diff --git a/contrib/arm-optimized-routines/README b/contrib/arm-optimized-routines/README index a2143a28488a..651ebdc84bc8 100644 --- a/contrib/arm-optimized-routines/README +++ b/contrib/arm-optimized-routines/README @@ -1,60 +1,60 @@ Arm Optimized Routines ---------------------- This repository contains implementations of library functions provided by Arm. The outbound license is available under a dual license, at the user’s election, as reflected in the LICENSE file. Contributions to this project are accepted, but Contributors have to sign an Assignment Agreement, please follow the instructions in contributor-agreement.pdf. This is needed so upstreaming code to projects that require copyright assignment is possible. Further contribution requirements are documented in README.contributors of the appropriate subdirectory. Regular quarterly releases are tagged as vYY.MM, the latest -release is v23.01. +release is v24.01. Source code layout: build/ - build directory (created by make). math/ - math subproject sources. math/include/ - math library public headers. math/test/ - math test and benchmark related sources. math/tools/ - tools used for designing the algorithms. networking/ - networking subproject sources. networking/include/ - networking library public headers. networking/test/ - networking test and benchmark related sources. string/ - string routines subproject sources. string/include/ - string library public headers. string/test/ - string test and benchmark related sources. pl/... - separately maintained performance library code. The steps to build the target libraries and run the tests: cp config.mk.dist config.mk # edit config.mk if necessary ... make make check Or building outside of the source directory: ln -s path/to/src/Makefile Makefile cp path/to/src/config.mk.dist config.mk echo 'srcdir = path/to/src' >> config.mk # further edits to config.mk make make check Or building and testing the math subproject only: make all-math make check-math The test system requires libmpfr and libmpc. For example on debian linux they can be installed as: sudo apt-get install libmpfr-dev libmpc-dev For cross build, CROSS_COMPILE should be set in config.mk and EMULATOR should be set for cross testing (e.g. using qemu-user or remote access to a target machine), see the examples in config.mk.dist. diff --git a/contrib/arm-optimized-routines/config.mk.dist b/contrib/arm-optimized-routines/config.mk.dist index 7a8497507a81..03fb54db52fa 100644 --- a/contrib/arm-optimized-routines/config.mk.dist +++ b/contrib/arm-optimized-routines/config.mk.dist @@ -1,93 +1,94 @@ # Example config.mk # -# Copyright (c) 2018-2022, Arm Limited. +# Copyright (c) 2018-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception # Subprojects to build SUBS = math string networking # Subsubprojects to build if subproject pl is built PLSUBS = math # Target architecture: aarch64, arm or x86_64 ARCH = aarch64 # Use for cross compilation with gcc. #CROSS_COMPILE = aarch64-none-linux-gnu- # Compiler for the target CC = $(CROSS_COMPILE)gcc CFLAGS = -std=c99 -pipe -O3 CFLAGS += -Wall -Wno-missing-braces CFLAGS += -Werror=implicit-function-declaration # Used for test case generator that is executed on the host HOST_CC = gcc HOST_CFLAGS = -std=c99 -O2 HOST_CFLAGS += -Wall -Wno-unused-function # Enable debug info. HOST_CFLAGS += -g CFLAGS += -g # Optimize the shared libraries on aarch64 assuming they fit in 1M. #CFLAGS_SHARED = -fPIC -mcmodel=tiny # Enable MTE support. #CFLAGS += -march=armv8.5-a+memtag -DWANT_MTE_TEST=1 # Use with cross testing. #EMULATOR = qemu-aarch64-static #EMULATOR = sh -c 'scp $$1 user@host:/dir && ssh user@host /dir/"$$@"' -- # Additional flags for subprojects. math-cflags = math-ldlibs = math-ulpflags = math-testflags = string-cflags = networking-cflags = # Use if mpfr is available on the target for ulp error checking. #math-ldlibs += -lmpfr -lgmp #math-cflags += -DUSE_MPFR # Use with gcc. math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector math-cflags += -ffp-contract=fast -fno-math-errno # Use with clang. #math-cflags += -ffp-contract=fast -# Disable vector math code -#math-cflags += -DWANT_VMATH=0 - -# Disable/enable SVE vector math code and tests +# Disable/enable SVE vector math code and tests. +# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE +# routines only so that SVE code does not leak into scalar +# routines. It is also necessary to add it for tools (e.g. ulp, +# mathbench) WANT_SVE_MATH = 0 ifeq ($(WANT_SVE_MATH), 1) - math-cflags += -march=armv8.2-a+sve + math-sve-cflags = -march=armv8-a+sve endif math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH) # If defined to 1, set errno in math functions according to ISO C. Many math # libraries do not set errno, so this is 0 by default. It may need to be # set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. WANT_ERRNO = 0 math-cflags += -DWANT_ERRNO=$(WANT_ERRNO) # If set to 1, set fenv in vector math routines. WANT_SIMD_EXCEPT = 0 math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT) # Disable fenv checks #math-ulpflags = -q -f #math-testflags = -nostatus # Remove GNU Property Notes from asm files. #string-cflags += -DWANT_GNU_PROPERTY=0 # Enable assertion checks. #networking-cflags += -DWANT_ASSERT # Avoid auto-vectorization of scalar code and unroll loops networking-cflags += -O2 -fno-tree-vectorize -funroll-loops diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk index 2a9cad10d96a..5e9494a7bd3c 100644 --- a/contrib/arm-optimized-routines/math/Dir.mk +++ b/contrib/arm-optimized-routines/math/Dir.mk @@ -1,115 +1,119 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2022, Arm Limited. +# Copyright (c) 2019-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/math B := build/math math-lib-srcs := $(wildcard $(S)/*.[cS]) +math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) + math-test-srcs := \ $(S)/test/mathtest.c \ $(S)/test/mathbench.c \ $(S)/test/ulp.c \ math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS]) math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h)) math-libs := \ build/lib/libmathlib.so \ build/lib/libmathlib.a \ math-tools := \ build/bin/mathtest \ build/bin/mathbench \ build/bin/mathbench_libc \ build/bin/runulp.sh \ build/bin/ulp \ math-host-tools := \ build/bin/rtest \ math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs))) math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs))) math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs))) math-target-objs := $(math-lib-objs) $(math-test-objs) math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs) math-files := \ $(math-objs) \ $(math-libs) \ $(math-tools) \ $(math-host-tools) \ $(math-includes) \ $(math-test-includes) \ all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) $(math-objs): $(math-includes) $(math-test-includes) $(math-objs): CFLAGS_ALL += $(math-cflags) $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) $(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS) $(B)/test/ulp.o: $(S)/test/ulp.h build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os) $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ build/lib/libmathlib.a: $(math-lib-objs) rm -f $@ $(AR) rc $@ $^ $(RANLIB) $@ $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc $(math-tools): LDLIBS += $(math-ldlibs) -lm +# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled +$(math-tools): CFLAGS_ALL += $(math-sve-cflags) build/bin/rtest: $(math-host-objs) $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) # This is not ideal, but allows custom symbols in mathbench to get resolved. build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) build/include/%.h: $(S)/include/%.h cp $< $@ build/include/test/%.h: $(S)/test/%.h cp $< $@ build/bin/%.sh: $(S)/test/%.sh cp $< $@ math-tests := $(wildcard $(S)/test/testcases/directed/*.tst) math-rtests := $(wildcard $(S)/test/testcases/random/*.tst) check-math-test: $(math-tools) cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags) check-math-rtest: $(math-host-tools) $(math-tools) cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags) check-math-ulp: $(math-tools) ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR) check-math: check-math-test check-math-rtest check-math-ulp install-math: \ $(math-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ $(math-includes:build/include/%=$(DESTDIR)$(includedir)/%) clean-math: rm -f $(math-files) .PHONY: all-math check-math-test check-math-rtest check-math-ulp check-math install-math clean-math diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c new file mode 100644 index 000000000000..9a73575bce89 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c new file mode 100644 index 000000000000..b9890b2998ad --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c new file mode 100644 index 000000000000..bc5609faf4fc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c @@ -0,0 +1,125 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static volatile struct +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ + .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), + V2 (0x1.55555da646206p-5) }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ + .special_bound = V2 (704.0), +#endif + .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), + .shift = V2 (0x1.8p+52) +}; + +#define C(i) data.poly[i] +#define Tab __v_exp_data + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ +# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f64 (exp, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) +{ + float64x2_t n, r, r2, s, y, z; + uint64x2_t cmp, u, e; + +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcagtq_f64 (x, data.special_bound); +#endif + + /* n = round(x/(ln2/N)). */ + z = vfmaq_f64 (data.shift, x, data.inv_ln2); + u = vreinterpretq_u64_f64 (z); + n = vsubq_f64 (z, data.shift); + + /* r = x - n*ln2/N. */ + r = x; + r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (r, data.ln2_lo, n); + + e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (C (0), C (1), r); + y = vfmaq_f64 (y, C (2), r2); + y = vfmaq_f64 (r, y, r2); + + /* s = 2^(n/N). */ + u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; + s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n); +#endif + + return vfmaq_f64 (s, y, s); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c new file mode 100644 index 000000000000..e402205e98e6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.962 ulp. */ + .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), + V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f32 (exp2f, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + n = vrndaq_f32 (x); + r = vsubq_f32 (x, n); + e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c new file mode 100644 index 000000000000..ba6b02fbb4bc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c @@ -0,0 +1,72 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.878 ulp. */ + 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) +#define C5 v_f32 (Poly[5]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ + float32x4_t n, r, scale, poly, absn; + uint32x4_t cmp, e; + + /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ +#if 0 + float32x4_t z; + z = x + Shift; + n = z - Shift; + r = x - n; + e = vreinterpretq_u32_f32 (z) << 23; +#else + n = vrndaq_f32 (x); + r = x - n; + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; +#endif + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (C5, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c new file mode 100644 index 000000000000..45f0848cac5b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c @@ -0,0 +1,146 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +# define N (1 << V_EXP_TABLE_BITS) + +/* 2^(j/N), j=0..N. */ +const uint64_t __v_exp_data[] = { +# if N == 128 + 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, + 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, + 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, + 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, + 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, + 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, + 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, + 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, + 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, + 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, + 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, + 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, + 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, + 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, + 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, + 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, + 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, + 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, + 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, + 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, + 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, + 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, + 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, + 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, + 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, + 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, + 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, + 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, + 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, + 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, + 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, + 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, + 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, + 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, + 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, + 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, + 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, + 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, + 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, + 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, + 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, + 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, + 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, +# elif N == 256 + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +# endif +}; diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf.c b/contrib/arm-optimized-routines/math/aarch64/v_expf.c new file mode 100644 index 000000000000..34e8b6081bcd --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_expf.c @@ -0,0 +1,122 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.45358 +0.5 ulp. */ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .ln2_hi = V4 (0x1.62e4p-1f), + .ln2_lo = V4 (0x1.7f7d1cp-20f), + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (expf, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly, z; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + cmp = vcgeq_u32 ( + vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), + TinyBound), + SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = vfmaq_f32 (d->shift, x, d->inv_ln2); + n = vsubq_f32 (z, d->shift); + r = vfmsq_f32 (x, n, d->ln2_hi); + r = vfmsq_f32 (r, n, d->ln2_lo); + e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c new file mode 100644 index 000000000000..43d03fa34efa --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c @@ -0,0 +1,77 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.36565 +0.5 ulp. */ + 0x1.6a6000p-10f, + 0x1.12718ep-7f, + 0x1.555af0p-5f, + 0x1.555430p-3f, + 0x1.fffff4p-2f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) +{ + float32x4_t n, r, scale, poly, absn, z; + uint32x4_t cmp, e; + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ +#if 1 + z = vfmaq_f32 (Shift, x, InvLn2); + n = z - Shift; + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_f32 (z) << 23; +#else + z = x * InvLn2; + n = vrndaq_f32 (z); + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; +#endif + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log.c b/contrib/arm-optimized-routines/math/aarch64/v_log.c new file mode 100644 index 000000000000..1d1c1fa62c04 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_log.c @@ -0,0 +1,100 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t ln2; + uint64x2_t sign_exp_mask; +} data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), + V2 (-0x1.554e550bd501ep-3) }, + .ln2 = V2 (0x1.62e42fefa39efp-1), + .min_norm = V2 (0x0010000000000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000) +}; + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) +#define Off v_u64 (0x3fe6900900000000) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + uint32x2_t cmp) +{ + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + uint32x2_t cmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, Off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); + y = vfmaq_f64 (y, A (4), r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (cmp))) + return special_case (x, y, hi, r2, cmp); + return vfmaq_f64 (hi, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log_data.c b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c new file mode 100644 index 000000000000..82351bb14766 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c @@ -0,0 +1,156 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define N (1 << V_LOG_TABLE_BITS) + +const struct v_log_data __v_log_data = { + /* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: + + table[i].invc = 1/c + table[i].logc = (double)log(c) + + where c is near the center of the subinterval and is chosen by trying several + floating point invc candidates around 1/center and selecting one for which + the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval + that contains 1 and the previous one got tweaked to avoid cancellation. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, + { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, + { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, + { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, + { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, + { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, + { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, + { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, + { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, + { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, + { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, + { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, + { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, + { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, + { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, + { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, + { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, + { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, + { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, + { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, + { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, + { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, + { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, + { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, + { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, + { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, + { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, + { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, + { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, + { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, + { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, + { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, + { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, + { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, + { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, + { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, + { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, + { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, + { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, + { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, + { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, + { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, + { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, + { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, + { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, + { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, + { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, + { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, + { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, + { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, + { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, + { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, + { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, + { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, + { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, + { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, + { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, + { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, + { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, + { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, + { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, + { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, + { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, + { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, + { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, + { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, + { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, + { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, + { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, + { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, + { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, + { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, + { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, + { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, + { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, + { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, + { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, + { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, + { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, + { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, + { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, + { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, + { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, + { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, + { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, + { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, + { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } +}; diff --git a/contrib/arm-optimized-routines/math/aarch64/v_logf.c b/contrib/arm-optimized-routines/math/aarch64/v_logf.c new file mode 100644 index 000000000000..66ebbbcd2b5a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_logf.c @@ -0,0 +1,74 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + float32x4_t poly[7]; + float32x4_t ln2, tiny_bound; + uint32x4_t off, mantissa_mask; +} data = { + /* 3.34 ulp error. */ + .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), + V4 (-0x1.ffffc8p-2f) }, + .ln2 = V4 (0x1.62e43p-1f), + .tiny_bound = V4 (0x1p-126), + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +#define P(i) d->poly[7 - i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, + uint16x4_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + uint16x4_t cmp; + + u = vreinterpretq_u32_f32 (x); + cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (x, y, r2, p, cmp); + return vfmaq_f32 (p, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_math.h b/contrib/arm-optimized-routines/math/aarch64/v_math.h new file mode 100644 index 000000000000..1dc9916c6fb0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_math.h @@ -0,0 +1,135 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#if !__aarch64__ +# error "Cannot build without AArch64" +#endif + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +#include +#include "../math_config.h" +#include + +/* Shorthand helpers for declaring constants. */ +# define V2(X) { X, X } +# define V4(X) { X, X, X, X } +# define V8(X) { X, X, X, X, X, X, X, X } + +static inline int +v_any_u16h (uint16x4_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; +} + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline int +v_any_u32h (uint32x2_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; +} + +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/v_pow.c b/contrib/arm-optimized-routines/math/aarch64/v_pow.c new file mode 100644 index 000000000000..734f1663a283 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_pow.c @@ -0,0 +1,22 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + float64x2_t z; + for (int lane = 0; lane < v_lanes64 (); lane++) + { + double sx = x[lane]; + double sy = y[lane]; + double sz = pow (sx, sy); + z[lane] = sz; + } + return z; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_powf.c b/contrib/arm-optimized-routines/math/aarch64/v_powf.c new file mode 100644 index 000000000000..3a4163ab0558 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_powf.c @@ -0,0 +1,148 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A data.log2_poly +#define C data.exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + double log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + double exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, + -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + 0x1.c6af84b912394p-5 / Scale / Scale / Scale, + 0x1.ebfce50fac4f3p-3 / Scale / Scale, + 0x1.62e42ff0c52d6p-1 / Scale}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + Log2IdxMask); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + uint32x4_t iz = vsubq_u32 (u, top); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + float32x4_t ret; + for (int lane = 0; lane < 4; lane++) + { + /* Use double precision for each lane. */ + double invc = data.log2_tab[i[lane]].invc; + double logc = data.log2_tab[i[lane]].logc; + double z = (double) asfloat (iz[lane]); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + double r = __builtin_fma (z, invc, -1.0); + double y0 = logc + (double) k[lane]; + + /* Polynomial to approximate log1p(r)/ln2. */ + double logx = A[0]; + logx = r * logx + A[1]; + logx = r * logx + A[2]; + logx = r * logx + A[3]; + logx = r * logx + y0; + double ylogx = y[lane] * logx; + cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) + >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 + ? 1 + : cmp[lane]; + + /* N*x = k + r with r in [-1/2, 1/2]. */ + double kd = round (ylogx); + uint64_t ki = lround (ylogx); + r = ylogx - kd; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; + t += ki << (52 - V_EXP2F_TABLE_BITS); + double s = asdouble (t); + double p = C[0]; + p = __builtin_fma (p, r, C[1]); + p = __builtin_fma (p, r, C[2]); + p = __builtin_fma (p, s * r, s); + + ret[lane] = p; + } + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, ret, cmp); + return ret; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sin.c b/contrib/arm-optimized-routines/math/aarch64/v_sin.c new file mode 100644 index 000000000000..04129c31133d --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_sin.c @@ -0,0 +1,97 @@ +/* + * Double-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + + .range_val = V2 (0x1p23), + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ +# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sin, x, y, cmp); +} + +/* Vector (AdvSIMD) sin approximation. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be + triggered correctly, set any special lanes to 1 (which is neutral w.r.t. + fenv). These lanes will be fixed by special-case handler later. */ + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); +#else + r = x; + cmp = vcageq_f64 (x, d->range_val); +#endif + + /* n = rint(|x|/pi). */ + n = vfmaq_f64 (d->shift, d->inv_pi, r); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sinf.c b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c new file mode 100644 index 000000000000..336879844459 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .range_val = V4 (0x1p20f) +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ +# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); +#else + r = x; + cmp = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(|x|/pi) */ + n = vfmaq_f32 (d->shift, d->inv_pi, r); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r) */ + r2 = vmulq_f32 (r, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, vmulq_f32 (y, r2), r); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/exp10.c b/contrib/arm-optimized-routines/math/exp10.c new file mode 100644 index 000000000000..0fbec4c694ca --- /dev/null +++ b/contrib/arm-optimized-routines/math/exp10.c @@ -0,0 +1,129 @@ +/* + * Double-precision 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */ +#define UFlowBound -0x1.5ep+8 /* -350. */ +#define SmallTop 0x3c6 /* top12(0x1p-57). */ +#define BigTop 0x407 /* top12(0x1p8). */ +#define Thresh 0x41 /* BigTop - SmallTop. */ +#define Shift __exp_data.shift +#define C(i) __exp_data.exp10_poly[i] + +static double +special_case (uint64_t sbits, double_t tmp, uint64_t ki) +{ + double_t scale, y; + + if (ki - (1ull << 16) < 0x80000000) + { + /* The exponent of scale might have overflowed by 1. */ + sbits -= 1ull << 52; + scale = asdouble (sbits); + y = 2 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + + /* n < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble (sbits); + y = scale + scale * tmp; + + if (y < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t lo = scale - y + scale * tmp; + double_t hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double (hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + + return check_uflow (y); +} + +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */ +double +exp10 (double x) +{ + uint64_t ix = asuint64 (x); + uint32_t abstop = (ix >> 52) & 0x7ff; + + if (unlikely (abstop - SmallTop >= Thresh)) + { + if (abstop - SmallTop >= 0x80000000) + /* Avoid spurious underflow for tiny x. + Note: 0 is common input. */ + return x + 1; + if (abstop == 0x7ff) + return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0; + if (x >= OFlowBound) + return __math_oflow (0); + if (x < UFlowBound) + return __math_uflow (0); + + /* Large x is special-cased below. */ + abstop = 0; + } + + /* Reduce x: z = x * N / log10(2), k = round(z). */ + double_t z = __exp_data.invlog10_2N * x; + double_t kd; + int64_t ki; +#if TOINT_INTRINSICS + kd = roundtoint (z); + ki = converttoint (z); +#else + kd = eval_as_double (z + Shift); + kd -= Shift; + ki = kd; +#endif + + /* r = x - k * log10(2), r in [-0.5, 0.5]. */ + double_t r = x; + r = __exp_data.neglog10_2hiN * kd + r; + r = __exp_data.neglog10_2loN * kd + r; + + /* exp10(x) = 2^(k/N) * 2^(r/N). + Approximate the two components separately. */ + + /* s = 2^(k/N), using lookup table. */ + uint64_t e = ki << (52 - EXP_TABLE_BITS); + uint64_t i = (ki & IndexMask) * 2; + uint64_t u = __exp_data.tab[i + 1]; + uint64_t sbits = u + e; + + double_t tail = asdouble (__exp_data.tab[i]); + + /* 2^(r/N) ~= 1 + r * Poly(r). */ + double_t r2 = r * r; + double_t p = C (0) + r * C (1); + double_t y = C (2) + r * C (3); + y = y + r2 * C (4); + y = p + r2 * y; + y = tail + y * r; + + if (unlikely (abstop == 0)) + return special_case (sbits, y, ki); + + /* Assemble components: + y = 2^(r/N) * 2^(k/N) + ~= (y + 1) * s. */ + double_t s = asdouble (sbits); + return eval_as_double (s * y + s); +} diff --git a/contrib/arm-optimized-routines/math/exp_data.c b/contrib/arm-optimized-routines/math/exp_data.c index 714c845709aa..c20b1b2d3e06 100644 --- a/contrib/arm-optimized-routines/math/exp_data.c +++ b/contrib/arm-optimized-routines/math/exp_data.c @@ -1,1120 +1,1141 @@ /* * Shared data between exp, exp2 and pow. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #define N (1 << EXP_TABLE_BITS) const struct exp_data __exp_data = { // N/ln2 .invln2N = 0x1.71547652b82fep0 * N, +.invlog10_2N = 0x1.a934f0979a371p1 * N, // -ln2/N #if N == 64 .negln2hiN = -0x1.62e42fefa0000p-7, .negln2loN = -0x1.cf79abc9e3b3ap-46, #elif N == 128 .negln2hiN = -0x1.62e42fefa0000p-8, .negln2loN = -0x1.cf79abc9e3b3ap-47, #elif N == 256 .negln2hiN = -0x1.62e42fefc0000p-9, .negln2loN = 0x1.c610ca86c3899p-45, #elif N == 512 .negln2hiN = -0x1.62e42fef80000p-10, .negln2loN = -0x1.1cf79abc9e3b4p-45, #endif +.neglog10_2hiN = -0x1.3441350ap-2 / N, +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N, // Used for rounding when !TOINT_INTRINSICS #if EXP_USE_TOINT_NARROW .shift = 0x1800000000.8p0, #else .shift = 0x1.8p52, #endif // exp polynomial coefficients. .poly = { #if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE // abs error: 1.5543*2^-60 // ulp error: 0.529 (0.533 without fma) // if |x| < ln2/128+eps // abs error if |x| < ln2/64: 1.7157*2^-50 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, 0x1.1111266d28935p-7, #elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE // abs error: 1.6735*2^-64 // ulp error: 0.518 (0.522 without fma) // if |x| < ln2/64 0x1.5555555548f9ap-3, 0x1.555555554bf5dp-5, 0x1.11115b75f0f4dp-7, 0x1.6c171a6b6303ep-10, #elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE // abs error: 1.555*2^-66 // ulp error: 0.509 (0.511 without fma) // if |x| < ln2/256+eps // abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65 // abs error if |x| < ln2/128: 1.7145*2^-56 0x1.ffffffffffdbdp-2, 0x1.555555555543cp-3, 0x1.55555cf172b91p-5, 0x1.1111167a4d017p-7, #elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE // abs error: 1.5542*2^-60 // ulp error: 0.521 (0.523 without fma) // if |x| < ln2/128 0x1.fffffffffdbcep-2, 0x1.55555555543c2p-3, 0x1.555573c64f2e3p-5, 0x1.111126b4eff73p-7, #elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE // abs error: 1.6861*2^-71 // ulp error: 0.509 (0.511 without fma) // if |x| < ln2/128 0x1.55555555548fdp-3, 0x1.555555555658fp-5, 0x1.111123a859bb6p-7, 0x1.6c16ba6920cabp-10, #elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE // abs error: 1.43*2^-58 // ulp error: 0.549 (0.550 without fma) // if |x| < ln2/512 0x1p0, // unused 0x1.fffffffffffd4p-2, 0x1.5555571d6ef9p-3, 0x1.5555576a5adcep-5, #elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE // abs error: 1.5547*2^-66 // ulp error: 0.505 (0.506 without fma) // if |x| < ln2/256 0x1.ffffffffffdbdp-2, 0x1.555555555543cp-3, 0x1.55555cf16e1edp-5, 0x1.1111167a4b553p-7, #elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE // abs error: 1.4300*2^-63 // ulp error: 0.504 // if |x| < ln2/1024 // abs error if |x| < ln2/512: 1.0689*2^-55 0x1p0, // unused 0x1.ffffffffffffdp-2, 0x1.555555c75bb6p-3, 0x1.555555dec04a8p-5, #endif }, .exp2_shift = 0x1.8p52 / N, // exp2 polynomial coefficients. .exp2_poly = { #if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE // abs error: 1.3054*2^-63 // ulp error: 0.515 // if |x| < 1/64 0x1.62e42fefa39efp-1, 0x1.ebfbdff82c58fp-3, 0x1.c6b08d7045cf1p-5, 0x1.3b2ab6fb8fd0ep-7, 0x1.5d884afec48d7p-10, 0x1.43097dc684ae1p-13, #elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE // abs error: 1.2195*2^-65 // ulp error: 0.507 (0.511 without fma) // if |x| < 1/256 // abs error if |x| < 1/128: 1.9941*2^-56 0x1.62e42fefa39efp-1, 0x1.ebfbdff82c424p-3, 0x1.c6b08d70cf4b5p-5, 0x1.3b2abd24650ccp-7, 0x1.5d7e09b4e3a84p-10, #elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE // abs error: 1.2195*2^-65 // ulp error: 0.504 (0.508 without fma) // if |x| < 1/256 0x1.62e42fefa39efp-1, 0x1.ebfbdff82c424p-3, 0x1.c6b08d70cf4b5p-5, 0x1.3b2abd24650ccp-7, 0x1.5d7e09b4e3a84p-10, #elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE // abs error: 1.4411*2^-64 // ulp error: 0.5024 (0.5063 without fma) // if |x| < 1/1024 // abs error if |x| < 1/512: 1.9430*2^-56 0x1.62e42fefa39ecp-1, 0x1.ebfbdff82c58bp-3, 0x1.c6b08e46de41fp-5, 0x1.3b2ab786ee1dap-7, #endif }, +.exp10_poly = { +#if EXP10_POLY_WIDE +/* Range is wider if using shift-based reduction: coeffs generated + using Remez in [-log10(2)/128, log10(2)/128 ]. */ +0x1.26bb1bbb55515p1, +0x1.53524c73cd32bp1, +0x1.0470591e1a108p1, +0x1.2bd77b12fe9a8p0, +0x1.14289fef24b78p-1 +#else +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */ +0x1.26bb1bbb55516p1, +0x1.53524c73ce9fep1, +0x1.0470591ce4b26p1, +0x1.2bd76577fe684p0, +0x1.1446eeccd0efbp-1 +#endif +}, // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) // tab[2*k] = asuint64(T[k]) // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N .tab = { #if N == 64 0x0, 0x3ff0000000000000, 0xbc7160139cd8dc5d, 0x3fefec9a3e778061, 0x3c8cd2523567f613, 0x3fefd9b0d3158574, 0x3c60f74e61e6c861, 0x3fefc74518759bc8, 0x3c979aa65d837b6d, 0x3fefb5586cf9890f, 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, 0xbc9556522a2fbd0e, 0x3fef9301d0125b51, 0xbc91c923b9d5f416, 0x3fef829aaea92de0, 0xbc801b15eaa59348, 0x3fef72b83c7d517b, 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, 0x3c968efde3a8a894, 0x3fef387a6e756238, 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, 0x3c834d754db0abb6, 0x3fef06fe0a31b715, 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, 0x3c859f48a72a4c6d, 0x3feedea64c123422, 0xbc58a78f4817895b, 0x3feed60a21f72e2a, 0x3c4363ed60c2ac11, 0x3feece086061892d, 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, 0x3c7690cebb7aafb0, 0x3feebfdad5362a27, 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, 0x3c93350518fdd78e, 0x3feeaf4736b527da, 0x3c9063e1e21c5409, 0x3feeab07dd485429, 0x3c9432e62b64c035, 0x3feea76f15ad2148, 0xbc8c33c53bef4da8, 0x3feea47eb03a5585, 0xbc93cedd78565858, 0x3feea23882552225, 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, 0xbc8619321e55e68a, 0x3fee9feb564267c9, 0xbc7b32dcb94da51d, 0x3feea11473eb0187, 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, 0xbc9369b6f13b3734, 0x3feea589994cce13, 0xbc94d450d872576e, 0x3feea8d99b4492ed, 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, 0x3c7bf68359f35f44, 0x3feeb1ae99157736, 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, 0xbc92434322f4f9aa, 0x3feebd829fde4e50, 0x3c71affc2b91ce27, 0x3feec49182a3f090, 0xbc87c50422622263, 0x3feecc667b5de565, 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, 0x3c8469846e735ab3, 0x3feede6b5579fdbf, 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, 0x3c736eae30af0cb3, 0x3fef199bdd85529c, 0x3c84e08fd10959ac, 0x3fef27f12e57d14b, 0x3c676b2c6c921968, 0x3fef3720dcef9069, 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, 0x3c74a385a63d07a7, 0x3fef5818dcfba487, 0x3c8e5a50d5c192ac, 0x3fef69e603db3285, 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, 0x3c74b604603a88d3, 0x3fef902ee78b3ff6, 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, 0x3c8a64a931d185ee, 0x3fefd0765b6e4540, 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, #elif N == 128 0x0, 0x3ff0000000000000, 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, 0xbc7160139cd8dc5d, 0x3fefec9a3e778061, 0xbc905e7a108766d1, 0x3fefe315e86e7f85, 0x3c8cd2523567f613, 0x3fefd9b0d3158574, 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, 0x3c60f74e61e6c861, 0x3fefc74518759bc8, 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, 0x3c979aa65d837b6d, 0x3fefb5586cf9890f, 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, 0xbc6a033489906e0b, 0x3fef9b66affed31b, 0xbc9556522a2fbd0e, 0x3fef9301d0125b51, 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, 0xbc91c923b9d5f416, 0x3fef829aaea92de0, 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, 0xbc801b15eaa59348, 0x3fef72b83c7d517b, 0xbc8f1ff055de323d, 0x3fef6af9388c8dea, 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, 0xbc96d99c7611eb26, 0x3fef5be084045cd4, 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, 0x3c807a05b0e4047d, 0x3fef3f49917ddc96, 0x3c968efde3a8a894, 0x3fef387a6e756238, 0x3c875e18f274487d, 0x3fef31ce4fb2a63f, 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, 0xbc96b87b3f71085e, 0x3fef24dfe1f56381, 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, 0x3c834d754db0abb6, 0x3fef06fe0a31b715, 0x3c864201e2ac744c, 0x3fef0170fc4cd831, 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, 0xbc9907f81b512d8e, 0x3feeecae6d05d866, 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, 0xbc991919b3ce1b15, 0x3feee32dc313a8e5, 0x3c859f48a72a4c6d, 0x3feedea64c123422, 0xbc9312607a28698a, 0x3feeda4504ac801c, 0xbc58a78f4817895b, 0x3feed60a21f72e2a, 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, 0x3c4363ed60c2ac11, 0x3feece086061892d, 0x3c9666093b0664ef, 0x3feeca41ed1d0057, 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, 0x3c7690cebb7aafb0, 0x3feebfdad5362a27, 0x3c931dbdeb54e077, 0x3feebcb299fddd0d, 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, 0xbc87deccdc93a349, 0x3feeb6daa2cf6642, 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, 0x3c93350518fdd78e, 0x3feeaf4736b527da, 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, 0x3c9063e1e21c5409, 0x3feeab07dd485429, 0x3c34c7855019c6ea, 0x3feea9268a5946b7, 0x3c9432e62b64c035, 0x3feea76f15ad2148, 0xbc8ce44a6199769f, 0x3feea5e1b976dc09, 0xbc8c33c53bef4da8, 0x3feea47eb03a5585, 0xbc845378892be9ae, 0x3feea34634ccc320, 0xbc93cedd78565858, 0x3feea23882552225, 0x3c5710aa807e1964, 0x3feea155d44ca973, 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, 0xbc6a12ad8734b982, 0x3feea012750bdabf, 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, 0xbc80dc3d54e08851, 0x3fee9f7df9519484, 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, 0xbc8619321e55e68a, 0x3fee9feb564267c9, 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, 0xbc7b32dcb94da51d, 0x3feea11473eb0187, 0x3c94ecfd5467c06b, 0x3feea1ed0130c132, 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, 0xbc88a1c52fb3cf42, 0x3feea427543e1a12, 0xbc9369b6f13b3734, 0x3feea589994cce13, 0xbc805e843a19ff1e, 0x3feea71a4623c7ad, 0xbc94d450d872576e, 0x3feea8d99b4492ed, 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, 0x3c7bf68359f35f44, 0x3feeb1ae99157736, 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, 0xbc6c23f97c90b959, 0x3feeba44cbc8520f, 0xbc92434322f4f9aa, 0x3feebd829fde4e50, 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, 0x3c71affc2b91ce27, 0x3feec49182a3f090, 0x3c6dd235e10a73bb, 0x3feec86319e32323, 0xbc87c50422622263, 0x3feecc667b5de565, 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, 0x3c90cc319cee31d2, 0x3feed99e1330b358, 0x3c8469846e735ab3, 0x3feede6b5579fdbf, 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, 0xbc907b8f4ad1d9fa, 0x3feeee07298db666, 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, 0xbc90a40e3da6f640, 0x3feef9728de5593a, 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, 0xbc91eee26b588a35, 0x3fef05b030a1064a, 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, 0x3c736eae30af0cb3, 0x3fef199bdd85529c, 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, 0x3c84e08fd10959ac, 0x3fef27f12e57d14b, 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, 0x3c676b2c6c921968, 0x3fef3720dcef9069, 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, 0xbc900dae3875a949, 0x3fef4f87080d89f2, 0x3c74a385a63d07a7, 0x3fef5818dcfba487, 0xbc82919e2040220f, 0x3fef60e316c98398, 0x3c8e5a50d5c192ac, 0x3fef69e603db3285, 0x3c843a59ac016b4b, 0x3fef7321f301b460, 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, 0xbc892ab93b470dc9, 0x3fef864614f5a129, 0x3c74b604603a88d3, 0x3fef902ee78b3ff6, 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, 0xbc8dae98e223747d, 0x3fefaf482d8e67f1, 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, 0x3c842b94c3a9eb32, 0x3fefc52b376bba97, 0x3c8a64a931d185ee, 0x3fefd0765b6e4540, 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, 0x3c5305c14160cc89, 0x3feff3c22b8f71f1, #elif N == 256 0x0, 0x3ff0000000000000, 0xbc84e82fc61851ac, 0x3feffb1afa5abcbf, 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, 0xbc82985dd8521d32, 0x3feff168143b0281, 0xbc7160139cd8dc5d, 0x3fefec9a3e778061, 0x3c651e617061bfbd, 0x3fefe7d42e11bbcc, 0xbc905e7a108766d1, 0x3fefe315e86e7f85, 0x3c845fad437fa426, 0x3fefde5f72f654b1, 0x3c8cd2523567f613, 0x3fefd9b0d3158574, 0xbc954529642b232f, 0x3fefd50a0e3c1f89, 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, 0x3c8293708ef5c32e, 0x3fefcbd42b72a836, 0x3c60f74e61e6c861, 0x3fefc74518759bc8, 0xbc95b9280905b2a4, 0x3fefc2bdf66607e0, 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, 0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919, 0x3c979aa65d837b6d, 0x3fefb5586cf9890f, 0x3c9407fb30d06420, 0x3fefb0f145e46c85, 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, 0xbc9a5d04b3b9911b, 0x3fefa83b23395dec, 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, 0xbc937a01f0739546, 0x3fef9fa55fdfa9c5, 0xbc6a033489906e0b, 0x3fef9b66affed31b, 0x3c8b8268b04ef0a5, 0x3fef973028d7233e, 0xbc9556522a2fbd0e, 0x3fef9301d0125b51, 0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6, 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, 0xbc65704e90c9f860, 0x3fef86a814f204ab, 0xbc91c923b9d5f416, 0x3fef829aaea92de0, 0xbc897cea57e46280, 0x3fef7e95934f312e, 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, 0x3c56f01429e2b9d2, 0x3fef76a45471c3c2, 0xbc801b15eaa59348, 0x3fef72b83c7d517b, 0x3c6e653b2459034b, 0x3fef6ed48695bbc0, 0xbc8f1ff055de323d, 0x3fef6af9388c8dea, 0x3c92cc7ea345b7dc, 0x3fef672658375d2f, 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, 0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c, 0xbc96d99c7611eb26, 0x3fef5be084045cd4, 0x3c8cdc1873af2155, 0x3fef582f95281c6b, 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, 0xbc9493684653a131, 0x3fef50e75eb44027, 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, 0xbc98e2899077520a, 0x3fef49c18438ce4d, 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, 0x3c9120fcd4f59273, 0x3fef42be3578a819, 0x3c807a05b0e4047d, 0x3fef3f49917ddc96, 0x3c89b788c188c9b8, 0x3fef3bdda27912d1, 0x3c968efde3a8a894, 0x3fef387a6e756238, 0x3c877afbca90ef84, 0x3fef351ffb82140a, 0x3c875e18f274487d, 0x3fef31ce4fb2a63f, 0x3c91512f082876ee, 0x3fef2e85711ece75, 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, 0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29, 0xbc96b87b3f71085e, 0x3fef24dfe1f56381, 0xbc803297e78260bf, 0x3fef21ba7591bb70, 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, 0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13, 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, 0xbc91e75c40b4251e, 0x3fef157e39771b2f, 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, 0x3c98a911f1f7785a, 0x3fef0f961f641589, 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, 0xbc61e7c998db7dbb, 0x3fef09d24abd886b, 0x3c834d754db0abb6, 0x3fef06fe0a31b715, 0x3c85425c11faadf4, 0x3fef0432edeeb2fd, 0x3c864201e2ac744c, 0x3fef0170fc4cd831, 0xbc979517a03e2847, 0x3feefeb83ba8ea32, 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, 0xbc800e2a46da4bee, 0x3feef96266e3fa2d, 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, 0xbc87430803972b34, 0x3feef431a2de883b, 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, 0xbc954de30ae02d94, 0x3feeef26231e754a, 0xbc9907f81b512d8e, 0x3feeecae6d05d866, 0xbc94f2487e1c03ec, 0x3feeea401b7140ef, 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, 0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4, 0xbc991919b3ce1b15, 0x3feee32dc313a8e5, 0x3c79c3bba5562a2f, 0x3feee0e544ede173, 0x3c859f48a72a4c6d, 0x3feedea64c123422, 0xbc85a71612e21658, 0x3feedc70df1c5175, 0xbc9312607a28698a, 0x3feeda4504ac801c, 0x3c86421f6f1d24d6, 0x3feed822c367a024, 0xbc58a78f4817895b, 0x3feed60a21f72e2a, 0xbc9348a6815fce65, 0x3feed3fb2709468a, 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, 0x3c835c43984d9871, 0x3feecffa3f84b9d4, 0x3c4363ed60c2ac11, 0x3feece086061892d, 0xbc632afc8d9473a0, 0x3feecc2042a7d232, 0x3c9666093b0664ef, 0x3feeca41ed1d0057, 0xbc95fc5e44de020e, 0x3feec86d668b3237, 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, 0xbc7ea0148327c42f, 0x3feec4e1e192aed2, 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, 0xbc7a843ad1a88022, 0x3feec17dea6db7d7, 0x3c7690cebb7aafb0, 0x3feebfdad5362a27, 0x3c892ca3bf144e63, 0x3feebe41b817c114, 0x3c931dbdeb54e077, 0x3feebcb299fddd0d, 0xbc902c99b04aa8b0, 0x3feebb2d81d8abff, 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, 0x3c73e34f67e67118, 0x3feeb8417f4531ee, 0xbc87deccdc93a349, 0x3feeb6daa2cf6642, 0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef, 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, 0x3c81bd2888075068, 0x3feeb2e2f4f6ad27, 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, 0xbc896be8ae89ef8f, 0x3feeb070dde910d2, 0x3c93350518fdd78e, 0x3feeaf4736b527da, 0xbc88e6ac90348602, 0x3feeae27dbe2c4cf, 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, 0xbc91af7f1365c3ac, 0x3feeac0827ff07cc, 0x3c9063e1e21c5409, 0x3feeab07dd485429, 0xbc943a3540d1898a, 0x3feeaa11fba87a03, 0x3c34c7855019c6ea, 0x3feea9268a5946b7, 0xbc951f58ddaa8090, 0x3feea84590998b93, 0x3c9432e62b64c035, 0x3feea76f15ad2148, 0xbc82e1648e50a17c, 0x3feea6a320dceb71, 0xbc8ce44a6199769f, 0x3feea5e1b976dc09, 0x3c95f30eda98a575, 0x3feea52ae6cdf6f4, 0xbc8c33c53bef4da8, 0x3feea47eb03a5585, 0x3c917ecda8a72159, 0x3feea3dd1d1929fd, 0xbc845378892be9ae, 0x3feea34634ccc320, 0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7, 0xbc93cedd78565858, 0x3feea23882552225, 0xbc85c33fdf910406, 0x3feea1c1c70833f6, 0x3c5710aa807e1964, 0x3feea155d44ca973, 0x3c81079ab5789604, 0x3feea0f4b19e9538, 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, 0x3c727df161cd7778, 0x3feea052fa75173e, 0xbc6a12ad8734b982, 0x3feea012750bdabf, 0x3c93f9924a05b767, 0x3fee9fdcddd47645, 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, 0xbc87557939a8b5ef, 0x3fee9f9298593ae5, 0xbc80dc3d54e08851, 0x3fee9f7df9519484, 0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87, 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, 0xbc88e67a9006c909, 0x3fee9f8286ead08a, 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, 0x3c86597566977ac8, 0x3fee9fbd35d7cbfd, 0xbc8619321e55e68a, 0x3fee9feb564267c9, 0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09, 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, 0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6, 0xbc7b32dcb94da51d, 0x3feea11473eb0187, 0xbc92dad3519d7b5b, 0x3feea17b0976cfdb, 0x3c94ecfd5467c06b, 0x3feea1ed0130c132, 0x3c87d51410fd15c2, 0x3feea26a62ff86f0, 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, 0xbc760a3629969871, 0x3feea3878491c491, 0xbc88a1c52fb3cf42, 0x3feea427543e1a12, 0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9, 0xbc9369b6f13b3734, 0x3feea589994cce13, 0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7, 0xbc805e843a19ff1e, 0x3feea71a4623c7ad, 0xbc522cea4f3afa1e, 0x3feea7f4179f5b21, 0xbc94d450d872576e, 0x3feea8d99b4492ed, 0x3c7c88549b958471, 0x3feea9cad931a436, 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, 0x3c931143962f7877, 0x3feeabd0a478580f, 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, 0x3c93e9e96f112479, 0x3feeae05bad61778, 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, 0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9, 0x3c7bf68359f35f44, 0x3feeb1ae99157736, 0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a, 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, 0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2, 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, 0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5, 0xbc6c23f97c90b959, 0x3feeba44cbc8520f, 0xbc51669428996971, 0x3feebbdd9a7670b3, 0xbc92434322f4f9aa, 0x3feebd829fde4e50, 0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2, 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, 0xbc9294f304f166b6, 0x3feec2bb4d53fe0d, 0x3c71affc2b91ce27, 0x3feec49182a3f090, 0xbc8a1e58414c07d3, 0x3feec674194bb8d5, 0x3c6dd235e10a73bb, 0x3feec86319e32323, 0xbc79740b58a20091, 0x3feeca5e8d07f29e, 0xbc87c50422622263, 0x3feecc667b5de565, 0x3c9165830a2b96c2, 0x3feece7aed8eb8bb, 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, 0xbc903d5cbe27874b, 0x3feed2c980460ad8, 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, 0x3c5986178980fce0, 0x3feed74a8af46052, 0x3c90cc319cee31d2, 0x3feed99e1330b358, 0xbc89472975b1f2a5, 0x3feedbfe53c12e59, 0x3c8469846e735ab3, 0x3feede6b5579fdbf, 0x3c7d8157a34b7e7f, 0x3feee0e521356eba, 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, 0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774, 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, 0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff, 0xbc907b8f4ad1d9fa, 0x3feeee07298db666, 0x3c889c2ea41433c7, 0x3feef0ce6c9a8952, 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, 0xbc7274aedac8ff80, 0x3feef68415b749b1, 0xbc90a40e3da6f640, 0x3feef9728de5593a, 0x3c85c620ce76df06, 0x3feefc6e29f1c52a, 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, 0xbc8fda52e1b51e41, 0x3fef028cf22749e4, 0xbc91eee26b588a35, 0x3fef05b030a1064a, 0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f, 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, 0xbc302899507554e5, 0x3fef0f69c3f3a207, 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, 0xbc80dda2d4c0010c, 0x3fef16286141b33d, 0x3c736eae30af0cb3, 0x3fef199bdd85529c, 0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c, 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, 0x3c836909391181d3, 0x3fef244778fafb22, 0x3c84e08fd10959ac, 0x3fef27f12e57d14b, 0xbc811cd7dbdf9547, 0x3fef2ba88988c933, 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, 0xbc7ac28b7bef6621, 0x3fef33405751c4db, 0x3c676b2c6c921968, 0x3fef3720dcef9069, 0xbc7030587207b9e1, 0x3fef3b0f2e6d1675, 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, 0xbc8cc734592af7fc, 0x3fef43155b5bab74, 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, 0x3c87752a44f587e8, 0x3fef4b532b08c968, 0xbc900dae3875a949, 0x3fef4f87080d89f2, 0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6, 0x3c74a385a63d07a7, 0x3fef5818dcfba487, 0x3c5159d9d908a96e, 0x3fef5c76e862e6d3, 0xbc82919e2040220f, 0x3fef60e316c98398, 0x3c8c254d16117a68, 0x3fef655d71ff6075, 0x3c8e5a50d5c192ac, 0x3fef69e603db3285, 0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315, 0x3c843a59ac016b4b, 0x3fef7321f301b460, 0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658, 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, 0xbc63e8e3eab2cbb4, 0x3fef81676b197d17, 0xbc892ab93b470dc9, 0x3fef864614f5a129, 0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12, 0x3c74b604603a88d3, 0x3fef902ee78b3ff6, 0xbc776caa4c2ff1cf, 0x3fef953924676d76, 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, 0xbc81d5fc525d9940, 0x3fef9f7977cdb740, 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, 0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e, 0xbc8dae98e223747d, 0x3fefaf482d8e67f1, 0x3c8269947c2bed4a, 0x3fefb4aaa2188510, 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, 0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a, 0x3c842b94c3a9eb32, 0x3fefc52b376bba97, 0xbc69fa74878ba7c7, 0x3fefcac948dd7274, 0x3c8a64a931d185ee, 0x3fefd0765b6e4540, 0x3c901f3a75ee0efe, 0x3fefd632798844f8, 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, 0xbc516a9ce6ed84fa, 0x3fefe1d802243c89, 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, 0xbc699c7db2effc76, 0x3fefedba3692d514, 0x3c5305c14160cc89, 0x3feff3c22b8f71f1, 0x3c64b458677f9840, 0x3feff9d96b2a23d9, #elif N == 512 0x0, 0x3ff0000000000000, 0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a, 0xbc84e82fc61851ac, 0x3feffb1afa5abcbf, 0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11, 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, 0x3c75c18e5ae0563a, 0x3feff3d1e77170b4, 0xbc82985dd8521d32, 0x3feff168143b0281, 0xbc705b1125cf49a5, 0x3fefef003103b10e, 0xbc7160139cd8dc5d, 0x3fefec9a3e778061, 0x3c9f879abbff3f87, 0x3fefea363d42b027, 0x3c651e617061bfbd, 0x3fefe7d42e11bbcc, 0x3c9b14003824712a, 0x3fefe57411915a8a, 0xbc905e7a108766d1, 0x3fefe315e86e7f85, 0x3c61cbf0f38af658, 0x3fefe0b9b35659d8, 0x3c845fad437fa426, 0x3fefde5f72f654b1, 0xbc9a3316383dcbc5, 0x3fefdc0727fc1762, 0x3c8cd2523567f613, 0x3fefd9b0d3158574, 0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2, 0xbc954529642b232f, 0x3fefd50a0e3c1f89, 0xbc89b3236d111646, 0x3fefd2b99fa6407c, 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, 0xbc8cb191be99b1b0, 0x3fefce1ead925493, 0x3c8293708ef5c32e, 0x3fefcbd42b72a836, 0xbc9acb71e83765b7, 0x3fefc98ba42e7d30, 0x3c60f74e61e6c861, 0x3fefc74518759bc8, 0x3c5cd3e58b03697e, 0x3fefc50088f8093f, 0xbc95b9280905b2a4, 0x3fefc2bdf66607e0, 0xbc8bfb07d4755452, 0x3fefc07d61701716, 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, 0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715, 0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919, 0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52, 0x3c979aa65d837b6d, 0x3fefb5586cf9890f, 0xbc85b9eb0402507b, 0x3fefb323d833d93f, 0x3c9407fb30d06420, 0x3fefb0f145e46c85, 0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53, 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, 0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba, 0xbc9a5d04b3b9911b, 0x3fefa83b23395dec, 0x3c9c8be44bf4cde8, 0x3fefa612a7b26300, 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, 0x3c820c5444c93c44, 0x3fefa1c7c55189c6, 0xbc937a01f0739546, 0x3fef9fa55fdfa9c5, 0xbc84c6baeb580d7a, 0x3fef9d8503328e6d, 0xbc6a033489906e0b, 0x3fef9b66affed31b, 0x3c8657aa1b0d9f83, 0x3fef994a66f951ce, 0x3c8b8268b04ef0a5, 0x3fef973028d7233e, 0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1, 0xbc9556522a2fbd0e, 0x3fef9301d0125b51, 0xbc6b0b2789925e90, 0x3fef90edb6db2dc1, 0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6, 0xbc93aad17d197fae, 0x3fef8ccbae51a5c8, 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, 0xbc989c464a07ad70, 0x3fef88b1e264a0e9, 0xbc65704e90c9f860, 0x3fef86a814f204ab, 0xbc72c338fce197f4, 0x3fef84a058cbae1e, 0xbc91c923b9d5f416, 0x3fef829aaea92de0, 0xbc6dca724cea0eb6, 0x3fef809717425438, 0xbc897cea57e46280, 0x3fef7e95934f312e, 0x3c464770b955d34d, 0x3fef7c962388149e, 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, 0xbc962811c114424f, 0x3fef789d83606e12, 0x3c56f01429e2b9d2, 0x3fef76a45471c3c2, 0x3c8ec58e74904dd4, 0x3fef74ad3c92df73, 0xbc801b15eaa59348, 0x3fef72b83c7d517b, 0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89, 0x3c6e653b2459034b, 0x3fef6ed48695bbc0, 0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9, 0xbc8f1ff055de323d, 0x3fef6af9388c8dea, 0x3c8bda920de0f6e2, 0x3fef690eba4df41f, 0x3c92cc7ea345b7dc, 0x3fef672658375d2f, 0xbc9a597f9a5ff71c, 0x3fef654013041dc2, 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, 0x3c50835b125aa573, 0x3fef6179e2363cf8, 0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c, 0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0, 0xbc96d99c7611eb26, 0x3fef5be084045cd4, 0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f, 0x3c8cdc1873af2155, 0x3fef582f95281c6b, 0xbc6817fd6a313e3e, 0x3fef565a51860746, 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, 0xbc96236af85fd26a, 0x3fef52b6358e15e8, 0xbc9493684653a131, 0x3fef50e75eb44027, 0x3c7795eb4523abe7, 0x3fef4f1aad999e82, 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, 0x3c8fe58b91b40095, 0x3fef4b87bf9cda38, 0xbc98e2899077520a, 0x3fef49c18438ce4d, 0x3c91ecaa860c614a, 0x3fef47fd7190241e, 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, 0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18, 0x3c9120fcd4f59273, 0x3fef42be3578a819, 0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9, 0x3c807a05b0e4047d, 0x3fef3f49917ddc96, 0x3c87f1c7350e256d, 0x3fef3d9282fc1f27, 0x3c89b788c188c9b8, 0x3fef3bdda27912d1, 0x3c420dac6c124f4f, 0x3fef3a2af0b63bff, 0x3c968efde3a8a894, 0x3fef387a6e756238, 0xbc99501d09bc09fd, 0x3fef36cc1c78903a, 0x3c877afbca90ef84, 0x3fef351ffb82140a, 0x3c73baf864dc8675, 0x3fef33760c547f15, 0x3c875e18f274487d, 0x3fef31ce4fb2a63f, 0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff, 0x3c91512f082876ee, 0x3fef2e85711ece75, 0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82, 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, 0xbc7548165d85ed32, 0x3fef29a8b16f0a30, 0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29, 0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98, 0xbc96b87b3f71085e, 0x3fef24dfe1f56381, 0xbc93a255f697ecfe, 0x3fef234c0ea83f36, 0xbc803297e78260bf, 0x3fef21ba7591bb70, 0x3c8d2d19edc1e550, 0x3fef202b17779965, 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, 0xbc76b2173113dd8c, 0x3fef1d130f50d65c, 0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13, 0x3c811aa5f853590b, 0x3fef1a03fc675d1f, 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, 0x3c61d61a34c8aa02, 0x3fef16fde4f2e280, 0xbc91e75c40b4251e, 0x3fef157e39771b2f, 0xbc91f892bf6b286d, 0x3fef1400cf2f6c18, 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, 0x3c7590c65c20e680, 0x3fef110cc15d5346, 0x3c98a911f1f7785a, 0x3fef0f961f641589, 0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833, 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, 0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2, 0xbc61e7c998db7dbb, 0x3fef09d24abd886b, 0x3c7b3bf786a54a87, 0x3fef08670653dfe4, 0x3c834d754db0abb6, 0x3fef06fe0a31b715, 0x3c74bb6c41732885, 0x3fef05975721b004, 0x3c85425c11faadf4, 0x3fef0432edeeb2fd, 0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac, 0x3c864201e2ac744c, 0x3fef0170fc4cd831, 0xbc5451d60c6ac9eb, 0x3fef001375752b40, 0xbc979517a03e2847, 0x3feefeb83ba8ea32, 0x3c8787a210ceafd9, 0x3feefd5f4fb45e20, 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, 0xbc888d1e4629943d, 0x3feefab46484ebb4, 0xbc800e2a46da4bee, 0x3feef96266e3fa2d, 0xbc93369c544088b6, 0x3feef812ba4ea77d, 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, 0x3c85373ce4eb6dfb, 0x3feef57a577dd72b, 0xbc87430803972b34, 0x3feef431a2de883b, 0x3c83adec8265a67f, 0x3feef2eb428335b4, 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, 0xbc835388bcac6bc5, 0x3feef06581d3f669, 0xbc954de30ae02d94, 0x3feeef26231e754a, 0x3c727cdb4e4b6640, 0x3feeede91be9c811, 0xbc9907f81b512d8e, 0x3feeecae6d05d866, 0x3c86c2696a26af35, 0x3feeeb761742d808, 0xbc94f2487e1c03ec, 0x3feeea401b7140ef, 0x3c888f6ff06b979a, 0x3feee90c7a61d55b, 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, 0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea, 0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4, 0xbc76b8867f91c9d6, 0x3feee4559212ef89, 0xbc991919b3ce1b15, 0x3feee32dc313a8e5, 0x3c94c9c0b5157fe6, 0x3feee20853c10f28, 0x3c79c3bba5562a2f, 0x3feee0e544ede173, 0xbc62455345b51c8e, 0x3feedfc4976d27fa, 0x3c859f48a72a4c6d, 0x3feedea64c123422, 0xbc93331de45477d0, 0x3feedd8a63b0a09b, 0xbc85a71612e21658, 0x3feedc70df1c5175, 0xbc95f84d39b39b16, 0x3feedb59bf29743f, 0xbc9312607a28698a, 0x3feeda4504ac801c, 0xbc72ba4dc7c4d562, 0x3feed932b07a35df, 0x3c86421f6f1d24d6, 0x3feed822c367a024, 0xbc844f25dc02691f, 0x3feed7153e4a136a, 0xbc58a78f4817895b, 0x3feed60a21f72e2a, 0xbc888d328eb9b501, 0x3feed5016f44d8f5, 0xbc9348a6815fce65, 0x3feed3fb2709468a, 0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1, 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, 0xbc615f0a2b9cd452, 0x3feed0f6d5817663, 0x3c835c43984d9871, 0x3feecffa3f84b9d4, 0xbc8c2e465a919e1d, 0x3feecf0018321a1a, 0x3c4363ed60c2ac11, 0x3feece086061892d, 0xbc865dfd02bd08f1, 0x3feecd1318eb43ec, 0xbc632afc8d9473a0, 0x3feecc2042a7d232, 0xbc8e68cec89b1762, 0x3feecb2fde7006f4, 0x3c9666093b0664ef, 0x3feeca41ed1d0057, 0xbc48ae858eb682ca, 0x3feec9566f8827d0, 0xbc95fc5e44de020e, 0x3feec86d668b3237, 0x3c5dd71277c0915f, 0x3feec786d3001fe5, 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, 0x3c92001325ecd7fb, 0x3feec5c10fa920a1, 0xbc7ea0148327c42f, 0x3feec4e1e192aed2, 0x3c65ace6e2870332, 0x3feec4052c5916c4, 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, 0xbc9595c55690ffaf, 0x3feec2532feaada6, 0xbc7a843ad1a88022, 0x3feec17dea6db7d7, 0xbc8b401ba9fb5199, 0x3feec0ab213d5283, 0x3c7690cebb7aafb0, 0x3feebfdad5362a27, 0x3c6df82bf324cc57, 0x3feebf0d073537ca, 0x3c892ca3bf144e63, 0x3feebe41b817c114, 0x3c97cae38641c7bb, 0x3feebd78e8bb586b, 0x3c931dbdeb54e077, 0x3feebcb299fddd0d, 0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a, 0xbc902c99b04aa8b0, 0x3feebb2d81d8abff, 0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0, 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, 0xbc80b582d74a55d9, 0x3feeb8f8b804f127, 0x3c73e34f67e67118, 0x3feeb8417f4531ee, 0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d, 0xbc87deccdc93a349, 0x3feeb6daa2cf6642, 0xbc592dca38593e20, 0x3feeb62b00da3b14, 0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef, 0xbc85daca9994833e, 0x3feeb4d359dfd53d, 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, 0xbc980b4321bc6dae, 0x3feeb385df598d78, 0x3c81bd2888075068, 0x3feeb2e2f4f6ad27, 0xbc8390afec5241c5, 0x3feeb24298571b06, 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, 0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf, 0xbc896be8ae89ef8f, 0x3feeb070dde910d2, 0xbc910aa91ae9b67f, 0x3feeafdac1351819, 0x3c93350518fdd78e, 0x3feeaf4736b527da, 0x3c957e1b67462375, 0x3feeaeb63f4d854c, 0xbc88e6ac90348602, 0x3feeae27dbe2c4cf, 0x3c8124d5051552a7, 0x3feead9c0d59ca07, 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, 0xbc3ca103952ecf1f, 0x3feeac8c32824135, 0xbc91af7f1365c3ac, 0x3feeac0827ff07cc, 0x3c773345c02a4fd6, 0x3feeab86b5f43d92, 0x3c9063e1e21c5409, 0x3feeab07dd485429, 0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e, 0xbc943a3540d1898a, 0x3feeaa11fba87a03, 0xbc924f2cb4f81746, 0x3feea99af482fc8f, 0x3c34c7855019c6ea, 0x3feea9268a5946b7, 0xbc943592a0a9846b, 0x3feea8b4be135acc, 0xbc951f58ddaa8090, 0x3feea84590998b93, 0xbc956bc85d444f4f, 0x3feea7d902d47c65, 0x3c9432e62b64c035, 0x3feea76f15ad2148, 0x3c914d1e4218319f, 0x3feea707ca0cbf0f, 0xbc82e1648e50a17c, 0x3feea6a320dceb71, 0x3c971c93709313f4, 0x3feea6411b078d26, 0xbc8ce44a6199769f, 0x3feea5e1b976dc09, 0x3c7f88303b60d222, 0x3feea584fd15612a, 0x3c95f30eda98a575, 0x3feea52ae6cdf6f4, 0x3c70125ca18d4b5b, 0x3feea4d3778bc944, 0xbc8c33c53bef4da8, 0x3feea47eb03a5585, 0x3c9592ea73798b11, 0x3feea42c91c56acd, 0x3c917ecda8a72159, 0x3feea3dd1d1929fd, 0xbc9371d6d7d75739, 0x3feea390532205d8, 0xbc845378892be9ae, 0x3feea34634ccc320, 0xbc8ac05fd996f807, 0x3feea2fec30678b7, 0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7, 0xbc91f5067d03653a, 0x3feea277e8dcc390, 0xbc93cedd78565858, 0x3feea23882552225, 0x3c917339c86ce3ad, 0x3feea1fbcc140be7, 0xbc85c33fdf910406, 0x3feea1c1c70833f6, 0xbc77e66065ba2500, 0x3feea18a7420a036, 0x3c5710aa807e1964, 0x3feea155d44ca973, 0x3c964c827ee6b49a, 0x3feea123e87bfb7a, 0x3c81079ab5789604, 0x3feea0f4b19e9538, 0xbc928311a3c73480, 0x3feea0c830a4c8d4, 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, 0x3c882c79e185e981, 0x3feea077541ee718, 0x3c727df161cd7778, 0x3feea052fa75173e, 0xbc8b48cea80b043b, 0x3feea0315a736c75, 0xbc6a12ad8734b982, 0x3feea012750bdabf, 0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09, 0x3c93f9924a05b767, 0x3fee9fdcddd47645, 0x3c954835dd4b7548, 0x3fee9fc62dea2f8a, 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, 0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8, 0xbc87557939a8b5ef, 0x3fee9f9298593ae5, 0xbc8f652fde52775c, 0x3fee9f86e7ba9fef, 0xbc80dc3d54e08851, 0x3fee9f7df9519484, 0xbc7b0300defbcf98, 0x3fee9f77ce1303f6, 0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87, 0xbc89dab646035dc0, 0x3fee9f73c4eaa988, 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, 0xbc91f0c230588dde, 0x3fee9f7ad3ef9011, 0xbc88e67a9006c909, 0x3fee9f8286ead08a, 0x3c9106450507a28c, 0x3fee9f8d02d50b8f, 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, 0xbc9129729a10f3a0, 0x3fee9faa5953c849, 0x3c86597566977ac8, 0x3fee9fbd35d7cbfd, 0x3c781a70a5124f67, 0x3fee9fd2df29ce7c, 0xbc8619321e55e68a, 0x3fee9feb564267c9, 0x3c941626ea62646d, 0x3feea0069c1a861d, 0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09, 0xbc940b9f54365b7c, 0x3feea04597eeba8f, 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, 0x3c873455e0e826c1, 0x3feea08fda749e5d, 0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6, 0x3c94f006ad874e3e, 0x3feea0e56b7fcf03, 0xbc7b32dcb94da51d, 0x3feea11473eb0187, 0xbc8f6d693d0973bb, 0x3feea14652e958aa, 0xbc92dad3519d7b5b, 0x3feea17b0976cfdb, 0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec, 0x3c94ecfd5467c06b, 0x3feea1ed0130c132, 0xbc88b25e045d207b, 0x3feea22a4456e7a3, 0x3c87d51410fd15c2, 0x3feea26a62ff86f0, 0xbc69cb3314060ca7, 0x3feea2ad5e2850ac, 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, 0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9, 0xbc760a3629969871, 0x3feea3878491c491, 0x3c94aa7212bfa73c, 0x3feea3d5fbab091f, 0xbc88a1c52fb3cf42, 0x3feea427543e1a12, 0xbc81e688272a8a12, 0x3feea47b8f4abaa9, 0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9, 0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a, 0xbc9369b6f13b3734, 0x3feea589994cce13, 0x3c8a1e274eed4476, 0x3feea5e968443d9a, 0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7, 0x3c94a533a59324da, 0x3feea6b1bdadb46d, 0xbc805e843a19ff1e, 0x3feea71a4623c7ad, 0x3c7a56d2760d087d, 0x3feea785b91e07f1, 0xbc522cea4f3afa1e, 0x3feea7f4179f5b21, 0x3c91682c1c6e8b05, 0x3feea86562ab00ec, 0xbc94d450d872576e, 0x3feea8d99b4492ed, 0x3c89ea99cf7a9591, 0x3feea950c27004c2, 0x3c7c88549b958471, 0x3feea9cad931a436, 0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957, 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, 0x3c909b176e05a9cd, 0x3feeab4ac52be8f7, 0x3c931143962f7877, 0x3feeabd0a478580f, 0x3c711607f1952c95, 0x3feeac597875c644, 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, 0x3c869608f0f86431, 0x3feead74029db01e, 0x3c93e9e96f112479, 0x3feeae05bad61778, 0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598, 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, 0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6, 0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9, 0x3c81c1701c359530, 0x3feeb10afc931857, 0x3c7bf68359f35f44, 0x3feeb1ae99157736, 0xbc8edb1bf6809287, 0x3feeb2553499284b, 0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a, 0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c, 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, 0xbc93fc025e1db9ce, 0x3feeb50dad829e70, 0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2, 0xbc8d737c7d71382e, 0x3feeb67bff148396, 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, 0x3c6ae88c43905293, 0x3feeb7f669e2802b, 0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5, 0xbc93d1f7661fe51b, 0x3feeb97cf65253d1, 0xbc6c23f97c90b959, 0x3feeba44cbc8520f, 0x3c651b68797ffc1c, 0x3feebb0faccf9243, 0xbc51669428996971, 0x3feebbdd9a7670b3, 0x3c54579c5ceed70b, 0x3feebcae95cba768, 0xbc92434322f4f9aa, 0x3feebd829fde4e50, 0x3c87298413381667, 0x3feebe59b9bddb5b, 0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2, 0xbc905000be64e965, 0x3feec01121235681, 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, 0xbc89fb12e3454b73, 0x3feec1d4d47f2598, 0xbc9294f304f166b6, 0x3feec2bb4d53fe0d, 0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3, 0x3c71affc2b91ce27, 0x3feec49182a3f090, 0x3c90622b15810eea, 0x3feec581414380f2, 0xbc8a1e58414c07d3, 0x3feec674194bb8d5, 0x3be9a5ecc875d327, 0x3feec76a0bcfc15e, 0x3c6dd235e10a73bb, 0x3feec86319e32323, 0x3c88ea486a3350ef, 0x3feec95f4499c647, 0xbc79740b58a20091, 0x3feeca5e8d07f29e, 0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb, 0xbc87c50422622263, 0x3feecc667b5de565, 0x3c89c31f7e38028b, 0x3feecd6f23701b15, 0x3c9165830a2b96c2, 0x3feece7aed8eb8bb, 0xbc5fac13f4e005a3, 0x3feecf89dacfe68c, 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, 0x3c7d8aced7162e89, 0x3feed1b1231475f7, 0xbc903d5cbe27874b, 0x3feed2c980460ad8, 0xbc848f50cea7269f, 0x3feed3e504f696b1, 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, 0x3c821eb9a08a0542, 0x3feed625893523d4, 0x3c5986178980fce0, 0x3feed74a8af46052, 0xbc6133a953131cfd, 0x3feed872b8950a73, 0x3c90cc319cee31d2, 0x3feed99e1330b358, 0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca, 0xbc89472975b1f2a5, 0x3feedbfe53c12e59, 0xbc90260cf07cb311, 0x3feedd333beb0b7e, 0x3c8469846e735ab3, 0x3feede6b5579fdbf, 0x3c1bca400a7b939d, 0x3feedfa6a1897fd2, 0x3c7d8157a34b7e7f, 0x3feee0e521356eba, 0x3c9140bc34dfc19f, 0x3feee226d59a09ee, 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, 0xbc8c9b1da461ab87, 0x3feee4b3e100301e, 0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774, 0x3c8c115f23ebea8e, 0x3feee74dcca5a413, 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, 0xbc6dcab99f23f84e, 0x3feee9f4a17a4735, 0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff, 0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4, 0xbc907b8f4ad1d9fa, 0x3feeee07298db666, 0x3c915b1397075f04, 0x3feeef692a8fa8cd, 0x3c889c2ea41433c7, 0x3feef0ce6c9a8952, 0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a, 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, 0xbc86a510f31e13e6, 0x3feef511c43bbd62, 0xbc7274aedac8ff80, 0x3feef68415b749b1, 0xbc92887ea88e7340, 0x3feef7f9ade433c6, 0xbc90a40e3da6f640, 0x3feef9728de5593a, 0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87, 0x3c85c620ce76df06, 0x3feefc6e29f1c52a, 0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6, 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, 0xbc8d1bf10460dba0, 0x3fef01004b3a7804, 0xbc8fda52e1b51e41, 0x3fef028cf22749e4, 0x3c8e5d80813dddfc, 0x3fef041ce8e77680, 0xbc91eee26b588a35, 0x3fef05b030a1064a, 0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7, 0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f, 0x3c7a77557fd62db3, 0x3fef0a7df9285775, 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, 0xbc651ba6128db749, 0x3fef0dc27e2cb5e5, 0xbc302899507554e5, 0x3fef0f69c3f3a207, 0xbc7c0ffefdc5e251, 0x3fef111462c95b60, 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, 0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30, 0xbc80dda2d4c0010c, 0x3fef16286141b33d, 0x3c923759b8aca76d, 0x3fef17e06ff301f4, 0x3c736eae30af0cb3, 0x3fef199bdd85529c, 0xbc895498a73dac7d, 0x3fef1b5aab23e61e, 0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c, 0x3c851de924583108, 0x3fef1ee26b34e065, 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, 0xbc8c5fe4051ba06c, 0x3fef2277b9881650, 0x3c836909391181d3, 0x3fef244778fafb22, 0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad, 0x3c84e08fd10959ac, 0x3fef27f12e57d14b, 0xbc7af5c67c4e8235, 0x3fef29cb269e601f, 0xbc811cd7dbdf9547, 0x3fef2ba88988c933, 0xbc8304ef0045d575, 0x3fef2d89584661a1, 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, 0x3c8725f94f910375, 0x3fef31553dfa8313, 0xbc7ac28b7bef6621, 0x3fef33405751c4db, 0x3c7b53e99f9191e8, 0x3fef352ee13da7cb, 0x3c676b2c6c921968, 0x3fef3720dcef9069, 0xbc810a79e6d7e2b8, 0x3fef39164b994d23, 0xbc7030587207b9e1, 0x3fef3b0f2e6d1675, 0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f, 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, 0x3c549eeef9ec910c, 0x3fef410e9be12cb9, 0xbc8cc734592af7fc, 0x3fef43155b5bab74, 0xbc8335827ffb9dce, 0x3fef451f95018d17, 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, 0x3c645563980ef762, 0x3fef493e7ba2c38c, 0x3c87752a44f587e8, 0x3fef4b532b08c968, 0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c, 0xbc900dae3875a949, 0x3fef4f87080d89f2, 0xbc8aab80ceab2b4a, 0x3fef51a638197a3c, 0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6, 0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f, 0x3c74a385a63d07a7, 0x3fef5818dcfba487, 0x3c83c119f18464c5, 0x3fef5a461eec14be, 0x3c5159d9d908a96e, 0x3fef5c76e862e6d3, 0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b, 0xbc82919e2040220f, 0x3fef60e316c98398, 0xbc72550d76be719a, 0x3fef631e7e2d479d, 0x3c8c254d16117a68, 0x3fef655d71ff6075, 0xbc82090274667d12, 0x3fef679ff37adb4a, 0x3c8e5a50d5c192ac, 0x3fef69e603db3285, 0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd, 0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315, 0x3c890de9296f4cd1, 0x3fef70cd9ab294e4, 0x3c843a59ac016b4b, 0x3fef7321f301b460, 0x3c832ff9978b34bc, 0x3fef7579e065807d, 0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658, 0xbc7303b63dda1980, 0x3fef7a347f63c159, 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, 0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1, 0xbc63e8e3eab2cbb4, 0x3fef81676b197d17, 0x3c768d9144ae12fc, 0x3fef83d4f11f8220, 0xbc892ab93b470dc9, 0x3fef864614f5a129, 0x3c853687f542403b, 0x3fef88bad7dcee90, 0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12, 0xbc736ed2de40b407, 0x3fef8daf3fe592e8, 0x3c74b604603a88d3, 0x3fef902ee78b3ff6, 0xbc614ef56c770f3b, 0x3fef92b2334ac7ee, 0xbc776caa4c2ff1cf, 0x3fef953924676d76, 0x3c8df7d1353d8e88, 0x3fef97c3bc24e350, 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, 0xbc850bed64091b8a, 0x3fef9ce3e4933c7e, 0xbc81d5fc525d9940, 0x3fef9f7977cdb740, 0x3c89d852381c317f, 0x3fefa212b6bc3181, 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, 0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5, 0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e, 0xbc5a1f25ce94cae7, 0x3fefac9c80faa594, 0xbc8dae98e223747d, 0x3fefaf482d8e67f1, 0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2, 0x3c8269947c2bed4a, 0x3fefb4aaa2188510, 0x3c737e8ae802b851, 0x3fefb7616ca06dd6, 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, 0x3c875119560e34af, 0x3fefbcda28a52e59, 0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a, 0xbc7431c3840929c6, 0x3fefc261cbdf5be7, 0x3c842b94c3a9eb32, 0x3fefc52b376bba97, 0xbc8cb472d2e86b99, 0x3fefc7f860a70c22, 0xbc69fa74878ba7c7, 0x3fefcac948dd7274, 0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac, 0x3c8a64a931d185ee, 0x3fefd0765b6e4540, 0x3c8eef18336b62e3, 0x3fefd35288633625, 0x3c901f3a75ee0efe, 0x3fefd632798844f8, 0x3c80d23f87b50a2a, 0x3fefd916302bd526, 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, 0x3c8302dee657c8e6, 0x3fefdee8f32a4b45, 0xbc516a9ce6ed84fa, 0x3fefe1d802243c89, 0xbc7b0caa080df170, 0x3fefe4cadbdac61d, 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, 0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54, 0xbc699c7db2effc76, 0x3fefedba3692d514, 0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad, 0x3c5305c14160cc89, 0x3feff3c22b8f71f1, 0x3c8e70b094fa075a, 0x3feff6cbe15f6314, 0x3c64b458677f9840, 0x3feff9d96b2a23d9, 0xbc72ec9a3e5d680a, 0x3feffceaca4391b6, #endif }, }; diff --git a/contrib/arm-optimized-routines/math/include/mathlib.h b/contrib/arm-optimized-routines/math/include/mathlib.h index c520c3772f7f..64cbb9c1f850 100644 --- a/contrib/arm-optimized-routines/math/include/mathlib.h +++ b/contrib/arm-optimized-routines/math/include/mathlib.h @@ -1,100 +1,59 @@ /* * Public API. * - * Copyright (c) 2015-2020, Arm Limited. + * Copyright (c) 2015-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATHLIB_H #define _MATHLIB_H float expf (float); float exp2f (float); float logf (float); float log2f (float); float powf (float, float); float sinf (float); float cosf (float); void sincosf (float, float*, float*); double exp (double); +double exp10 (double); double exp2 (double); double log (double); double log2 (double); double pow (double, double); -/* Scalar functions using the vector algorithm with identical result. */ -float __s_sinf (float); -float __s_cosf (float); -float __s_expf (float); -float __s_expf_1u (float); -float __s_exp2f (float); -float __s_exp2f_1u (float); -float __s_logf (float); -float __s_powf (float, float); -double __s_sin (double); -double __s_cos (double); -double __s_exp (double); -double __s_log (double); -double __s_pow (double, double); - #if __aarch64__ -#if __GNUC__ >= 5 +# if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -#elif __clang_major__*100+__clang_minor__ >= 305 +# elif __clang_major__*100+__clang_minor__ >= 305 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -#else -#error Unsupported compiler -#endif - -/* Vector functions following the base PCS. */ -__f32x4_t __v_sinf (__f32x4_t); -__f32x4_t __v_cosf (__f32x4_t); -__f32x4_t __v_expf (__f32x4_t); -__f32x4_t __v_expf_1u (__f32x4_t); -__f32x4_t __v_exp2f (__f32x4_t); -__f32x4_t __v_exp2f_1u (__f32x4_t); -__f32x4_t __v_logf (__f32x4_t); -__f32x4_t __v_powf (__f32x4_t, __f32x4_t); -__f64x2_t __v_sin (__f64x2_t); -__f64x2_t __v_cos (__f64x2_t); -__f64x2_t __v_exp (__f64x2_t); -__f64x2_t __v_log (__f64x2_t); -__f64x2_t __v_pow (__f64x2_t, __f64x2_t); +# else +# error Unsupported compiler +# endif -#if __GNUC__ >= 9 || __clang_major__ >= 8 -#define __vpcs __attribute__((__aarch64_vector_pcs__)) - -/* Vector functions following the vector PCS. */ -__vpcs __f32x4_t __vn_sinf (__f32x4_t); -__vpcs __f32x4_t __vn_cosf (__f32x4_t); -__vpcs __f32x4_t __vn_expf (__f32x4_t); -__vpcs __f32x4_t __vn_expf_1u (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t); -__vpcs __f32x4_t __vn_logf (__f32x4_t); -__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t __vn_sin (__f64x2_t); -__vpcs __f64x2_t __vn_cos (__f64x2_t); -__vpcs __f64x2_t __vn_exp (__f64x2_t); -__vpcs __f64x2_t __vn_log (__f64x2_t); -__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t); +# if __GNUC__ >= 9 || __clang_major__ >= 8 +# undef __vpcs +# define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); -#endif +# endif #endif #endif diff --git a/contrib/arm-optimized-routines/math/math_config.h b/contrib/arm-optimized-routines/math/math_config.h index 7ffc0cd2796a..faf77b31fc99 100644 --- a/contrib/arm-optimized-routines/math/math_config.h +++ b/contrib/arm-optimized-routines/math/math_config.h @@ -1,462 +1,521 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2020, Arm Limited. + * Copyright (c) 2017-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATH_CONFIG_H #define _MATH_CONFIG_H #include #include #ifndef WANT_ROUNDING /* If defined to 1, return correct results for special cases in non-nearest rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f). This may be set to 0 if there is no fenv support or if math functions only get called in round to nearest mode. */ # define WANT_ROUNDING 1 #endif #ifndef WANT_ERRNO /* If defined to 1, set errno in math functions according to ISO C. Many math libraries do not set errno, so this is 0 by default. It may need to be set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */ # define WANT_ERRNO 0 #endif #ifndef WANT_ERRNO_UFLOW /* Set errno to ERANGE if result underflows to 0 (in all rounding modes). */ # define WANT_ERRNO_UFLOW (WANT_ROUNDING && WANT_ERRNO) #endif /* Compiler can inline round as a single instruction. */ #ifndef HAVE_FAST_ROUND # if __aarch64__ # define HAVE_FAST_ROUND 1 # else # define HAVE_FAST_ROUND 0 # endif #endif /* Compiler can inline lround, but not (long)round(x). */ #ifndef HAVE_FAST_LROUND # if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__ # define HAVE_FAST_LROUND 1 # else # define HAVE_FAST_LROUND 0 # endif #endif /* Compiler can inline fma as a single instruction. */ #ifndef HAVE_FAST_FMA # if defined FP_FAST_FMA || __aarch64__ # define HAVE_FAST_FMA 1 # else # define HAVE_FAST_FMA 0 # endif #endif /* Provide *_finite symbols and some of the glibc hidden symbols so libmathlib can be used with binaries compiled against glibc to interpose math functions with both static and dynamic linking. */ #ifndef USE_GLIBC_ABI # if __GNUC__ # define USE_GLIBC_ABI 1 # else # define USE_GLIBC_ABI 0 # endif #endif /* Optionally used extensions. */ #ifdef __GNUC__ # define HIDDEN __attribute__ ((__visibility__ ("hidden"))) # define NOINLINE __attribute__ ((noinline)) # define UNUSED __attribute__ ((unused)) # define likely(x) __builtin_expect (!!(x), 1) # define unlikely(x) __builtin_expect (x, 0) # if __GNUC__ >= 9 # define attribute_copy(f) __attribute__ ((copy (f))) # else # define attribute_copy(f) # endif # define strong_alias(f, a) \ extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); # define hidden_alias(f, a) \ extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ attribute_copy (f); #else # define HIDDEN # define NOINLINE # define UNUSED # define likely(x) (x) # define unlikely(x) (x) #endif +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +/* Symbol renames to avoid libc conflicts. */ +#define __math_oflowf arm_math_oflowf +#define __math_uflowf arm_math_uflowf +#define __math_may_uflowf arm_math_may_uflowf +#define __math_divzerof arm_math_divzerof +#define __math_oflow arm_math_oflow +#define __math_uflow arm_math_uflow +#define __math_may_uflow arm_math_may_uflow +#define __math_divzero arm_math_divzero +#define __math_invalidf arm_math_invalidf +#define __math_invalid arm_math_invalid +#define __math_check_oflow arm_math_check_oflow +#define __math_check_uflow arm_math_check_uflow +#define __math_check_oflowf arm_math_check_oflowf +#define __math_check_uflowf arm_math_check_uflowf + +#define __sincosf_table arm_math_sincosf_table +#define __inv_pio4 arm_math_inv_pio4 +#define __exp2f_data arm_math_exp2f_data +#define __logf_data arm_math_logf_data +#define __log2f_data arm_math_log2f_data +#define __powf_log2_data arm_math_powf_log2_data +#define __exp_data arm_math_exp_data +#define __log_data arm_math_log_data +#define __log2_data arm_math_log2_data +#define __pow_log_data arm_math_pow_log_data +#define __erff_data arm_math_erff_data +#define __erf_data arm_math_erf_data +#define __v_exp_data arm_math_v_exp_data +#define __v_log_data arm_math_v_log_data + #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ # define TOINT_INTRINSICS 1 /* Round x to nearest int in all rounding modes, ties have to be rounded consistently with converttoint so the results match. If the result would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */ static inline double_t roundtoint (double_t x) { return round (x); } /* Convert x to nearest int in all rounding modes, ties have to be rounded consistently with roundtoint. If the result is not representible in an int32_t then the semantics is unspecified. */ static inline int32_t converttoint (double_t x) { # if HAVE_FAST_LROUND return lround (x); # else return (long) round (x); # endif } #endif static inline uint32_t asuint (float f) { union { float f; uint32_t i; } u = {f}; return u.i; } static inline float asfloat (uint32_t i) { union { uint32_t i; float f; } u = {i}; return u.f; } static inline uint64_t asuint64 (double f) { union { double f; uint64_t i; } u = {f}; return u.i; } static inline double asdouble (uint64_t i) { union { uint64_t i; double f; } u = {i}; return u.f; } #ifndef IEEE_754_2008_SNAN # define IEEE_754_2008_SNAN 1 #endif static inline int issignalingf_inline (float x) { uint32_t ix = asuint (x); if (!IEEE_754_2008_SNAN) return (ix & 0x7fc00000) == 0x7fc00000; return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000; } static inline int issignaling_inline (double x) { uint64_t ix = asuint64 (x); if (!IEEE_754_2008_SNAN) return (ix & 0x7ff8000000000000) == 0x7ff8000000000000; return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; } #if __aarch64__ && __GNUC__ /* Prevent the optimization of a floating-point expression. */ static inline float opt_barrier_float (float x) { __asm__ __volatile__ ("" : "+w" (x)); return x; } static inline double opt_barrier_double (double x) { __asm__ __volatile__ ("" : "+w" (x)); return x; } /* Force the evaluation of a floating-point expression for its side-effect. */ static inline void force_eval_float (float x) { __asm__ __volatile__ ("" : "+w" (x)); } static inline void force_eval_double (double x) { __asm__ __volatile__ ("" : "+w" (x)); } #else static inline float opt_barrier_float (float x) { volatile float y = x; return y; } static inline double opt_barrier_double (double x) { volatile double y = x; return y; } static inline void force_eval_float (float x) { volatile float y UNUSED = x; } static inline void force_eval_double (double x) { volatile double y UNUSED = x; } #endif /* Evaluate an expression as the specified type, normally a type cast should be enough, but compilers implement non-standard excess-precision handling, so when FLT_EVAL_METHOD != 0 then these functions may need to be customized. */ static inline float eval_as_float (float x) { return x; } static inline double eval_as_double (double x) { return x; } /* Error handling tail calls for special cases, with a sign argument. The sign of the return value is set if the argument is non-zero. */ /* The result overflows. */ HIDDEN float __math_oflowf (uint32_t); /* The result underflows to 0 in nearest rounding mode. */ HIDDEN float __math_uflowf (uint32_t); /* The result underflows to 0 in some directed rounding mode only. */ HIDDEN float __math_may_uflowf (uint32_t); /* Division by zero. */ HIDDEN float __math_divzerof (uint32_t); /* The result overflows. */ HIDDEN double __math_oflow (uint32_t); /* The result underflows to 0 in nearest rounding mode. */ HIDDEN double __math_uflow (uint32_t); /* The result underflows to 0 in some directed rounding mode only. */ HIDDEN double __math_may_uflow (uint32_t); /* Division by zero. */ HIDDEN double __math_divzero (uint32_t); /* Error handling using input checking. */ /* Invalid input unless it is a quiet NaN. */ HIDDEN float __math_invalidf (float); /* Invalid input unless it is a quiet NaN. */ HIDDEN double __math_invalid (double); /* Error handling using output checking, only for errno setting. */ /* Check if the result overflowed to infinity. */ HIDDEN double __math_check_oflow (double); /* Check if the result underflowed to 0. */ HIDDEN double __math_check_uflow (double); /* Check if the result overflowed to infinity. */ static inline double check_oflow (double x) { return WANT_ERRNO ? __math_check_oflow (x) : x; } /* Check if the result underflowed to 0. */ static inline double check_uflow (double x) { return WANT_ERRNO ? __math_check_uflow (x) : x; } /* Check if the result overflowed to infinity. */ HIDDEN float __math_check_oflowf (float); /* Check if the result underflowed to 0. */ HIDDEN float __math_check_uflowf (float); /* Check if the result overflowed to infinity. */ static inline float check_oflowf (float x) { return WANT_ERRNO ? __math_check_oflowf (x) : x; } /* Check if the result underflowed to 0. */ static inline float check_uflowf (float x) { return WANT_ERRNO ? __math_check_uflowf (x) : x; } /* Shared between expf, exp2f and powf. */ #define EXP2F_TABLE_BITS 5 #define EXP2F_POLY_ORDER 3 extern const struct exp2f_data { uint64_t tab[1 << EXP2F_TABLE_BITS]; double shift_scaled; double poly[EXP2F_POLY_ORDER]; double shift; double invln2_scaled; double poly_scaled[EXP2F_POLY_ORDER]; } __exp2f_data HIDDEN; #define LOGF_TABLE_BITS 4 #define LOGF_POLY_ORDER 4 extern const struct logf_data { struct { double invc, logc; } tab[1 << LOGF_TABLE_BITS]; double ln2; double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */ } __logf_data HIDDEN; #define LOG2F_TABLE_BITS 4 #define LOG2F_POLY_ORDER 4 extern const struct log2f_data { struct { double invc, logc; } tab[1 << LOG2F_TABLE_BITS]; double poly[LOG2F_POLY_ORDER]; } __log2f_data HIDDEN; #define POWF_LOG2_TABLE_BITS 4 #define POWF_LOG2_POLY_ORDER 5 #if TOINT_INTRINSICS # define POWF_SCALE_BITS EXP2F_TABLE_BITS #else # define POWF_SCALE_BITS 0 #endif #define POWF_SCALE ((double) (1 << POWF_SCALE_BITS)) extern const struct powf_log2_data { struct { double invc, logc; } tab[1 << POWF_LOG2_TABLE_BITS]; double poly[POWF_LOG2_POLY_ORDER]; } __powf_log2_data HIDDEN; #define EXP_TABLE_BITS 7 #define EXP_POLY_ORDER 5 /* Use polynomial that is optimized for a wider input range. This may be needed for good precision in non-nearest rounding and !TOINT_INTRINSICS. */ #define EXP_POLY_WIDE 0 /* Use close to nearest rounding toint when !TOINT_INTRINSICS. This may be needed for good precision in non-nearest rouning and !EXP_POLY_WIDE. */ #define EXP_USE_TOINT_NARROW 0 #define EXP2_POLY_ORDER 5 #define EXP2_POLY_WIDE 0 +/* Wider exp10 polynomial necessary for good precision in non-nearest rounding + and !TOINT_INTRINSICS. */ +#define EXP10_POLY_WIDE 0 extern const struct exp_data { double invln2N; + double invlog10_2N; double shift; double negln2hiN; double negln2loN; + double neglog10_2hiN; + double neglog10_2loN; double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; + double exp10_poly[5]; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; #define LOG_TABLE_BITS 7 #define LOG_POLY_ORDER 6 #define LOG_POLY1_ORDER 12 extern const struct log_data { double ln2hi; double ln2lo; double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ double poly1[LOG_POLY1_ORDER - 1]; struct {double invc, logc;} tab[1 << LOG_TABLE_BITS]; #if !HAVE_FAST_FMA struct {double chi, clo;} tab2[1 << LOG_TABLE_BITS]; #endif } __log_data HIDDEN; #define LOG2_TABLE_BITS 6 #define LOG2_POLY_ORDER 7 #define LOG2_POLY1_ORDER 11 extern const struct log2_data { double invln2hi; double invln2lo; double poly[LOG2_POLY_ORDER - 1]; double poly1[LOG2_POLY1_ORDER - 1]; struct {double invc, logc;} tab[1 << LOG2_TABLE_BITS]; #if !HAVE_FAST_FMA struct {double chi, clo;} tab2[1 << LOG2_TABLE_BITS]; #endif } __log2_data HIDDEN; #define POW_LOG_TABLE_BITS 7 #define POW_LOG_POLY_ORDER 8 extern const struct pow_log_data { double ln2hi; double ln2lo; double poly[POW_LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ /* Note: the pad field is unused, but allows slightly faster indexing. */ struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS]; } __pow_log_data HIDDEN; extern const struct erff_data { float erff_poly_A[6]; float erff_poly_B[7]; } __erff_data HIDDEN; #define ERF_POLY_A_ORDER 19 #define ERF_POLY_A_NCOEFFS 10 #define ERFC_POLY_C_NCOEFFS 16 #define ERFC_POLY_D_NCOEFFS 18 #define ERFC_POLY_E_NCOEFFS 14 #define ERFC_POLY_F_NCOEFFS 17 extern const struct erf_data { double erf_poly_A[ERF_POLY_A_NCOEFFS]; double erf_ratio_N_A[5]; double erf_ratio_D_A[5]; double erf_ratio_N_B[7]; double erf_ratio_D_B[6]; double erfc_poly_C[ERFC_POLY_C_NCOEFFS]; double erfc_poly_D[ERFC_POLY_D_NCOEFFS]; double erfc_poly_E[ERFC_POLY_E_NCOEFFS]; double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; } __erf_data HIDDEN; +#define V_EXP_TABLE_BITS 7 +extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; + +#define V_LOG_TABLE_BITS 7 +extern const struct v_log_data +{ + struct + { + double invc, logc; + } table[1 << V_LOG_TABLE_BITS]; +} __v_log_data HIDDEN; + #endif diff --git a/contrib/arm-optimized-routines/math/s_cos.c b/contrib/arm-optimized-routines/math/s_cos.c deleted file mode 100644 index e66d563d15b5..000000000000 --- a/contrib/arm-optimized-routines/math/s_cos.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cos.c" diff --git a/contrib/arm-optimized-routines/math/s_cosf.c b/contrib/arm-optimized-routines/math/s_cosf.c deleted file mode 100644 index f615d260b39b..000000000000 --- a/contrib/arm-optimized-routines/math/s_cosf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cosf.c" diff --git a/contrib/arm-optimized-routines/math/s_exp.c b/contrib/arm-optimized-routines/math/s_exp.c deleted file mode 100644 index 5da0099e3c65..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp.c" diff --git a/contrib/arm-optimized-routines/math/s_exp2f.c b/contrib/arm-optimized-routines/math/s_exp2f.c deleted file mode 100644 index dcbfea9e1e79..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp2f.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp2f.c" diff --git a/contrib/arm-optimized-routines/math/s_exp2f_1u.c b/contrib/arm-optimized-routines/math/s_exp2f_1u.c deleted file mode 100644 index bf387e44cfb2..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp2f_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp2f_1u.c" diff --git a/contrib/arm-optimized-routines/math/s_expf.c b/contrib/arm-optimized-routines/math/s_expf.c deleted file mode 100644 index dacda7fb4fd5..000000000000 --- a/contrib/arm-optimized-routines/math/s_expf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expf.c" diff --git a/contrib/arm-optimized-routines/math/s_expf_1u.c b/contrib/arm-optimized-routines/math/s_expf_1u.c deleted file mode 100644 index 00096449f7a5..000000000000 --- a/contrib/arm-optimized-routines/math/s_expf_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expf_1u.c" diff --git a/contrib/arm-optimized-routines/math/s_log.c b/contrib/arm-optimized-routines/math/s_log.c deleted file mode 100644 index 27d2eb290f56..000000000000 --- a/contrib/arm-optimized-routines/math/s_log.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log.c" diff --git a/contrib/arm-optimized-routines/math/s_logf.c b/contrib/arm-optimized-routines/math/s_logf.c deleted file mode 100644 index 7d98b2ba15c4..000000000000 --- a/contrib/arm-optimized-routines/math/s_logf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_logf.c" diff --git a/contrib/arm-optimized-routines/math/s_pow.c b/contrib/arm-optimized-routines/math/s_pow.c deleted file mode 100644 index 6eca2b2b17f1..000000000000 --- a/contrib/arm-optimized-routines/math/s_pow.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_pow.c" diff --git a/contrib/arm-optimized-routines/math/s_powf.c b/contrib/arm-optimized-routines/math/s_powf.c deleted file mode 100644 index 1d55d90df7b2..000000000000 --- a/contrib/arm-optimized-routines/math/s_powf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_powf.c" diff --git a/contrib/arm-optimized-routines/math/s_sin.c b/contrib/arm-optimized-routines/math/s_sin.c deleted file mode 100644 index 0c6171259c0c..000000000000 --- a/contrib/arm-optimized-routines/math/s_sin.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sin.c" diff --git a/contrib/arm-optimized-routines/math/s_sinf.c b/contrib/arm-optimized-routines/math/s_sinf.c deleted file mode 100644 index 3aae61149618..000000000000 --- a/contrib/arm-optimized-routines/math/s_sinf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sinf.c" diff --git a/contrib/arm-optimized-routines/math/test/mathbench.c b/contrib/arm-optimized-routines/math/test/mathbench.c index 6e18e36fbcb2..ed7e89bb7710 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench.c +++ b/contrib/arm-optimized-routines/math/test/mathbench.c @@ -1,702 +1,642 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2022, Arm Limited. + * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #undef _GNU_SOURCE #define _GNU_SOURCE 1 #include #include #include #include #include #include #include "mathlib.h" -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - /* Number of measurements, best result is reported. */ #define MEASURE 60 /* Array size. */ #define N 8000 /* Iterations over the array. */ #define ITER 125 static double *Trace; static size_t trace_size; static double A[N]; static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#if __aarch64__ && WANT_VMATH -typedef __f64x2_t v_double; +#ifdef __vpcs +#include +typedef float64x2_t v_double; #define v_double_len() 2 static inline v_double v_double_load (const double *p) { return (v_double){p[0], p[1]}; } static inline v_double v_double_dup (double x) { return (v_double){x, x}; } -typedef __f32x4_t v_float; +typedef float32x4_t v_float; #define v_float_len() 4 static inline v_float v_float_load (const float *p) { return (v_float){p[0], p[1], p[2], p[3]}; } static inline v_float v_float_dup (float x) { return (v_float){x, x, x, x}; } +#else +/* dummy definitions to make things compile. */ +typedef double v_double; +typedef float v_float; +#define v_double_len(x) 1 +#define v_double_load(x) (x)[0] +#define v_double_dup(x) (x) +#define v_float_len(x) 1 +#define v_float_load(x) (x)[0] +#define v_float_dup(x) (x) + +#endif + #if WANT_SVE_MATH #include typedef svbool_t sv_bool; typedef svfloat64_t sv_double; #define sv_double_len() svcntd() static inline sv_double sv_double_load (const double *p) { svbool_t pg = svptrue_b64(); return svld1(pg, p); } static inline sv_double sv_double_dup (double x) { return svdup_n_f64(x); } typedef svfloat32_t sv_float; #define sv_float_len() svcntw() static inline sv_float sv_float_load (const float *p) { svbool_t pg = svptrue_b32(); return svld1(pg, p); } static inline sv_float sv_float_dup (float x) { return svdup_n_f32(x); } -#endif #else /* dummy definitions to make things compile. */ -typedef double v_double; -typedef float v_float; -#define v_double_len(x) 1 -#define v_double_load(x) (x)[0] -#define v_double_dup(x) (x) -#define v_float_len(x) 1 -#define v_float_load(x) (x)[0] -#define v_float_dup(x) (x) +#define sv_double_len(x) 1 +#define sv_float_len(x) 1 #endif static double dummy (double x) { return x; } static float dummyf (float x) { return x; } -#if WANT_VMATH -#if __aarch64__ -static v_double -__v_dummy (v_double x) -{ - return x; -} - -static v_float -__v_dummyf (v_float x) -{ - return x; -} - #ifdef __vpcs __vpcs static v_double __vn_dummy (v_double x) { return x; } __vpcs static v_float __vn_dummyf (v_float x) { return x; } #endif #if WANT_SVE_MATH static sv_double __sv_dummy (sv_double x, sv_bool pg) { return x; } static sv_float __sv_dummyf (sv_float x, sv_bool pg) { return x; } -#endif -#endif #endif #include "test/mathbench_wrappers.h" static const struct fun { const char *name; int prec; int vec; double lo; double hi; union { double (*d) (double); float (*f) (float); - v_double (*vd) (v_double); - v_float (*vf) (v_float); #ifdef __vpcs __vpcs v_double (*vnd) (v_double); __vpcs v_float (*vnf) (v_float); #endif #if WANT_SVE_MATH sv_double (*svd) (sv_double, sv_bool); sv_float (*svf) (sv_float, sv_bool); #endif } fun; } funtab[] = { #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, -#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, -#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) F (dummyf, 1.0, 2.0) -#if WANT_VMATH -#if __aarch64__ -VD (__v_dummy, 1.0, 2.0) -VF (__v_dummyf, 1.0, 2.0) #ifdef __vpcs VND (__vn_dummy, 1.0, 2.0) VNF (__vn_dummyf, 1.0, 2.0) #endif #if WANT_SVE_MATH SVD (__sv_dummy, 1.0, 2.0) SVF (__sv_dummyf, 1.0, 2.0) #endif -#endif -#endif #include "test/mathbench_funcs.h" {0}, #undef F #undef D -#undef VF -#undef VD #undef VNF #undef VND #undef SVF #undef SVD }; static void gen_linear (double lo, double hi) { for (int i = 0; i < N; i++) A[i] = (lo * (N - i) + hi * i) / N; } static void genf_linear (double lo, double hi) { for (int i = 0; i < N; i++) Af[i] = (float)(lo * (N - i) + hi * i) / N; } static inline double asdouble (uint64_t i) { union { uint64_t i; double f; } u = {i}; return u.f; } static uint64_t seed = 0x0123456789abcdef; static double frand (double lo, double hi) { seed = 6364136223846793005ULL * seed + 1; return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0); } static void gen_rand (double lo, double hi) { for (int i = 0; i < N; i++) A[i] = frand (lo, hi); } static void genf_rand (double lo, double hi) { for (int i = 0; i < N; i++) Af[i] = (float)frand (lo, hi); } static void gen_trace (int index) { for (int i = 0; i < N; i++) A[i] = Trace[index + i]; } static void genf_trace (int index) { for (int i = 0; i < N; i++) Af[i] = (float)Trace[index + i]; } static void run_thruput (double f (double)) { for (int i = 0; i < N; i++) f (A[i]); } static void runf_thruput (float f (float)) { for (int i = 0; i < N; i++) f (Af[i]); } volatile double zero = 0; static void run_latency (double f (double)) { double z = zero; double prev = z; for (int i = 0; i < N; i++) prev = f (A[i] + prev * z); } static void runf_latency (float f (float)) { float z = (float)zero; float prev = z; for (int i = 0; i < N; i++) prev = f (Af[i] + prev * z); } -static void -run_v_thruput (v_double f (v_double)) -{ - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); -} - -static void -runf_v_thruput (v_float f (v_float)) -{ - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); -} - -static void -run_v_latency (v_double f (v_double)) -{ - v_double z = v_double_dup (zero); - v_double prev = z; - for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); -} - -static void -runf_v_latency (v_float f (v_float)) -{ - v_float z = v_float_dup (zero); - v_float prev = z; - for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); -} - #ifdef __vpcs static void run_vn_thruput (__vpcs v_double f (v_double)) { for (int i = 0; i < N; i += v_double_len ()) f (v_double_load (A+i)); } static void runf_vn_thruput (__vpcs v_float f (v_float)) { for (int i = 0; i < N; i += v_float_len ()) f (v_float_load (Af+i)); } static void run_vn_latency (__vpcs v_double f (v_double)) { - v_double z = v_double_dup (zero); - v_double prev = z; + volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; + uint64x2_t sel = vsel; + v_double prev = v_double_dup (0); for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); } static void runf_vn_latency (__vpcs v_float f (v_float)) { - v_float z = v_float_dup (zero); - v_float prev = z; + volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; + uint32x4_t sel = vsel; + v_float prev = v_float_dup (0); for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); } #endif #if WANT_SVE_MATH static void run_sv_thruput (sv_double f (sv_double, sv_bool)) { for (int i = 0; i < N; i += sv_double_len ()) f (sv_double_load (A+i), svptrue_b64 ()); } static void runf_sv_thruput (sv_float f (sv_float, sv_bool)) { for (int i = 0; i < N; i += sv_float_len ()) f (sv_float_load (Af+i), svptrue_b32 ()); } static void run_sv_latency (sv_double f (sv_double, sv_bool)) { - sv_double z = sv_double_dup (zero); - sv_double prev = z; + volatile sv_bool vsel = svptrue_b64 (); + sv_bool sel = vsel; + sv_double prev = sv_double_dup (0); for (int i = 0; i < N; i += sv_double_len ()) - prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ()); + prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); } static void runf_sv_latency (sv_float f (sv_float, sv_bool)) { - sv_float z = sv_float_dup (zero); - sv_float prev = z; + volatile sv_bool vsel = svptrue_b32 (); + sv_bool sel = vsel; + sv_float prev = sv_float_dup (0); for (int i = 0; i < N; i += sv_float_len ()) - prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ()); + prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); } #endif static uint64_t tic (void) { struct timespec ts; if (clock_gettime (CLOCK_REALTIME, &ts)) abort (); return ts.tv_sec * 1000000000ULL + ts.tv_nsec; } #define TIMEIT(run, f) do { \ dt = -1; \ run (f); /* Warm up. */ \ for (int j = 0; j < measurecount; j++) \ { \ uint64_t t0 = tic (); \ for (int i = 0; i < itercount; i++) \ run (f); \ uint64_t t1 = tic (); \ if (t1 - t0 < dt) \ dt = t1 - t0; \ } \ } while (0) static void bench1 (const struct fun *f, int type, double lo, double hi) { uint64_t dt = 0; uint64_t ns100; const char *s = type == 't' ? "rthruput" : "latency"; int vlen = 1; - if (f->vec && f->prec == 'd') - vlen = v_double_len(); - else if (f->vec && f->prec == 'f') - vlen = v_float_len(); + if (f->vec == 'n') + vlen = f->prec == 'd' ? v_double_len() : v_float_len(); + else if (f->vec == 's') + vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); else if (f->prec == 'd' && type == 'l' && f->vec == 0) TIMEIT (run_latency, f->fun.d); else if (f->prec == 'f' && type == 't' && f->vec == 0) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); - else if (f->prec == 'd' && type == 't' && f->vec == 'v') - TIMEIT (run_v_thruput, f->fun.vd); - else if (f->prec == 'd' && type == 'l' && f->vec == 'v') - TIMEIT (run_v_latency, f->fun.vd); - else if (f->prec == 'f' && type == 't' && f->vec == 'v') - TIMEIT (runf_v_thruput, f->fun.vf); - else if (f->prec == 'f' && type == 'l' && f->vec == 'v') - TIMEIT (runf_v_latency, f->fun.vf); #ifdef __vpcs else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); else if (f->prec == 'd' && type == 'l' && f->vec == 'n') TIMEIT (run_vn_latency, f->fun.vnd); else if (f->prec == 'f' && type == 't' && f->vec == 'n') TIMEIT (runf_vn_thruput, f->fun.vnf); else if (f->prec == 'f' && type == 'l' && f->vec == 'n') TIMEIT (runf_vn_latency, f->fun.vnf); #endif #if WANT_SVE_MATH else if (f->prec == 'd' && type == 't' && f->vec == 's') TIMEIT (run_sv_thruput, f->fun.svd); else if (f->prec == 'd' && type == 'l' && f->vec == 's') TIMEIT (run_sv_latency, f->fun.svd); else if (f->prec == 'f' && type == 't' && f->vec == 's') TIMEIT (runf_sv_thruput, f->fun.svf); else if (f->prec == 'f' && type == 'l' && f->vec == 's') TIMEIT (runf_sv_latency, f->fun.svf); #endif if (type == 't') { ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } else if (type == 'l') { ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); - printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } fflush (stdout); } static void bench (const struct fun *f, double lo, double hi, int type, int gen) { if (f->prec == 'd' && gen == 'r') gen_rand (lo, hi); else if (f->prec == 'd' && gen == 'l') gen_linear (lo, hi); else if (f->prec == 'd' && gen == 't') gen_trace (0); else if (f->prec == 'f' && gen == 'r') genf_rand (lo, hi); else if (f->prec == 'f' && gen == 'l') genf_linear (lo, hi); else if (f->prec == 'f' && gen == 't') genf_trace (0); if (gen == 't') hi = trace_size / N; if (type == 'b' || type == 't') bench1 (f, 't', lo, hi); if (type == 'b' || type == 'l') bench1 (f, 'l', lo, hi); for (int i = N; i < trace_size; i += N) { if (f->prec == 'd') gen_trace (i); else genf_trace (i); lo = i / N; if (type == 'b' || type == 't') bench1 (f, 't', lo, hi); if (type == 'b' || type == 'l') bench1 (f, 'l', lo, hi); } } static void readtrace (const char *name) { int n = 0; FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r"); if (!f) { printf ("openning \"%s\" failed: %m\n", name); exit (1); } for (;;) { if (n >= trace_size) { trace_size += N; Trace = realloc (Trace, trace_size * sizeof (Trace[0])); if (Trace == NULL) { printf ("out of memory\n"); exit (1); } } if (fscanf (f, "%lf", Trace + n) != 1) break; n++; } if (ferror (f) || n == 0) { printf ("reading \"%s\" failed: %m\n", name); exit (1); } fclose (f); if (n % N == 0) trace_size = n; for (int i = 0; n < trace_size; n++, i++) Trace[n] = Trace[i]; } static void usage (void) { printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] " "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func " "[func2 ..]\n"); printf ("func:\n"); printf ("%7s [run all benchmarks]\n", "all"); for (const struct fun *f = funtab; f->name; f++) printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi); exit (1); } int main (int argc, char *argv[]) { int usergen = 0, gen = 'r', type = 'b', all = 0; double lo = 0, hi = 0; const char *tracefile = "-"; argv++; argc--; for (;;) { if (argc <= 0) usage (); if (argv[0][0] != '-') break; else if (argc >= 3 && strcmp (argv[0], "-i") == 0) { usergen = 1; lo = strtod (argv[1], 0); hi = strtod (argv[2], 0); argv += 3; argc -= 3; } else if (argc >= 2 && strcmp (argv[0], "-m") == 0) { measurecount = strtol (argv[1], 0, 0); argv += 2; argc -= 2; } else if (argc >= 2 && strcmp (argv[0], "-c") == 0) { itercount = strtol (argv[1], 0, 0); argv += 2; argc -= 2; } else if (argc >= 2 && strcmp (argv[0], "-g") == 0) { gen = argv[1][0]; if (strchr ("rlt", gen) == 0) usage (); argv += 2; argc -= 2; } else if (argc >= 2 && strcmp (argv[0], "-f") == 0) { gen = 't'; /* -f implies -g trace. */ tracefile = argv[1]; argv += 2; argc -= 2; } else if (argc >= 2 && strcmp (argv[0], "-t") == 0) { type = argv[1][0]; if (strchr ("ltb", type) == 0) usage (); argv += 2; argc -= 2; } else usage (); } if (gen == 't') { readtrace (tracefile); lo = hi = 0; usergen = 1; } while (argc > 0) { int found = 0; all = strcmp (argv[0], "all") == 0; for (const struct fun *f = funtab; f->name; f++) if (all || strcmp (argv[0], f->name) == 0) { found = 1; if (!usergen) { lo = f->lo; hi = f->hi; } bench (f, lo, hi, type, gen); if (usergen && !all) break; } if (!found) printf ("unknown function: %s\n", argv[0]); argv++; argc--; } return 0; } diff --git a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h index ad6dd2a2313d..84c4e68650ac 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h +++ b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h @@ -1,100 +1,62 @@ /* * Function entries for mathbench. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ D (exp, -9.9, 9.9) D (exp, 0.5, 1.0) +D (exp10, -9.9, 9.9) D (exp2, -9.9, 9.9) D (log, 0.01, 11.1) D (log, 0.999, 1.001) D (log2, 0.01, 11.1) D (log2, 0.999, 1.001) {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, D (xpow, 0.01, 11.1) D (ypow, -9.9, 9.9) D (erf, -6.0, 6.0) F (expf, -9.9, 9.9) F (exp2f, -9.9, 9.9) F (logf, 0.01, 11.1) F (log2f, 0.01, 11.1) {"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, F (xpowf, 0.01, 11.1) F (ypowf, -9.9, 9.9) {"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, F (sinf, 0.1, 0.7) F (sinf, 0.8, 3.1) F (sinf, -3.1, 3.1) F (sinf, 3.3, 33.3) F (sinf, 100, 1000) F (sinf, 1e6, 1e32) F (cosf, 0.1, 0.7) F (cosf, 0.8, 3.1) F (cosf, -3.1, 3.1) F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) F (erff, -4.0, 4.0) -#if WANT_VMATH -D (__s_sin, -3.1, 3.1) -D (__s_cos, -3.1, 3.1) -D (__s_exp, -9.9, 9.9) -D (__s_log, 0.01, 11.1) -{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, -F (__s_expf, -9.9, 9.9) -F (__s_expf_1u, -9.9, 9.9) -F (__s_exp2f, -9.9, 9.9) -F (__s_exp2f_1u, -9.9, 9.9) -F (__s_logf, 0.01, 11.1) -{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, -F (__s_sinf, -3.1, 3.1) -F (__s_cosf, -3.1, 3.1) -#if __aarch64__ -VD (__v_sin, -3.1, 3.1) -VD (__v_cos, -3.1, 3.1) -VD (__v_exp, -9.9, 9.9) -VD (__v_log, 0.01, 11.1) -{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, -VF (__v_expf, -9.9, 9.9) -VF (__v_expf_1u, -9.9, 9.9) -VF (__v_exp2f, -9.9, 9.9) -VF (__v_exp2f_1u, -9.9, 9.9) -VF (__v_logf, 0.01, 11.1) -{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, -VF (__v_sinf, -3.1, 3.1) -VF (__v_cosf, -3.1, 3.1) #ifdef __vpcs -VND (__vn_exp, -9.9, 9.9) VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (__vn_log, 0.01, 11.1) VND (_ZGVnN2v_log, 0.01, 11.1) -{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, {"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (__vn_sin, -3.1, 3.1) VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (__vn_cos, -3.1, 3.1) VND (_ZGVnN2v_cos, -3.1, 3.1) -VNF (__vn_expf, -9.9, 9.9) VNF (_ZGVnN4v_expf, -9.9, 9.9) -VNF (__vn_expf_1u, -9.9, 9.9) -VNF (__vn_exp2f, -9.9, 9.9) +VNF (_ZGVnN4v_expf_1u, -9.9, 9.9) VNF (_ZGVnN4v_exp2f, -9.9, 9.9) -VNF (__vn_exp2f_1u, -9.9, 9.9) -VNF (__vn_logf, 0.01, 11.1) +VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9) VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, {"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (__vn_sinf, -3.1, 3.1) VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (__vn_cosf, -3.1, 3.1) VNF (_ZGVnN4v_cosf, -3.1, 3.1) #endif -#endif -#endif + /* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h index 8311f0f4e173..062b9db56de5 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h +++ b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h @@ -1,104 +1,66 @@ /* * Function wrappers for mathbench. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#if WANT_VMATH -#if __aarch64__ #ifdef __vpcs -__vpcs static v_float -xy__vn_powf (v_float x) -{ - return __vn_powf (x, x); -} __vpcs static v_float xy_Z_powf (v_float x) { return _ZGVnN4vv_powf (x, x); } -__vpcs static v_double -xy__vn_pow (v_double x) -{ - return __vn_pow (x, x); -} - __vpcs static v_double xy_Z_pow (v_double x) { return _ZGVnN2vv_pow (x, x); } -#endif // __vpcs - -static v_float -xy__v_powf (v_float x) -{ - return __v_powf (x, x); -} -static v_double -xy__v_pow (v_double x) -{ - return __v_pow (x, x); -} -#endif // __aarch64__ - -static float -xy__s_powf (float x) -{ - return __s_powf (x, x); -} - -static double -xy__s_pow (double x) -{ - return __s_pow (x, x); -} -#endif // WANT_VMATH +#endif static double xypow (double x) { return pow (x, x); } static float xypowf (float x) { return powf (x, x); } static double xpow (double x) { return pow (x, 23.4); } static float xpowf (float x) { return powf (x, 23.4f); } static double ypow (double x) { return pow (2.34, x); } static float ypowf (float x) { return powf (2.34f, x); } static float sincosf_wrap (float x) { float s, c; sincosf (x, &s, &c); return s + c; } diff --git a/contrib/arm-optimized-routines/math/test/mathtest.c b/contrib/arm-optimized-routines/math/test/mathtest.c index 3168da43b01d..834233fdde9d 100644 --- a/contrib/arm-optimized-routines/math/test/mathtest.c +++ b/contrib/arm-optimized-routines/math/test/mathtest.c @@ -1,1704 +1,1709 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2022, Arm Limited. + * Copyright (c) 1998-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include #include #include #include #include #include #include #include #include "mathlib.h" #ifndef math_errhandling # define math_errhandling 0 #endif #ifdef __cplusplus #define EXTERN_C extern "C" #else #define EXTERN_C extern #endif #ifndef TRUE #define TRUE 1 #endif #ifndef FALSE #define FALSE 0 #endif #ifdef IMPORT_SYMBOL #define STR2(x) #x #define STR(x) STR2(x) _Pragma(STR(import IMPORT_SYMBOL)) #endif int dmsd, dlsd; int quiet = 0; int doround = 0; unsigned statusmask = FE_ALL_EXCEPT; #define EXTRABITS (12) #define ULPUNIT (1<name, ((test_func*)b)->name); } int is_double_argtype(int argtype) { switch(argtype) { case at_d: case at_d2: case at_dc: case at_dc2: return 1; default: return 0; } } int is_single_argtype(int argtype) { switch(argtype) { case at_s: case at_s2: case at_sc: case at_sc2: return 1; default: return 0; } } int is_double_rettype(int rettype) { switch(rettype) { case rt_d: case rt_dc: case rt_d2: return 1; default: return 0; } } int is_single_rettype(int rettype) { switch(rettype) { case rt_s: case rt_sc: case rt_s2: return 1; default: return 0; } } int is_complex_argtype(int argtype) { switch(argtype) { case at_dc: case at_sc: case at_dc2: case at_sc2: return 1; default: return 0; } } int is_complex_rettype(int rettype) { switch(rettype) { case rt_dc: case rt_sc: return 1; default: return 0; } } /* * Special-case flags indicating that some functions' error * tolerance handling is more complicated than a fixed relative * error bound. */ #define ABSLOWERBOUND 0x4000000000000000LL #define PLUSMINUSPIO2 0x1000000000000000LL #define ARM_PREFIX(x) x #define TFUNC(arg,ret,name,tolerance) { t_func, arg, ret, (void*)&name, m_none, tolerance, #name } #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name } #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name } #ifndef PL /* sincosf wrappers for easier testing. */ static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; } static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; } #endif test_func tfuncs[] = { /* trigonometric */ TFUNC(at_d,rt_d, acos, 4*ULPUNIT), TFUNC(at_d,rt_d, asin, 4*ULPUNIT), TFUNC(at_d,rt_d, atan, 4*ULPUNIT), TFUNC(at_d2,rt_d, atan2, 4*ULPUNIT), TFUNC(at_d,rt_d, tan, 2*ULPUNIT), TFUNC(at_d,rt_d, sin, 2*ULPUNIT), TFUNC(at_d,rt_d, cos, 2*ULPUNIT), TFUNC(at_s,rt_s, acosf, 4*ULPUNIT), TFUNC(at_s,rt_s, asinf, 4*ULPUNIT), TFUNC(at_s,rt_s, atanf, 4*ULPUNIT), TFUNC(at_s2,rt_s, atan2f, 4*ULPUNIT), TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT), TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4), #ifndef PL TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4), #endif /* hyperbolic */ TFUNC(at_d, rt_d, atanh, 4*ULPUNIT), TFUNC(at_d, rt_d, asinh, 4*ULPUNIT), TFUNC(at_d, rt_d, acosh, 4*ULPUNIT), TFUNC(at_d,rt_d, tanh, 4*ULPUNIT), TFUNC(at_d,rt_d, sinh, 4*ULPUNIT), TFUNC(at_d,rt_d, cosh, 4*ULPUNIT), TFUNC(at_s, rt_s, atanhf, 4*ULPUNIT), TFUNC(at_s, rt_s, asinhf, 4*ULPUNIT), TFUNC(at_s, rt_s, acoshf, 4*ULPUNIT), TFUNC(at_s,rt_s, tanhf, 4*ULPUNIT), TFUNC(at_s,rt_s, sinhf, 4*ULPUNIT), TFUNC(at_s,rt_s, coshf, 4*ULPUNIT), /* exponential and logarithmic */ TFUNC(at_d,rt_d, log, 3*ULPUNIT/4), TFUNC(at_d,rt_d, log10, 3*ULPUNIT), TFUNC(at_d,rt_d, log2, 3*ULPUNIT/4), TFUNC(at_d,rt_d, log1p, 2*ULPUNIT), TFUNC(at_d,rt_d, exp, 3*ULPUNIT/4), TFUNC(at_d,rt_d, exp2, 3*ULPUNIT/4), TFUNC(at_d,rt_d, expm1, ULPUNIT), TFUNCARM(at_s,rt_s, logf, ULPUNIT), TFUNC(at_s,rt_s, log10f, 3*ULPUNIT), TFUNCARM(at_s,rt_s, log2f, ULPUNIT), TFUNC(at_s,rt_s, log1pf, 2*ULPUNIT), TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4), TFUNC(at_s,rt_s, expm1f, ULPUNIT), + TFUNC(at_d,rt_d, exp10, ULPUNIT), /* power */ TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4), TFUNC(at_d,rt_d, sqrt, ULPUNIT/2), TFUNC(at_d,rt_d, cbrt, 2*ULPUNIT), TFUNC(at_d2, rt_d, hypot, 4*ULPUNIT), TFUNCARM(at_s2,rt_s, powf, ULPUNIT), TFUNC(at_s,rt_s, sqrtf, ULPUNIT/2), TFUNC(at_s,rt_s, cbrtf, 2*ULPUNIT), TFUNC(at_s2, rt_s, hypotf, 4*ULPUNIT), /* error function */ TFUNC(at_d,rt_d, erf, 16*ULPUNIT), TFUNC(at_s,rt_s, erff, 16*ULPUNIT), TFUNC(at_d,rt_d, erfc, 16*ULPUNIT), TFUNC(at_s,rt_s, erfcf, 16*ULPUNIT), /* gamma functions */ TFUNC(at_d,rt_d, tgamma, 16*ULPUNIT), TFUNC(at_s,rt_s, tgammaf, 16*ULPUNIT), TFUNC(at_d,rt_d, lgamma, 16*ULPUNIT | ABSLOWERBOUND), TFUNC(at_s,rt_s, lgammaf, 16*ULPUNIT | ABSLOWERBOUND), TFUNC(at_d,rt_d, ceil, 0), TFUNC(at_s,rt_s, ceilf, 0), TFUNC(at_d2,rt_d, copysign, 0), TFUNC(at_s2,rt_s, copysignf, 0), TFUNC(at_d,rt_d, floor, 0), TFUNC(at_s,rt_s, floorf, 0), TFUNC(at_d2,rt_d, fmax, 0), TFUNC(at_s2,rt_s, fmaxf, 0), TFUNC(at_d2,rt_d, fmin, 0), TFUNC(at_s2,rt_s, fminf, 0), TFUNC(at_d2,rt_d, fmod, 0), TFUNC(at_s2,rt_s, fmodf, 0), MFUNC(at_d, rt_i, fpclassify, 0), MFUNC(at_s, rt_i, fpclassifyf, 0), TFUNC(at_dip,rt_d, frexp, 0), TFUNC(at_sip,rt_s, frexpf, 0), MFUNC(at_d, rt_i, isfinite, 0), MFUNC(at_s, rt_i, isfinitef, 0), MFUNC(at_d, rt_i, isgreater, 0), MFUNC(at_d, rt_i, isgreaterequal, 0), MFUNC(at_s, rt_i, isgreaterequalf, 0), MFUNC(at_s, rt_i, isgreaterf, 0), MFUNC(at_d, rt_i, isinf, 0), MFUNC(at_s, rt_i, isinff, 0), MFUNC(at_d, rt_i, isless, 0), MFUNC(at_d, rt_i, islessequal, 0), MFUNC(at_s, rt_i, islessequalf, 0), MFUNC(at_s, rt_i, islessf, 0), MFUNC(at_d, rt_i, islessgreater, 0), MFUNC(at_s, rt_i, islessgreaterf, 0), MFUNC(at_d, rt_i, isnan, 0), MFUNC(at_s, rt_i, isnanf, 0), MFUNC(at_d, rt_i, isnormal, 0), MFUNC(at_s, rt_i, isnormalf, 0), MFUNC(at_d, rt_i, isunordered, 0), MFUNC(at_s, rt_i, isunorderedf, 0), TFUNC(at_di,rt_d, ldexp, 0), TFUNC(at_si,rt_s, ldexpf, 0), TFUNC(at_ddp,rt_d2, modf, 0), TFUNC(at_ssp,rt_s2, modff, 0), #ifndef BIGRANGERED MFUNC(at_d, rt_d, rred, 2*ULPUNIT), #else MFUNC(at_d, rt_d, m_rred, ULPUNIT), #endif MFUNC(at_d, rt_i, signbit, 0), MFUNC(at_s, rt_i, signbitf, 0), }; /* * keywords are: func size op1 op2 result res2 errno op1r op1i op2r op2i resultr resulti * also we ignore: wrongresult wrongres2 wrongerrno * op1 equivalent to op1r, same with op2 and result */ typedef struct { test_func *func; unsigned op1r[2]; /* real part, also used for non-complex numbers */ unsigned op1i[2]; /* imaginary part */ unsigned op2r[2]; unsigned op2i[2]; unsigned resultr[3]; unsigned resulti[3]; enum { rc_none, rc_zero, rc_infinity, rc_nan, rc_finite } resultc; /* special complex results, rc_none means use resultr and resulti as normal */ unsigned res2[2]; unsigned status; /* IEEE status return, if any */ unsigned maybestatus; /* for optional status, or allowance for spurious */ int nresult; /* number of result words */ int in_err, in_err_limit; int err; int maybeerr; int valid; int comment; int random; } testdetail; enum { /* keywords */ k_errno, k_errno_in, k_error, k_func, k_maybeerror, k_maybestatus, k_op1, k_op1i, k_op1r, k_op2, k_op2i, k_op2r, k_random, k_res2, k_result, k_resultc, k_resulti, k_resultr, k_status, k_wrongres2, k_wrongresult, k_wrongstatus, k_wrongerrno }; char *keywords[] = { "errno", "errno_in", "error", "func", "maybeerror", "maybestatus", "op1", "op1i", "op1r", "op2", "op2i", "op2r", "random", "res2", "result", "resultc", "resulti", "resultr", "status", "wrongres2", "wrongresult", "wrongstatus", "wrongerrno" }; enum { e_0, e_EDOM, e_ERANGE, /* * This enum makes sure that we have the right number of errnos in the * errno[] array */ e_number_of_errnos }; char *errnos[] = { "0", "EDOM", "ERANGE" }; enum { e_none, e_divbyzero, e_domain, e_overflow, e_underflow }; char *errors[] = { "0", "divbyzero", "domain", "overflow", "underflow" }; static int verbose, fo, strict; /* state toggled by random=on / random=off */ static int randomstate; /* Canonify a double NaN: SNaNs all become 7FF00000.00000001 and QNaNs * all become 7FF80000.00000001 */ void canon_dNaN(unsigned a[2]) { if ((a[0] & 0x7FF00000) != 0x7FF00000) return; /* not Inf or NaN */ if (!(a[0] & 0xFFFFF) && !a[1]) return; /* Inf */ a[0] &= 0x7FF80000; /* canonify top word */ a[1] = 0x00000001; /* canonify bottom word */ } /* Canonify a single NaN: SNaNs all become 7F800001 and QNaNs * all become 7FC00001. Returns classification of the NaN. */ void canon_sNaN(unsigned a[1]) { if ((a[0] & 0x7F800000) != 0x7F800000) return; /* not Inf or NaN */ if (!(a[0] & 0x7FFFFF)) return; /* Inf */ a[0] &= 0x7FC00000; /* canonify most bits */ a[0] |= 0x00000001; /* canonify bottom bit */ } /* * Detect difficult operands for FO mode. */ int is_dhard(unsigned a[2]) { if ((a[0] & 0x7FF00000) == 0x7FF00000) return TRUE; /* inf or NaN */ if ((a[0] & 0x7FF00000) == 0 && ((a[0] & 0x7FFFFFFF) | a[1]) != 0) return TRUE; /* denormal */ return FALSE; } int is_shard(unsigned a[1]) { if ((a[0] & 0x7F800000) == 0x7F800000) return TRUE; /* inf or NaN */ if ((a[0] & 0x7F800000) == 0 && (a[0] & 0x7FFFFFFF) != 0) return TRUE; /* denormal */ return FALSE; } /* * Normalise all zeroes into +0, for FO mode. */ void dnormzero(unsigned a[2]) { if (a[0] == 0x80000000 && a[1] == 0) a[0] = 0; } void snormzero(unsigned a[1]) { if (a[0] == 0x80000000) a[0] = 0; } static int find(char *word, char **array, int asize) { int i, j; asize /= sizeof(char *); i = -1; j = asize; /* strictly between i and j */ while (j-i > 1) { int k = (i+j) / 2; int c = strcmp(word, array[k]); if (c > 0) i = k; else if (c < 0) j = k; else /* found it! */ return k; } return -1; /* not found */ } static test_func* find_testfunc(char *word) { int i, j, asize; asize = sizeof(tfuncs)/sizeof(test_func); i = -1; j = asize; /* strictly between i and j */ while (j-i > 1) { int k = (i+j) / 2; int c = strcmp(word, tfuncs[k].name); if (c > 0) i = k; else if (c < 0) j = k; else /* found it! */ return tfuncs + k; } return NULL; /* not found */ } static long long calc_error(unsigned a[2], unsigned b[3], int shift, int rettype) { unsigned r0, r1, r2; int sign, carry; long long result; /* * If either number is infinite, require exact equality. If * either number is NaN, require that both are NaN. If either * of these requirements is broken, return INT_MAX. */ if (is_double_rettype(rettype)) { if ((a[0] & 0x7FF00000) == 0x7FF00000 || (b[0] & 0x7FF00000) == 0x7FF00000) { if (((a[0] & 0x800FFFFF) || a[1]) && ((b[0] & 0x800FFFFF) || b[1]) && (a[0] & 0x7FF00000) == 0x7FF00000 && (b[0] & 0x7FF00000) == 0x7FF00000) return 0; /* both NaN - OK */ if (!((a[0] & 0xFFFFF) || a[1]) && !((b[0] & 0xFFFFF) || b[1]) && a[0] == b[0]) return 0; /* both same sign of Inf - OK */ return LLONG_MAX; } } else { if ((a[0] & 0x7F800000) == 0x7F800000 || (b[0] & 0x7F800000) == 0x7F800000) { if ((a[0] & 0x807FFFFF) && (b[0] & 0x807FFFFF) && (a[0] & 0x7F800000) == 0x7F800000 && (b[0] & 0x7F800000) == 0x7F800000) return 0; /* both NaN - OK */ if (!(a[0] & 0x7FFFFF) && !(b[0] & 0x7FFFFF) && a[0] == b[0]) return 0; /* both same sign of Inf - OK */ return LLONG_MAX; } } /* * Both finite. Return INT_MAX if the signs differ. */ if ((a[0] ^ b[0]) & 0x80000000) return LLONG_MAX; /* * Now it's just straight multiple-word subtraction. */ if (is_double_rettype(rettype)) { r2 = -b[2]; carry = (r2 == 0); r1 = a[1] + ~b[1] + carry; carry = (r1 < a[1] || (carry && r1 == a[1])); r0 = a[0] + ~b[0] + carry; } else { r2 = -b[1]; carry = (r2 == 0); r1 = a[0] + ~b[0] + carry; carry = (r1 < a[0] || (carry && r1 == a[0])); r0 = ~0 + carry; } /* * Forgive larger errors in specialised cases. */ if (shift > 0) { if (shift > 32*3) return 0; /* all errors are forgiven! */ while (shift >= 32) { r2 = r1; r1 = r0; r0 = -(r0 >> 31); shift -= 32; } if (shift > 0) { r2 = (r2 >> shift) | (r1 << (32-shift)); r1 = (r1 >> shift) | (r0 << (32-shift)); r0 = (r0 >> shift) | ((-(r0 >> 31)) << (32-shift)); } } if (r0 & 0x80000000) { sign = 1; r2 = ~r2; carry = (r2 == 0); r1 = 0 + ~r1 + carry; carry = (carry && (r2 == 0)); r0 = 0 + ~r0 + carry; } else { sign = 0; } if (r0 >= (1LL<<(31-EXTRABITS))) return LLONG_MAX; /* many ulps out */ result = (r2 >> (32-EXTRABITS)) & (ULPUNIT-1); result |= r1 << EXTRABITS; result |= (long long)r0 << (32+EXTRABITS); if (sign) result = -result; return result; } /* special named operands */ typedef struct { unsigned op1, op2; char* name; } special_op; static special_op special_ops_double[] = { {0x00000000,0x00000000,"0"}, {0x3FF00000,0x00000000,"1"}, {0x7FF00000,0x00000000,"inf"}, {0x7FF80000,0x00000001,"qnan"}, {0x7FF00000,0x00000001,"snan"}, {0x3ff921fb,0x54442d18,"pi2"}, {0x400921fb,0x54442d18,"pi"}, {0x3fe921fb,0x54442d18,"pi4"}, {0x4002d97c,0x7f3321d2,"3pi4"}, }; static special_op special_ops_float[] = { {0x00000000,0,"0"}, {0x3f800000,0,"1"}, {0x7f800000,0,"inf"}, {0x7fc00000,0,"qnan"}, {0x7f800001,0,"snan"}, {0x3fc90fdb,0,"pi2"}, {0x40490fdb,0,"pi"}, {0x3f490fdb,0,"pi4"}, {0x4016cbe4,0,"3pi4"}, }; /* This is what is returned by the below functions. We need it to handle the sign of the number */ static special_op tmp_op = {0,0,0}; special_op* find_special_op_from_op(unsigned op1, unsigned op2, int is_double) { int i; special_op* sop; if(is_double) { sop = special_ops_double; } else { sop = special_ops_float; } for(i = 0; i < sizeof(special_ops_double)/sizeof(special_op); i++) { if(sop->op1 == (op1&0x7fffffff) && sop->op2 == op2) { if(tmp_op.name) free(tmp_op.name); tmp_op.name = malloc(strlen(sop->name)+2); if(op1>>31) { sprintf(tmp_op.name,"-%s",sop->name); } else { strcpy(tmp_op.name,sop->name); } return &tmp_op; } sop++; } return NULL; } special_op* find_special_op_from_name(const char* name, int is_double) { int i, neg=0; special_op* sop; if(is_double) { sop = special_ops_double; } else { sop = special_ops_float; } if(*name=='-') { neg=1; name++; } else if(*name=='+') { name++; } for(i = 0; i < sizeof(special_ops_double)/sizeof(special_op); i++) { if(0 == strcmp(name,sop->name)) { tmp_op.op1 = sop->op1; if(neg) { tmp_op.op1 |= 0x80000000; } tmp_op.op2 = sop->op2; return &tmp_op; } sop++; } return NULL; } /* helper function for the below type=0 for single, 1 for double, 2 for no sop */ int do_op(char* q, unsigned* op, const char* name, int num, int sop_type) { int i; int n=num; special_op* sop = NULL; for(i = 0; i < num; i++) { op[i] = 0; } if(sop_type<2) { sop = find_special_op_from_name(q,sop_type); } if(sop != NULL) { op[0] = sop->op1; op[1] = sop->op2; } else { switch(num) { case 1: n = sscanf(q, "%x", &op[0]); break; case 2: n = sscanf(q, "%x.%x", &op[0], &op[1]); break; case 3: n = sscanf(q, "%x.%x.%x", &op[0], &op[1], &op[2]); break; default: return -1; } } if (verbose) { printf("%s=",name); for (i = 0; (i < n); ++i) printf("%x.", op[i]); printf(" (n=%d)\n", n); } return n; } testdetail parsetest(char *testbuf, testdetail oldtest) { char *p; /* Current part of line: Option name */ char *q; /* Current part of line: Option value */ testdetail ret; /* What we return */ int k; /* Function enum from k_* */ int n; /* Used as returns for scanfs */ int argtype=2, rettype=2; /* for do_op */ /* clear ret */ memset(&ret, 0, sizeof(ret)); if (verbose) printf("Parsing line: %s\n", testbuf); while (*testbuf && isspace(*testbuf)) testbuf++; if (testbuf[0] == ';' || testbuf[0] == '#' || testbuf[0] == '!' || testbuf[0] == '>' || testbuf[0] == '\0') { ret.comment = 1; if (verbose) printf("Line is a comment\n"); return ret; } ret.comment = 0; if (*testbuf == '+') { if (oldtest.valid) { ret = oldtest; /* structure copy */ } else { fprintf(stderr, "copy from invalid: ignored\n"); } testbuf++; } ret.random = randomstate; ret.in_err = 0; ret.in_err_limit = e_number_of_errnos; p = strtok(testbuf, " \t"); while (p != NULL) { q = strchr(p, '='); if (!q) goto balderdash; *q++ = '\0'; k = find(p, keywords, sizeof(keywords)); switch (k) { case k_random: randomstate = (!strcmp(q, "on")); ret.comment = 1; return ret; /* otherwise ignore this line */ case k_func: if (verbose) printf("func=%s ", q); //ret.func = find(q, funcs, sizeof(funcs)); ret.func = find_testfunc(q); if (ret.func == NULL) { if (verbose) printf("(id=unknown)\n"); goto balderdash; } if(is_single_argtype(ret.func->argtype)) argtype = 0; else if(is_double_argtype(ret.func->argtype)) argtype = 1; if(is_single_rettype(ret.func->rettype)) rettype = 0; else if(is_double_rettype(ret.func->rettype)) rettype = 1; //ret.size = sizes[ret.func]; if (verbose) printf("(name=%s) (size=%d)\n", ret.func->name, ret.func->argtype); break; case k_op1: case k_op1r: n = do_op(q,ret.op1r,"op1r",2,argtype); if (n < 1) goto balderdash; break; case k_op1i: n = do_op(q,ret.op1i,"op1i",2,argtype); if (n < 1) goto balderdash; break; case k_op2: case k_op2r: n = do_op(q,ret.op2r,"op2r",2,argtype); if (n < 1) goto balderdash; break; case k_op2i: n = do_op(q,ret.op2i,"op2i",2,argtype); if (n < 1) goto balderdash; break; case k_resultc: puts(q); if(strncmp(q,"inf",3)==0) { ret.resultc = rc_infinity; } else if(strcmp(q,"zero")==0) { ret.resultc = rc_zero; } else if(strcmp(q,"nan")==0) { ret.resultc = rc_nan; } else if(strcmp(q,"finite")==0) { ret.resultc = rc_finite; } else { goto balderdash; } break; case k_result: case k_resultr: n = (do_op)(q,ret.resultr,"resultr",3,rettype); if (n < 1) goto balderdash; ret.nresult = n; /* assume real and imaginary have same no. words */ break; case k_resulti: n = do_op(q,ret.resulti,"resulti",3,rettype); if (n < 1) goto balderdash; break; case k_res2: n = do_op(q,ret.res2,"res2",2,rettype); if (n < 1) goto balderdash; break; case k_status: while (*q) { if (*q == 'i') ret.status |= FE_INVALID; if (*q == 'z') ret.status |= FE_DIVBYZERO; if (*q == 'o') ret.status |= FE_OVERFLOW; if (*q == 'u') ret.status |= FE_UNDERFLOW; q++; } break; case k_maybeerror: n = find(q, errors, sizeof(errors)); if (n < 0) goto balderdash; if(math_errhandling&MATH_ERREXCEPT) { switch(n) { case e_domain: ret.maybestatus |= FE_INVALID; break; case e_divbyzero: ret.maybestatus |= FE_DIVBYZERO; break; case e_overflow: ret.maybestatus |= FE_OVERFLOW; break; case e_underflow: ret.maybestatus |= FE_UNDERFLOW; break; } } { switch(n) { case e_domain: ret.maybeerr = e_EDOM; break; case e_divbyzero: case e_overflow: case e_underflow: ret.maybeerr = e_ERANGE; break; } } case k_maybestatus: while (*q) { if (*q == 'i') ret.maybestatus |= FE_INVALID; if (*q == 'z') ret.maybestatus |= FE_DIVBYZERO; if (*q == 'o') ret.maybestatus |= FE_OVERFLOW; if (*q == 'u') ret.maybestatus |= FE_UNDERFLOW; q++; } break; case k_error: n = find(q, errors, sizeof(errors)); if (n < 0) goto balderdash; if(math_errhandling&MATH_ERREXCEPT) { switch(n) { case e_domain: ret.status |= FE_INVALID; break; case e_divbyzero: ret.status |= FE_DIVBYZERO; break; case e_overflow: ret.status |= FE_OVERFLOW; break; case e_underflow: ret.status |= FE_UNDERFLOW; break; } } if(math_errhandling&MATH_ERRNO) { switch(n) { case e_domain: ret.err = e_EDOM; break; case e_divbyzero: case e_overflow: case e_underflow: ret.err = e_ERANGE; break; } } if(!(math_errhandling&MATH_ERRNO)) { switch(n) { case e_domain: ret.maybeerr = e_EDOM; break; case e_divbyzero: case e_overflow: case e_underflow: ret.maybeerr = e_ERANGE; break; } } break; case k_errno: ret.err = find(q, errnos, sizeof(errnos)); if (ret.err < 0) goto balderdash; break; case k_errno_in: ret.in_err = find(q, errnos, sizeof(errnos)); if (ret.err < 0) goto balderdash; ret.in_err_limit = ret.in_err + 1; break; case k_wrongresult: case k_wrongstatus: case k_wrongres2: case k_wrongerrno: /* quietly ignore these keys */ break; default: goto balderdash; } p = strtok(NULL, " \t"); } ret.valid = 1; return ret; /* come here from almost any error */ balderdash: ret.valid = 0; return ret; } typedef enum { test_comment, /* deliberately not a test */ test_invalid, /* accidentally not a test */ test_decline, /* was a test, and wasn't run */ test_fail, /* was a test, and failed */ test_pass /* was a test, and passed */ } testresult; char failtext[512]; typedef union { unsigned i[2]; double f; double da[2]; } dbl; typedef union { unsigned i; float f; float da[2]; } sgl; /* helper function for runtest */ void print_error(int rettype, unsigned *result, char* text, char** failp) { special_op *sop; char *str; if(result) { *failp += sprintf(*failp," %s=",text); sop = find_special_op_from_op(result[0],result[1],is_double_rettype(rettype)); if(sop) { *failp += sprintf(*failp,"%s",sop->name); } else { if(is_double_rettype(rettype)) { str="%08x.%08x"; } else { str="%08x"; } *failp += sprintf(*failp,str,result[0],result[1]); } } } void print_ulps_helper(const char *name, long long ulps, char** failp) { if(ulps == LLONG_MAX) { *failp += sprintf(*failp, " %s=HUGE", name); } else { *failp += sprintf(*failp, " %s=%.3f", name, (double)ulps / ULPUNIT); } } /* for complex args make ulpsr or ulpsri = 0 to not print */ void print_ulps(int rettype, long long ulpsr, long long ulpsi, char** failp) { if(is_complex_rettype(rettype)) { if (ulpsr) print_ulps_helper("ulpsr",ulpsr,failp); if (ulpsi) print_ulps_helper("ulpsi",ulpsi,failp); } else { if (ulpsr) print_ulps_helper("ulps",ulpsr,failp); } } int runtest(testdetail t) { int err, status; dbl d_arg1, d_arg2, d_res, d_res2; sgl s_arg1, s_arg2, s_res, s_res2; int deferred_decline = FALSE; char *failp = failtext; unsigned int intres=0; int res2_adjust = 0; if (t.comment) return test_comment; if (!t.valid) return test_invalid; /* Set IEEE status to mathlib-normal */ feclearexcept(FE_ALL_EXCEPT); /* Deal with operands */ #define DO_DOP(arg,op) arg.i[dmsd] = t.op[0]; arg.i[dlsd] = t.op[1] DO_DOP(d_arg1,op1r); DO_DOP(d_arg2,op2r); s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0]; + s_res.i = 0; /* * Detect NaNs, infinities and denormals on input, and set a * deferred decline flag if we're in FO mode. * * (We defer the decline rather than doing it immediately * because even in FO mode the operation is not permitted to * crash or tight-loop; so we _run_ the test, and then ignore * all the results.) */ if (fo) { if (is_double_argtype(t.func->argtype) && is_dhard(t.op1r)) deferred_decline = TRUE; if (t.func->argtype==at_d2 && is_dhard(t.op2r)) deferred_decline = TRUE; if (is_single_argtype(t.func->argtype) && is_shard(t.op1r)) deferred_decline = TRUE; if (t.func->argtype==at_s2 && is_shard(t.op2r)) deferred_decline = TRUE; if (is_double_rettype(t.func->rettype) && is_dhard(t.resultr)) deferred_decline = TRUE; if (t.func->rettype==rt_d2 && is_dhard(t.res2)) deferred_decline = TRUE; if (is_single_argtype(t.func->rettype) && is_shard(t.resultr)) deferred_decline = TRUE; if (t.func->rettype==rt_s2 && is_shard(t.res2)) deferred_decline = TRUE; if (t.err == e_ERANGE) deferred_decline = TRUE; } /* * Perform the operation */ errno = t.in_err == e_EDOM ? EDOM : t.in_err == e_ERANGE ? ERANGE : 0; if (t.err == e_0) t.err = t.in_err; if (t.maybeerr == e_0) t.maybeerr = t.in_err; if(t.func->type == t_func) { switch(t.func->argtype) { case at_d: d_res.f = t.func->func.d_d_ptr(d_arg1.f); break; case at_s: s_res.f = t.func->func.s_s_ptr(s_arg1.f); break; case at_d2: d_res.f = t.func->func.d2_d_ptr(d_arg1.f, d_arg2.f); break; case at_s2: s_res.f = t.func->func.s2_s_ptr(s_arg1.f, s_arg2.f); break; case at_di: d_res.f = t.func->func.di_d_ptr(d_arg1.f, d_arg2.i[dmsd]); break; case at_si: s_res.f = t.func->func.si_s_ptr(s_arg1.f, s_arg2.i); break; case at_dip: d_res.f = t.func->func.dip_d_ptr(d_arg1.f, (int*)&intres); break; case at_sip: s_res.f = t.func->func.sip_s_ptr(s_arg1.f, (int*)&intres); break; case at_ddp: d_res.f = t.func->func.ddp_d_ptr(d_arg1.f, &d_res2.f); break; case at_ssp: s_res.f = t.func->func.ssp_s_ptr(s_arg1.f, &s_res2.f); break; default: printf("unhandled function: %s\n",t.func->name); return test_fail; } } else { /* printf("macro: name=%s, num=%i, s1.i=0x%08x s1.f=%f\n",t.func->name, t.func->macro_name, s_arg1.i, (double)s_arg1.f); */ switch(t.func->macro_name) { case m_isfinite: intres = isfinite(d_arg1.f); break; case m_isinf: intres = isinf(d_arg1.f); break; case m_isnan: intres = isnan(d_arg1.f); break; case m_isnormal: intres = isnormal(d_arg1.f); break; case m_signbit: intres = signbit(d_arg1.f); break; case m_fpclassify: intres = fpclassify(d_arg1.f); break; case m_isgreater: intres = isgreater(d_arg1.f, d_arg2.f); break; case m_isgreaterequal: intres = isgreaterequal(d_arg1.f, d_arg2.f); break; case m_isless: intres = isless(d_arg1.f, d_arg2.f); break; case m_islessequal: intres = islessequal(d_arg1.f, d_arg2.f); break; case m_islessgreater: intres = islessgreater(d_arg1.f, d_arg2.f); break; case m_isunordered: intres = isunordered(d_arg1.f, d_arg2.f); break; case m_isfinitef: intres = isfinite(s_arg1.f); break; case m_isinff: intres = isinf(s_arg1.f); break; case m_isnanf: intres = isnan(s_arg1.f); break; case m_isnormalf: intres = isnormal(s_arg1.f); break; case m_signbitf: intres = signbit(s_arg1.f); break; case m_fpclassifyf: intres = fpclassify(s_arg1.f); break; case m_isgreaterf: intres = isgreater(s_arg1.f, s_arg2.f); break; case m_isgreaterequalf: intres = isgreaterequal(s_arg1.f, s_arg2.f); break; case m_islessf: intres = isless(s_arg1.f, s_arg2.f); break; case m_islessequalf: intres = islessequal(s_arg1.f, s_arg2.f); break; case m_islessgreaterf: intres = islessgreater(s_arg1.f, s_arg2.f); break; case m_isunorderedf: intres = isunordered(s_arg1.f, s_arg2.f); break; default: printf("unhandled macro: %s\n",t.func->name); return test_fail; } } /* * Decline the test if the deferred decline flag was set above. */ if (deferred_decline) return test_decline; /* printf("intres=%i\n",intres); */ /* Clear the fail text (indicating a pass unless we change it) */ failp[0] = '\0'; /* Check the IEEE status bits (except INX, which we disregard). * We don't bother with this for complex numbers, because the * complex functions are hard to get exactly right and we don't * have to anyway (C99 annex G is only informative). */ if (!(is_complex_argtype(t.func->argtype) || is_complex_rettype(t.func->rettype))) { status = fetestexcept(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW); if ((status|t.maybestatus|~statusmask) != (t.status|t.maybestatus|~statusmask)) { if (quiet) failtext[0]='x'; else { failp += sprintf(failp, " wrongstatus=%s%s%s%s%s", (status & FE_INVALID ? "i" : ""), (status & FE_DIVBYZERO ? "z" : ""), (status & FE_OVERFLOW ? "o" : ""), (status & FE_UNDERFLOW ? "u" : ""), (status ? "" : "OK")); } } } /* Check the result */ { unsigned resultr[2], resulti[2]; unsigned tresultr[3], tresulti[3], wres; switch(t.func->rettype) { case rt_d: case rt_d2: tresultr[0] = t.resultr[0]; tresultr[1] = t.resultr[1]; resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd]; + resulti[0] = resulti[1] = 0; wres = 2; break; case rt_i: tresultr[0] = t.resultr[0]; resultr[0] = intres; + resulti[0] = 0; wres = 1; break; case rt_s: case rt_s2: tresultr[0] = t.resultr[0]; resultr[0] = s_res.i; + resulti[0] = 0; wres = 1; break; default: puts("unhandled rettype in runtest"); - wres = 0; + abort (); } if(t.resultc != rc_none) { int err = 0; switch(t.resultc) { case rc_zero: if(resultr[0] != 0 || resulti[0] != 0 || (wres==2 && (resultr[1] != 0 || resulti[1] != 0))) { err = 1; } break; case rc_infinity: if(wres==1) { if(!((resultr[0]&0x7fffffff)==0x7f800000 || (resulti[0]&0x7fffffff)==0x7f800000)) { err = 1; } } else { if(!(((resultr[0]&0x7fffffff)==0x7ff00000 && resultr[1]==0) || ((resulti[0]&0x7fffffff)==0x7ff00000 && resulti[1]==0))) { err = 1; } } break; case rc_nan: if(wres==1) { if(!((resultr[0]&0x7fffffff)>0x7f800000 || (resulti[0]&0x7fffffff)>0x7f800000)) { err = 1; } } else { canon_dNaN(resultr); canon_dNaN(resulti); if(!(((resultr[0]&0x7fffffff)>0x7ff00000 && resultr[1]==1) || ((resulti[0]&0x7fffffff)>0x7ff00000 && resulti[1]==1))) { err = 1; } } break; case rc_finite: if(wres==1) { if(!((resultr[0]&0x7fffffff)<0x7f800000 || (resulti[0]&0x7fffffff)<0x7f800000)) { err = 1; } } else { if(!((resultr[0]&0x7fffffff)<0x7ff00000 || (resulti[0]&0x7fffffff)<0x7ff00000)) { err = 1; } } break; default: break; } if(err) { print_error(t.func->rettype,resultr,"wrongresultr",&failp); print_error(t.func->rettype,resulti,"wrongresulti",&failp); } } else if (t.nresult > wres) { /* * The test case data has provided the result to more * than double precision. Instead of testing exact * equality, we test against our maximum error * tolerance. */ int rshift, ishift; long long ulpsr, ulpsi, ulptolerance; tresultr[wres] = t.resultr[wres] << (32-EXTRABITS); tresulti[wres] = t.resulti[wres] << (32-EXTRABITS); if(strict) { ulptolerance = 4096; /* one ulp */ } else { ulptolerance = t.func->tolerance; } rshift = ishift = 0; if (ulptolerance & ABSLOWERBOUND) { /* * Hack for the lgamma functions, which have an * error behaviour that can't conveniently be * characterised in pure ULPs. Really, we want to * say that the error in lgamma is "at most N ULPs, * or at most an absolute error of X, whichever is * larger", for appropriately chosen N,X. But since * these two functions are the only cases where it * arises, I haven't bothered to do it in a nice way * in the function table above. * * (The difficult cases arise with negative input * values such that |gamma(x)| is very near to 1; in * this situation implementations tend to separately * compute lgamma(|x|) and the log of the correction * term from the Euler reflection formula, and * subtract - which catastrophically loses * significance.) * * As far as I can tell, nobody cares about this: * GNU libm doesn't get those cases right either, * and OpenCL explicitly doesn't state a ULP error * limit for lgamma. So my guess is that this is * simply considered acceptable error behaviour for * this particular function, and hence I feel free * to allow for it here. */ ulptolerance &= ~ABSLOWERBOUND; if (t.op1r[0] & 0x80000000) { if (t.func->rettype == rt_d) rshift = 0x400 - ((tresultr[0] >> 20) & 0x7ff); else if (t.func->rettype == rt_s) rshift = 0x80 - ((tresultr[0] >> 23) & 0xff); if (rshift < 0) rshift = 0; } } if (ulptolerance & PLUSMINUSPIO2) { ulptolerance &= ~PLUSMINUSPIO2; /* * Hack for range reduction, which can reduce * borderline cases in the wrong direction, i.e. * return a value just outside one end of the interval * [-pi/4,+pi/4] when it could have returned a value * just inside the other end by subtracting an * adjacent multiple of pi/2. * * We tolerate this, up to a point, because the * trigonometric functions making use of the output of * rred can cope and because making the range reducer * do the exactly right thing in every case would be * more expensive. */ if (wres == 1) { /* Upper bound of overshoot derived in rredf.h */ if ((resultr[0]&0x7FFFFFFF) <= 0x3f494b02 && (resultr[0]&0x7FFFFFFF) > 0x3f490fda && (resultr[0]&0x80000000) != (tresultr[0]&0x80000000)) { unsigned long long val; val = tresultr[0]; val = (val << 32) | tresultr[1]; /* * Compute the alternative permitted result by * subtracting from the sum of the extended * single-precision bit patterns of +pi/4 and * -pi/4. This is a horrible hack which only * works because we can be confident that * numbers in this range all have the same * exponent! */ val = 0xfe921fb54442d184ULL - val; tresultr[0] = val >> 32; tresultr[1] = (val >> (32-EXTRABITS)) << (32-EXTRABITS); /* * Also, expect a correspondingly different * value of res2 as a result of this change. * The adjustment depends on whether we just * flipped the result from + to - or vice * versa. */ if (resultr[0] & 0x80000000) { res2_adjust = +1; } else { res2_adjust = -1; } } } } ulpsr = calc_error(resultr, tresultr, rshift, t.func->rettype); if(is_complex_rettype(t.func->rettype)) { ulpsi = calc_error(resulti, tresulti, ishift, t.func->rettype); } else { ulpsi = 0; } unsigned *rr = (ulpsr > ulptolerance || ulpsr < -ulptolerance) ? resultr : NULL; unsigned *ri = (ulpsi > ulptolerance || ulpsi < -ulptolerance) ? resulti : NULL; /* printf("tolerance=%i, ulpsr=%i, ulpsi=%i, rr=%p, ri=%p\n",ulptolerance,ulpsr,ulpsi,rr,ri); */ if (rr || ri) { if (quiet) failtext[0]='x'; else { print_error(t.func->rettype,rr,"wrongresultr",&failp); print_error(t.func->rettype,ri,"wrongresulti",&failp); print_ulps(t.func->rettype,rr ? ulpsr : 0, ri ? ulpsi : 0,&failp); } } } else { if(is_complex_rettype(t.func->rettype)) /* * Complex functions are not fully supported, * this is unreachable, but prevents warnings. */ abort(); /* * The test case data has provided the result in * exactly the output precision. Therefore we must * complain about _any_ violation. */ switch(t.func->rettype) { case rt_dc: canon_dNaN(tresulti); canon_dNaN(resulti); if (fo) { dnormzero(tresulti); dnormzero(resulti); } /* deliberate fall-through */ case rt_d: canon_dNaN(tresultr); canon_dNaN(resultr); if (fo) { dnormzero(tresultr); dnormzero(resultr); } break; case rt_sc: canon_sNaN(tresulti); canon_sNaN(resulti); if (fo) { snormzero(tresulti); snormzero(resulti); } /* deliberate fall-through */ case rt_s: canon_sNaN(tresultr); canon_sNaN(resultr); if (fo) { snormzero(tresultr); snormzero(resultr); } break; default: break; } if(is_complex_rettype(t.func->rettype)) { unsigned *rr, *ri; if(resultr[0] != tresultr[0] || (wres > 1 && resultr[1] != tresultr[1])) { rr = resultr; } else { rr = NULL; } if(resulti[0] != tresulti[0] || (wres > 1 && resulti[1] != tresulti[1])) { ri = resulti; } else { ri = NULL; } if(rr || ri) { if (quiet) failtext[0]='x'; print_error(t.func->rettype,rr,"wrongresultr",&failp); print_error(t.func->rettype,ri,"wrongresulti",&failp); } } else if (resultr[0] != tresultr[0] || (wres > 1 && resultr[1] != tresultr[1])) { if (quiet) failtext[0]='x'; print_error(t.func->rettype,resultr,"wrongresult",&failp); } } /* * Now test res2, for those functions (frexp, modf, rred) * which use it. */ if (t.func->func.ptr == &frexp || t.func->func.ptr == &frexpf || t.func->macro_name == m_rred || t.func->macro_name == m_rredf) { unsigned tres2 = t.res2[0]; if (res2_adjust) { /* Fix for range reduction, propagated from further up */ tres2 = (tres2 + res2_adjust) & 3; } if (tres2 != intres) { if (quiet) failtext[0]='x'; else { failp += sprintf(failp, " wrongres2=%08x", intres); } } } else if (t.func->func.ptr == &modf || t.func->func.ptr == &modff) { tresultr[0] = t.res2[0]; tresultr[1] = t.res2[1]; if (is_double_rettype(t.func->rettype)) { canon_dNaN(tresultr); resultr[0] = d_res2.i[dmsd]; resultr[1] = d_res2.i[dlsd]; canon_dNaN(resultr); if (fo) { dnormzero(tresultr); dnormzero(resultr); } } else { canon_sNaN(tresultr); resultr[0] = s_res2.i; resultr[1] = s_res2.i; canon_sNaN(resultr); if (fo) { snormzero(tresultr); snormzero(resultr); } } if (resultr[0] != tresultr[0] || (wres > 1 && resultr[1] != tresultr[1])) { if (quiet) failtext[0]='x'; else { if (is_double_rettype(t.func->rettype)) failp += sprintf(failp, " wrongres2=%08x.%08x", resultr[0], resultr[1]); else failp += sprintf(failp, " wrongres2=%08x", resultr[0]); } } } } /* Check errno */ err = (errno == EDOM ? e_EDOM : errno == ERANGE ? e_ERANGE : e_0); if (err != t.err && err != t.maybeerr) { if (quiet) failtext[0]='x'; else { failp += sprintf(failp, " wrongerrno=%s expecterrno=%s ", errnos[err], errnos[t.err]); } } return *failtext ? test_fail : test_pass; } int passed, failed, declined; void runtests(char *name, FILE *fp) { char testbuf[512], linebuf[512]; int lineno = 1; testdetail test; test.valid = 0; if (verbose) printf("runtests: %s\n", name); while (fgets(testbuf, sizeof(testbuf), fp)) { int res, print_errno; testbuf[strcspn(testbuf, "\r\n")] = '\0'; strcpy(linebuf, testbuf); test = parsetest(testbuf, test); print_errno = 0; while (test.in_err < test.in_err_limit) { res = runtest(test); if (res == test_pass) { if (verbose) printf("%s:%d: pass\n", name, lineno); ++passed; } else if (res == test_decline) { if (verbose) printf("%s:%d: declined\n", name, lineno); ++declined; } else if (res == test_fail) { if (!quiet) printf("%s:%d: FAIL%s: %s%s%s%s\n", name, lineno, test.random ? " (random)" : "", linebuf, print_errno ? " errno_in=" : "", print_errno ? errnos[test.in_err] : "", failtext); ++failed; } else if (res == test_invalid) { printf("%s:%d: malformed: %s\n", name, lineno, linebuf); ++failed; } test.in_err++; print_errno = 1; } lineno++; } } int main(int ac, char **av) { char **files; int i, nfiles = 0; dbl d; #ifdef MICROLIB /* * Invent argc and argv ourselves. */ char *argv[256]; char args[256]; { int sargs[2]; char *p; ac = 0; sargs[0]=(int)args; sargs[1]=(int)sizeof(args); if (!__semihost(0x15, sargs)) { args[sizeof(args)-1] = '\0'; /* just in case */ p = args; while (1) { while (*p == ' ' || *p == '\t') p++; if (!*p) break; argv[ac++] = p; while (*p && *p != ' ' && *p != '\t') p++; if (*p) *p++ = '\0'; } } av = argv; } #endif /* Sort tfuncs */ qsort(tfuncs, sizeof(tfuncs)/sizeof(test_func), sizeof(test_func), &compare_tfuncs); /* * Autodetect the `double' endianness. */ dmsd = 0; d.f = 1.0; /* 0x3ff00000 / 0x00000000 */ if (d.i[dmsd] == 0) { dmsd = 1; } /* * Now dmsd denotes what the compiler thinks we're at. Let's * check that it agrees with what the runtime thinks. */ d.i[0] = d.i[1] = 0x11111111;/* a random +ve number */ d.f /= d.f; /* must now be one */ if (d.i[dmsd] == 0) { fprintf(stderr, "YIKES! Compiler and runtime disagree on endianness" " of `double'. Bailing out\n"); return 1; } dlsd = !dmsd; /* default is terse */ verbose = 0; fo = 0; strict = 0; files = (char **)malloc((ac+1) * sizeof(char *)); if (!files) { fprintf(stderr, "initial malloc failed!\n"); return 1; } #ifdef NOCMDLINE files[nfiles++] = "testfile"; #endif while (--ac) { char *p = *++av; if (*p == '-') { static char *options[] = { "-fo", #if 0 "-noinexact", "-noround", #endif "-nostatus", "-quiet", "-strict", "-v", "-verbose", }; enum { op_fo, #if 0 op_noinexact, op_noround, #endif op_nostatus, op_quiet, op_strict, op_v, op_verbose, }; switch (find(p, options, sizeof(options))) { case op_quiet: quiet = 1; break; #if 0 case op_noinexact: statusmask &= 0x0F; /* remove bit 4 */ break; case op_noround: doround = 0; break; #endif case op_nostatus: /* no status word => noinx,noround */ statusmask = 0; doround = 0; break; case op_v: case op_verbose: verbose = 1; break; case op_fo: fo = 1; break; case op_strict: /* tolerance is 1 ulp */ strict = 1; break; default: fprintf(stderr, "unrecognised option: %s\n", p); break; } } else { files[nfiles++] = p; } } passed = failed = declined = 0; if (nfiles) { for (i = 0; i < nfiles; i++) { FILE *fp = fopen(files[i], "r"); if (!fp) { fprintf(stderr, "Couldn't open %s\n", files[i]); } else runtests(files[i], fp); } } else runtests("(stdin)", stdin); printf("Completed. Passed %d, failed %d (total %d", passed, failed, passed+failed); if (declined) printf(" plus %d declined", declined); printf(")\n"); if (failed || passed == 0) return 1; printf("** TEST PASSED OK **\n"); return 0; } void undef_func() { failed++; puts("ERROR: undefined function called"); } diff --git a/contrib/arm-optimized-routines/math/test/runulp.sh b/contrib/arm-optimized-routines/math/test/runulp.sh index b4000f6ea01b..e2e03e3ae761 100755 --- a/contrib/arm-optimized-routines/math/test/runulp.sh +++ b/contrib/arm-optimized-routines/math/test/runulp.sh @@ -1,324 +1,282 @@ #!/bin/bash # ULP error check script. # -# Copyright (c) 2019-2022, Arm Limited. +# Copyright (c) 2019-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception #set -x set -eu # cd to bin directory. cd "${0%/*}" rmodes='n u d z' #rmodes=n flags="${ULPFLAGS:--q}" emu="$@" FAIL=0 PASS=0 t() { [ $r = "n" ] && Lt=$L || Lt=$Ldir $emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) } check() { $emu ./ulp -f -q "$@" >/dev/null } Ldir=0.5 for r in $rmodes do L=0.01 t exp 0 0xffff000000000000 10000 t exp 0x1p-6 0x1p6 40000 t exp -0x1p-6 -0x1p6 40000 t exp 633.3 733.3 10000 t exp -633.3 -777.3 10000 L=0.01 t exp2 0 0xffff000000000000 10000 t exp2 0x1p-6 0x1p6 40000 t exp2 -0x1p-6 -0x1p6 40000 t exp2 633.3 733.3 10000 t exp2 -633.3 -777.3 10000 L=0.02 t log 0 0xffff000000000000 10000 t log 0x1p-4 0x1p4 40000 t log 0 inf 40000 L=0.05 t log2 0 0xffff000000000000 10000 t log2 0x1p-4 0x1p4 40000 t log2 0 inf 40000 L=0.05 t pow 0.5 2.0 x 0 inf 20000 t pow -0.5 -2.0 x 0 inf 20000 t pow 0.5 2.0 x -0 -inf 20000 t pow -0.5 -2.0 x -0 -inf 20000 t pow 0.5 2.0 x 0x1p-10 0x1p10 40000 t pow 0.5 2.0 x -0x1p-10 -0x1p10 40000 t pow 0 inf x 0.5 2.0 80000 t pow 0 inf x -0.5 -2.0 80000 t pow 0x1.fp-1 0x1.08p0 x 0x1p8 0x1p17 80000 t pow 0x1.fp-1 0x1.08p0 x -0x1p8 -0x1p17 80000 t pow 0 0x1p-1000 x 0 1.0 50000 t pow 0x1p1000 inf x 0 1.0 50000 t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=0.02 +t exp10 0 0x1p-47 5000 +t exp10 -0 -0x1p-47 5000 +t exp10 0x1p-47 1 50000 +t exp10 -0x1p-47 -1 50000 +t exp10 1 0x1.34413509f79ffp8 50000 +t exp10 -1 -0x1.434e6420f4374p8 50000 +t exp10 0x1.34413509f79ffp8 inf 5000 +t exp10 -0x1.434e6420f4374p8 -inf 5000 + L=1.0 Ldir=0.9 t erf 0 0xffff000000000000 10000 t erf 0x1p-1022 0x1p-26 40000 t erf -0x1p-1022 -0x1p-26 40000 t erf 0x1p-26 0x1p3 40000 t erf -0x1p-26 -0x1p3 40000 t erf 0 inf 40000 Ldir=0.5 L=0.01 t expf 0 0xffff0000 10000 t expf 0x1p-14 0x1p8 50000 t expf -0x1p-14 -0x1p8 50000 L=0.01 t exp2f 0 0xffff0000 10000 t exp2f 0x1p-14 0x1p8 50000 t exp2f -0x1p-14 -0x1p8 50000 L=0.32 t logf 0 0xffff0000 10000 t logf 0x1p-4 0x1p4 50000 t logf 0 inf 50000 L=0.26 t log2f 0 0xffff0000 10000 t log2f 0x1p-4 0x1p4 50000 t log2f 0 inf 50000 L=0.06 t sinf 0 0xffff0000 10000 t sinf 0x1p-14 0x1p54 50000 t sinf -0x1p-14 -0x1p54 50000 L=0.06 t cosf 0 0xffff0000 10000 t cosf 0x1p-14 0x1p54 50000 t cosf -0x1p-14 -0x1p54 50000 L=0.06 t sincosf_sinf 0 0xffff0000 10000 t sincosf_sinf 0x1p-14 0x1p54 50000 t sincosf_sinf -0x1p-14 -0x1p54 50000 L=0.06 t sincosf_cosf 0 0xffff0000 10000 t sincosf_cosf 0x1p-14 0x1p54 50000 t sincosf_cosf -0x1p-14 -0x1p54 50000 L=0.4 t powf 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000 t powf 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000 t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 L=0.6 Ldir=0.9 t erff 0 0xffff0000 10000 t erff 0x1p-127 0x1p-26 40000 t erff -0x1p-127 -0x1p-26 40000 t erff 0x1p-26 0x1p3 40000 t erff -0x1p-26 -0x1p3 40000 t erff 0 inf 40000 Ldir=0.5 done # vector functions + Ldir=0.5 r='n' flags="${ULPFLAGS:--q}" -runs= -check __s_exp 1 && runs=1 -runv= -check __v_exp 1 && runv=1 -runvn= -check __vn_exp 1 && runvn=1 range_exp=' 0 0xffff000000000000 10000 0x1p-6 0x1p6 400000 -0x1p-6 -0x1p6 400000 633.3 733.3 10000 -633.3 -777.3 10000 ' range_log=' 0 0xffff000000000000 10000 0x1p-4 0x1p4 400000 0 inf 400000 ' range_pow=' 0x1p-1 0x1p1 x 0x1p-10 0x1p10 50000 0x1p-1 0x1p1 x -0x1p-10 -0x1p10 50000 0x1p-500 0x1p500 x 0x1p-1 0x1p1 50000 0x1p-500 0x1p500 x -0x1p-1 -0x1p1 50000 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p16 50000 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16 50000 ' range_sin=' - 0 0xffff000000000000 10000 - 0x1p-4 0x1p4 400000 - -0x1p-23 0x1p23 400000 + 0 0x1p23 500000 + -0 -0x1p23 500000 + 0x1p23 inf 10000 + -0x1p23 -inf 10000 ' range_cos="$range_sin" range_expf=' 0 0xffff0000 10000 0x1p-14 0x1p8 500000 -0x1p-14 -0x1p8 500000 ' range_expf_1u="$range_expf" range_exp2f="$range_expf" range_exp2f_1u="$range_expf" range_logf=' 0 0xffff0000 10000 0x1p-4 0x1p4 500000 ' range_sinf=' - 0 0xffff0000 10000 - 0x1p-4 0x1p4 300000 --0x1p-9 -0x1p9 300000 + 0 0x1p20 500000 + -0 -0x1p20 500000 + 0x1p20 inf 10000 + -0x1p20 -inf 10000 ' range_cosf="$range_sinf" range_powf=' 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 ' # error limits L_exp=1.9 L_log=1.2 L_pow=0.05 L_sin=3.0 L_cos=3.0 L_expf=1.49 L_expf_1u=0.4 L_exp2f=1.49 L_exp2f_1u=0.4 L_logf=2.9 L_sinf=1.4 L_cosf=1.4 L_powf=2.1 -while read G F R D +while read G F D do - [ "$R" = 1 ] || continue case "$G" in \#*) continue ;; esac eval range="\${range_$G}" eval L="\${L_$G}" while read X do [ -n "$X" ] || continue case "$X" in \#*) continue ;; esac disable_fenv="" if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then # If library was built with SIMD exceptions # disabled, disable fenv checking in ulp # tool. Otherwise, fenv checking may still be # disabled by adding -f to the end of the run # line. disable_fenv="-f" fi t $D $disable_fenv $F $X done << EOF $range + EOF done << EOF # group symbol run -exp __s_exp $runs -exp __v_exp $runv -exp __vn_exp $runvn -exp _ZGVnN2v_exp $runvn - -log __s_log $runs -log __v_log $runv -log __vn_log $runvn -log _ZGVnN2v_log $runvn - -pow __s_pow $runs -f -pow __v_pow $runv -f -pow __vn_pow $runvn -f -pow _ZGVnN2vv_pow $runvn -f - -sin __s_sin $runs -sin __v_sin $runv -sin __vn_sin $runvn -sin _ZGVnN2v_sin $runvn - -cos __s_cos $runs -cos __v_cos $runv -cos __vn_cos $runvn -cos _ZGVnN2v_cos $runvn - -expf __s_expf $runs -expf __v_expf $runv -expf __vn_expf $runvn -expf _ZGVnN4v_expf $runvn - -expf_1u __s_expf_1u $runs -f -expf_1u __v_expf_1u $runv -f -expf_1u __vn_expf_1u $runvn -f - -exp2f __s_exp2f $runs -exp2f __v_exp2f $runv -exp2f __vn_exp2f $runvn -exp2f _ZGVnN4v_exp2f $runvn - -exp2f_1u __s_exp2f_1u $runs -f -exp2f_1u __v_exp2f_1u $runv -f -exp2f_1u __vn_exp2f_1u $runvn -f - -logf __s_logf $runs -logf __v_logf $runv -logf __vn_logf $runvn -logf _ZGVnN4v_logf $runvn - -sinf __s_sinf $runs -sinf __v_sinf $runv -sinf __vn_sinf $runvn -sinf _ZGVnN4v_sinf $runvn - -cosf __s_cosf $runs -cosf __v_cosf $runv -cosf __vn_cosf $runvn -cosf _ZGVnN4v_cosf $runvn - -powf __s_powf $runs -f -powf __v_powf $runv -f -powf __vn_powf $runvn -f -powf _ZGVnN4vv_powf $runvn -f +exp _ZGVnN2v_exp +log _ZGVnN2v_log +pow _ZGVnN2vv_pow -f +sin _ZGVnN2v_sin -z +cos _ZGVnN2v_cos +expf _ZGVnN4v_expf +expf_1u _ZGVnN4v_expf_1u -f +exp2f _ZGVnN4v_exp2f +exp2f_1u _ZGVnN4v_exp2f_1u -f +logf _ZGVnN4v_logf +sinf _ZGVnN4v_sinf -z +cosf _ZGVnN4v_cosf +powf _ZGVnN4vv_powf -f EOF [ 0 -eq $FAIL ] || { echo "FAILED $FAIL PASSED $PASS" exit 1 } diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst new file mode 100644 index 000000000000..2cf4273bd1d7 --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst @@ -0,0 +1,15 @@ +; Directed test cases for exp10 +; +; Copyright (c) 2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0 +func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux +func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0 +func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/ulp.c b/contrib/arm-optimized-routines/math/test/ulp.c index bb8c3ad69900..5ff29972e50e 100644 --- a/contrib/arm-optimized-routines/math/test/ulp.c +++ b/contrib/arm-optimized-routines/math/test/ulp.c @@ -1,855 +1,828 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include "mathlib.h" /* Don't depend on mpfr by default. */ #ifndef USE_MPFR # define USE_MPFR 0 #endif #if USE_MPFR # include #endif -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - static inline uint64_t asuint64 (double f) { union { double f; uint64_t i; } u = {f}; return u.i; } static inline double asdouble (uint64_t i) { union { uint64_t i; double f; } u = {i}; return u.f; } static inline uint32_t asuint (float f) { union { float f; uint32_t i; } u = {f}; return u.i; } static inline float asfloat (uint32_t i) { union { uint32_t i; float f; } u = {i}; return u.f; } static uint64_t seed = 0x0123456789abcdef; static uint64_t rand64 (void) { seed = 6364136223846793005ull * seed + 1; return seed ^ (seed >> 32); } /* Uniform random in [0,n]. */ static uint64_t randn (uint64_t n) { uint64_t r, m; if (n == 0) return 0; n++; if (n == 0) return rand64 (); for (;;) { r = rand64 (); m = r % n; if (r - m <= -n) return m; } } struct gen { uint64_t start; uint64_t len; uint64_t start2; uint64_t len2; uint64_t off; uint64_t step; uint64_t cnt; }; struct args_f1 { float x; }; struct args_f2 { float x; float x2; }; struct args_d1 { double x; }; struct args_d2 { double x; double x2; }; /* result = y + tail*2^ulpexp. */ struct ret_f { float y; double tail; int ulpexp; int ex; int ex_may; }; struct ret_d { double y; double tail; int ulpexp; int ex; int ex_may; }; static inline uint64_t next1 (struct gen *g) { /* For single argument use randomized incremental steps, that produce dense sampling without collisions and allow testing all inputs in a range. */ uint64_t r = g->start + g->off; g->off += g->step + randn (g->step / 2); if (g->off > g->len) g->off -= g->len; /* hack. */ return r; } static inline uint64_t next2 (uint64_t *x2, struct gen *g) { /* For two arguments use uniform random sampling. */ uint64_t r = g->start + randn (g->len); *x2 = g->start2 + randn (g->len2); return r; } static struct args_f1 next_f1 (void *g) { return (struct args_f1){asfloat (next1 (g))}; } static struct args_f2 next_f2 (void *g) { uint64_t x2; uint64_t x = next2 (&x2, g); return (struct args_f2){asfloat (x), asfloat (x2)}; } static struct args_d1 next_d1 (void *g) { return (struct args_d1){asdouble (next1 (g))}; } static struct args_d2 next_d2 (void *g) { uint64_t x2; uint64_t x = next2 (&x2, g); return (struct args_d2){asdouble (x), asdouble (x2)}; } struct conf { int r; int rc; int quiet; int mpfr; int fenv; unsigned long long n; double softlim; double errlim; + int ignore_zero_sign; }; /* A bit of a hack: call vector functions twice with the same input in lane 0 but a different value in other lanes: once with an in-range value and then with a special case value. */ static int secondcall; /* Wrappers for vector functions. */ -#if __aarch64__ && WANT_VMATH +#ifdef __vpcs typedef __f32x4_t v_float; typedef __f64x2_t v_double; /* First element of fv and dv may be changed by -c argument. */ static float fv[2] = {1.0f, -INFINITY}; static double dv[2] = {1.0, -INFINITY}; static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } #if WANT_SVE_MATH #include typedef __SVFloat32_t sv_float; typedef __SVFloat64_t sv_double; static inline sv_float svargf(float x) { int n = svcntw(); float base[n]; for (int i=0; i> 23 & 0xff; if (!e) e++; return e - 0x7f - 23; } static inline int ulpscale_d (double x) { int e = asuint64 (x) >> 52 & 0x7ff; if (!e) e++; return e - 0x3ff - 52; } static inline float call_f1 (const struct fun *f, struct args_f1 a) { return f->fun.f1 (a.x); } static inline float call_f2 (const struct fun *f, struct args_f2 a) { return f->fun.f2 (a.x, a.x2); } static inline double call_d1 (const struct fun *f, struct args_d1 a) { return f->fun.d1 (a.x); } static inline double call_d2 (const struct fun *f, struct args_d2 a) { return f->fun.d2 (a.x, a.x2); } static inline double call_long_f1 (const struct fun *f, struct args_f1 a) { return f->fun_long.f1 (a.x); } static inline double call_long_f2 (const struct fun *f, struct args_f2 a) { return f->fun_long.f2 (a.x, a.x2); } static inline long double call_long_d1 (const struct fun *f, struct args_d1 a) { return f->fun_long.d1 (a.x); } static inline long double call_long_d2 (const struct fun *f, struct args_d2 a) { return f->fun_long.d2 (a.x, a.x2); } static inline void printcall_f1 (const struct fun *f, struct args_f1 a) { printf ("%s(%a)", f->name, a.x); } static inline void printcall_f2 (const struct fun *f, struct args_f2 a) { printf ("%s(%a, %a)", f->name, a.x, a.x2); } static inline void printcall_d1 (const struct fun *f, struct args_d1 a) { printf ("%s(%a)", f->name, a.x); } static inline void printcall_d2 (const struct fun *f, struct args_d2 a) { printf ("%s(%a, %a)", f->name, a.x, a.x2); } static inline void printgen_f1 (const struct fun *f, struct gen *gen) { printf ("%s in [%a;%a]", f->name, asfloat (gen->start), asfloat (gen->start + gen->len)); } static inline void printgen_f2 (const struct fun *f, struct gen *gen) { printf ("%s in [%a;%a] x [%a;%a]", f->name, asfloat (gen->start), asfloat (gen->start + gen->len), asfloat (gen->start2), asfloat (gen->start2 + gen->len2)); } static inline void printgen_d1 (const struct fun *f, struct gen *gen) { printf ("%s in [%a;%a]", f->name, asdouble (gen->start), asdouble (gen->start + gen->len)); } static inline void printgen_d2 (const struct fun *f, struct gen *gen) { printf ("%s in [%a;%a] x [%a;%a]", f->name, asdouble (gen->start), asdouble (gen->start + gen->len), asdouble (gen->start2), asdouble (gen->start2 + gen->len2)); } #define reduce_f1(a, f, op) (f (a.x)) #define reduce_f2(a, f, op) (f (a.x) op f (a.x2)) #define reduce_d1(a, f, op) (f (a.x)) #define reduce_d2(a, f, op) (f (a.x) op f (a.x2)) #ifndef IEEE_754_2008_SNAN # define IEEE_754_2008_SNAN 1 #endif static inline int issignaling_f (float x) { uint32_t ix = asuint (x); if (!IEEE_754_2008_SNAN) return (ix & 0x7fc00000) == 0x7fc00000; return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000; } static inline int issignaling_d (double x) { uint64_t ix = asuint64 (x); if (!IEEE_754_2008_SNAN) return (ix & 0x7ff8000000000000) == 0x7ff8000000000000; return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; } #if USE_MPFR static mpfr_rnd_t rmap (int r) { switch (r) { case FE_TONEAREST: return MPFR_RNDN; case FE_TOWARDZERO: return MPFR_RNDZ; case FE_UPWARD: return MPFR_RNDU; case FE_DOWNWARD: return MPFR_RNDD; } return -1; } #define prec_mpfr_f 50 #define prec_mpfr_d 80 #define prec_f 24 #define prec_d 53 #define emin_f -148 #define emin_d -1073 #define emax_f 128 #define emax_d 1024 static inline int call_mpfr_f1 (mpfr_t y, const struct fun *f, struct args_f1 a, mpfr_rnd_t r) { MPFR_DECL_INIT (x, prec_f); mpfr_set_flt (x, a.x, MPFR_RNDN); return f->fun_mpfr.f1 (y, x, r); } static inline int call_mpfr_f2 (mpfr_t y, const struct fun *f, struct args_f2 a, mpfr_rnd_t r) { MPFR_DECL_INIT (x, prec_f); MPFR_DECL_INIT (x2, prec_f); mpfr_set_flt (x, a.x, MPFR_RNDN); mpfr_set_flt (x2, a.x2, MPFR_RNDN); return f->fun_mpfr.f2 (y, x, x2, r); } static inline int call_mpfr_d1 (mpfr_t y, const struct fun *f, struct args_d1 a, mpfr_rnd_t r) { MPFR_DECL_INIT (x, prec_d); mpfr_set_d (x, a.x, MPFR_RNDN); return f->fun_mpfr.d1 (y, x, r); } static inline int call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r) { MPFR_DECL_INIT (x, prec_d); MPFR_DECL_INIT (x2, prec_d); mpfr_set_d (x, a.x, MPFR_RNDN); mpfr_set_d (x2, a.x2, MPFR_RNDN); return f->fun_mpfr.d2 (y, x, x2, r); } #endif #define float_f float #define double_f double #define copysign_f copysignf #define nextafter_f nextafterf #define fabs_f fabsf #define asuint_f asuint #define asfloat_f asfloat #define scalbn_f scalbnf #define lscalbn_f scalbn #define halfinf_f 0x1p127f #define min_normal_f 0x1p-126f #define float_d double #define double_d long double #define copysign_d copysign #define nextafter_d nextafter #define fabs_d fabs #define asuint_d asuint64 #define asfloat_d asdouble #define scalbn_d scalbn #define lscalbn_d scalbnl #define halfinf_d 0x1p1023 #define min_normal_d 0x1p-1022 #define NEW_RT #define RT(x) x##_f #define T(x) x##_f1 #include "ulp.h" #undef T #define T(x) x##_f2 #include "ulp.h" #undef T #undef RT #define NEW_RT #define RT(x) x##_d #define T(x) x##_d1 #include "ulp.h" #undef T #define T(x) x##_d2 #include "ulp.h" #undef T #undef RT static void usage (void) { - puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func " + puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func " "lo [hi [x lo2 hi2] [count]]"); puts ("Compares func against a higher precision implementation in [lo; hi]."); puts ("-q: quiet."); puts ("-m: use mpfr even if faster method is available."); - puts ("-f: disable fenv testing (rounding modes and exceptions)."); -#if __aarch64__ && WANT_VMATH + puts ("-f: disable fenv exceptions testing."); +#ifdef ___vpcs puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n" " This should be different from tested input in other lanes, and non-special \n" " (i.e. should not trigger fenv exceptions). Default is 1."); #endif + puts ("-z: ignore sign of 0."); puts ("Supported func:"); for (const struct fun *f = fun; f->name; f++) printf ("\t%s\n", f->name); exit (1); } static int cmp (const struct fun *f, struct gen *gen, const struct conf *conf) { int r = 1; if (f->arity == 1 && f->singleprec) r = cmp_f1 (f, gen, conf); else if (f->arity == 2 && f->singleprec) r = cmp_f2 (f, gen, conf); else if (f->arity == 1 && !f->singleprec) r = cmp_d1 (f, gen, conf); else if (f->arity == 2 && !f->singleprec) r = cmp_d2 (f, gen, conf); else usage (); return r; } static uint64_t getnum (const char *s, int singleprec) { // int i; uint64_t sign = 0; // char buf[12]; if (s[0] == '+') s++; else if (s[0] == '-') { sign = singleprec ? 1ULL << 31 : 1ULL << 63; s++; } /* 0xXXXX is treated as bit representation, '-' flips the sign bit. */ if (s[0] == '0' && tolower (s[1]) == 'x' && strchr (s, 'p') == 0) return sign ^ strtoull (s, 0, 0); // /* SNaN, QNaN, NaN, Inf. */ // for (i=0; s[i] && i < sizeof buf; i++) // buf[i] = tolower(s[i]); // buf[i] = 0; // if (strcmp(buf, "snan") == 0) // return sign | (singleprec ? 0x7fa00000 : 0x7ff4000000000000); // if (strcmp(buf, "qnan") == 0 || strcmp(buf, "nan") == 0) // return sign | (singleprec ? 0x7fc00000 : 0x7ff8000000000000); // if (strcmp(buf, "inf") == 0 || strcmp(buf, "infinity") == 0) // return sign | (singleprec ? 0x7f800000 : 0x7ff0000000000000); /* Otherwise assume it's a floating-point literal. */ return sign | (singleprec ? asuint (strtof (s, 0)) : asuint64 (strtod (s, 0))); } static void parsegen (struct gen *g, int argc, char *argv[], const struct fun *f) { int singleprec = f->singleprec; int arity = f->arity; uint64_t a, b, a2, b2, n; if (argc < 1) usage (); b = a = getnum (argv[0], singleprec); n = 0; if (argc > 1 && strcmp (argv[1], "x") == 0) { argc -= 2; argv += 2; } else if (argc > 1) { b = getnum (argv[1], singleprec); if (argc > 2 && strcmp (argv[2], "x") == 0) { argc -= 3; argv += 3; } } b2 = a2 = getnum (argv[0], singleprec); if (argc > 1) b2 = getnum (argv[1], singleprec); if (argc > 2) n = strtoull (argv[2], 0, 0); if (argc > 3) usage (); //printf("ab %lx %lx ab2 %lx %lx n %lu\n", a, b, a2, b2, n); if (arity == 1) { g->start = a; g->len = b - a; if (n - 1 > b - a) n = b - a + 1; g->off = 0; g->step = n ? (g->len + 1) / n : 1; g->start2 = g->len2 = 0; g->cnt = n; } else if (arity == 2) { g->start = a; g->len = b - a; g->off = g->step = 0; g->start2 = a2; g->len2 = b2 - a2; g->cnt = n; } else usage (); } int main (int argc, char *argv[]) { const struct fun *f; struct gen gen; struct conf conf; conf.rc = 'n'; conf.quiet = 0; conf.mpfr = 0; conf.fenv = 1; conf.softlim = 0; conf.errlim = INFINITY; + conf.ignore_zero_sign = 0; for (;;) { argc--; argv++; if (argc < 1) usage (); if (argv[0][0] != '-') break; switch (argv[0][1]) { case 'e': argc--; argv++; if (argc < 1) usage (); conf.errlim = strtod (argv[0], 0); break; case 'f': conf.fenv = 0; break; case 'l': argc--; argv++; if (argc < 1) usage (); conf.softlim = strtod (argv[0], 0); break; case 'm': conf.mpfr = 1; break; case 'q': conf.quiet = 1; break; case 'r': conf.rc = argv[0][2]; if (!conf.rc) { argc--; argv++; - if (argc < 1) + if (argc < 1 || argv[0][1] != '\0') usage (); conf.rc = argv[0][0]; } break; -#if __aarch64__ && WANT_VMATH + case 'z': + conf.ignore_zero_sign = 1; + break; +#ifdef __vpcs case 'c': argc--; argv++; fv[0] = strtof(argv[0], 0); dv[0] = strtod(argv[0], 0); break; #endif default: usage (); } } switch (conf.rc) { case 'n': conf.r = FE_TONEAREST; break; case 'u': conf.r = FE_UPWARD; break; case 'd': conf.r = FE_DOWNWARD; break; case 'z': conf.r = FE_TOWARDZERO; break; default: usage (); } for (f = fun; f->name; f++) if (strcmp (argv[0], f->name) == 0) break; if (!f->name) - usage (); + { +#ifndef __vpcs + /* Ignore vector math functions if vector math is not supported. */ + if (strncmp (argv[0], "_ZGVnN", 6) == 0) + exit (0); +#endif +#if !WANT_SVE_MATH + if (strncmp (argv[0], "_ZGVsMxv", 8) == 0) + exit (0); +#endif + printf ("math function %s not supported\n", argv[0]); + exit (1); + } if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG) conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */ if (!USE_MPFR && conf.mpfr) { puts ("mpfr is not available."); return 0; } argc--; argv++; parsegen (&gen, argc, argv, f); conf.n = gen.cnt; return cmp (f, &gen, &conf); } diff --git a/contrib/arm-optimized-routines/math/test/ulp.h b/contrib/arm-optimized-routines/math/test/ulp.h index 327b4bd0fd06..b0bc59aeef8d 100644 --- a/contrib/arm-optimized-routines/math/test/ulp.h +++ b/contrib/arm-optimized-routines/math/test/ulp.h @@ -1,362 +1,379 @@ /* * Generic functions for ULP error estimation. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* For each different math function type, T(x) should add a different suffix to x. RT(x) should add a return type specific suffix to x. */ #ifdef NEW_RT #undef NEW_RT # if USE_MPFR static int RT(ulpscale_mpfr) (mpfr_t x, int t) { /* TODO: pow of 2 cases. */ if (mpfr_regular_p (x)) { mpfr_exp_t e = mpfr_get_exp (x) - RT(prec); if (e < RT(emin)) e = RT(emin) - 1; if (e > RT(emax) - RT(prec)) e = RT(emax) - RT(prec); return e; } if (mpfr_zero_p (x)) return RT(emin) - 1; if (mpfr_inf_p (x)) return RT(emax) - RT(prec); /* NaN. */ return 0; } # endif /* Difference between exact result and closest real number that gets rounded to got, i.e. error before rounding, for a correctly rounded result the difference is 0. */ -static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) +static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, + int ignore_zero_sign) { RT(float) want = p->y; RT(float) d; double e; if (RT(asuint) (got) == RT(asuint) (want)) return 0.0; + if (isnan (got) && isnan (want)) + /* Ignore sign of NaN. */ + return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY; if (signbit (got) != signbit (want)) - /* May have false positives with NaN. */ - //return isnan(got) && isnan(want) ? 0 : INFINITY; - return INFINITY; + { + /* Fall through to ULP calculation if ignoring sign of zero and at + exactly one of want and got is non-zero. */ + if (ignore_zero_sign && want == got) + return 0.0; + if (!ignore_zero_sign || (want != 0 && got != 0)) + return INFINITY; + } if (!isfinite (want) || !isfinite (got)) { if (isnan (got) != isnan (want)) return INFINITY; if (isnan (want)) return 0; if (isinf (got)) { got = RT(copysign) (RT(halfinf), got); want *= 0.5f; } if (isinf (want)) { want = RT(copysign) (RT(halfinf), want); got *= 0.5f; } } if (r == FE_TONEAREST) { // TODO: incorrect when got vs want cross a powof2 boundary /* error = got > want ? got - want - tail ulp - 0.5 ulp : got - want - tail ulp + 0.5 ulp; */ d = got - want; e = d > 0 ? -p->tail - 0.5 : -p->tail + 0.5; } else { if ((r == FE_DOWNWARD && got < want) || (r == FE_UPWARD && got > want) || (r == FE_TOWARDZERO && fabs (got) < fabs (want))) got = RT(nextafter) (got, want); d = got - want; e = -p->tail; } return RT(scalbn) (d, -p->ulpexp) + e; } static int RT(isok) (RT(float) ygot, int exgot, RT(float) ywant, int exwant, int exmay) { return RT(asuint) (ygot) == RT(asuint) (ywant) && ((exgot ^ exwant) & ~exmay) == 0; } static int RT(isok_nofenv) (RT(float) ygot, RT(float) ywant) { return RT(asuint) (ygot) == RT(asuint) (ywant); } #endif static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r, RT(float) * y, int *ex) { if (r != FE_TONEAREST) fesetround (r); feclearexcept (FE_ALL_EXCEPT); *y = T(call) (f, a); *ex = fetestexcept (FE_ALL_EXCEPT); if (r != FE_TONEAREST) fesetround (FE_TONEAREST); } static inline void T(call_nofenv) (const struct fun *f, struct T(args) a, int r, RT(float) * y, int *ex) { + if (r != FE_TONEAREST) + fesetround (r); *y = T(call) (f, a); *ex = 0; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); } static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a, int r, struct RT(ret) * p, RT(float) ygot, int exgot) { if (r != FE_TONEAREST) fesetround (r); feclearexcept (FE_ALL_EXCEPT); volatile struct T(args) va = a; // TODO: barrier a = va; RT(double) yl = T(call_long) (f, a); p->y = (RT(float)) yl; volatile RT(float) vy = p->y; // TODO: barrier (void) vy; p->ex = fetestexcept (FE_ALL_EXCEPT); if (r != FE_TONEAREST) fesetround (FE_TONEAREST); p->ex_may = FE_INEXACT; if (RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may)) return 1; p->ulpexp = RT(ulpscale) (p->y); if (isinf (p->y)) p->tail = RT(lscalbn) (yl - (RT(double)) 2 * RT(halfinf), -p->ulpexp); else p->tail = RT(lscalbn) (yl - p->y, -p->ulpexp); if (RT(fabs) (p->y) < RT(min_normal)) { /* TODO: subnormal result is treated as undeflow even if it's exact since call_long may not raise inexact correctly. */ if (p->y != 0 || (p->ex & FE_INEXACT)) p->ex |= FE_UNDERFLOW | FE_INEXACT; } return 0; } static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a, int r, struct RT(ret) * p, RT(float) ygot, int exgot) { + if (r != FE_TONEAREST) + fesetround (r); RT(double) yl = T(call_long) (f, a); p->y = (RT(float)) yl; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); if (RT(isok_nofenv) (ygot, p->y)) return 1; p->ulpexp = RT(ulpscale) (p->y); if (isinf (p->y)) p->tail = RT(lscalbn) (yl - (RT(double)) 2 * RT(halfinf), -p->ulpexp); else p->tail = RT(lscalbn) (yl - p->y, -p->ulpexp); return 0; } /* There are nan input args and all quiet. */ static inline int T(qnanpropagation) (struct T(args) a) { return T(reduce) (a, isnan, ||) && !T(reduce) (a, RT(issignaling), ||); } static inline RT(float) T(sum) (struct T(args) a) { return T(reduce) (a, , +); } /* returns 1 if the got result is ok. */ static inline int T(call_mpfr_fix) (const struct fun *f, struct T(args) a, int r_fenv, struct RT(ret) * p, RT(float) ygot, int exgot) { #if USE_MPFR int t, t2; mpfr_rnd_t r = rmap (r_fenv); MPFR_DECL_INIT(my, RT(prec_mpfr)); MPFR_DECL_INIT(mr, RT(prec)); MPFR_DECL_INIT(me, RT(prec_mpfr)); mpfr_clear_flags (); t = T(call_mpfr) (my, f, a, r); /* Double rounding. */ t2 = mpfr_set (mr, my, r); if (t2) t = t2; mpfr_set_emin (RT(emin)); mpfr_set_emax (RT(emax)); t = mpfr_check_range (mr, t, r); t = mpfr_subnormalize (mr, t, r); mpfr_set_emax (MPFR_EMAX_DEFAULT); mpfr_set_emin (MPFR_EMIN_DEFAULT); p->y = mpfr_get_d (mr, r); p->ex = t ? FE_INEXACT : 0; p->ex_may = FE_INEXACT; if (mpfr_underflow_p () && (p->ex & FE_INEXACT)) /* TODO: handle before and after rounding uflow cases. */ p->ex |= FE_UNDERFLOW; if (mpfr_overflow_p ()) p->ex |= FE_OVERFLOW | FE_INEXACT; if (mpfr_divby0_p ()) p->ex |= FE_DIVBYZERO; //if (mpfr_erangeflag_p ()) // p->ex |= FE_INVALID; if (!mpfr_nanflag_p () && RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may)) return 1; if (mpfr_nanflag_p () && !T(qnanpropagation) (a)) p->ex |= FE_INVALID; p->ulpexp = RT(ulpscale_mpfr) (my, t); if (!isfinite (p->y)) { p->tail = 0; if (isnan (p->y)) { /* If an input was nan keep its sign. */ p->y = T(sum) (a); if (!isnan (p->y)) p->y = (p->y - p->y) / (p->y - p->y); return RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may); } mpfr_set_si_2exp (mr, signbit (p->y) ? -1 : 1, 1024, MPFR_RNDN); if (mpfr_cmpabs (my, mr) >= 0) return RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may); } mpfr_sub (me, my, mr, MPFR_RNDN); mpfr_mul_2si (me, me, -p->ulpexp, MPFR_RNDN); p->tail = mpfr_get_d (me, MPFR_RNDN); return 0; #else abort (); #endif } static int T(cmp) (const struct fun *f, struct gen *gen, const struct conf *conf) { double maxerr = 0; uint64_t cnt = 0; uint64_t cnt1 = 0; uint64_t cnt2 = 0; uint64_t cntfail = 0; int r = conf->r; int use_mpfr = conf->mpfr; int fenv = conf->fenv; for (;;) { struct RT(ret) want; struct T(args) a = T(next) (gen); int exgot; int exgot2; RT(float) ygot; RT(float) ygot2; int fail = 0; if (fenv) T(call_fenv) (f, a, r, &ygot, &exgot); else T(call_nofenv) (f, a, r, &ygot, &exgot); if (f->twice) { secondcall = 1; if (fenv) T(call_fenv) (f, a, r, &ygot2, &exgot2); else T(call_nofenv) (f, a, r, &ygot2, &exgot2); secondcall = 0; if (RT(asuint) (ygot) != RT(asuint) (ygot2)) { fail = 1; cntfail++; T(printcall) (f, a); printf (" got %a then %a for same input\n", ygot, ygot2); } } cnt++; int ok = use_mpfr ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot) : (fenv ? T(call_long_fenv) (f, a, r, &want, ygot, exgot) : T(call_long_nofenv) (f, a, r, &want, ygot, exgot)); if (!ok) { int print = 0; - double err = RT(ulperr) (ygot, &want, r); + double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign); double abserr = fabs (err); // TODO: count errors below accuracy limit. if (abserr > 0) cnt1++; if (abserr > 1) cnt2++; if (abserr > conf->errlim) { print = 1; if (!fail) { fail = 1; cntfail++; } } if (abserr > maxerr) { maxerr = abserr; if (!conf->quiet && abserr > conf->softlim) print = 1; } if (print) { T(printcall) (f, a); // TODO: inf ulp handling printf (" got %a want %a %+g ulp err %g\n", ygot, want.y, want.tail, err); } int diff = fenv ? exgot ^ want.ex : 0; if (fenv && (diff & ~want.ex_may)) { if (!fail) { fail = 1; cntfail++; } T(printcall) (f, a); printf (" is %a %+g ulp, got except 0x%0x", want.y, want.tail, exgot); if (diff & exgot) printf (" wrongly set: 0x%x", diff & exgot); if (diff & ~exgot) printf (" wrongly clear: 0x%x", diff & ~exgot); putchar ('\n'); } } if (cnt >= conf->n) break; if (!conf->quiet && cnt % 0x100000 == 0) printf ("progress: %6.3f%% cnt %llu cnt1 %llu cnt2 %llu cntfail %llu " "maxerr %g\n", 100.0 * cnt / conf->n, (unsigned long long) cnt, (unsigned long long) cnt1, (unsigned long long) cnt2, (unsigned long long) cntfail, maxerr); } double cc = cnt; if (cntfail) printf ("FAIL "); else printf ("PASS "); T(printgen) (f, gen); printf (" round %c errlim %g maxerr %g %s cnt %llu cnt1 %llu %g%% cnt2 %llu " "%g%% cntfail %llu %g%%\n", conf->rc, conf->errlim, maxerr, conf->r == FE_TONEAREST ? "+0.5" : "+1.0", (unsigned long long) cnt, (unsigned long long) cnt1, 100.0 * cnt1 / cc, (unsigned long long) cnt2, 100.0 * cnt2 / cc, (unsigned long long) cntfail, 100.0 * cntfail / cc); return !!cntfail; } diff --git a/contrib/arm-optimized-routines/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/math/test/ulp_funcs.h index f5cea4d6d14c..84f7927d3935 100644 --- a/contrib/arm-optimized-routines/math/test/ulp_funcs.h +++ b/contrib/arm-optimized-routines/math/test/ulp_funcs.h @@ -1,78 +1,40 @@ /* * Function entries for ulp. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ F1 (sin) F1 (cos) F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) F1 (exp) F1 (exp2) F1 (log) F1 (log2) F2 (pow) F1 (erf) D1 (exp) + D1 (exp10) D1 (exp2) D1 (log) D1 (log2) D2 (pow) D1 (erf) -#if WANT_VMATH - F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) - F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) - F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0) - F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0) - F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0) - F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0) - F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0) - F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0) - F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0) - F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0) - F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0) - F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0) - F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0) -#if __aarch64__ - F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1) - F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1) - F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1) #ifdef __vpcs - F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1) - F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1) - F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1) F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) + F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1) F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) #endif -#endif -#endif +/* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h index fd9e00c0310f..60dc3d6dd652 100644 --- a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h +++ b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h @@ -1,59 +1,37 @@ /* * Function wrappers for ulp. * - * Copyright (c) 2022, Arm Limited. + * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ + /* Wrappers for sincos. */ static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} static double sincos_sin(double x) {(void)cos(x); return sin(x);} static double sincos_cos(double x) {(void)sin(x); return cos(x);} #if USE_MPFR static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } #endif /* Wrappers for vector functions. */ -#if __aarch64__ && WANT_VMATH -static float v_sinf(float x) { return __v_sinf(argf(x))[0]; } -static float v_cosf(float x) { return __v_cosf(argf(x))[0]; } -static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; } -static float v_expf(float x) { return __v_expf(argf(x))[0]; } -static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; } -static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; } -static float v_logf(float x) { return __v_logf(argf(x))[0]; } -static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; } -static double v_sin(double x) { return __v_sin(argd(x))[0]; } -static double v_cos(double x) { return __v_cos(argd(x))[0]; } -static double v_exp(double x) { return __v_exp(argd(x))[0]; } -static double v_log(double x) { return __v_log(argd(x))[0]; } -static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; } #ifdef __vpcs -static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; } -static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; } -static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; } -static float vn_expf(float x) { return __vn_expf(argf(x))[0]; } -static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; } -static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; } -static float vn_logf(float x) { return __vn_logf(argf(x))[0]; } -static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; } -static double vn_sin(double x) { return __vn_sin(argd(x))[0]; } -static double vn_cos(double x) { return __vn_cos(argd(x))[0]; } -static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } -static double vn_log(double x) { return __vn_log(argd(x))[0]; } -static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; } static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } +static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; } static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } +static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; } static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } #endif -#endif + +/* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/tgamma128.c b/contrib/arm-optimized-routines/math/tgamma128.c new file mode 100644 index 000000000000..65deacc49d99 --- /dev/null +++ b/contrib/arm-optimized-routines/math/tgamma128.c @@ -0,0 +1,356 @@ +/* + * Implementation of the true gamma function (as opposed to lgamma) + * for 128-bit long double. + * + * Copyright (c) 2006-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* + * This module implements the float128 gamma function under the name + * tgamma128. It's expected to be suitable for integration into system + * maths libraries under the standard name tgammal, if long double is + * 128-bit. Such a library will probably want to check the error + * handling and optimize the initial process of extracting the + * exponent, which is done here by simple and portable (but + * potentially slower) methods. + */ + +#include +#include +#include +#include + +/* Only binary128 format is supported. */ +#if LDBL_MANT_DIG == 113 + +#include "tgamma128.h" + +#define lenof(x) (sizeof(x)/sizeof(*(x))) + +/* + * Helper routine to evaluate a polynomial via Horner's rule + */ +static long double poly(const long double *coeffs, size_t n, long double x) +{ + long double result = coeffs[--n]; + + while (n > 0) + result = (result * x) + coeffs[--n]; + + return result; +} + +/* + * Compute sin(pi*x) / pi, for use in the reflection formula that + * relates gamma(-x) and gamma(x). + */ +static long double sin_pi_x_over_pi(long double x) +{ + int quo; + long double fracpart = remquol(x, 0.5L, &quo); + + long double sign = 1.0L; + if (quo & 2) + sign = -sign; + quo &= 1; + + if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) { + /* For numbers this size, sin(pi*x) is so close to pi*x that + * sin(pi*x)/pi is indistinguishable from x in float128 */ + return sign * fracpart; + } + + if (quo == 0) { + return sign * sinl(pi*fracpart) / pi; + } else { + return sign * cosl(pi*fracpart) / pi; + } +} + +/* Return tgamma(x) on the assumption that x >= 8. */ +static long double tgamma_large(long double x, + bool negative, long double negadjust) +{ + /* + * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K, + * where K is a correction factor computed as a polynomial in 1/x. + * + * (Vaguely inspired by the form of the Lanczos approximation, but + * I tried the Lanczos approximation itself and it suffers badly + * from big cancellation leading to loss of significance.) + */ + long double t = 1/x; + long double p = poly(coeffs_large, lenof(coeffs_large), t); + + /* + * To avoid overflow in cases where x^(x-0.5) does overflow + * but gamma(x) does not, we split x^(x-0.5) in half and + * multiply back up _after_ multiplying the shrinking factor + * of exp(-(x-0.5)). + * + * Note that computing x-0.5 and (x-0.5)/2 is exact for the + * relevant range of x, so the only sources of error are pow + * and exp themselves, plus the multiplications. + */ + long double powhalf = powl(x, (x-0.5L)/2.0L); + long double expret = expl(-(x-0.5L)); + + if (!negative) { + return (expret * powhalf) * powhalf * p; + } else { + /* + * Apply the reflection formula as commented below, but + * carefully: negadjust has magnitude less than 1, so it can + * turn a case where gamma(+x) would overflow into a case + * where gamma(-x) doesn't underflow. Not only that, but the + * FP format has greater range in the tiny domain due to + * denormals. For both reasons, it's not good enough to + * compute the positive result and then adjust it. + */ + long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p); + return ret / powhalf; + } +} + +/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */ +static long double tgamma_tiny(long double x, + bool negative, long double negadjust) +{ + /* + * For x near zero, we use a polynomial approximation to + * g = 1/(x*gamma(x)), and then return 1/(g*x). + */ + long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x); + if (!negative) + return 1.0L / (g*x); + else + return g / negadjust; +} + +/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */ +static long double tgamma_ultratiny(long double x, bool negative, + long double negadjust) +{ + /* On this interval, gamma can't even be distinguished from 1/x, + * so we skip the polynomial evaluation in tgamma_tiny, partly to + * save time and partly to avoid the tiny intermediate values + * setting the underflow exception flag. */ + if (!negative) + return 1.0L / x; + else + return 1.0L / negadjust; +} + +/* Return tgamma(x) on the assumption that 1 <= x <= 2. */ +static long double tgamma_central(long double x) +{ + /* + * In this central interval, our strategy is to finding the + * difference between x and the point where gamma has a minimum, + * and approximate based on that. + */ + + /* The difference between the input x and the minimum x. The first + * subtraction is expected to be exact, since x and min_hi have + * the same exponent (unless x=2, in which case it will still be + * exact). */ + long double t = (x - min_x_hi) - min_x_lo; + + /* + * Now use two different polynomials for the intervals [1,m] and + * [m,2]. + */ + long double p; + if (t < 0) + p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t); + else + p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t); + + return (min_y_lo + p * (t*t)) + min_y_hi; +} + +long double tgamma128(long double x) +{ + /* + * Start by extracting the number's sign and exponent, and ruling + * out cases of non-normalized numbers. + * + * For an implementation integrated into a system libm, it would + * almost certainly be quicker to do this by direct bitwise access + * to the input float128 value, using whatever is the local idiom + * for knowing its endianness. + * + * Integration into a system libc may also need to worry about + * setting errno, if that's the locally preferred way to report + * math.h errors. + */ + int sign = signbit(x); + int exponent; + switch (fpclassify(x)) { + case FP_NAN: + return x+x; /* propagate QNaN, make SNaN throw an exception */ + case FP_ZERO: + return 1/x; /* divide by zero on purpose to indicate a pole */ + case FP_INFINITE: + if (sign) { + return x-x; /* gamma(-inf) has indeterminate sign, so provoke an + * IEEE invalid operation exception to indicate that */ + } + return x; /* but gamma(+inf) is just +inf with no error */ + case FP_SUBNORMAL: + exponent = -16384; + break; + default: + frexpl(x, &exponent); + exponent--; + break; + } + + bool negative = false; + long double negadjust = 0.0L; + + if (sign) { + /* + * Euler's reflection formula is + * + * gamma(1-x) gamma(x) = pi/sin(pi*x) + * + * pi + * => gamma(x) = -------------------- + * gamma(1-x) sin(pi*x) + * + * But computing 1-x is going to lose a lot of accuracy when x + * is very small, so instead we transform using the recurrence + * gamma(t+1)=t gamma(t). Setting t=-x, this gives us + * gamma(1-x) = -x gamma(-x), so we now have + * + * pi + * gamma(x) = ---------------------- + * -x gamma(-x) sin(pi*x) + * + * which relates gamma(x) to gamma(-x), which is much nicer, + * since x can be turned into -x without rounding. + */ + negadjust = sin_pi_x_over_pi(x); + negative = true; + x = -x; + + /* + * Now the ultimate answer we want is + * + * 1 / (gamma(x) * x * negadjust) + * + * where x is the positive value we've just turned it into. + * + * For some of the cases below, we'll compute gamma(x) + * normally and then compute this adjusted value afterwards. + * But for others, we can implement the reciprocal operation + * in this formula by _avoiding_ an inversion that the + * sub-case was going to do anyway. + */ + + if (negadjust == 0) { + /* + * Special case for negative integers. Applying the + * reflection formula would cause division by zero, but + * standards would prefer we treat this error case as an + * invalid operation and return NaN instead. (Possibly + * because otherwise you'd have to decide which sign of + * infinity to return, and unlike the x=0 case, there's no + * sign of zero available to disambiguate.) + */ + return negadjust / negadjust; + } + } + + /* + * Split the positive domain into various cases. For cases where + * we do the negative-number adjustment the usual way, we'll leave + * the answer in 'g' and drop out of the if statement. + */ + long double g; + + if (exponent >= 11) { + /* + * gamma of any positive value this large overflows, and gamma + * of any negative value underflows. + */ + if (!negative) { + long double huge = 0x1p+12288L; + return huge * huge; /* provoke an overflow */ + } else { + long double tiny = 0x1p-12288L; + return tiny * tiny * negadjust; /* underflow, of the right sign */ + } + } else if (exponent >= 3) { + /* Negative-number adjustment happens inside here */ + return tgamma_large(x, negative, negadjust); + } else if (exponent < -113) { + /* Negative-number adjustment happens inside here */ + return tgamma_ultratiny(x, negative, negadjust); + } else if (exponent < -5) { + /* Negative-number adjustment happens inside here */ + return tgamma_tiny(x, negative, negadjust); + } else if (exponent == 0) { + g = tgamma_central(x); + } else if (exponent < 0) { + /* + * For x in [1/32,1) we range-reduce upwards to the interval + * [1,2), using the inverse of the normal recurrence formula: + * gamma(x) = gamma(x+1)/x. + */ + g = tgamma_central(1+x) / x; + } else { + /* + * For x in [2,8) we range-reduce downwards to the interval + * [1,2) by repeated application of the recurrence formula. + * + * Actually multiplying (x-1) by (x-2) by (x-3) and so on + * would introduce multiple ULPs of rounding error. We can get + * better accuracy by writing x = (k+1/2) + t, where k is an + * integer and |t|<1/2, and expanding out the obvious factor + * (x-1)(x-2)...(x-k+1) as a polynomial in t. + */ + long double mult; + int i = x; + if (i == 2) { /* x in [2,3) */ + mult = (x-1); + } else { + long double t = x - (i + 0.5L); + switch (i) { + /* E.g. for x=3.5+t, we want + * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */ + case 3: + mult = 3.75L+t*(4.0L+t); + break; + case 4: + mult = 13.125L+t*(17.75L+t*(7.5L+t)); + break; + case 5: + mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t))); + break; + case 6: + mult = 324.84375L+t*(570.5625L+t*(376.250L+t*( + 117.5L+t*(17.5L+t)))); + break; + case 7: + mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( + 1140.0L+t*(231.25L+t*(24.0L+t))))); + break; + } + } + + g = tgamma_central(x - (i-1)) * mult; + } + + if (!negative) { + /* Positive domain: return g unmodified */ + return g; + } else { + /* Negative domain: apply the reflection formula as commented above */ + return 1.0L / (g * x * negadjust); + } +} + +#endif diff --git a/contrib/arm-optimized-routines/math/tgamma128.h b/contrib/arm-optimized-routines/math/tgamma128.h new file mode 100644 index 000000000000..90875a22dce4 --- /dev/null +++ b/contrib/arm-optimized-routines/math/tgamma128.h @@ -0,0 +1,141 @@ +/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* The largest positive value for which 128-bit tgamma does not overflow. */ +static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L; + +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +static const long double coeffs_large[] = { + 0x1.8535745aa79569579b9eec0f3bbcp+0L, + 0x1.0378f83c6fb8f0e51269f2b4a973p-3L, + 0x1.59f6a05094f69686c3380f4e2783p-8L, + -0x1.0b291dee952a82764a4859b081a6p-8L, + -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L, + 0x1.387a8b5f38dd77e7f139b1021e86p-10L, + 0x1.bca46637f65b13750c728cc29e40p-14L, + -0x1.d80401c00aef998c9e303151a51cp-11L, + -0x1.49cb6bb09f935a2053ccc2cf3711p-14L, + 0x1.4e950204437dcaf2be77f73a6f45p-10L, + 0x1.cb711a2d65f188bf60110934d6bep-14L, + -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L, + -0x1.0305ab9760cddb0d833e73766836p-12L, + 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L, + 0x1.bb4144740ad9290123fdcea684aap-11L, + -0x1.72ab4e88272a229bfafd192450f0p-5L, + 0x1.80c70ac6eb3b7a698983d25a62b8p-12L, + 0x1.e222791c6743ce3e3cae220fb236p-3L, + 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L, + -0x1.9d204fa235a42cd901b123d2ad47p+1L, + 0x1.55b56d1158f77ddb1c95fc44ab02p+0L, + 0x1.37f900a11dbd892abd7dde533e2dp+5L, + -0x1.2da49f4188dd89cb958369ef2401p+7L, + 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L, + -0x1.61433cebe649098c9611c4c7774ap+7L, +}; + +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +static const long double coeffs_tiny[] = { + 0x1.0000000000000000000000000000p+0L, + 0x1.2788cfc6fb618f49a37c7f0201fep-1L, + -0x1.4fcf4026afa2dceb8490ade22796p-1L, + -0x1.5815e8fa27047c8f42b5d9217244p-5L, + 0x1.5512320b43fbe5dfa771333518f7p-3L, + -0x1.59af103c340927bffdd44f954bfcp-5L, + -0x1.3b4af28483e210479657e5543366p-7L, + 0x1.d919c527f6070bfce9b29c2ace9cp-8L, + -0x1.317112ce35337def3556a18aa178p-10L, + -0x1.c364fe77a6f27677b985b1fa2e1dp-13L, + 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L, + -0x1.51cf9f090b5dc398ba86305e3634p-16L, + -0x1.4e80f64c04a339740de06ca9fa4ap-20L, + 0x1.241ddc2aef2ec20e58b08f2fda17p-20L, +}; + +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L; +static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L; + +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L; +static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +static const long double coeffs_central_neg[] = { + 0x1.b6c53f7377b83839c8a292e43b69p-2L, + 0x1.0bae9f40c7d09ed76e732045850ap-3L, + 0x1.4981175e14d04c3530e51d01c5fep-3L, + 0x1.79f77aaf032c948af3a9edbd2061p-4L, + 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L, + 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L, + 0x1.0b44c2f92982f887b55ec36dfdb0p-5L, + 0x1.6df1de1e178ef72ca7bd63d40870p-6L, + 0x1.f63f502bde27e81c0f5e13479b43p-7L, + 0x1.57fd67d901f40ea011353ad89a0ap-7L, + 0x1.d7151376eed187eb753e2273cafcp-8L, + 0x1.427162b5c6ff1d904c71ef53e37cp-8L, + 0x1.b954b8c3a56cf93e49ef6538928ap-9L, + 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L, + 0x1.9d35250d9b9378d9b59df734537ap-10L, + 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L, + 0x1.7e0db39bb99cdb52b028d9359380p-11L, + 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L, + 0x1.27521cf5fd24dcdf43524e6add11p-13L, + 0x1.06461d62243bf9a826b42349672fp-10L, + -0x1.2b852abead28209b4e0c756dc46ep-9L, + 0x1.be673c11a72c826115ec6d286c14p-8L, + -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L, + 0x1.fa362bd2dc68f41abef2d8600acdp-6L, + -0x1.a21585b2f52f8b23855de8e452edp-5L, + 0x1.1f234431ed032052fc92e64e0493p-4L, + -0x1.40d332476ca0199c60cdae3f9132p-4L, + 0x1.1d45dc665d86012eba2eea199cefp-4L, + -0x1.8491016cdd08dc9be7ade9b5fef3p-5L, + 0x1.7e7e2fbc6d49ad484300d6add324p-6L, + -0x1.e63fe3f874a37276a8d7d8b705ecp-8L, + 0x1.30a2a73944f8c84998314d69c23fp-10L, +}; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +static const long double coeffs_central_pos[] = { + 0x1.b6c53f7377b83839c8a292e22aa2p-2L, + -0x1.0bae9f40c7d09ed76e72e1c955dep-3L, + 0x1.4981175e14d04c3530ee5e1ecebcp-3L, + -0x1.79f77aaf032c948ac983d77f3e07p-4L, + 0x1.1e97bd10821095ab7dc94936cc11p-4L, + -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L, + 0x1.0b44c2f929837fafef7b5d9e80f1p-5L, + -0x1.6df1de1e175fe2a51faa25cddbb4p-6L, + 0x1.f63f502be57d11aed2cfe90843ffp-7L, + -0x1.57fd67d852f230015b9f64770273p-7L, + 0x1.d715138adc07e5fce81077070357p-8L, + -0x1.4271618e9fda8992a667adb15f4fp-8L, + 0x1.b954d15d9eb772e80fdd760672d7p-9L, + -0x1.2dfe391241d3cb79c8c15182843dp-9L, + 0x1.9d44396fcd48451c3ba924cee814p-10L, + -0x1.1ac195fb99739e341589e39803e6p-10L, + 0x1.82e46127b68f002770826e25f146p-11L, + -0x1.089dacd90d9f41493119ac178359p-11L, + 0x1.6993c007b20394a057d21f3d37f8p-12L, + -0x1.ec43a709f4446560c099dec8e31bp-13L, + 0x1.4ba36322f4074e9add9450f003cap-13L, + -0x1.b3f83a977965ca1b7937bf5b34cap-14L, + 0x1.10af346abc09cb25a6d9fe810b6ep-14L, + -0x1.38d8ea1188f242f50203edc395bdp-15L, + 0x1.39add987a948ec56f62b721a4475p-16L, + -0x1.02a4e141f286c8a967e2df9bc9adp-17L, + 0x1.433b50af22425f546e87113062d7p-19L, + -0x1.0c7b73cb0013f00aafc103e8e382p-21L, + 0x1.b852de313ec38da2297f6deaa6b4p-25L, +}; + +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L; diff --git a/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl new file mode 100644 index 000000000000..ecec174110ea --- /dev/null +++ b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl @@ -0,0 +1,212 @@ +# -*- julia -*- +# +# Generate tgamma128.h, containing polynomials and constants used by +# tgamma128.c. +# +# Copyright (c) 2006-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +# This Julia program depends on the 'Remez' and 'SpecialFunctions' +# library packages. To install them, run this at the interactive Julia +# prompt: +# +# import Pkg; Pkg.add(["Remez", "SpecialFunctions"]) +# +# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04). + +import Printf +import Remez +import SpecialFunctions + +# Round a BigFloat to 128-bit long double and format it as a C99 hex +# float literal. +function quadhex(x) + sign = " " + if x < 0 + sign = "-" + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + mantissa = BigInt(round(x)) + + mantstr = string(mantissa, base=16, pad=29) + return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end], + exponent) +end + +# Round a BigFloat to 128-bit long double and return it still as a +# BigFloat. +function quadval(x, round=0) + sign = +1 + if x.sign < 0 + sign = -1 + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + if round < 0 + mantissa = floor(x) + elseif round > 0 + mantissa = ceil(x) + else + mantissa = round(x) + end + + return sign * mantissa * BigFloat(2)^(exponent - 112) +end + +# Output an array of BigFloats as a C array declaration. +function dumparray(a, name) + println("static const long double ", name, "[] = {") + for x in N + println(" ", quadhex(x), ",") + end + println("};") +end + +print("/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006,2009,2023 Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +") + +Base.MPFR.setprecision(512) + +e = exp(BigFloat(1)) + +print(" +/* The largest positive value for which 128-bit tgamma does not overflow. */ +") +lo = BigFloat("1000") +hi = BigFloat("2000") +while true + global lo + global hi + global max_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + max_x = mid + break + end + if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2)) + lo = mid + else + hi = mid + end +end +max_x = quadval(max_x, -1) +println("static const long double max_x = ", quadhex(max_x), ";") + +print(" +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? sqrt(BigFloat(2)*pi/e) : + exp(SpecialFunctions.logabsgamma(1/x)[1] + + (1/x-0.5)*(1+log(x))), + (0, 1/BigFloat(8)), + 24, 0, + (x, y) -> 1/y +) +dumparray(N, "coeffs_large") + +print(" +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)), + (0, 1/BigFloat(32)), + 13, 0, +) +dumparray(N, "coeffs_tiny") + +print(" +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +") +lo = BigFloat("1.4") +hi = BigFloat("1.5") +while true + global lo + global hi + global min_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + min_x = mid + break + end + if SpecialFunctions.digamma(mid) < 0 + lo = mid + else + hi = mid + end +end +min_x_hi = quadval(min_x, -1) +println("static const long double min_x_hi = ", quadhex(min_x_hi), ";") +println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";") + +print(" +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +") +min_y = SpecialFunctions.gamma(min_x) +min_y_hi = quadval(min_y, -1) +println("static const long double min_y_hi = ", quadhex(min_y_hi), ";") +println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";") + +function taylor_bodge(x) + # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2. + # Used in the Remez calls below for x values very near the origin, to avoid + # significance loss problems when trying to compute it directly via that + # formula (even in MPFR's extra precision). + return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506")))) +end + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) : + (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x), + (0, min_x - 1), + 31, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_neg") + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) : + (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x), + (0, 2 - min_x), + 28, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_pos") + +print(" +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +") +println("static const long double pi = ", quadhex(BigFloat(pi)), ";") diff --git a/contrib/arm-optimized-routines/math/v_cos.c b/contrib/arm-optimized-routines/math/v_cos.c deleted file mode 100644 index 4c8787e66c41..000000000000 --- a/contrib/arm-optimized-routines/math/v_cos.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Double-precision vector cos function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define HalfPi v_f64 (0x1.921fb54442d18p+0) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define RangeVal v_f64 (0x1p23) -#define AbsMask v_u64 (0x7fffffffffffffff) - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (cos, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(cos) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t odd, cmp; - - r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); - cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); - -#if WANT_SIMD_EXCEPT - if (unlikely (v_any_u64 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - specialcase later. */ - r = v_sel_f64 (cmp, v_f64 (1.0), r); -#endif - - /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = v_fma_f64 (InvPi, r + HalfPi, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - n -= v_f64 (0.5); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_cosf.c b/contrib/arm-optimized-routines/math/v_cosf.c deleted file mode 100644 index bd677c3ae173..000000000000 --- a/contrib/arm-optimized-routines/math/v_cosf.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Single-precision vector cos function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) -#define HalfPi v_f32 (0x1.921fb6p0f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (cosf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(cosf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t odd, cmp; - - r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); - cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); - -#if WANT_SIMD_EXCEPT - if (unlikely (v_any_u32 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - specialcase later. */ - r = v_sel_f32 (cmp, v_f32 (1.0f), r); -#endif - - /* n = rint((|x|+pi/2)/pi) - 0.5 */ - n = v_fma_f32 (InvPi, r + HalfPi, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - n -= v_f32 (0.5f); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp.c b/contrib/arm-optimized-routines/math/v_exp.c deleted file mode 100644 index da23fd1c5f46..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Double-precision vector e^x function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED -#include "v_exp.h" - -#if V_EXP_TABLE_BITS == 7 -/* maxerr: 1.88 +0.5 ulp - rel error: 1.4337*2^-53 - abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ -#define C1 v_f64 (0x1.ffffffffffd43p-2) -#define C2 v_f64 (0x1.55555c75adbb2p-3) -#define C3 v_f64 (0x1.55555da646206p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */ -#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */ -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63) -#elif V_EXP_TABLE_BITS == 8 -/* maxerr: 0.54 +0.5 ulp - rel error: 1.4318*2^-58 - abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */ -#define C1 v_f64 (0x1.fffffffffffd4p-2) -#define C2 v_f64 (0x1.5555571d6b68cp-3) -#define C3 v_f64 (0x1.5555576a59599p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep8) -#define Ln2hi v_f64 (0x1.62e42fefa39efp-9) -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64) -#endif - -#define N (1 << V_EXP_TABLE_BITS) -#define Tab __v_exp_data -#define IndexMask v_u64 (N - 1) -#define Shift v_f64 (0x1.8p+52) - -#if WANT_SIMD_EXCEPT - -#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)). */ -#define BigBound 0x408 /* top12 (asuint64 (0x1p9)). */ - -VPCS_ATTR static NOINLINE v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f64 (exp, x, y, cmp); -} - -#else - -#define Thres v_f64 (704.0) - -VPCS_ATTR -static v_f64_t -specialcase (v_f64_t s, v_f64_t y, v_f64_t n) -{ - v_f64_t absn = v_abs_f64 (n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); - v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); - v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); - v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); - v_f64_t r1 = s1 * s1; - v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; - return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); -} - -#endif - -VPCS_ATTR -v_f64_t -V_NAME(exp) (v_f64_t x) -{ - v_f64_t n, r, r2, s, y, z; - v_u64_t cmp, u, e, i; - -#if WANT_SIMD_EXCEPT - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - specialcase to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - v_f64_t xm = x; - cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound - >= BigBound - TinyBound); - if (unlikely (v_any_u64 (cmp))) - x = v_sel_f64 (cmp, v_f64 (1), x); -#else - cmp = v_cond_u64 (v_abs_f64 (x) > Thres); -#endif - - /* n = round(x/(ln2/N)). */ - z = v_fma_f64 (x, InvLn2, Shift); - u = v_as_u64_f64 (z); - n = z - Shift; - - /* r = x - n*ln2/N. */ - r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); - - e = u << (52 - V_EXP_TABLE_BITS); - i = u & IndexMask; - - /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - r2 = r * r; - y = v_fma_f64 (C2, r, C1); - y = v_fma_f64 (C3, r2, y); - y = v_fma_f64 (y, r2, r); - - /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - s = v_as_f64_u64 (u + e); - - if (unlikely (v_any_u64 (cmp))) -#if WANT_SIMD_EXCEPT - return specialcase (xm, v_fma_f64 (y, s, s), cmp); -#else - return specialcase (s, y, n); -#endif - - return v_fma_f64 (y, s, s); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp.h b/contrib/arm-optimized-routines/math/v_exp.h deleted file mode 100644 index 1e7f7f3b927d..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Declarations for double-precision e^x vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_EXP_TABLE_BITS 7 - -extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp2f.c b/contrib/arm-optimized-routines/math/v_exp2f.c deleted file mode 100644 index 7f40dbaa6679..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp2f.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.962 ulp. */ - 0x1.59977ap-10f, - 0x1.3ce9e4p-7f, - 0x1.c6bd32p-5f, - 0x1.ebf9bcp-3f, - 0x1.62e422p-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) - -#if WANT_SIMD_EXCEPT - -#define TinyBound 0x20000000 /* asuint (0x1p-63). */ -#define BigBound 0x42800000 /* asuint (0x1p6). */ - -VPCS_ATTR -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (exp2f, x, y, cmp); -} - -#else - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -#endif - -VPCS_ATTR -v_f32_t -V_NAME(exp2f) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly; - v_u32_t cmp, e; - -#if WANT_SIMD_EXCEPT - cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound - >= BigBound - TinyBound); - v_f32_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - specialcase to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - if (unlikely (v_any_u32 (cmp))) - x = v_sel_f32 (cmp, v_f32 (1), x); -#endif - - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - -#if !WANT_SIMD_EXCEPT - v_f32_t absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); -#endif - - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - - if (unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp); -#else - return specialcase (poly, n, e, absn, cmp, scale); -#endif - - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/v_exp2f_1u.c deleted file mode 100644 index de1a32d54139..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp2f_1u.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 0.878 ulp. */ - 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) -#define C5 v_f32 (Poly[5]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); -} - -VPCS_ATTR -v_f32_t -V_NAME(exp2f_1u) (v_f32_t x) -{ - v_f32_t n, r, scale, poly, absn; - v_u32_t cmp, e; - - /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, C5); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} -#endif diff --git a/contrib/arm-optimized-routines/math/v_expf.c b/contrib/arm-optimized-routines/math/v_expf.c deleted file mode 100644 index ade23b2416aa..000000000000 --- a/contrib/arm-optimized-routines/math/v_expf.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.45358 +0.5 ulp. */ - 0x1.0e4020p-7f, - 0x1.573e2ep-5f, - 0x1.555e66p-3f, - 0x1.fffdb6p-2f, - 0x1.ffffecp-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -#if WANT_SIMD_EXCEPT - -#define TinyBound 0x20000000 /* asuint (0x1p-63). */ -#define BigBound 0x42800000 /* asuint (0x1p6). */ - -VPCS_ATTR -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (expf, x, y, cmp); -} - -#else - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -#endif - -VPCS_ATTR -v_f32_t -V_NAME(expf) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, z; - v_u32_t cmp, e; - -#if WANT_SIMD_EXCEPT - cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound - >= BigBound - TinyBound); - v_f32_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - specialcase to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - if (unlikely (v_any_u32 (cmp))) - x = v_sel_f32 (cmp, v_f32 (1), x); -#endif - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - -#if !WANT_SIMD_EXCEPT - v_f32_t absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); -#endif - - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - - if (unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp); -#else - return specialcase (poly, n, e, absn, cmp, scale); -#endif - - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_expf_1u.c b/contrib/arm-optimized-routines/math/v_expf_1u.c deleted file mode 100644 index 8f0ae91c582a..000000000000 --- a/contrib/arm-optimized-routines/math/v_expf_1u.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 0.36565 +0.5 ulp. */ - 0x1.6a6000p-10f, - 0x1.12718ep-7f, - 0x1.555af0p-5f, - 0x1.555430p-3f, - 0x1.fffff4p-2f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf_1u) (v_f32_t x) -{ - v_f32_t n, r, scale, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} -#endif diff --git a/contrib/arm-optimized-routines/math/v_log.c b/contrib/arm-optimized-routines/math/v_log.c deleted file mode 100644 index 47a829119b3c..000000000000 --- a/contrib/arm-optimized-routines/math/v_log.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#include "v_log.h" -#if V_SUPPORTED - -/* Worst-case error: 1.17 + 0.5 ulp. */ - -static const f64_t Poly[] = { - /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - -0x1.ffffffffffff7p-2, - 0x1.55555555170d4p-2, - -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, - -0x1.554e550bd501ep-3, -}; - -#define A0 v_f64 (Poly[0]) -#define A1 v_f64 (Poly[1]) -#define A2 v_f64 (Poly[2]) -#define A3 v_f64 (Poly[3]) -#define A4 v_f64 (Poly[4]) -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) -#define N (1 << V_LOG_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) - -struct entry -{ - v_f64_t invc; - v_f64_t logc; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - e.invc = __v_log_data[i].invc; - e.logc = __v_log_data[i].logc; -#else - e.invc[0] = __v_log_data[i[0]].invc; - e.logc[0] = __v_log_data[i[0]].logc; - e.invc[1] = __v_log_data[i[1]].invc; - e.logc[1] = __v_log_data[i[1]].logc; -#endif - return e; -} - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (log, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(log) (v_f64_t x) -{ - v_f64_t z, r, r2, p, y, kd, hi; - v_u64_t ix, iz, tmp, top, i, cmp; - v_s64_t k; - struct entry e; - - ix = v_as_u64_f64 (x); - top = ix >> 48; - cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N; - k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */ - iz = ix - (tmp & v_u64 (0xfffULL << 52)); - z = v_as_f64_u64 (iz); - e = lookup (i); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - kd = v_to_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = v_fma_f64 (kd, Ln2, e.logc + r); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = r * r; - y = v_fma_f64 (A3, r, A2); - p = v_fma_f64 (A1, r, A0); - y = v_fma_f64 (A4, r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_log.h b/contrib/arm-optimized-routines/math/v_log.h deleted file mode 100644 index a37bbc2bd6b6..000000000000 --- a/contrib/arm-optimized-routines/math/v_log.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Declarations for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_LOG_TABLE_BITS 7 - -extern const struct v_log_data -{ - f64_t invc; - f64_t logc; -} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN; -#endif diff --git a/contrib/arm-optimized-routines/math/v_log_data.c b/contrib/arm-optimized-routines/math/v_log_data.c deleted file mode 100644 index ec1c8e5e16b2..000000000000 --- a/contrib/arm-optimized-routines/math/v_log_data.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Lookup table for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_log.h" -#if WANT_VMATH - -#define N (1 << V_LOG_TABLE_BITS) - -/* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + poly(z/c - 1) - -where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) -and log(c) and 1/c for the ith subinterval comes from a lookup table: - - tab[i].invc = 1/c - tab[i].logc = (double)log(c) - -where c is near the center of the subinterval and is chosen by trying several -floating point invc candidates around 1/center and selecting one for which -the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval -that contains 1 and the previous one got tweaked to avoid cancellation. */ -const struct v_log_data __v_log_data[N] = { -{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2}, -{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2}, -{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2}, -{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2}, -{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2}, -{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2}, -{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2}, -{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2}, -{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2}, -{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2}, -{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2}, -{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2}, -{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2}, -{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2}, -{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2}, -{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2}, -{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2}, -{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2}, -{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2}, -{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3}, -{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3}, -{0x1.446f12b278001p+0, -0x1.e52e160484698p-3}, -{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3}, -{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3}, -{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3}, -{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3}, -{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3}, -{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3}, -{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3}, -{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3}, -{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3}, -{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3}, -{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3}, -{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3}, -{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3}, -{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3}, -{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3}, -{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3}, -{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3}, -{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3}, -{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3}, -{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3}, -{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3}, -{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3}, -{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3}, -{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4}, -{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4}, -{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4}, -{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4}, -{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4}, -{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4}, -{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4}, -{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4}, -{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4}, -{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4}, -{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4}, -{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4}, -{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4}, -{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4}, -{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4}, -{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5}, -{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5}, -{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5}, -{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5}, -{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5}, -{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5}, -{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5}, -{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5}, -{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6}, -{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6}, -{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6}, -{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6}, -{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7}, -{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7}, -{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9}, -{1.0, 0.0}, -{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8}, -{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7}, -{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6}, -{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6}, -{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5}, -{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5}, -{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5}, -{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5}, -{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4}, -{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4}, -{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4}, -{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4}, -{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4}, -{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4}, -{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4}, -{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4}, -{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4}, -{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3}, -{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3}, -{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3}, -{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3}, -{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3}, -{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3}, -{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3}, -{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3}, -{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3}, -{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3}, -{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3}, -{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3}, -{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3}, -{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3}, -{0x1.9998e1480b618p-1, 0x1.c903161240163p-3}, -{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3}, -{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3}, -{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3}, -{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3}, -{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2}, -{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2}, -{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2}, -{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2}, -{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2}, -{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2}, -{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2}, -{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2}, -{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2}, -{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2}, -{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2}, -{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2}, -{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2}, -{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2}, -{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2}, -{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2}, -}; -#endif diff --git a/contrib/arm-optimized-routines/math/v_logf.c b/contrib/arm-optimized-routines/math/v_logf.c deleted file mode 100644 index 93a53758bff7..000000000000 --- a/contrib/arm-optimized-routines/math/v_logf.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 3.34 ulp error */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, - -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, -}; -#define P7 v_f32 (Poly[0]) -#define P6 v_f32 (Poly[1]) -#define P5 v_f32 (Poly[2]) -#define P4 v_f32 (Poly[3]) -#define P3 v_f32 (Poly[4]) -#define P2 v_f32 (Poly[5]) -#define P1 v_f32 (Poly[6]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(logf) (v_f32_t x) -{ - v_f32_t n, p, q, r, r2, y; - v_u32_t u, cmp; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */ - u -= Off; - n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */ - u &= Mask; - u += Off; - r = v_as_f32_u32 (u) - v_f32 (1.0f); - - /* y = log(1+r) + n*ln2. */ - r2 = r * r; - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = v_fma_f32 (P6, r, P5); - q = v_fma_f32 (P4, r, P3); - y = v_fma_f32 (P2, r, P1); - p = v_fma_f32 (P7, r2, p); - q = v_fma_f32 (p, r2, q); - y = v_fma_f32 (q, r2, y); - p = v_fma_f32 (Ln2, n, r); - y = v_fma_f32 (y, r2, p); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_math.h b/contrib/arm-optimized-routines/math/v_math.h deleted file mode 100644 index 3289916187d2..000000000000 --- a/contrib/arm-optimized-routines/math/v_math.h +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif -#if WANT_VMATH - -/* The goal of this header is to allow vector and scalar - build of the same algorithm, the provided intrinsic - wrappers are also vector length agnostic so they can - be implemented for SVE too (or other simd architectures) - and then the code should work on those targets too. */ - -#if SCALAR -#define V_NAME(x) __s_##x -#elif VPCS && __aarch64__ -#define V_NAME(x) __vn_##x -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -#else -#define V_NAME(x) __v_##x -#endif - -#ifndef VPCS_ATTR -#define VPCS_ATTR -#endif -#ifndef VPCS_ALIAS -#define VPCS_ALIAS -#endif - -#include -#include "math_config.h" - -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; - -/* reinterpret as type1 from type2. */ -static inline u32_t -as_u32_f32 (f32_t x) -{ - union { f32_t f; u32_t u; } r = {x}; - return r.u; -} -static inline f32_t -as_f32_u32 (u32_t x) -{ - union { u32_t u; f32_t f; } r = {x}; - return r.f; -} -static inline s32_t -as_s32_u32 (u32_t x) -{ - union { u32_t u; s32_t i; } r = {x}; - return r.i; -} -static inline u32_t -as_u32_s32 (s32_t x) -{ - union { s32_t i; u32_t u; } r = {x}; - return r.u; -} -static inline u64_t -as_u64_f64 (f64_t x) -{ - union { f64_t f; u64_t u; } r = {x}; - return r.u; -} -static inline f64_t -as_f64_u64 (u64_t x) -{ - union { u64_t u; f64_t f; } r = {x}; - return r.f; -} -static inline s64_t -as_s64_u64 (u64_t x) -{ - union { u64_t u; s64_t i; } r = {x}; - return r.i; -} -static inline u64_t -as_u64_s64 (s64_t x) -{ - union { s64_t i; u64_t u; } r = {x}; - return r.u; -} - -#if SCALAR -#define V_SUPPORTED 1 -typedef f32_t v_f32_t; -typedef u32_t v_u32_t; -typedef s32_t v_s32_t; -typedef f64_t v_f64_t; -typedef u64_t v_u64_t; -typedef s64_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 1; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return x; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return x; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return x; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - *x = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - *x = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - *x = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x ? -1 : 0; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return __builtin_fabsf (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return __builtin_fmaf (x, y, z); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return __builtin_roundf (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return __builtin_lroundf (x); /* relies on -fno-math-errno. */ -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return p ? x : y; -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return x; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return f (x); -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return f (x1, x2); -} - -static inline int -v_lanes64 (void) -{ - return 1; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return x; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return x; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return x; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - *x = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x ? -1 : 0; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return __builtin_fabs (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return __builtin_fma (x, y, z); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return __builtin_round (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return __builtin_lround (x); /* relies on -fno-math-errno. */ -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return p ? x : y; -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return x; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return f (x); -} - -#elif __aarch64__ -#define V_SUPPORTED 1 -#include -typedef float32x4_t v_f32_t; -typedef uint32x4_t v_u32_t; -typedef int32x4_t v_s32_t; -typedef float64x2_t v_f64_t; -typedef uint64x2_t v_u64_t; -typedef int64x2_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return (v_f32_t){x, x, x, x}; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return (v_u32_t){x, x, x, x}; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return (v_s32_t){x, x, x, x}; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x[i]; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x[i]; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x[i]; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - (*x)[i] = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return vabsq_f32 (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return vfmaq_f32 (z, x, y); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return vrndaq_f32 (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return vcvtaq_s32_f32 (x); -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return vbslq_f32 (p, x, y); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return ( - v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return (v_f64_t){x, x}; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return (v_u64_t){x, x}; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return (v_s64_t){x, x}; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x[i]; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - (*x)[i] = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return vabsq_f64 (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return vfmaq_f64 (z, x, y); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return vrndaq_f64 (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return vcvtaq_s64_f64 (x); -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return vbslq_f64 (p, x, y); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return (v_f64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return (v_u64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; -} -#endif - -#endif -#endif diff --git a/contrib/arm-optimized-routines/math/v_pow.c b/contrib/arm-optimized-routines/math/v_pow.c deleted file mode 100644 index 05a83aaa8c0a..000000000000 --- a/contrib/arm-optimized-routines/math/v_pow.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Double-precision vector pow function. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -VPCS_ATTR -v_f64_t -V_NAME(pow) (v_f64_t x, v_f64_t y) -{ - v_f64_t z; - for (int lane = 0; lane < v_lanes64 (); lane++) - { - f64_t sx = v_get_f64 (x, lane); - f64_t sy = v_get_f64 (y, lane); - f64_t sz = pow (sx, sy); - v_set_f64 (&z, lane, sz); - } - return z; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_powf.c b/contrib/arm-optimized-routines/math/v_powf.c deleted file mode 100644 index ad8ab8d4f00d..000000000000 --- a/contrib/arm-optimized-routines/math/v_powf.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define SBITS 5 -#define Tlog v__powf_log2_data.tab -#define Texp v__exp2f_data.tab -#define A v__powf_log2_data.poly -#define C v__exp2f_data.poly -#define LOGDEG 4 - -#if LOGDEG == 5 -/* 1.01 ulp */ -#define OFF v_u32 (0x3f330000) -#define TBITS 4 -#elif LOGDEG == 4 -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */ -#define OFF v_u32 (0x3f35d000) -#define TBITS 5 -#endif - -#define V_EXP2F_TABLE_BITS SBITS -#define V_EXP2F_POLY_ORDER 3 -struct v_exp2f_data -{ - uint64_t tab[1 << V_EXP2F_TABLE_BITS]; - double poly[V_EXP2F_POLY_ORDER]; -}; - -#define V_POWF_LOG2_TABLE_BITS TBITS -#define V_POWF_LOG2_POLY_ORDER LOGDEG -#define SCALE ((double) (1 << SBITS)) -struct v_powf_log2_data -{ - struct - { - double invc, logc; - } tab[1 << V_POWF_LOG2_TABLE_BITS]; - double poly[V_POWF_LOG2_POLY_ORDER]; -}; - -static const struct v_powf_log2_data v__powf_log2_data = { -#if LOGDEG == 5 - .tab = { -{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE }, -{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE }, -{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE }, -{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE }, -{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE }, -{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE }, -{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE }, -{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE }, -{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE }, -{ 0x1p+0, 0x0p+0 * SCALE }, -{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE }, -{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE }, -{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE }, -{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE }, -{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE }, -{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE }, - }, -/* rel err: 1.46 * 2^-32 */ - .poly = { -0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE, -0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE, -0x1.71547652ab82bp0 * SCALE, - } -#elif LOGDEG == 4 - .tab = { -{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE}, -{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE}, -{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE}, -{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE}, -{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE}, -{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE}, -{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE}, -{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE}, -{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE}, -{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE}, -{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE}, -{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE}, -{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE}, -{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE}, -{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE}, -{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE}, -{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE}, -{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE}, -{0x1p+0, 0x0p+0 * SCALE}, -{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE}, -{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE}, -{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE}, -{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE}, -{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE}, -{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE}, -{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE}, -{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE}, -{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE}, -{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE}, -{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE}, -{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE}, -{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE}, - }, -/* rel err: 1.5 * 2^-30 */ - .poly = { - -0x1.6ff5daa3b3d7cp-2 * SCALE, - 0x1.ec81d03c01aebp-2 * SCALE, - -0x1.71547bb43f101p-1 * SCALE, - 0x1.7154764a815cbp0 * SCALE, - } -#endif -}; - -static const struct v_exp2f_data v__exp2f_data = { - .tab = { -0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, -0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, -0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, -0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, -0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, -0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, -0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, -0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, - }, -/* rel err: 1.69 * 2^-34 */ - .poly = { -0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE - }, -}; - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(powf) (v_f32_t x, v_f32_t y) -{ - v_u32_t u, tmp, cmp, i, top, iz; - v_s32_t k; - v_f32_t ret; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - tmp = u - OFF; - i = (tmp >> (23 - TBITS)) % (1 << TBITS); - top = tmp & 0xff800000; - iz = u - top; - k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */ - - for (int lane = 0; lane < v_lanes32 (); lane++) - { - uint32_t si, siz; - int32_t sk; - float sy; - - /* Use double precision for each lane. */ - double invc, logc, z, r, p, y0, logx, ylogx, kd, s; - uint64_t ki, t; - - si = v_get_u32 (i, lane); - siz = v_get_u32 (iz, lane); - sk = v_get_s32 (k, lane); - sy = v_get_f32 (y, lane); - - invc = Tlog[si].invc; - logc = Tlog[si].logc; - z = (double) as_f32_u32 (siz); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ - r = __builtin_fma (z, invc, -1.0); - y0 = logc + (double) sk; - - /* Polynomial to approximate log1p(r)/ln2. */ -#if LOGDEG == 5 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + A[4]; - logx = r * logx + y0; -#elif LOGDEG == 4 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; -#endif - ylogx = sy * logx; - v_set_u32 (&cmp, lane, - (as_u64_f64 (ylogx) >> 47 & 0xffff) - >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47 - ? 1 - : v_get_u32 (cmp, lane)); - - /* N*x = k + r with r in [-1/2, 1/2] */ -#if TOINT_INTRINSICS - kd = roundtoint (ylogx); /* k */ - ki = converttoint (ylogx); -#else -# define SHIFT 0x1.8p52 - kd = eval_as_double (ylogx + SHIFT); - ki = asuint64 (kd); - kd -= SHIFT; -#endif - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ - t = Texp[ki % (1 << SBITS)]; - t += ki << (52 - SBITS); - s = as_f64_u64 (t); - p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - v_set_f32 (&ret, lane, p); - } - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, ret, cmp); - return ret; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_sin.c b/contrib/arm-optimized-routines/math/v_sin.c deleted file mode 100644 index 9dbb9dec04de..000000000000 --- a/contrib/arm-optimized-routines/math/v_sin.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Double-precision vector sin function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define AbsMask v_u64 (0x7fffffffffffffff) - -#if WANT_SIMD_EXCEPT -#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)). */ -#define Thresh 0x214 /* top12 (asuint64 (RangeVal)) - TinyBound. */ -#else -#define RangeVal v_f64 (0x1p23) -#endif - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (sin, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(sin) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t sign, odd, cmp, ir; - - ir = v_as_u64_f64 (x) & AbsMask; - r = v_as_f64_u64 (ir); - sign = v_as_u64_f64 (x) & ~AbsMask; - -#if WANT_SIMD_EXCEPT - /* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be - triggered correctly, set any special lanes to 1 (which is neutral w.r.t. - fenv). These lanes will be fixed by specialcase later. */ - cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh); - if (unlikely (v_any_u64 (cmp))) - r = v_sel_f64 (cmp, v_f64 (1), r); -#else - cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal)); -#endif - - /* n = rint(|x|/pi). */ - n = v_fma_f64 (InvPi, r, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_sinf.c b/contrib/arm-optimized-routines/math/v_sinf.c deleted file mode 100644 index ce35dacc65cf..000000000000 --- a/contrib/arm-optimized-routines/math/v_sinf.c +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Single-precision vector sin function. - * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define TinyBound v_f32 (0x1p-61f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (sinf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(sinf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t sign, odd, cmp, ir; - - ir = v_as_u32_f32 (x) & AbsMask; - r = v_as_f32_u32 (ir); - sign = v_as_u32_f32 (x) & ~AbsMask; - -#if WANT_SIMD_EXCEPT - cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound) - >= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound))); - if (unlikely (v_any_u32 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - specialcase later. */ - r = v_sel_f32 (cmp, v_f32 (1), r); -#else - cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal)); -#endif - - /* n = rint(|x|/pi) */ - n = v_fma_f32 (InvPi, r, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/vn_cos.c b/contrib/arm-optimized-routines/math/vn_cos.c deleted file mode 100644 index 4b5b23718a8b..000000000000 --- a/contrib/arm-optimized-routines/math/vn_cos.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cos. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos) -#include "v_cos.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_cosf.c b/contrib/arm-optimized-routines/math/vn_cosf.c deleted file mode 100644 index 86dd26ecb3e7..000000000000 --- a/contrib/arm-optimized-routines/math/vn_cosf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cosf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf) -#include "v_cosf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp.c b/contrib/arm-optimized-routines/math/vn_exp.c deleted file mode 100644 index 0d85b17de05a..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp) -#include "v_exp.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp2f.c b/contrib/arm-optimized-routines/math/vn_exp2f.c deleted file mode 100644 index da3bb40ae93f..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp2f.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f) -#include "v_exp2f.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c b/contrib/arm-optimized-routines/math/vn_exp2f_1u.c deleted file mode 100644 index 3e3a24705614..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_exp2f_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_expf.c b/contrib/arm-optimized-routines/math/vn_expf.c deleted file mode 100644 index 6e91a940bbf4..000000000000 --- a/contrib/arm-optimized-routines/math/vn_expf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) -#include "v_expf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_expf_1u.c b/contrib/arm-optimized-routines/math/vn_expf_1u.c deleted file mode 100644 index 57ae6a315b9b..000000000000 --- a/contrib/arm-optimized-routines/math/vn_expf_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_expf_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_log.c b/contrib/arm-optimized-routines/math/vn_log.c deleted file mode 100644 index 902bff1fcd4e..000000000000 --- a/contrib/arm-optimized-routines/math/vn_log.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log) -#include "v_log.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_logf.c b/contrib/arm-optimized-routines/math/vn_logf.c deleted file mode 100644 index 07e493685b4d..000000000000 --- a/contrib/arm-optimized-routines/math/vn_logf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_logf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf) -#include "v_logf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_pow.c b/contrib/arm-optimized-routines/math/vn_pow.c deleted file mode 100644 index 1a980ff6bf2f..000000000000 --- a/contrib/arm-optimized-routines/math/vn_pow.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_pow. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow) -#include "v_pow.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_powf.c b/contrib/arm-optimized-routines/math/vn_powf.c deleted file mode 100644 index a42ade371adc..000000000000 --- a/contrib/arm-optimized-routines/math/vn_powf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_powf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf) -#include "v_powf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_sin.c b/contrib/arm-optimized-routines/math/vn_sin.c deleted file mode 100644 index 64b05c8ca0eb..000000000000 --- a/contrib/arm-optimized-routines/math/vn_sin.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sin. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin) -#include "v_sin.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_sinf.c b/contrib/arm-optimized-routines/math/vn_sinf.c deleted file mode 100644 index 6e880c60dc39..000000000000 --- a/contrib/arm-optimized-routines/math/vn_sinf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf) -#include "v_sinf.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/Dir.mk b/contrib/arm-optimized-routines/pl/math/Dir.mk index be65344572a8..94b26cf3309c 100644 --- a/contrib/arm-optimized-routines/pl/math/Dir.mk +++ b/contrib/arm-optimized-routines/pl/math/Dir.mk @@ -1,229 +1,216 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2023, Arm Limited. +# Copyright (c) 2019-2024, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception PLM := $(srcdir)/pl/math AOR := $(srcdir)/math B := build/pl/math -math-lib-srcs := $(wildcard $(PLM)/*.[cS]) +pl-lib-srcs := $(wildcard $(PLM)/*.[cS]) + +ifeq ($(WANT_SVE_MATH), 0) +pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs)) +endif + math-test-srcs := \ $(AOR)/test/mathtest.c \ $(AOR)/test/mathbench.c \ $(AOR)/test/ulp.c \ math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS]) -math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h)) -math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h)) +pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h)) +pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h)) -math-libs := \ +pl-libs := \ build/pl/lib/libmathlib.so \ build/pl/lib/libmathlib.a \ math-tools := \ build/pl/bin/mathtest \ build/pl/bin/mathbench \ build/pl/bin/mathbench_libc \ build/pl/bin/runulp.sh \ build/pl/bin/ulp \ math-host-tools := \ build/pl/bin/rtest \ -math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs))) +pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs))) math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs))) math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs))) -math-target-objs := $(math-lib-objs) $(math-test-objs) -math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs) +pl-target-objs := $(pl-lib-objs) $(math-test-objs) +pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs) pl/math-files := \ - $(math-objs) \ - $(math-libs) \ + $(pl-objs) \ + $(pl-libs) \ $(math-tools) \ $(math-host-tools) \ - $(math-includes) \ - $(math-test-includes) \ + $(pl-includes) \ + $(pl-test-includes) \ -all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) +all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes) -$(math-objs): $(math-includes) $(math-test-includes) -$(math-objs): CFLAGS_PL += $(math-cflags) +$(pl-objs): $(pl-includes) $(pl-test-includes) +$(pl-objs): CFLAGS_PL += $(math-cflags) $(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) $(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS) -build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs) +$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags) + +build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs) # Replace PL_SIG cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@ -build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs) +build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs) # Replace PL_SIG macros with mathbench func entries cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@ -build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs) +build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs) # Replace PL_SIG macros with ULP wrapper declarations cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@ $(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h $(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test $(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h $(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test -build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os) +build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os) $(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^ -build/pl/lib/libmathlib.a: $(math-lib-objs) +build/pl/lib/libmathlib.a: $(pl-lib-objs) rm -f $@ $(AR) rc $@ $^ $(RANLIB) $@ $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc $(math-tools): LDLIBS += $(math-ldlibs) -lm +# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled +$(math-tools): CFLAGS_PL += $(math-sve-cflags) # Some targets to build pl/math/test from math/test sources build/pl/math/test/%.o: $(srcdir)/math/test/%.S $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/math/test/%.o: $(srcdir)/math/test/%.c $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/math/test/%.os: $(srcdir)/math/test/%.S $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/math/test/%.os: $(srcdir)/math/test/%.c $(CC) $(CFLAGS_PL) -c -o $@ $< # Some targets to build pl/ sources using appropriate flags build/pl/%.o: $(srcdir)/pl/%.S $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/%.o: $(srcdir)/pl/%.c $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/%.os: $(srcdir)/pl/%.S $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/%.os: $(srcdir)/pl/%.c $(CC) $(CFLAGS_PL) -c -o $@ $< build/pl/bin/rtest: $(math-host-objs) $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) # This is not ideal, but allows custom symbols in mathbench to get resolved. build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) build/pl/include/%.h: $(PLM)/include/%.h cp $< $@ build/pl/include/test/%.h: $(PLM)/test/%.h cp $< $@ build/pl/bin/%.sh: $(PLM)/test/%.sh cp $< $@ pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst) pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst) check-pl/math-test: $(math-tools) cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) check-pl/math-rtest: $(math-host-tools) $(math-tools) cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) ulp-input-dir=$(B)/test/inputs -math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs))) -math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs))) -math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs))) -math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs))) +math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs))) +math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs))) +math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs))) -ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs) +ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs) $(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags) $(ulp-input-dir)/%.ulp: $(PLM)/%.c mkdir -p $(@D) $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@ -$(ulp-input-dir)/%.alias: $(PLM)/%.c - mkdir -p $(@D) - $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@ - $(ulp-input-dir)/%.fenv: $(PLM)/%.c mkdir -p $(@D) $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@ $(ulp-input-dir)/%.itv: $(PLM)/%.c mkdir -p $(dir $@) $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@ ulp-lims := $(ulp-input-dir)/limits $(ulp-lims): $(math-lib-lims) cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@ -ulp-aliases := $(ulp-input-dir)/aliases -$(ulp-aliases): $(math-lib-aliases) - cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@ - fenv-exps := $(ulp-input-dir)/fenv $(fenv-exps): $(math-lib-fenvs) cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@ -ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias -$(ulp-itvs-noalias): $(math-lib-itvs) - cat $^ > $@ - -rename-aliases := $(ulp-input-dir)/rename_alias.sed -$(rename-aliases): $(ulp-aliases) - # Build sed script for replacing aliases from generated alias file - cat $< | awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@ - -ulp-itvs-alias := $(ulp-input-dir)/itvs_alias -$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases) - cat $< | sed -f $(rename-aliases) > $@ - ulp-itvs := $(ulp-input-dir)/intervals -$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias) +$(ulp-itvs): $(math-lib-itvs) cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@ -check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs) +check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs) WANT_SVE_MATH=$(WANT_SVE_MATH) \ ULPFLAGS="$(math-ulpflags)" \ LIMITS=../../../$(ulp-lims) \ - ALIASES=../../../$(ulp-aliases) \ INTERVALS=../../../$(ulp-itvs) \ FENV=../../../$(fenv-exps) \ + FUNC=$(func) \ build/pl/bin/runulp.sh $(EMULATOR) check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp $(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so $(INSTALL) -D $< $@ $(DESTDIR)$(libdir)/pl/%: build/pl/lib/% $(INSTALL) -m 644 -D $< $@ $(DESTDIR)$(includedir)/pl/%: build/pl/include/% $(INSTALL) -m 644 -D $< $@ install-pl/math: \ - $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \ - $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%) + $(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \ + $(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%) clean-pl/math: rm -f $(pl/math-files) .PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math diff --git a/contrib/arm-optimized-routines/pl/math/acos_2u.c b/contrib/arm-optimized-routines/pl/math/acos_2u.c new file mode 100644 index 000000000000..9ec6894f1d81 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/acos_2u.c @@ -0,0 +1,100 @@ +/* + * Double-precision acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffffffffffff) +#define Half (0x3fe0000000000000) +#define One (0x3ff0000000000000) +#define PiOver2 (0x1.921fb54442d18p+0) +#define Pi (0x1.921fb54442d18p+1) +#define Small (0x3c90000000000000) /* 2^-53. */ +#define Small16 (0x3c90) +#define QNaN (0x7ff8) + +/* Fast implementation of double-precision acos(x) based on polynomial + approximation of double-precision asin(x). + + For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + + For |x| in [Small, 0.5], use the trigonometric identity + + acos(x) = pi/2 - asin(x) + + and use an order 11 polynomial P such that the final approximation of asin is + an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + + The largest observed error in this region is 1.18 ulps, + acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1 + + acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)) + + where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the + approximation of asin near 0. + + The largest observed error in this region is 1.52 ulps, + acos(0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. + + For x in [-1.0, -0.5], use this other identity to deduce the negative inputs + from their absolute value: acos(x) = pi - acos(-x). */ +double +acos (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t ia16 = ia >> 48; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia16 == QNaN)) + return x; + if (ia > One) + return __math_invalid (x); + if (ia16 < Small16) + return PiOver2 - x; + + /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5); + double z = ax < 0.5 ? ax : sqrt (z2); + + /* Use a single polynomial approximation P for both intervals. */ + double z4 = z2 * z2; + double z8 = z4 * z4; + double z16 = z8 * z8; + double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fma (z * z2, p, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = pi - 2 Q(|x|), for -1.0 < x <= -0.5 + = 2 Q(|x|) , for -0.5 < x < 0.0. */ + if (ax < 0.5) + return PiOver2 - asdouble (asuint64 (p) | sign); + + return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p; +} + +PL_SIG (S, D, 1, acos, -1.0, 1.0) +PL_TEST_ULP (acos, 1.02) +PL_TEST_INTERVAL (acos, 0, Small, 5000) +PL_TEST_INTERVAL (acos, Small, 0.5, 50000) +PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (acos, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/acosf_1u4.c b/contrib/arm-optimized-routines/pl/math/acosf_1u4.c new file mode 100644 index 000000000000..6dde422ef85a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/acosf_1u4.c @@ -0,0 +1,99 @@ +/* + * Single-precision acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffff) +#define Half (0x3f000000) +#define One (0x3f800000) +#define PiOver2f (0x1.921fb6p+0f) +#define Pif (0x1.921fb6p+1f) +#define Small (0x32800000) /* 2^-26. */ +#define Small12 (0x328) +#define QNaN (0x7fc) + +/* Fast implementation of single-precision acos(x) based on polynomial + approximation of single-precision asin(x). + + For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + + For |x| in [Small, 0.5], use the trigonometric identity + + acos(x) = pi/2 - asin(x) + + and use an order 4 polynomial P such that the final approximation of asin is + an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + + The largest observed error in this region is 1.16 ulps, + acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 want 0x1.0c27f6p+0. + + For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1 + + acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)) + + where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the + approximation of asin near 0. + + The largest observed error in this region is 1.32 ulps, + acosf(0x1.15ba56p-1) got 0x1.feb33p-1 want 0x1.feb32ep-1. + + For x in [-1.0, -0.5], use this other identity to deduce the negative inputs + from their absolute value. + + acos(x) = pi - acos(-x) + + The largest observed error in this region is 1.28 ulps, + acosf(-0x1.002072p-1) got 0x1.0c1e84p+1 want 0x1.0c1e82p+1. */ +float +acosf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia12 == QNaN)) + return x; + if (ia > One) + return __math_invalidf (x); + if (ia12 < Small12) + return PiOver2f - x; + + /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f); + float z = ax < 0.5 ? ax : sqrtf (z2); + + /* Use a single polynomial approximation P for both intervals. */ + float p = horner_4_f32 (z2, __asinf_poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fmaf (z * z2, p, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = pi - 2 Q(|x|), for -1.0 < x <= -0.5 + = 2 Q(|x|) , for -0.5 < x < 0.0. */ + if (ax < 0.5) + return PiOver2f - asfloat (asuint (p) | sign); + + return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p; +} + +PL_SIG (S, F, 1, acos, -1.0, 1.0) +PL_TEST_ULP (acosf, 0.82) +PL_TEST_INTERVAL (acosf, 0, Small, 5000) +PL_TEST_INTERVAL (acosf, Small, 0.5, 50000) +PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (acosf, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asin_3u.c b/contrib/arm-optimized-routines/pl/math/asin_3u.c new file mode 100644 index 000000000000..0b50995449ce --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asin_3u.c @@ -0,0 +1,106 @@ +/* + * Double-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f64.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffffffffffff) +#define Half (0x3fe0000000000000) +#define One (0x3ff0000000000000) +#define PiOver2 (0x1.921fb54442d18p+0) +#define Small (0x3e50000000000000) /* 2^-26. */ +#define Small16 (0x3e50) +#define QNaN (0x7ff8) + +/* Fast implementation of double-precision asin(x) based on polynomial + approximation. + + For x < Small, approximate asin(x) by x. Small = 2^-26 for correct rounding. + + For x in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + asin(0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + No cheap approximation can be obtained near x = 1, since the function is not + continuously differentiable on 1. + + For x in [0.5, 1.0], we use a method based on a trigonometric identity + + asin(x) = pi/2 - acos(x) + + and a generalized power series expansion of acos(y) near y=1, that reads as + + acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1) + + The Taylor series of asin(z) near z = 0, reads as + + asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...). + + Therefore, (1) can be written in terms of P(y/2) or even asin(y/2) + + acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2) + + Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and + + asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). + + The largest observed error in this region is 2.69 ulps, + asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 + want 0x1.110d7e85fdd53p-1. */ +double +asin (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t ia16 = ia >> 48; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia16 == QNaN)) + return x; + if (ia > One) + return __math_invalid (x); + if (ia16 < Small16) + return x; + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5); + double z = ax < 0.5 ? ax : sqrt (z2); + + /* Use a single polynomial approximation P for both intervals. */ + double z4 = z2 * z2; + double z8 = z4 * z4; + double z16 = z8 * z8; + double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fma (z * z2, p, z); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + double y = ax < 0.5 ? p : fma (-2.0, p, PiOver2); + + /* Copy sign. */ + return asdouble (asuint64 (y) | sign); +} + +PL_SIG (S, D, 1, asin, -1.0, 1.0) +PL_TEST_ULP (asin, 2.19) +PL_TEST_INTERVAL (asin, 0, Small, 5000) +PL_TEST_INTERVAL (asin, Small, 0.5, 50000) +PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (asin, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asin_data.c b/contrib/arm-optimized-routines/pl/math/asin_data.c new file mode 100644 index 000000000000..b5517731c7f4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asin_data.c @@ -0,0 +1,19 @@ +/* + * Coefficients for single-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asin(x) directly in [0x1p-106, 0.25]. See tools/asin.sollya + for these coeffcients were generated. */ +const double __asin_poly[] = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, +}; diff --git a/contrib/arm-optimized-routines/pl/math/asinf_2u5.c b/contrib/arm-optimized-routines/pl/math/asinf_2u5.c new file mode 100644 index 000000000000..ec608146ff66 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinf_2u5.c @@ -0,0 +1,100 @@ +/* + * Single-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffff) +#define Half (0x3f000000) +#define One (0x3f800000) +#define PiOver2f (0x1.921fb6p+0f) +#define Small (0x39800000) /* 2^-12. */ +#define Small12 (0x398) +#define QNaN (0x7fc) + +/* Fast implementation of single-precision asin(x) based on polynomial + approximation. + + For x < Small, approximate asin(x) by x. Small = 2^-12 for correct rounding. + + For x in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + asinf(0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + No cheap approximation can be obtained near x = 1, since the function is not + continuously differentiable on 1. + + For x in [0.5, 1.0], we use a method based on a trigonometric identity + + asin(x) = pi/2 - acos(x) + + and a generalized power series expansion of acos(y) near y=1, that reads as + + acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1) + + The Taylor series of asin(z) near z = 0, reads as + + asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...). + + Therefore, (1) can be written in terms of P(y/2) or even asin(y/2) + + acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2) + + Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and + + asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). + + The largest observed error in this region is 2.41 ulps, + asinf(0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float +asinf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia12 == QNaN)) + return x; + if (ia > One) + return __math_invalidf (x); + if (ia12 < Small12) + return x; + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f); + float z = ax < 0.5 ? ax : sqrtf (z2); + + /* Use a single polynomial approximation P for both intervals. */ + float p = horner_4_f32 (z2, __asinf_poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fmaf (z * z2, p, z); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float y = ax < 0.5 ? p : fmaf (-2.0f, p, PiOver2f); + + /* Copy sign. */ + return asfloat (asuint (y) | sign); +} + +PL_SIG (S, F, 1, asin, -1.0, 1.0) +PL_TEST_ULP (asinf, 1.91) +PL_TEST_INTERVAL (asinf, 0, Small, 5000) +PL_TEST_INTERVAL (asinf, Small, 0.5, 50000) +PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000) +PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000) +PL_TEST_INTERVAL (asinf, -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asinf_data.c b/contrib/arm-optimized-routines/pl/math/asinf_data.c new file mode 100644 index 000000000000..1652025e2920 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinf_data.c @@ -0,0 +1,16 @@ +/* + * Coefficients for single-precision asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asinf(x) directly in [0x1p-24, 0.25]. See for tools/asinf.sollya + for these coeffs were generated. */ +const float __asinf_poly[] = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, 0x1.3af7d8p-5, +}; diff --git a/contrib/arm-optimized-routines/pl/math/asinh_2u5.c b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c index f1679556d5f8..b7fc81a2b94f 100644 --- a/contrib/arm-optimized-routines/pl/math/asinh_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c @@ -1,86 +1,85 @@ /* * Double-precision asinh(x) function * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrin.h" +#include "poly_scalar_f64.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffffffffffff #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */ #define One 0x3ff0000000000000 /* asuint64(1.0). */ #define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */ #define Ln2 0x1.62e42fefa39efp-1 double optr_aor_log_f64 (double); /* Scalar double-precision asinh implementation. This routine uses different approaches on different intervals: |x| < 2^-26: Return x. Function is exact in this region. |x| < 1: Use custom order-17 polynomial. This is least accurate close to 1. The largest observed error in this region is 1.47 ULPs: asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 want 0x1.c1d6bf874019cp-1. |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)). The largest observed error in this region is 2.03 ULPs: asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1 want -0x1.c3508eb6a682p-1. |x| >= 2^511: We cannot square x without overflow at a low cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot even double x without overflow, so calculate this as ln(x) + ln(2). The largest observed error in this region is 0.98 ULPs at many values, for instance: asinh(0x1.5255a4cf10319p+975) got 0x1.52652f4cb26cbp+9 want 0x1.52652f4cb26ccp+9. */ double asinh (double x) { uint64_t ix = asuint64 (x); uint64_t ia = ix & AbsMask; double ax = asdouble (ia); uint64_t sign = ix & ~AbsMask; if (ia < ExpM26) { return x; } if (ia < One) { double x2 = x * x; double z2 = x2 * x2; double z4 = z2 * z2; double z8 = z4 * z4; -#define C(i) __asinh_data.poly[i] - double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); + double p = estrin_17_f64 (x2, z2, z4, z8, z8 * z8, __asinh_data.poly); double y = fma (p, x2 * ax, ax); return asdouble (asuint64 (y) | sign); } if (unlikely (ia >= Exp511)) { return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign); } return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1))) | sign); } PL_SIG (S, D, 1, asinh, -10.0, 10.0) PL_TEST_ULP (asinh, 1.54) PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000) PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000) PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000) PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000) PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000) PL_TEST_INTERVAL (asinh, 100.0, inf, 50000) PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c index 2b2c55db56dc..ec26b80ec2ec 100644 --- a/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c @@ -1,78 +1,76 @@ /* * Single-precision asinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrinf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask (0x7fffffff) #define SqrtFltMax (0x1.749e96p+10f) #define Ln2 (0x1.62e4p-1f) #define One (0x3f8) #define ExpM12 (0x398) -#define C(i) __asinhf_data.coeffs[i] - float optr_aor_log_f32 (float); /* asinhf approximation using a variety of approaches on different intervals: |x| < 2^-12: Return x. Function is exactly rounded in this region. |x| < 1.0: Use custom order-8 polynomial. The largest observed error in this region is 1.3ulps: asinhf(0x1.f0f74cp-1) got 0x1.b88de4p-1 want 0x1.b88de2p-1. |x| <= SqrtFltMax: Calculate the result directly using the definition of asinh(x) = ln(x + sqrt(x*x + 1)). The largest observed error in this region is 1.99ulps. asinhf(0x1.00e358p+0) got 0x1.c4849ep-1 want 0x1.c484a2p-1. |x| > SqrtFltMax: We cannot square x without overflow at a low cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot even double x without overflow, so calculate this as ln(x) + ln(2). This largest observed error in this region is 3.39ulps. asinhf(0x1.749e9ep+10) got 0x1.fffff8p+2 want 0x1.fffffep+2. */ float asinhf (float x) { uint32_t ix = asuint (x); uint32_t ia = ix & AbsMask; uint32_t ia12 = ia >> 20; float ax = asfloat (ia); uint32_t sign = ix & ~AbsMask; if (unlikely (ia12 < ExpM12 || ia == 0x7f800000)) return x; if (unlikely (ia12 >= 0x7f8)) return __math_invalidf (x); if (ia12 < One) { float x2 = ax * ax; - float p = ESTRIN_7 (ax, x2, x2 * x2, C); + float p = estrin_7_f32 (ax, x2, x2 * x2, __asinhf_data.coeffs); float y = fmaf (x2, p, ax); return asfloat (asuint (y) | sign); } if (unlikely (ax > SqrtFltMax)) { return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign); } return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign); } PL_SIG (S, F, 1, asinh, -10.0, 10.0) PL_TEST_ULP (asinhf, 2.9) PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000) PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000) PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000) PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/atan_common.h b/contrib/arm-optimized-routines/pl/math/atan_common.h index da0da6436854..798cc22cc40a 100644 --- a/contrib/arm-optimized-routines/pl/math/atan_common.h +++ b/contrib/arm-optimized-routines/pl/math/atan_common.h @@ -1,49 +1,33 @@ /* - * Double-precision polynomial evaluation function for scalar and vector atan(x) - * and atan2(y,x). + * Double-precision polynomial evaluation function for scalar + * atan(x) and atan2(y,x). * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "estrin.h" - -#if V_SUPPORTED - -#include "v_math.h" - -#define DBL_T v_f64_t -#define P(i) v_f64 (__atan_poly_data.poly[i]) - -#else - -#define DBL_T double -#define P(i) __atan_poly_data.poly[i] - -#endif +#include "poly_scalar_f64.h" /* Polynomial used in fast atan(x) and atan2(y,x) implementations The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline DBL_T -eval_poly (DBL_T z, DBL_T az, DBL_T shift) +static inline double +eval_poly (double z, double az, double shift) { /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of full scheme to avoid underflow in x^16. */ - DBL_T z2 = z * z; - DBL_T x2 = z2 * z2; - DBL_T x4 = x2 * x2; - DBL_T x8 = x4 * x4; - DBL_T y - = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P)); + double z2 = z * z; + double x2 = z2 * z2; + double x4 = x2 * x2; + double x8 = x4 * x4; + double y = fma (estrin_11_f64 (z2, x2, x4, x8, __atan_poly_data.poly + 8), + x8, estrin_7_f64 (z2, x2, x4, __atan_poly_data.poly)); /* Finalize. y = shift + z + z^3 * P(z^2). */ - y = FMA (y, z2 * az, az); + y = fma (y, z2 * az, az); y = y + shift; return y; } -#undef DBL_T -#undef FMA #undef P diff --git a/contrib/arm-optimized-routines/pl/math/atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c index 9d17f252b8b9..ba6f68089de1 100644 --- a/contrib/arm-optimized-routines/pl/math/atanf_2u9.c +++ b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c @@ -1,76 +1,72 @@ /* * Single-precision atan(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "atanf_common.h" #include "pl_sig.h" #include "pl_test.h" #define PiOver2 0x1.921fb6p+0f #define AbsMask 0x7fffffff #define TinyBound 0x30800000 /* asuint(0x1p-30). */ #define BigBound 0x4e800000 /* asuint(0x1p30). */ #define One 0x3f800000 /* Approximation of single-precision atan(x) based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Maximum error is 2.88 ulps: atanf(0x1.0565ccp+0) got 0x1.97771p-1 want 0x1.97770ap-1. */ float atanf (float x) { uint32_t ix = asuint (x); uint32_t sign = ix & ~AbsMask; uint32_t ia = ix & AbsMask; if (unlikely (ia < TinyBound)) /* Avoid underflow by returning x. */ return x; if (unlikely (ia > BigBound)) { if (ia > 0x7f800000) /* Propagate NaN. */ return __math_invalidf (x); /* atan(x) rounds to PiOver2 for large x. */ return asfloat (asuint (PiOver2) ^ sign); } float z, az, shift; if (ia > One) { /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x). */ z = -1.0f / x; shift = PiOver2; /* Use absolute value only when needed (odd powers of z). */ az = -fabsf (z); } else { /* For x < 1, approximate atan(x) directly. */ z = x; az = asfloat (ia); shift = 0; } /* Calculate polynomial, shift + z + z^3 * P(z^2). */ float y = eval_poly (z, az, shift); /* Copy sign. */ return asfloat (asuint (y) ^ sign); } PL_SIG (S, F, 1, atan, -10.0, 10.0) PL_TEST_ULP (atanf, 2.38) -PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000) -PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000) -PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000) -PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000) -PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000) -PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000) -PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000) -PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000) +PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000) +PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000) +PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000) +PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/atanf_common.h b/contrib/arm-optimized-routines/pl/math/atanf_common.h index 37ca76dee2f7..8952e7e0078b 100644 --- a/contrib/arm-optimized-routines/pl/math/atanf_common.h +++ b/contrib/arm-optimized-routines/pl/math/atanf_common.h @@ -1,51 +1,38 @@ /* - * Single-precision polynomial evaluation function for scalar and vector + * Single-precision polynomial evaluation function for scalar * atan(x) and atan2(y,x). * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef PL_MATH_ATANF_COMMON_H #define PL_MATH_ATANF_COMMON_H #include "math_config.h" -#include "estrinf.h" - -#if V_SUPPORTED - -#include "v_math.h" - -#define FLT_T v_f32_t -#define P(i) v_f32 (__atanf_poly_data.poly[i]) - -#else - -#define FLT_T float -#define P(i) __atanf_poly_data.poly[i] - -#endif +#include "poly_scalar_f32.h" /* Polynomial used in fast atanf(x) and atan2f(y,x) implementations The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline FLT_T -eval_poly (FLT_T z, FLT_T az, FLT_T shift) +static inline float +eval_poly (float z, float az, float shift) { /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, a standard implementation using z8 creates spurious underflow in the very last fma (when z^8 is small enough). Therefore, we split the last fma into a mul and and an fma. Horner and single-level Estrin have higher errors that exceed threshold. */ - FLT_T z2 = z * z; - FLT_T z4 = z2 * z2; + float z2 = z * z; + float z4 = z2 * z2; /* Then assemble polynomial. */ - FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P)); - + float y = fmaf ( + z4, z4 * pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly + 4), + pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly)); /* Finalize: y = shift + z * P(z^2). */ - return FMA (y, z2 * az, az) + shift; + return fmaf (y, z2 * az, az) + shift; } #endif // PL_MATH_ATANF_COMMON_H diff --git a/contrib/arm-optimized-routines/pl/math/atanh_3u.c b/contrib/arm-optimized-routines/pl/math/atanh_3u.c index a168cd555ff6..dcfbe8192a22 100644 --- a/contrib/arm-optimized-routines/pl/math/atanh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/atanh_3u.c @@ -1,86 +1,83 @@ /* * Double-precision atanh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "estrin.h" +#include "poly_scalar_f64.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffffffffffff #define Half 0x3fe0000000000000 #define One 0x3ff0000000000000 #define Ln2Hi 0x1.62e42fefa3800p-1 #define Ln2Lo 0x1.ef35793c76730p-45 #define OneMHfRt2Top \ 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ #define OneTop12 0x3ff #define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ #define BottomMask 0xffffffff -#define C(i) __log1p_data.coeffs[i] static inline double log1p_inline (double x) { /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced interval. Copied from log1p_2u.c, with no special-case handling. See that file for details of the algorithm. */ double m = x + 1; uint64_t mi = asuint64 (m); /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in [sqrt(2)/2, sqrt(2)]. */ uint32_t u = (mi >> 32) + OneMHfRt2Top; int32_t k = (int32_t) (u >> 20) - OneTop12; uint32_t utop = (u & 0x000fffff) + HfRt2Top; uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask); double f = asdouble (u_red) - 1; /* Correction term for round-off in f. */ double cm = (x - (m - 1)) / m; /* Approximate log1p(f) with polynomial. */ double f2 = f * f; double f4 = f2 * f2; double f8 = f4 * f4; - double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f); + double p = fma ( + f, estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs) * f, f); /* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */ double kd = k; double y = fma (Ln2Lo, kd, cm); return y + fma (Ln2Hi, kd, p); } /* Approximation for double-precision inverse tanh(x), using a simplified version of log1p. Greatest observed error is 3.00 ULP: atanh(0x1.e58f3c108d714p-4) got 0x1.e7da77672a647p-4 want 0x1.e7da77672a64ap-4. */ double atanh (double x) { uint64_t ix = asuint64 (x); uint64_t sign = ix & ~AbsMask; uint64_t ia = ix & AbsMask; if (unlikely (ia == One)) return __math_divzero (sign >> 32); if (unlikely (ia > One)) return __math_invalid (x); double halfsign = asdouble (Half | sign); double ax = asdouble (ia); return halfsign * log1p_inline ((2 * ax) / (1 - ax)); } PL_SIG (S, D, 1, atanh, -1.0, 1.0) PL_TEST_ULP (atanh, 3.00) -PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000) -PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000) -PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000) -PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000) -PL_TEST_INTERVAL (atanh, 1, inf, 100) -PL_TEST_INTERVAL (atanh, -1, -inf, 100) +PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000) +PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000) +PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c index fb90aa29c7a3..e99d5a9900a9 100644 --- a/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c +++ b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c @@ -1,88 +1,86 @@ /* * Single-precision atanh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffff #define Half 0x3f000000 #define One 0x3f800000 #define Four 0x40800000 #define Ln2 0x1.62e43p-1f -#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ +/* asuint(0x1p-12), below which atanhf(x) rounds to x. */ +#define TinyBound 0x39800000 #define C(i) __log1pf_data.coeffs[i] static inline float eval_poly (float m) { /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ float p_12 = fmaf (m, C (1), C (0)); float p_34 = fmaf (m, C (3), C (2)); float p_56 = fmaf (m, C (5), C (4)); float p_78 = fmaf (m, C (7), C (6)); float m2 = m * m; float p_02 = fmaf (m2, p_12, m); float p_36 = fmaf (m2, p_56, p_34); float p_79 = fmaf (m2, C (8), p_78); float m4 = m2 * m2; float p_06 = fmaf (m4, p_36, p_02); return fmaf (m4 * p_79, m4, p_06); } static inline float log1pf_inline (float x) { /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no special-case handling. See that file for details of the algorithm. */ float m = x + 1.0f; int k = (asuint (m) - 0x3f400000) & 0xff800000; float s = asfloat (Four - k); float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f); float p = eval_poly (m_scale); float scale_back = (float) k * 0x1.0p-23f; return fmaf (scale_back, Ln2, p); } /* Approximation for single-precision inverse tanh(x), using a simplified version of log1p. Maximum error is 3.08 ULP: atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5 want 0x1.ffb76ep-5. */ float atanhf (float x) { uint32_t ix = asuint (x); uint32_t iax = ix & AbsMask; uint32_t sign = ix & ~AbsMask; if (unlikely (iax < TinyBound)) return x; if (iax == One) return __math_divzero (sign); if (unlikely (iax > One)) return __math_invalidf (x); float halfsign = asfloat (Half | sign); float ax = asfloat (iax); return halfsign * log1pf_inline ((2 * ax) / (1 - ax)); } PL_SIG (S, F, 1, atanh, -1.0, 1.0) PL_TEST_ULP (atanhf, 2.59) -PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500) -PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000) -PL_TEST_INTERVAL (atanhf, 1, inf, 1000) -PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500) -PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000) -PL_TEST_INTERVAL (atanhf, -1, -inf, 1000) +PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500) +PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000) +PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c index 83715dd18a3e..80be83c4470c 100644 --- a/contrib/arm-optimized-routines/pl/math/cbrt_2u.c +++ b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c @@ -1,70 +1,69 @@ /* * Double-precision cbrt(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" PL_SIG (S, D, 1, cbrt, -10.0, 10.0) #define AbsMask 0x7fffffffffffffff #define TwoThirds 0x1.5555555555555p-1 #define C(i) __cbrt_data.poly[i] #define T(i) __cbrt_data.table[i] /* Approximation for double-precision cbrt(x), using low-order polynomial and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat according to the exponent, for instance an error observed for double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an integer. cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 want 0x1.965fe72821e99p+0. */ double cbrt (double x) { uint64_t ix = asuint64 (x); uint64_t iax = ix & AbsMask; uint64_t sign = ix & ~AbsMask; - if (unlikely (iax == 0 || iax == 0x7f80000000000000)) + if (unlikely (iax == 0 || iax == 0x7ff0000000000000)) return x; /* |x| = m * 2^e, where m is in [0.5, 1.0]. We can easily decompose x into m and e using frexp. */ int e; double m = frexp (asdouble (iax), &e); /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for Newton iterations. */ double p_01 = fma (C (1), m, C (0)); double p_23 = fma (C (3), m, C (2)); double p = fma (p_23, m * m, p_01); /* Two iterations of Newton's method for iteratively approximating cbrt. */ double m_by_3 = m / 3; double a = fma (TwoThirds, p, m_by_3 / (p * p)); a = fma (TwoThirds, a, m_by_3 / (a * a)); /* Assemble the result by the following: cbrt(x) = cbrt(m) * 2 ^ (e / 3). Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. i is an integer in [-2, 2], so t can be looked up in the table T. Hence the result is assembled as: cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. Which can be done easily using ldexp. */ return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign); } PL_TEST_ULP (cbrt, 1.30) -PL_TEST_INTERVAL (cbrt, 0, inf, 1000000) -PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000) +PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c index adc591786a6a..88fcb7162ef6 100644 --- a/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c +++ b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c @@ -1,67 +1,66 @@ /* * Single-precision cbrt(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrinf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffff #define SignMask 0x80000000 #define TwoThirds 0x1.555556p-1f -#define C(i) __cbrtf_data.poly[i] #define T(i) __cbrtf_data.table[i] /* Approximation for single-precision cbrt(x), using low-order polynomial and one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This is observed for every value where the mantissa is 0x1.81410e and the exponent is a multiple of 3, for example: cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 want 0x1.255d92p+10. */ float cbrtf (float x) { uint32_t ix = asuint (x); uint32_t iax = ix & AbsMask; uint32_t sign = ix & SignMask; if (unlikely (iax == 0 || iax == 0x7f800000)) return x; /* |x| = m * 2^e, where m is in [0.5, 1.0]. We can easily decompose x into m and e using frexpf. */ int e; float m = frexpf (asfloat (iax), &e); /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, the less accurate the next stage of the algorithm needs to be. An order-4 polynomial is enough for one Newton iteration. */ - float p = ESTRIN_3 (m, m * m, C); + float p = pairwise_poly_3_f32 (m, m * m, __cbrtf_data.poly); + /* One iteration of Newton's method for iteratively approximating cbrt. */ float m_by_3 = m / 3; float a = fmaf (TwoThirds, p, m_by_3 / (p * p)); /* Assemble the result by the following: cbrt(x) = cbrt(m) * 2 ^ (e / 3). Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. i is an integer in [-2, 2], so t can be looked up in the table T. Hence the result is assembled as: cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. Which can be done easily using ldexpf. */ return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign); } PL_SIG (S, F, 1, cbrt, -10.0, 10.0) PL_TEST_ULP (cbrtf, 1.03) -PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000) -PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000) +PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/cosh_2u.c b/contrib/arm-optimized-routines/pl/math/cosh_2u.c index 5d1df0717453..2240a9c56f15 100644 --- a/contrib/arm-optimized-routines/pl/math/cosh_2u.c +++ b/contrib/arm-optimized-routines/pl/math/cosh_2u.c @@ -1,66 +1,63 @@ /* * Double-precision cosh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffffffffffff #define SpecialBound \ 0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows. */ double __exp_dd (double, double); static double specialcase (double x, uint64_t iax) { if (iax == 0x7ff0000000000000) return INFINITY; if (iax > 0x7ff0000000000000) return __math_invalid (x); /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */ double t = __exp_dd (asdouble (iax) / 2, 0); return (0.5 * t) * t; } /* Approximation for double-precision cosh(x). cosh(x) = (exp(x) + exp(-x)) / 2. The greatest observed error is in the special region, 1.93 ULP: cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 want 0x1.fdf28623ef923p+1021. The greatest observed error in the non-special region is 1.03 ULP: cosh(0x1.502cd8e56ab3bp+0) got 0x1.fe54962842d0ep+0 want 0x1.fe54962842d0fp+0. */ double cosh (double x) { uint64_t ix = asuint64 (x); uint64_t iax = ix & AbsMask; /* exp overflows a little bit before cosh, so use special-case handler for the gap, as well as special values. */ if (unlikely (iax >= SpecialBound)) return specialcase (x, iax); double ax = asdouble (iax); /* Use double-precision exp helper to calculate exp(x), then: cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2). */ double t = __exp_dd (ax, 0); return 0.5 * t + 0.5 / t; } PL_SIG (S, D, 1, cosh, -10.0, 10.0) PL_TEST_ULP (cosh, 1.43) -PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) -PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000) -PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) -PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000) -PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100) -PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100) +PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) +PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) +PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/coshf_1u9.c b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c index c125c929aa77..cf737840e0d6 100644 --- a/contrib/arm-optimized-routines/pl/math/coshf_1u9.c +++ b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c @@ -1,71 +1,68 @@ /* * Single-precision cosh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffff #define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ #define SpecialBound \ 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \ special case. */ float optr_aor_exp_f32 (float); static NOINLINE float specialcase (float x, uint32_t iax) { if (iax == 0x7f800000) return INFINITY; if (iax > 0x7f800000) return __math_invalidf (x); if (iax <= TinyBound) /* For tiny x, avoid underflow by just returning 1. */ return 1; /* Otherwise SpecialBound <= |x| < Inf. x is too large to calculate exp(x) without overflow, so use exp(|x|/2) instead. For large x cosh(x) is dominated by exp(x), so return: cosh(x) ~= (exp(|x|/2))^2 / 2. */ float t = optr_aor_exp_f32 (asfloat (iax) / 2); return (0.5 * t) * t; } /* Approximation for single-precision cosh(x) using exp. cosh(x) = (exp(x) + exp(-x)) / 2. The maximum error is 1.89 ULP, observed for |x| > SpecialBound: coshf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. The maximum error observed for TinyBound < |x| < SpecialBound is 1.02 ULP: coshf(0x1.50a3cp+0) got 0x1.ff21dcp+0 want 0x1.ff21dap+0. */ float coshf (float x) { uint32_t ix = asuint (x); uint32_t iax = ix & AbsMask; float ax = asfloat (iax); if (unlikely (iax <= TinyBound || iax >= SpecialBound)) { /* x is tiny, large or special. */ return specialcase (x, iax); } /* Compute cosh using the definition: coshf(x) = exp(x) / 2 + exp(-x) / 2. */ float t = optr_aor_exp_f32 (ax); return 0.5f * t + 0.5f / t; } PL_SIG (S, F, 1, cosh, -10.0, 10.0) PL_TEST_ULP (coshf, 1.89) -PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100) -PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) -PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100) -PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000) +PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100) +PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) +PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/cospi_3u1.c b/contrib/arm-optimized-routines/pl/math/cospi_3u1.c new file mode 100644 index 000000000000..4a688a076829 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cospi_3u1.c @@ -0,0 +1,89 @@ +/* + * Double-precision scalar cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +static const double poly[] + = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, + -0x1.012a9870eeb7dp-25 }; + +#define Shift 0x1.8p+52 + +/* Approximation for scalar double-precision cospi(x). + Maximum error: 3.13 ULP: + cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1 + want 0x1.fffffffffd16ep-1. */ +double +cospi (double x) +{ + if (isinf (x)) + return __math_invalid (x); + + double ax = asdouble (asuint64 (x) & ~0x8000000000000000); + + /* Edge cases for when cospif should be exactly 1. (Integers) + 0x1p53 is the limit for single precision to store any decimal places. */ + if (ax >= 0x1p53) + return 1; + + /* If x is an integer, return +- 1, based upon if x is odd. */ + uint64_t m = (uint64_t) ax; + if (m == ax) + return (m & 1) ? -1 : 1; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via + cospi(x) ~= 1. */ + if (ax < 0x1p-63) + return 1; + + /* Any non-integer values >= 0x1x51 will be int +0.5. + These values should return exactly 0. */ + if (ax >= 0x1p51) + return 0; + + /* n = rint(|x|). */ + double n = ax + Shift; + uint64_t sign = asuint64 (n) << 63; + n = n - Shift; + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + double r = 0.5 - fabs (ax - n); + + /* y = sin(r). */ + double r2 = r * r; + double y = horner_9_f64 (r2, poly); + y = y * r; + + /* Reintroduce C2_hi. */ + y = fma (-4 * r2, r, y); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be + positive, therefore, the sign must be introduced based upon if x rounds to + odd or even. */ + return asdouble (asuint64 (y) ^ sign); +} + +PL_SIG (S, D, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (cospi, 2.63) +PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000) +PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/cospif_2u6.c b/contrib/arm-optimized-routines/pl/math/cospif_2u6.c new file mode 100644 index 000000000000..d78a98ed8b2d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cospif_2u6.c @@ -0,0 +1,84 @@ +/* + * Single-precision scalar cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Taylor series coefficents for sin(pi * x). */ +#define C0 0x1.921fb6p1f +#define C1 -0x1.4abbcep2f +#define C2 0x1.466bc6p1f +#define C3 -0x1.32d2ccp-1f +#define C4 0x1.50783p-4f +#define C5 -0x1.e30750p-8f + +#define Shift 0x1.0p+23f + +/* Approximation for scalar single-precision cospi(x) - cospif. + Maximum error: 2.64 ULP: + cospif(0x1.37e844p-4) got 0x1.f16b3p-1 + want 0x1.f16b2ap-1. */ +float +cospif (float x) +{ + if (isinf (x)) + return __math_invalidf (x); + + float ax = asfloat (asuint (x) & ~0x80000000); + + /* Edge cases for when cospif should be exactly +/- 1. (Integers) + 0x1p23 is the limit for single precision to store any decimal places. */ + if (ax >= 0x1p24f) + return 1; + + uint32_t m = roundf (ax); + if (m == ax) + return (m & 1) ? -1 : 1; + + /* Any non-integer values >= 0x1p22f will be int +0.5. + These values should return exactly 0. */ + if (ax >= 0x1p22f) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via cospi(x) ~= 1 - + (pi*x). */ + if (ax < 0x1p-31f) + return 1 - (C0 * x); + + /* n = rint(|x|). */ + float n = ax + Shift; + uint32_t sign = asuint (n) << 31; + n = n - Shift; + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + float r = 0.5f - fabs (ax - n); + + /* y = sin(pi * r). */ + float r2 = r * r; + float y = fmaf (C5, r2, C4); + y = fmaf (y, r2, C3); + y = fmaf (y, r2, C2); + y = fmaf (y, r2, C1); + y = fmaf (y, r2, C0); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be + positive, therefore, the sign must be introduced based upon if x rounds to + odd or even. */ + return asfloat (asuint (y * r) ^ sign); +} + +PL_SIG (S, F, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (cospif, 2.15) +PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000) +PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/erf_2u5.c b/contrib/arm-optimized-routines/pl/math/erf_2u5.c new file mode 100644 index 000000000000..3ca2a1332c1f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erf_2u5.c @@ -0,0 +1,102 @@ +/* + * Double-precision erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 +#define Shift 0x1p45 + +/* Polynomial coefficients. */ +#define OneThird 0x1.5555555555555p-2 +#define TwoThird 0x1.5555555555555p-1 + +#define TwoOverFifteen 0x1.1111111111111p-3 +#define TwoOverFive 0x1.999999999999ap-2 +#define Tenth 0x1.999999999999ap-4 + +#define TwoOverNine 0x1.c71c71c71c71cp-3 +#define TwoOverFortyFive 0x1.6c16c16c16c17p-5 +#define Sixth 0x1.555555555555p-3 + +/* Fast erf approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +double +erf (double x) +{ + /* Get absolute value and sign. */ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & 0x7fffffffffffffff; + uint64_t sign = ix & ~0x7fffffffffffffff; + + /* |x| < 0x1p-508. Triggers exceptions. */ + if (unlikely (ia < 0x2030000000000000)) + return fma (TwoOverSqrtPiMinusOne, x, x); + + if (ia < 0x4017f80000000000) /* |x| < 6 - 1 / 128 = 5.9921875. */ + { + /* Set r to multiple of 1/128 nearest to |x|. */ + double a = asdouble (ia); + double z = a + Shift; + uint64_t i = asuint64 (z) - asuint64 (Shift); + double r = z - Shift; + /* Lookup erf(r) and scale(r) in table. + Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */ + double erfr = __erf_data.tab[i].erf; + double scale = __erf_data.tab[i].scale; + + /* erf(x) ~ erf(r) + scale * d * poly (d, r). */ + double d = a - r; + double r2 = r * r; + double d2 = d * d; + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + double p1 = -r; + double p2 = fma (TwoThird, r2, -OneThird); + double p3 = -r * fma (OneThird, r2, -0.5); + double p4 = fma (fma (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth); + double p5 + = -r * fma (fma (TwoOverFortyFive, r2, -TwoOverNine), r2, Sixth); + + double p34 = fma (p4, d, p3); + double p12 = fma (p2, d, p1); + double y = fma (p5, d2, p34); + y = fma (y, d2, p12); + + y = fma (fma (y, d2, d), scale, erfr); + return asdouble (asuint64 (y) | sign); + } + + /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ + if (unlikely (ia >= 0x7ff0000000000000)) + return (1.0 - (double) (sign >> 62)) + 1.0 / x; + + /* Boring domain (|x| >= 6.0). */ + return asdouble (sign | asuint64 (1.0)); +} + +PL_SIG (S, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (erf, 1.79) +PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000) +PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000) +PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erf_data.c b/contrib/arm-optimized-routines/pl/math/erf_data.c new file mode 100644 index 000000000000..138e03578e77 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erf_data.c @@ -0,0 +1,788 @@ +/* + * Data for approximation of erf. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in erf. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 6.0 (769 values): + - the first entry __erff_data.tab.erf contains the values of erf(r), + - the second entry __erff_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct erf_data __erf_data = { + .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, + { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, + { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, + { 0x1.b137e0cf584dcp-6, 0x1.20b4d8bac36c1p+0 }, + { 0x1.20c5645dd2538p-5, 0x1.209546ad13ccfp+0 }, + { 0x1.68e5d3bbc9526p-5, 0x1.206cb4897b148p+0 }, + { 0x1.b0fafef135745p-5, 0x1.203b261cd0052p+0 }, + { 0x1.f902a77bd3821p-5, 0x1.2000a00ae3804p+0 }, + { 0x1.207d480e90658p-4, 0x1.1fbd27cdc72d3p+0 }, + { 0x1.44703e87e8593p-4, 0x1.1f70c3b4f2cc7p+0 }, + { 0x1.68591a1e83b5dp-4, 0x1.1f1b7ae44867fp+0 }, + { 0x1.8c36beb8a8d23p-4, 0x1.1ebd5552f795bp+0 }, + { 0x1.b0081148a873ap-4, 0x1.1e565bca400d4p+0 }, + { 0x1.d3cbf7e70a4b3p-4, 0x1.1de697e413d28p+0 }, + { 0x1.f78159ec8bb50p-4, 0x1.1d6e14099944ap+0 }, + { 0x1.0d939005f65e5p-3, 0x1.1cecdb718d61cp+0 }, + { 0x1.1f5e1a35c3b89p-3, 0x1.1c62fa1e869b6p+0 }, + { 0x1.311fc15f56d14p-3, 0x1.1bd07cdd189acp+0 }, + { 0x1.42d7fc2f64959p-3, 0x1.1b357141d95d5p+0 }, + { 0x1.548642321d7c6p-3, 0x1.1a91e5a748165p+0 }, + { 0x1.662a0bdf7a89fp-3, 0x1.19e5e92b964abp+0 }, + { 0x1.77c2d2a765f9ep-3, 0x1.19318bae53a04p+0 }, + { 0x1.895010fdbdbfdp-3, 0x1.1874ddcdfce24p+0 }, + { 0x1.9ad142662e14dp-3, 0x1.17aff0e56ec10p+0 }, + { 0x1.ac45e37fe2526p-3, 0x1.16e2d7093cd8cp+0 }, + { 0x1.bdad72110a648p-3, 0x1.160da304ed92fp+0 }, + { 0x1.cf076d1233237p-3, 0x1.153068581b781p+0 }, + { 0x1.e05354b96ff36p-3, 0x1.144b3b337c90cp+0 }, + { 0x1.f190aa85540e2p-3, 0x1.135e3075d076bp+0 }, + { 0x1.015f78a3dcf3dp-2, 0x1.12695da8b5bdep+0 }, + { 0x1.09eed6982b948p-2, 0x1.116cd8fd67618p+0 }, + { 0x1.127631eb8de32p-2, 0x1.1068b94962e5ep+0 }, + { 0x1.1af54e232d609p-2, 0x1.0f5d1602f7e41p+0 }, + { 0x1.236bef825d9a2p-2, 0x1.0e4a073dc1b91p+0 }, + { 0x1.2bd9db0f7827fp-2, 0x1.0d2fa5a70c168p+0 }, + { 0x1.343ed6989b7d9p-2, 0x1.0c0e0a8223359p+0 }, + { 0x1.3c9aa8b84bedap-2, 0x1.0ae54fa490722p+0 }, + { 0x1.44ed18d9f6462p-2, 0x1.09b58f724416bp+0 }, + { 0x1.4d35ef3e5372ep-2, 0x1.087ee4d9ad247p+0 }, + { 0x1.5574f4ffac98ep-2, 0x1.07416b4fbfe7cp+0 }, + { 0x1.5da9f415ff23fp-2, 0x1.05fd3ecbec297p+0 }, + { 0x1.65d4b75b00471p-2, 0x1.04b27bc403d30p+0 }, + { 0x1.6df50a8dff772p-2, 0x1.03613f2812dafp+0 }, + { 0x1.760aba57a76bfp-2, 0x1.0209a65e29545p+0 }, + { 0x1.7e15944d9d3e4p-2, 0x1.00abcf3e187a9p+0 }, + { 0x1.861566f5fd3c0p-2, 0x1.fe8fb01a47307p-1 }, + { 0x1.8e0a01cab516bp-2, 0x1.fbbbbef34b4b2p-1 }, + { 0x1.95f3353cbb146p-2, 0x1.f8dc092d58ff8p-1 }, + { 0x1.9dd0d2b721f39p-2, 0x1.f5f0cdaf15313p-1 }, + { 0x1.a5a2aca209394p-2, 0x1.f2fa4c16c0019p-1 }, + { 0x1.ad68966569a87p-2, 0x1.eff8c4b1375dbp-1 }, + { 0x1.b522646bbda68p-2, 0x1.ecec7870ebca7p-1 }, + { 0x1.bccfec24855b8p-2, 0x1.e9d5a8e4c934ep-1 }, + { 0x1.c4710406a65fcp-2, 0x1.e6b4982f158b9p-1 }, + { 0x1.cc058392a6d2dp-2, 0x1.e38988fc46e72p-1 }, + { 0x1.d38d4354c3bd0p-2, 0x1.e054be79d3042p-1 }, + { 0x1.db081ce6e2a48p-2, 0x1.dd167c4cf9d2ap-1 }, + { 0x1.e275eaf25e458p-2, 0x1.d9cf06898cdafp-1 }, + { 0x1.e9d68931ae650p-2, 0x1.d67ea1a8b5368p-1 }, + { 0x1.f129d471eabb1p-2, 0x1.d325927fb9d89p-1 }, + { 0x1.f86faa9428f9dp-2, 0x1.cfc41e36c7df9p-1 }, + { 0x1.ffa7ea8eb5fd0p-2, 0x1.cc5a8a3fbea40p-1 }, + { 0x1.03693a371519cp-1, 0x1.c8e91c4d01368p-1 }, + { 0x1.06f794ab2cae7p-1, 0x1.c5701a484ef9dp-1 }, + { 0x1.0a7ef5c18edd2p-1, 0x1.c1efca49a5011p-1 }, + { 0x1.0dff4f247f6c6p-1, 0x1.be68728e29d5dp-1 }, + { 0x1.1178930ada115p-1, 0x1.bada596f25436p-1 }, + { 0x1.14eab43841b55p-1, 0x1.b745c55905bf8p-1 }, + { 0x1.1855a5fd3dd50p-1, 0x1.b3aafcc27502ep-1 }, + { 0x1.1bb95c3746199p-1, 0x1.b00a46237d5bep-1 }, + { 0x1.1f15cb50bc4dep-1, 0x1.ac63e7ecc1411p-1 }, + { 0x1.226ae840d4d70p-1, 0x1.a8b8287ec6a09p-1 }, + { 0x1.25b8a88b6dd7fp-1, 0x1.a5074e2157620p-1 }, + { 0x1.28ff0240d52cdp-1, 0x1.a1519efaf889ep-1 }, + { 0x1.2c3debfd7d6c1p-1, 0x1.9d97610879642p-1 }, + { 0x1.2f755ce9a21f4p-1, 0x1.99d8da149c13fp-1 }, + { 0x1.32a54cb8db67bp-1, 0x1.96164fafd8de3p-1 }, + { 0x1.35cdb3a9a144dp-1, 0x1.925007283d7aap-1 }, + { 0x1.38ee8a84beb71p-1, 0x1.8e86458169af8p-1 }, + { 0x1.3c07ca9cb4f9ep-1, 0x1.8ab94f6caa71dp-1 }, + { 0x1.3f196dcd0f135p-1, 0x1.86e9694134b9ep-1 }, + { 0x1.42236e79a5fa6p-1, 0x1.8316d6f48133dp-1 }, + { 0x1.4525c78dd5966p-1, 0x1.7f41dc12c9e89p-1 }, + { 0x1.4820747ba2dc2p-1, 0x1.7b6abbb7aaf19p-1 }, + { 0x1.4b13713ad3513p-1, 0x1.7791b886e7403p-1 }, + { 0x1.4dfeba47f63ccp-1, 0x1.73b714a552763p-1 }, + { 0x1.50e24ca35fd2cp-1, 0x1.6fdb11b1e0c34p-1 }, + { 0x1.53be25d016a4fp-1, 0x1.6bfdf0beddaf5p-1 }, + { 0x1.569243d2b3a9bp-1, 0x1.681ff24b4ab04p-1 }, + { 0x1.595ea53035283p-1, 0x1.6441563c665d4p-1 }, + { 0x1.5c2348ecc4dc3p-1, 0x1.60625bd75d07bp-1 }, + { 0x1.5ee02e8a71a53p-1, 0x1.5c8341bb23767p-1 }, + { 0x1.61955607dd15dp-1, 0x1.58a445da7c74cp-1 }, + { 0x1.6442bfdedd397p-1, 0x1.54c5a57629db0p-1 }, + { 0x1.66e86d0312e82p-1, 0x1.50e79d1749ac9p-1 }, + { 0x1.69865ee075011p-1, 0x1.4d0a6889dfd9fp-1 }, + { 0x1.6c1c9759d0e5fp-1, 0x1.492e42d78d2c5p-1 }, + { 0x1.6eab18c74091bp-1, 0x1.4553664273d24p-1 }, + { 0x1.7131e5f496a5ap-1, 0x1.417a0c4049fd0p-1 }, + { 0x1.73b1021fc0cb8p-1, 0x1.3da26d759aef5p-1 }, + { 0x1.762870f720c6fp-1, 0x1.39ccc1b136d5ap-1 }, + { 0x1.78983697dc96fp-1, 0x1.35f93fe7d1b3dp-1 }, + { 0x1.7b00578c26037p-1, 0x1.32281e2fd1a92p-1 }, + { 0x1.7d60d8c979f7bp-1, 0x1.2e5991bd4cbfcp-1 }, + { 0x1.7fb9bfaed8078p-1, 0x1.2a8dcede3673bp-1 }, + { 0x1.820b1202f27fbp-1, 0x1.26c508f6bd0ffp-1 }, + { 0x1.8454d5f25760dp-1, 0x1.22ff727dd6f7bp-1 }, + { 0x1.8697120d92a4ap-1, 0x1.1f3d3cf9ffe5ap-1 }, + { 0x1.88d1cd474a2e0p-1, 0x1.1b7e98fe26217p-1 }, + { 0x1.8b050ef253c37p-1, 0x1.17c3b626c7a11p-1 }, + { 0x1.8d30debfc572ep-1, 0x1.140cc3173f007p-1 }, + { 0x1.8f5544bd00c04p-1, 0x1.1059ed7740313p-1 }, + { 0x1.91724951b8fc6p-1, 0x1.0cab61f084b93p-1 }, + { 0x1.9387f53df5238p-1, 0x1.09014c2ca74dap-1 }, + { 0x1.959651980da31p-1, 0x1.055bd6d32e8d7p-1 }, + { 0x1.979d67caa6631p-1, 0x1.01bb2b87c6968p-1 }, + { 0x1.999d4192a5715p-1, 0x1.fc3ee5d1524b0p-2 }, + { 0x1.9b95e8fd26abap-1, 0x1.f511a91a67d2ap-2 }, + { 0x1.9d8768656cc42p-1, 0x1.edeeee0959518p-2 }, + { 0x1.9f71ca72cffb6p-1, 0x1.e6d6ffaa65a25p-2 }, + { 0x1.a1551a16aaeafp-1, 0x1.dfca26f5bbf88p-2 }, + { 0x1.a331628a45b92p-1, 0x1.d8c8aace11e63p-2 }, + { 0x1.a506af4cc00f4p-1, 0x1.d1d2cfff91594p-2 }, + { 0x1.a6d50c20fa293p-1, 0x1.cae8d93f1d7b6p-2 }, + { 0x1.a89c850b7d54dp-1, 0x1.c40b0729ed547p-2 }, + { 0x1.aa5d265064366p-1, 0x1.bd3998457afdap-2 }, + { 0x1.ac16fc7143263p-1, 0x1.b674c8ffc6283p-2 }, + { 0x1.adca142b10f98p-1, 0x1.afbcd3afe8ab6p-2 }, + { 0x1.af767a741088bp-1, 0x1.a911f096fbc26p-2 }, + { 0x1.b11c3c79bb424p-1, 0x1.a27455e14c93cp-2 }, + { 0x1.b2bb679ead19cp-1, 0x1.9be437a7de946p-2 }, + { 0x1.b4540978921eep-1, 0x1.9561c7f23a47bp-2 }, + { 0x1.b5e62fce16095p-1, 0x1.8eed36b886d93p-2 }, + { 0x1.b771e894d602ep-1, 0x1.8886b1e5ecfd1p-2 }, + { 0x1.b8f741ef54f83p-1, 0x1.822e655b417e6p-2 }, + { 0x1.ba764a2af2b78p-1, 0x1.7be47af1f5d89p-2 }, + { 0x1.bbef0fbde6221p-1, 0x1.75a91a7f4d2edp-2 }, + { 0x1.bd61a1453ab44p-1, 0x1.6f7c69d7d3ef8p-2 }, + { 0x1.bece0d82d1a5cp-1, 0x1.695e8cd31867ep-2 }, + { 0x1.c034635b66e23p-1, 0x1.634fa54fa285fp-2 }, + { 0x1.c194b1d49a184p-1, 0x1.5d4fd33729015p-2 }, + { 0x1.c2ef0812fc1bdp-1, 0x1.575f3483021c3p-2 }, + { 0x1.c443755820d64p-1, 0x1.517de540ce2a3p-2 }, + { 0x1.c5920900b5fd1p-1, 0x1.4babff975a04cp-2 }, + { 0x1.c6dad2829ec62p-1, 0x1.45e99bcbb7915p-2 }, + { 0x1.c81de16b14cefp-1, 0x1.4036d0468a7a2p-2 }, + { 0x1.c95b455cce69dp-1, 0x1.3a93b1998736cp-2 }, + { 0x1.ca930e0e2a825p-1, 0x1.35005285227f1p-2 }, + { 0x1.cbc54b476248dp-1, 0x1.2f7cc3fe6f423p-2 }, + { 0x1.ccf20ce0c0d27p-1, 0x1.2a09153529381p-2 }, + { 0x1.ce1962c0e0d8bp-1, 0x1.24a55399ea239p-2 }, + { 0x1.cf3b5cdaf0c39p-1, 0x1.1f518ae487dc8p-2 }, + { 0x1.d0580b2cfd249p-1, 0x1.1a0dc51a9934dp-2 }, + { 0x1.d16f7dbe41ca0p-1, 0x1.14da0a961fd14p-2 }, + { 0x1.d281c49d818d0p-1, 0x1.0fb6620c550afp-2 }, + { 0x1.d38eefdf64fddp-1, 0x1.0aa2d09497f2bp-2 }, + { 0x1.d4970f9ce00d9p-1, 0x1.059f59af7a906p-2 }, + { 0x1.d59a33f19ed42p-1, 0x1.00abff4dec7a3p-2 }, + { 0x1.d6986cfa798e7p-1, 0x1.f79183b101c5bp-3 }, + { 0x1.d791cad3eff01p-1, 0x1.edeb406d9c824p-3 }, + { 0x1.d8865d98abe01p-1, 0x1.e4652fadcb6b2p-3 }, + { 0x1.d97635600bb89p-1, 0x1.daff4969c0b04p-3 }, + { 0x1.da61623cb41e0p-1, 0x1.d1b982c501370p-3 }, + { 0x1.db47f43b2980dp-1, 0x1.c893ce1dcbef7p-3 }, + { 0x1.dc29fb60715afp-1, 0x1.bf8e1b1ca2279p-3 }, + { 0x1.dd0787a8bb39dp-1, 0x1.b6a856c3ed54fp-3 }, + { 0x1.dde0a90611a0dp-1, 0x1.ade26b7fbed95p-3 }, + { 0x1.deb56f5f12d28p-1, 0x1.a53c4135a6526p-3 }, + { 0x1.df85ea8db188ep-1, 0x1.9cb5bd549b111p-3 }, + { 0x1.e0522a5dfda73p-1, 0x1.944ec2e4f5630p-3 }, + { 0x1.e11a3e8cf4eb8p-1, 0x1.8c07329874652p-3 }, + { 0x1.e1de36c75ba58p-1, 0x1.83deeada4d25ap-3 }, + { 0x1.e29e22a89d766p-1, 0x1.7bd5c7df3fe9cp-3 }, + { 0x1.e35a11b9b61cep-1, 0x1.73eba3b5b07b7p-3 }, + { 0x1.e4121370224ccp-1, 0x1.6c205655be71fp-3 }, + { 0x1.e4c6372cd8927p-1, 0x1.6473b5b15a7a1p-3 }, + { 0x1.e5768c3b4a3fcp-1, 0x1.5ce595c455b0ap-3 }, + { 0x1.e62321d06c5e0p-1, 0x1.5575c8a468361p-3 }, + { 0x1.e6cc0709c8a0dp-1, 0x1.4e241e912c305p-3 }, + { 0x1.e7714aec96534p-1, 0x1.46f066040a832p-3 }, + { 0x1.e812fc64db369p-1, 0x1.3fda6bc016994p-3 }, + { 0x1.e8b12a44944a8p-1, 0x1.38e1fae1d6a9dp-3 }, + { 0x1.e94be342e6743p-1, 0x1.3206dceef5f87p-3 }, + { 0x1.e9e335fb56f87p-1, 0x1.2b48d9e5dea1cp-3 }, + { 0x1.ea7730ed0bbb9p-1, 0x1.24a7b84d38971p-3 }, + { 0x1.eb07e27a133aap-1, 0x1.1e233d434b813p-3 }, + { 0x1.eb9558e6b42cep-1, 0x1.17bb2c8d41535p-3 }, + { 0x1.ec1fa258c4beap-1, 0x1.116f48a6476ccp-3 }, + { 0x1.eca6ccd709544p-1, 0x1.0b3f52ce8c383p-3 }, + { 0x1.ed2ae6489ac1ep-1, 0x1.052b0b1a174eap-3 }, + { 0x1.edabfc7453e63p-1, 0x1.fe6460fef4680p-4 }, + { 0x1.ee2a1d004692cp-1, 0x1.f2a901ccafb37p-4 }, + { 0x1.eea5557137ae0p-1, 0x1.e723726b824a9p-4 }, + { 0x1.ef1db32a2277cp-1, 0x1.dbd32ac4c99b0p-4 }, + { 0x1.ef93436bc2daap-1, 0x1.d0b7a0f921e7cp-4 }, + { 0x1.f006135426b26p-1, 0x1.c5d0497c09e74p-4 }, + { 0x1.f0762fde45ee6p-1, 0x1.bb1c972f23e50p-4 }, + { 0x1.f0e3a5e1a1788p-1, 0x1.b09bfb7d11a83p-4 }, + { 0x1.f14e8211e8c55p-1, 0x1.a64de673e8837p-4 }, + { 0x1.f1b6d0fea5f4dp-1, 0x1.9c31c6df3b1b8p-4 }, + { 0x1.f21c9f12f0677p-1, 0x1.92470a61b6965p-4 }, + { 0x1.f27ff89525acfp-1, 0x1.888d1d8e510a3p-4 }, + { 0x1.f2e0e9a6a8b09p-1, 0x1.7f036c0107294p-4 }, + { 0x1.f33f7e43a706bp-1, 0x1.75a96077274bap-4 }, + { 0x1.f39bc242e43e6p-1, 0x1.6c7e64e7281cbp-4 }, + { 0x1.f3f5c1558b19ep-1, 0x1.6381e2980956bp-4 }, + { 0x1.f44d870704911p-1, 0x1.5ab342383d177p-4 }, + { 0x1.f4a31ebcd47dfp-1, 0x1.5211ebf41880bp-4 }, + { 0x1.f4f693b67bd77p-1, 0x1.499d478bca735p-4 }, + { 0x1.f547f10d60597p-1, 0x1.4154bc68d75c3p-4 }, + { 0x1.f59741b4b97cfp-1, 0x1.3937b1b319259p-4 }, + { 0x1.f5e4907982a07p-1, 0x1.31458e6542847p-4 }, + { 0x1.f62fe80272419p-1, 0x1.297db960e4f63p-4 }, + { 0x1.f67952cff6282p-1, 0x1.21df9981f8e53p-4 }, + { 0x1.f6c0db3c34641p-1, 0x1.1a6a95b1e786fp-4 }, + { 0x1.f7068b7b10fd9p-1, 0x1.131e14fa1625dp-4 }, + { 0x1.f74a6d9a38383p-1, 0x1.0bf97e95f2a64p-4 }, + { 0x1.f78c8b812d498p-1, 0x1.04fc3a0481321p-4 }, + { 0x1.f7cceef15d631p-1, 0x1.fc4b5e32d6259p-5 }, + { 0x1.f80ba18636f07p-1, 0x1.eeea8c1b1db93p-5 }, + { 0x1.f848acb544e95p-1, 0x1.e1d4cf1e2450ap-5 }, + { 0x1.f88419ce4e184p-1, 0x1.d508f9a1ea64ep-5 }, + { 0x1.f8bdf1fb78370p-1, 0x1.c885df3451a07p-5 }, + { 0x1.f8f63e416ebffp-1, 0x1.bc4a54a84e834p-5 }, + { 0x1.f92d077f8d56dp-1, 0x1.b055303221015p-5 }, + { 0x1.f96256700da8ep-1, 0x1.a4a549829587ep-5 }, + { 0x1.f99633a838a57p-1, 0x1.993979e14fffdp-5 }, + { 0x1.f9c8a7989af0dp-1, 0x1.8e109c4622913p-5 }, + { 0x1.f9f9ba8d3c733p-1, 0x1.83298d717210ep-5 }, + { 0x1.fa2974addae45p-1, 0x1.78832c03aa2b1p-5 }, + { 0x1.fa57ddfe27376p-1, 0x1.6e1c5893c380bp-5 }, + { 0x1.fa84fe5e05c8dp-1, 0x1.63f3f5c4de13bp-5 }, + { 0x1.fab0dd89d1309p-1, 0x1.5a08e85af27e0p-5 }, + { 0x1.fadb831a9f9c3p-1, 0x1.505a174e9c929p-5 }, + { 0x1.fb04f6868a944p-1, 0x1.46e66be002240p-5 }, + { 0x1.fb2d3f20f9101p-1, 0x1.3dacd1a8d8ccdp-5 }, + { 0x1.fb54641aebbc9p-1, 0x1.34ac36ad8dafep-5 }, + { 0x1.fb7a6c834b5a2p-1, 0x1.2be38b6d92415p-5 }, + { 0x1.fb9f5f4739170p-1, 0x1.2351c2f2d1449p-5 }, + { 0x1.fbc3433260ca5p-1, 0x1.1af5d2e04f3f6p-5 }, + { 0x1.fbe61eef4cf6ap-1, 0x1.12ceb37ff9bc3p-5 }, + { 0x1.fc07f907bc794p-1, 0x1.0adb5fcfa8c75p-5 }, + { 0x1.fc28d7e4f9cd0p-1, 0x1.031ad58d56279p-5 }, + { 0x1.fc48c1d033c7ap-1, 0x1.f7182a851bca2p-6 }, + { 0x1.fc67bcf2d7b8fp-1, 0x1.e85c449e377f2p-6 }, + { 0x1.fc85cf56ecd38p-1, 0x1.da0005e5f28dfp-6 }, + { 0x1.fca2fee770c79p-1, 0x1.cc0180af00a8bp-6 }, + { 0x1.fcbf5170b578bp-1, 0x1.be5ecd2fcb5f9p-6 }, + { 0x1.fcdacca0bfb73p-1, 0x1.b1160991ff737p-6 }, + { 0x1.fcf57607a6e7cp-1, 0x1.a4255a00b9f03p-6 }, + { 0x1.fd0f5317f582fp-1, 0x1.978ae8b55ce1bp-6 }, + { 0x1.fd2869270a56fp-1, 0x1.8b44e6031383ep-6 }, + { 0x1.fd40bd6d7a785p-1, 0x1.7f5188610ddc8p-6 }, + { 0x1.fd58550773cb5p-1, 0x1.73af0c737bb45p-6 }, + { 0x1.fd6f34f52013ap-1, 0x1.685bb5134ef13p-6 }, + { 0x1.fd85621b0876dp-1, 0x1.5d55cb54cd53ap-6 }, + { 0x1.fd9ae142795e3p-1, 0x1.529b9e8cf9a1ep-6 }, + { 0x1.fdafb719e6a69p-1, 0x1.482b8455dc491p-6 }, + { 0x1.fdc3e835500b3p-1, 0x1.3e03d891b37dep-6 }, + { 0x1.fdd7790ea5bc0p-1, 0x1.3422fd6d12e2bp-6 }, + { 0x1.fdea6e062d0c9p-1, 0x1.2a875b5ffab56p-6 }, + { 0x1.fdfccb62e52d3p-1, 0x1.212f612dee7fbp-6 }, + { 0x1.fe0e9552ebdd6p-1, 0x1.181983e5133ddp-6 }, + { 0x1.fe1fcfebe2083p-1, 0x1.0f443edc5ce49p-6 }, + { 0x1.fe307f2b503d0p-1, 0x1.06ae13b0d3255p-6 }, + { 0x1.fe40a6f70af4bp-1, 0x1.fcab1483ea7fcp-7 }, + { 0x1.fe504b1d9696cp-1, 0x1.ec72615a894c4p-7 }, + { 0x1.fe5f6f568b301p-1, 0x1.dcaf3691fc448p-7 }, + { 0x1.fe6e1742f7cf6p-1, 0x1.cd5ec93c12431p-7 }, + { 0x1.fe7c466dc57a1p-1, 0x1.be7e5ac24963bp-7 }, + { 0x1.fe8a004c19ae6p-1, 0x1.b00b38d6b3575p-7 }, + { 0x1.fe97483db8670p-1, 0x1.a202bd6372dcep-7 }, + { 0x1.fea4218d6594ap-1, 0x1.94624e78e0fafp-7 }, + { 0x1.feb08f7146046p-1, 0x1.87275e3a6869dp-7 }, + { 0x1.febc950b3fa75p-1, 0x1.7a4f6aca256cbp-7 }, + { 0x1.fec835695932ep-1, 0x1.6dd7fe3358230p-7 }, + { 0x1.fed37386190fbp-1, 0x1.61beae53b72b7p-7 }, + { 0x1.fede5248e38f4p-1, 0x1.56011cc3b036dp-7 }, + { 0x1.fee8d486585eep-1, 0x1.4a9cf6bda3f4cp-7 }, + { 0x1.fef2fd00af31ap-1, 0x1.3f8ff5042a88ep-7 }, + { 0x1.fefcce6813974p-1, 0x1.34d7dbc76d7e5p-7 }, + { 0x1.ff064b5afffbep-1, 0x1.2a727a89a3f14p-7 }, + { 0x1.ff0f766697c76p-1, 0x1.205dac02bd6b9p-7 }, + { 0x1.ff18520700971p-1, 0x1.1697560347b25p-7 }, + { 0x1.ff20e0a7ba8c2p-1, 0x1.0d1d69569b82dp-7 }, + { 0x1.ff2924a3f7a83p-1, 0x1.03ede1a45bfeep-7 }, + { 0x1.ff312046f2339p-1, 0x1.f60d8aa2a88f2p-8 }, + { 0x1.ff38d5cc4227fp-1, 0x1.e4cc4abf7d065p-8 }, + { 0x1.ff404760319b4p-1, 0x1.d4143a9dfe965p-8 }, + { 0x1.ff47772010262p-1, 0x1.c3e1a5f5c077cp-8 }, + { 0x1.ff4e671a85425p-1, 0x1.b430ecf4a83a8p-8 }, + { 0x1.ff55194fe19dfp-1, 0x1.a4fe83fb9db25p-8 }, + { 0x1.ff5b8fb26f5f6p-1, 0x1.9646f35a76623p-8 }, + { 0x1.ff61cc26c1578p-1, 0x1.8806d70b2fc36p-8 }, + { 0x1.ff67d08401202p-1, 0x1.7a3ade6c8b3e4p-8 }, + { 0x1.ff6d9e943c231p-1, 0x1.6cdfcbfc1e263p-8 }, + { 0x1.ff733814af88cp-1, 0x1.5ff2750fe7820p-8 }, + { 0x1.ff789eb6130c9p-1, 0x1.536fc18f7ce5cp-8 }, + { 0x1.ff7dd41ce2b4dp-1, 0x1.4754abacdf1dcp-8 }, + { 0x1.ff82d9e1a76d8p-1, 0x1.3b9e3f9d06e3fp-8 }, + { 0x1.ff87b1913e853p-1, 0x1.30499b503957fp-8 }, + { 0x1.ff8c5cad200a5p-1, 0x1.2553ee2a336bfp-8 }, + { 0x1.ff90dcaba4096p-1, 0x1.1aba78ba3af89p-8 }, + { 0x1.ff9532f846ab0p-1, 0x1.107a8c7323a6ep-8 }, + { 0x1.ff9960f3eb327p-1, 0x1.06918b6355624p-8 }, + { 0x1.ff9d67f51ddbap-1, 0x1.f9f9cfd9c3035p-9 }, + { 0x1.ffa14948549a7p-1, 0x1.e77448fb66bb9p-9 }, + { 0x1.ffa506302ebaep-1, 0x1.d58da68fd1170p-9 }, + { 0x1.ffa89fe5b3625p-1, 0x1.c4412bf4b8f0bp-9 }, + { 0x1.ffac17988ef4bp-1, 0x1.b38a3af2e55b4p-9 }, + { 0x1.ffaf6e6f4f5c0p-1, 0x1.a3645330550ffp-9 }, + { 0x1.ffb2a5879f35ep-1, 0x1.93cb11a30d765p-9 }, + { 0x1.ffb5bdf67fe6fp-1, 0x1.84ba3004a50d0p-9 }, + { 0x1.ffb8b8c88295fp-1, 0x1.762d84469c18fp-9 }, + { 0x1.ffbb970200110p-1, 0x1.6821000795a03p-9 }, + { 0x1.ffbe599f4f9d9p-1, 0x1.5a90b00981d93p-9 }, + { 0x1.ffc10194fcb64p-1, 0x1.4d78bba8ca5fdp-9 }, + { 0x1.ffc38fcffbb7cp-1, 0x1.40d564548fad7p-9 }, + { 0x1.ffc60535dd7f5p-1, 0x1.34a305080681fp-9 }, + { 0x1.ffc862a501fd7p-1, 0x1.28de11c5031ebp-9 }, + { 0x1.ffcaa8f4c9beap-1, 0x1.1d83170fbf6fbp-9 }, + { 0x1.ffccd8f5c66d1p-1, 0x1.128eb96be8798p-9 }, + { 0x1.ffcef371ea4d7p-1, 0x1.07fdb4dafea5fp-9 }, + { 0x1.ffd0f92cb6ba7p-1, 0x1.fb99b8b8279e1p-10 }, + { 0x1.ffd2eae369a07p-1, 0x1.e7f232d9e2630p-10 }, + { 0x1.ffd4c94d29fdbp-1, 0x1.d4fed7195d7e8p-10 }, + { 0x1.ffd6951b33686p-1, 0x1.c2b9cf7f893bfp-10 }, + { 0x1.ffd84ef9009eep-1, 0x1.b11d702b3deb1p-10 }, + { 0x1.ffd9f78c7524ap-1, 0x1.a024365f771bdp-10 }, + { 0x1.ffdb8f7605ee7p-1, 0x1.8fc8c794b03b5p-10 }, + { 0x1.ffdd1750e1220p-1, 0x1.8005f08d6f1efp-10 }, + { 0x1.ffde8fb314ebfp-1, 0x1.70d6a46e07ddap-10 }, + { 0x1.ffdff92db56e5p-1, 0x1.6235fbd7a4345p-10 }, + { 0x1.ffe1544d01ccbp-1, 0x1.541f340697987p-10 }, + { 0x1.ffe2a1988857cp-1, 0x1.468dadf4080abp-10 }, + { 0x1.ffe3e19349dc7p-1, 0x1.397ced7af2b15p-10 }, + { 0x1.ffe514bbdc197p-1, 0x1.2ce898809244ep-10 }, + { 0x1.ffe63b8c8b5f7p-1, 0x1.20cc76202c5fap-10 }, + { 0x1.ffe7567b7b5e1p-1, 0x1.15246dda49d47p-10 }, + { 0x1.ffe865fac722bp-1, 0x1.09ec86c75d497p-10 }, + { 0x1.ffe96a78a04a9p-1, 0x1.fe41cd9bb4eeep-11 }, + { 0x1.ffea645f6d6dap-1, 0x1.e97ba3b77f306p-11 }, + { 0x1.ffeb5415e7c44p-1, 0x1.d57f524723822p-11 }, + { 0x1.ffec39ff380b9p-1, 0x1.c245d4b998479p-11 }, + { 0x1.ffed167b12ac2p-1, 0x1.afc85e0f82e12p-11 }, + { 0x1.ffede9e5d3262p-1, 0x1.9e005769dbc1dp-11 }, + { 0x1.ffeeb49896c6dp-1, 0x1.8ce75e9f6f8a0p-11 }, + { 0x1.ffef76e956a9fp-1, 0x1.7c7744d9378f7p-11 }, + { 0x1.fff0312b010b5p-1, 0x1.6caa0d3582fe9p-11 }, + { 0x1.fff0e3ad91ec2p-1, 0x1.5d79eb71e893bp-11 }, + { 0x1.fff18ebe2b0e1p-1, 0x1.4ee1429bf7cc0p-11 }, + { 0x1.fff232a72b48ep-1, 0x1.40daa3c89f5b6p-11 }, + { 0x1.fff2cfb0453d9p-1, 0x1.3360ccd23db3ap-11 }, + { 0x1.fff3661e9569dp-1, 0x1.266ea71d4f71ap-11 }, + { 0x1.fff3f634b79f9p-1, 0x1.19ff4663ae9dfp-11 }, + { 0x1.fff48032dbe40p-1, 0x1.0e0de78654d1ep-11 }, + { 0x1.fff50456dab8cp-1, 0x1.0295ef6591848p-11 }, + { 0x1.fff582dc48d30p-1, 0x1.ef25d37f49fe1p-12 }, + { 0x1.fff5fbfc8a439p-1, 0x1.da01102b5f851p-12 }, + { 0x1.fff66feee5129p-1, 0x1.c5b5412dcafadp-12 }, + { 0x1.fff6dee89352ep-1, 0x1.b23a5a23e4210p-12 }, + { 0x1.fff7491cd4af6p-1, 0x1.9f8893d8fd1c1p-12 }, + { 0x1.fff7aebcff755p-1, 0x1.8d986a4187285p-12 }, + { 0x1.fff80ff8911fdp-1, 0x1.7c629a822bc9ep-12 }, + { 0x1.fff86cfd3e657p-1, 0x1.6be02102b3520p-12 }, + { 0x1.fff8c5f702ccfp-1, 0x1.5c0a378c90bcap-12 }, + { 0x1.fff91b102fca8p-1, 0x1.4cda5374ea275p-12 }, + { 0x1.fff96c717b695p-1, 0x1.3e4a23d1f4702p-12 }, + { 0x1.fff9ba420e834p-1, 0x1.30538fbb77ecdp-12 }, + { 0x1.fffa04a7928b1p-1, 0x1.22f0b496539bdp-12 }, + { 0x1.fffa4bc63ee9ap-1, 0x1.161be46ad3b50p-12 }, + { 0x1.fffa8fc0e5f33p-1, 0x1.09cfa445b00ffp-12 }, + { 0x1.fffad0b901755p-1, 0x1.fc0d55470cf51p-13 }, + { 0x1.fffb0ecebee1bp-1, 0x1.e577bbcd49935p-13 }, + { 0x1.fffb4a210b172p-1, 0x1.cfd4a5adec5bfp-13 }, + { 0x1.fffb82cd9dcbfp-1, 0x1.bb1a9657ce465p-13 }, + { 0x1.fffbb8f1049c6p-1, 0x1.a740684026555p-13 }, + { 0x1.fffbeca6adbe9p-1, 0x1.943d4a1d1ed39p-13 }, + { 0x1.fffc1e08f25f5p-1, 0x1.8208bc334a6a5p-13 }, + { 0x1.fffc4d3120aa1p-1, 0x1.709a8db59f25cp-13 }, + { 0x1.fffc7a37857d2p-1, 0x1.5feada379d8b7p-13 }, + { 0x1.fffca53375ce3p-1, 0x1.4ff207314a102p-13 }, + { 0x1.fffcce3b57bffp-1, 0x1.40a8c1949f75ep-13 }, + { 0x1.fffcf564ab6b7p-1, 0x1.3207fb7420eb9p-13 }, + { 0x1.fffd1ac4135f9p-1, 0x1.2408e9ba3327fp-13 }, + { 0x1.fffd3e6d5cd87p-1, 0x1.16a501f0e42cap-13 }, + { 0x1.fffd607387b07p-1, 0x1.09d5f819c9e29p-13 }, + { 0x1.fffd80e8ce0dap-1, 0x1.fb2b792b40a22p-14 }, + { 0x1.fffd9fdeabccep-1, 0x1.e3bcf436a1a95p-14 }, + { 0x1.fffdbd65e5ad0p-1, 0x1.cd55277c18d05p-14 }, + { 0x1.fffdd98e903b2p-1, 0x1.b7e94604479dcp-14 }, + { 0x1.fffdf46816833p-1, 0x1.a36eec00926ddp-14 }, + { 0x1.fffe0e0140857p-1, 0x1.8fdc1b2dcf7b9p-14 }, + { 0x1.fffe26683972ap-1, 0x1.7d2737527c3f9p-14 }, + { 0x1.fffe3daa95b18p-1, 0x1.6b4702d7d5849p-14 }, + { 0x1.fffe53d558ae9p-1, 0x1.5a329b7d30748p-14 }, + { 0x1.fffe68f4fa777p-1, 0x1.49e17724f4d41p-14 }, + { 0x1.fffe7d156d244p-1, 0x1.3a4b60ba9aa4dp-14 }, + { 0x1.fffe904222101p-1, 0x1.2b6875310f785p-14 }, + { 0x1.fffea2860ee1ep-1, 0x1.1d312098e9dbap-14 }, + { 0x1.fffeb3ebb267bp-1, 0x1.0f9e1b4dd36dfp-14 }, + { 0x1.fffec47d19457p-1, 0x1.02a8673a94691p-14 }, + { 0x1.fffed443e2787p-1, 0x1.ec929a665b449p-15 }, + { 0x1.fffee34943b15p-1, 0x1.d4f4b4c8e09edp-15 }, + { 0x1.fffef1960d85dp-1, 0x1.be6abbb10a5aap-15 }, + { 0x1.fffeff32af7afp-1, 0x1.a8e8cc1fadef6p-15 }, + { 0x1.ffff0c273bea2p-1, 0x1.94637d5bacfdbp-15 }, + { 0x1.ffff187b6bc0ep-1, 0x1.80cfdc72220cfp-15 }, + { 0x1.ffff2436a21dcp-1, 0x1.6e2367dc27f95p-15 }, + { 0x1.ffff2f5fefcaap-1, 0x1.5c540b4936fd2p-15 }, + { 0x1.ffff39fe16963p-1, 0x1.4b581b8d170fcp-15 }, + { 0x1.ffff44178c8d2p-1, 0x1.3b2652b06c2b2p-15 }, + { 0x1.ffff4db27f146p-1, 0x1.2bb5cc22e5db6p-15 }, + { 0x1.ffff56d4d5e5ep-1, 0x1.1cfe010e2052dp-15 }, + { 0x1.ffff5f8435efcp-1, 0x1.0ef6c4c84a0fep-15 }, + { 0x1.ffff67c604180p-1, 0x1.01984165a5f36p-15 }, + { 0x1.ffff6f9f67e55p-1, 0x1.e9b5e8d00ce76p-16 }, + { 0x1.ffff77154e0d6p-1, 0x1.d16f5716c6c1ap-16 }, + { 0x1.ffff7e2c6aea2p-1, 0x1.ba4f035d60e02p-16 }, + { 0x1.ffff84e93cd75p-1, 0x1.a447b7b03f045p-16 }, + { 0x1.ffff8b500e77cp-1, 0x1.8f4ccca7fc90dp-16 }, + { 0x1.ffff9164f8e46p-1, 0x1.7b5223dac7336p-16 }, + { 0x1.ffff972be5c59p-1, 0x1.684c227fcacefp-16 }, + { 0x1.ffff9ca891572p-1, 0x1.562fac4329b48p-16 }, + { 0x1.ffffa1de8c582p-1, 0x1.44f21e49054f2p-16 }, + { 0x1.ffffa6d13de73p-1, 0x1.34894a5e24657p-16 }, + { 0x1.ffffab83e54b8p-1, 0x1.24eb7254ccf83p-16 }, + { 0x1.ffffaff99bac4p-1, 0x1.160f438c70913p-16 }, + { 0x1.ffffb43555b5fp-1, 0x1.07ebd2a2d2844p-16 }, + { 0x1.ffffb839e52f3p-1, 0x1.f4f12e9ab070ap-17 }, + { 0x1.ffffbc09fa7cdp-1, 0x1.db5ad0b27805cp-17 }, + { 0x1.ffffbfa82616bp-1, 0x1.c304efa2c6f4ep-17 }, + { 0x1.ffffc316d9ed0p-1, 0x1.abe09e9144b5ep-17 }, + { 0x1.ffffc6586abf6p-1, 0x1.95df988e76644p-17 }, + { 0x1.ffffc96f1165ep-1, 0x1.80f439b4ee04bp-17 }, + { 0x1.ffffcc5cec0c1p-1, 0x1.6d11788a69c64p-17 }, + { 0x1.ffffcf23ff5fcp-1, 0x1.5a2adfa0b4bc4p-17 }, + { 0x1.ffffd1c637b2bp-1, 0x1.4834877429b8fp-17 }, + { 0x1.ffffd4456a10dp-1, 0x1.37231085c7d9ap-17 }, + { 0x1.ffffd6a3554a1p-1, 0x1.26eb9daed6f7ep-17 }, + { 0x1.ffffd8e1a2f22p-1, 0x1.1783ceac28910p-17 }, + { 0x1.ffffdb01e8546p-1, 0x1.08e1badf0fcedp-17 }, + { 0x1.ffffdd05a75eap-1, 0x1.f5f7d88472604p-18 }, + { 0x1.ffffdeee4f810p-1, 0x1.db92b5212fb8dp-18 }, + { 0x1.ffffe0bd3e852p-1, 0x1.c282cd3957edap-18 }, + { 0x1.ffffe273c15b7p-1, 0x1.aab7abace48dcp-18 }, + { 0x1.ffffe41314e06p-1, 0x1.94219bfcb4928p-18 }, + { 0x1.ffffe59c6698bp-1, 0x1.7eb1a2075864dp-18 }, + { 0x1.ffffe710d565ep-1, 0x1.6a597219a93d9p-18 }, + { 0x1.ffffe8717232dp-1, 0x1.570b69502f313p-18 }, + { 0x1.ffffe9bf4098cp-1, 0x1.44ba864670882p-18 }, + { 0x1.ffffeafb377d5p-1, 0x1.335a62115bce2p-18 }, + { 0x1.ffffec2641a9ep-1, 0x1.22df298214423p-18 }, + { 0x1.ffffed413e5b7p-1, 0x1.133d96ae7e0ddp-18 }, + { 0x1.ffffee4d01cd6p-1, 0x1.046aeabcfcdecp-18 }, + { 0x1.ffffef4a55bd4p-1, 0x1.ecb9cfe1d8642p-19 }, + { 0x1.fffff039f9e8fp-1, 0x1.d21397ead99cbp-19 }, + { 0x1.fffff11ca4876p-1, 0x1.b8d094c86d374p-19 }, + { 0x1.fffff1f302bc1p-1, 0x1.a0df0f0c626dcp-19 }, + { 0x1.fffff2bdb904dp-1, 0x1.8a2e269750a39p-19 }, + { 0x1.fffff37d63a36p-1, 0x1.74adc8f4064d3p-19 }, + { 0x1.fffff43297019p-1, 0x1.604ea819f007cp-19 }, + { 0x1.fffff4dde0118p-1, 0x1.4d0231928c6f9p-19 }, + { 0x1.fffff57fc4a95p-1, 0x1.3aba85fe22e1fp-19 }, + { 0x1.fffff618c3da6p-1, 0x1.296a70f414053p-19 }, + { 0x1.fffff6a956450p-1, 0x1.1905613b3abf2p-19 }, + { 0x1.fffff731ee681p-1, 0x1.097f6156f32c5p-19 }, + { 0x1.fffff7b2f8ed6p-1, 0x1.f59a20caf6695p-20 }, + { 0x1.fffff82cdcf1bp-1, 0x1.d9c73698fb1dcp-20 }, + { 0x1.fffff89ffc4aap-1, 0x1.bf716c6168baep-20 }, + { 0x1.fffff90cb3c81p-1, 0x1.a6852c6b58392p-20 }, + { 0x1.fffff9735b73bp-1, 0x1.8eefd70594a88p-20 }, + { 0x1.fffff9d446cccp-1, 0x1.789fb715aae95p-20 }, + { 0x1.fffffa2fc5015p-1, 0x1.6383f726a8e04p-20 }, + { 0x1.fffffa8621251p-1, 0x1.4f8c96f26a26ap-20 }, + { 0x1.fffffad7a2652p-1, 0x1.3caa61607f920p-20 }, + { 0x1.fffffb248c39dp-1, 0x1.2acee2f5ecdb8p-20 }, + { 0x1.fffffb6d1e95dp-1, 0x1.19ec60b1242edp-20 }, + { 0x1.fffffbb196132p-1, 0x1.09f5cf4dd2877p-20 }, + { 0x1.fffffbf22c1e2p-1, 0x1.f5bd95d8730d8p-21 }, + { 0x1.fffffc2f171e3p-1, 0x1.d9371e2ff7c35p-21 }, + { 0x1.fffffc688a9cfp-1, 0x1.be41de54d155ap-21 }, + { 0x1.fffffc9eb76acp-1, 0x1.a4c89e08ef4f3p-21 }, + { 0x1.fffffcd1cbc28p-1, 0x1.8cb738399b12cp-21 }, + { 0x1.fffffd01f36afp-1, 0x1.75fa8dbc84becp-21 }, + { 0x1.fffffd2f57d68p-1, 0x1.608078a70dcbcp-21 }, + { 0x1.fffffd5a2041fp-1, 0x1.4c37c0394d094p-21 }, + { 0x1.fffffd8271d12p-1, 0x1.39100d5687bfep-21 }, + { 0x1.fffffda86faa9p-1, 0x1.26f9df8519bd6p-21 }, + { 0x1.fffffdcc3b117p-1, 0x1.15e6827001f18p-21 }, + { 0x1.fffffdedf37edp-1, 0x1.05c803e4831c1p-21 }, + { 0x1.fffffe0db6b91p-1, 0x1.ed22548cffd35p-22 }, + { 0x1.fffffe2ba0ea5p-1, 0x1.d06ad6ecdf971p-22 }, + { 0x1.fffffe47ccb60p-1, 0x1.b551c847fbc96p-22 }, + { 0x1.fffffe62534d4p-1, 0x1.9bc09f112b494p-22 }, + { 0x1.fffffe7b4c81ep-1, 0x1.83a1ff0aa239dp-22 }, + { 0x1.fffffe92ced93p-1, 0x1.6ce1aa3fd7bddp-22 }, + { 0x1.fffffea8ef9cfp-1, 0x1.576c72b514859p-22 }, + { 0x1.fffffebdc2ec6p-1, 0x1.43302cc4a0da8p-22 }, + { 0x1.fffffed15bcbap-1, 0x1.301ba221dc9bbp-22 }, + { 0x1.fffffee3cc32cp-1, 0x1.1e1e857adc568p-22 }, + { 0x1.fffffef5251c2p-1, 0x1.0d2966b1746f7p-22 }, + { 0x1.ffffff0576917p-1, 0x1.fa5b4f49cc6b2p-23 }, + { 0x1.ffffff14cfb92p-1, 0x1.dc3ae30b55c16p-23 }, + { 0x1.ffffff233ee1dp-1, 0x1.bfd7555a3bd68p-23 }, + { 0x1.ffffff30d18e8p-1, 0x1.a517d9e61628ap-23 }, + { 0x1.ffffff3d9480fp-1, 0x1.8be4f8f6c951fp-23 }, + { 0x1.ffffff4993c46p-1, 0x1.74287ded49339p-23 }, + { 0x1.ffffff54dab72p-1, 0x1.5dcd669f2cd34p-23 }, + { 0x1.ffffff5f74141p-1, 0x1.48bfd38302870p-23 }, + { 0x1.ffffff6969fb8p-1, 0x1.34ecf8a3c124ap-23 }, + { 0x1.ffffff72c5fb6p-1, 0x1.22430f521cbcfp-23 }, + { 0x1.ffffff7b91176p-1, 0x1.10b1488aeb235p-23 }, + { 0x1.ffffff83d3d07p-1, 0x1.0027c00a263a6p-23 }, + { 0x1.ffffff8b962bep-1, 0x1.e12ee004efc37p-24 }, + { 0x1.ffffff92dfba2p-1, 0x1.c3e44ae32b16bp-24 }, + { 0x1.ffffff99b79d2p-1, 0x1.a854ea14102a8p-24 }, + { 0x1.ffffffa0248e8p-1, 0x1.8e6761569f45dp-24 }, + { 0x1.ffffffa62ce54p-1, 0x1.7603bac345f65p-24 }, + { 0x1.ffffffabd69b4p-1, 0x1.5f1353cdad001p-24 }, + { 0x1.ffffffb127525p-1, 0x1.4980cb3c80949p-24 }, + { 0x1.ffffffb624592p-1, 0x1.3537f00b6ad4dp-24 }, + { 0x1.ffffffbad2affp-1, 0x1.2225b12bffc68p-24 }, + { 0x1.ffffffbf370cdp-1, 0x1.10380e1adb7e9p-24 }, + { 0x1.ffffffc355dfdp-1, 0x1.febc107d5efaap-25 }, + { 0x1.ffffffc733572p-1, 0x1.df0f2a0ee6946p-25 }, + { 0x1.ffffffcad3626p-1, 0x1.c14b2188bcee4p-25 }, + { 0x1.ffffffce39b67p-1, 0x1.a553644f7f07dp-25 }, + { 0x1.ffffffd169d0cp-1, 0x1.8b0cfce0579dfp-25 }, + { 0x1.ffffffd466fa5p-1, 0x1.725e7c5dd20f7p-25 }, + { 0x1.ffffffd7344aap-1, 0x1.5b2fe547a1340p-25 }, + { 0x1.ffffffd9d4aabp-1, 0x1.456a974e92e93p-25 }, + { 0x1.ffffffdc4ad7ap-1, 0x1.30f93c3699078p-25 }, + { 0x1.ffffffde9964ep-1, 0x1.1dc7b5b978cf8p-25 }, + { 0x1.ffffffe0c2bf0p-1, 0x1.0bc30c5d52f15p-25 }, + { 0x1.ffffffe2c92dbp-1, 0x1.f5b2be65a0c7fp-26 }, + { 0x1.ffffffe4aed5ep-1, 0x1.d5f3a8dea7357p-26 }, + { 0x1.ffffffe675bbdp-1, 0x1.b82915b03515bp-26 }, + { 0x1.ffffffe81fc4ep-1, 0x1.9c3517e789488p-26 }, + { 0x1.ffffffe9aeb97p-1, 0x1.81fb7df06136ep-26 }, + { 0x1.ffffffeb24467p-1, 0x1.6961b8d641d06p-26 }, + { 0x1.ffffffec81ff2p-1, 0x1.524ec4d916caep-26 }, + { 0x1.ffffffedc95e7p-1, 0x1.3cab1343d18d1p-26 }, + { 0x1.ffffffeefbc85p-1, 0x1.2860757487a01p-26 }, + { 0x1.fffffff01a8b6p-1, 0x1.155a09065d4f7p-26 }, + { 0x1.fffffff126e1ep-1, 0x1.0384250e4c9fcp-26 }, + { 0x1.fffffff221f30p-1, 0x1.e59890b926c78p-27 }, + { 0x1.fffffff30cd3fp-1, 0x1.c642116a8a9e3p-27 }, + { 0x1.fffffff3e8892p-1, 0x1.a8e405e651ab6p-27 }, + { 0x1.fffffff4b606fp-1, 0x1.8d5f98114f872p-27 }, + { 0x1.fffffff57632dp-1, 0x1.7397c5a66e307p-27 }, + { 0x1.fffffff629e44p-1, 0x1.5b71456c5a4c4p-27 }, + { 0x1.fffffff6d1e56p-1, 0x1.44d26de513197p-27 }, + { 0x1.fffffff76ef3fp-1, 0x1.2fa31d6371537p-27 }, + { 0x1.fffffff801c1fp-1, 0x1.1bcca373b7b43p-27 }, + { 0x1.fffffff88af67p-1, 0x1.0939ab853339fp-27 }, + { 0x1.fffffff90b2e3p-1, 0x1.efac5187b2863p-28 }, + { 0x1.fffffff982fc1p-1, 0x1.cf1e86235d0e6p-28 }, + { 0x1.fffffff9f2e9fp-1, 0x1.b0a68a2128babp-28 }, + { 0x1.fffffffa5b790p-1, 0x1.9423165bc4444p-28 }, + { 0x1.fffffffabd229p-1, 0x1.7974e743dea3cp-28 }, + { 0x1.fffffffb18582p-1, 0x1.607e9eacd1050p-28 }, + { 0x1.fffffffb6d844p-1, 0x1.4924a74dec728p-28 }, + { 0x1.fffffffbbd0aap-1, 0x1.334d19e0c2160p-28 }, + { 0x1.fffffffc0748fp-1, 0x1.1edfa3c5f5ccap-28 }, + { 0x1.fffffffc4c96cp-1, 0x1.0bc56f1b54701p-28 }, + { 0x1.fffffffc8d462p-1, 0x1.f3d2185e047d9p-29 }, + { 0x1.fffffffcc9a41p-1, 0x1.d26cb87945e87p-29 }, + { 0x1.fffffffd01f89p-1, 0x1.b334fac4b9f99p-29 }, + { 0x1.fffffffd36871p-1, 0x1.96076f7918d1cp-29 }, + { 0x1.fffffffd678edp-1, 0x1.7ac2d72fc2c63p-29 }, + { 0x1.fffffffd954aep-1, 0x1.614801550319ep-29 }, + { 0x1.fffffffdbff2ap-1, 0x1.4979ac8b28926p-29 }, + { 0x1.fffffffde7ba0p-1, 0x1.333c68e2d0548p-29 }, + { 0x1.fffffffe0cd16p-1, 0x1.1e767bce37dd7p-29 }, + { 0x1.fffffffe2f664p-1, 0x1.0b0fc5b6d05a0p-29 }, + { 0x1.fffffffe4fa30p-1, 0x1.f1e3523b41d7dp-30 }, + { 0x1.fffffffe6daf7p-1, 0x1.d00de6608effep-30 }, + { 0x1.fffffffe89b0cp-1, 0x1.b0778b7b3301ap-30 }, + { 0x1.fffffffea3c9ap-1, 0x1.92fb04ec0f6cfp-30 }, + { 0x1.fffffffebc1a9p-1, 0x1.77756ec9f78fap-30 }, + { 0x1.fffffffed2c21p-1, 0x1.5dc61922d5a06p-30 }, + { 0x1.fffffffee7dc8p-1, 0x1.45ce65699ff6dp-30 }, + { 0x1.fffffffefb847p-1, 0x1.2f71a5f159970p-30 }, + { 0x1.ffffffff0dd2bp-1, 0x1.1a94ff571654fp-30 }, + { 0x1.ffffffff1ede9p-1, 0x1.071f4bbea09ecp-30 }, + { 0x1.ffffffff2ebdap-1, 0x1.e9f1ff8ddd774p-31 }, + { 0x1.ffffffff3d843p-1, 0x1.c818223a202c7p-31 }, + { 0x1.ffffffff4b453p-1, 0x1.a887bd2b4404dp-31 }, + { 0x1.ffffffff58126p-1, 0x1.8b1a336c5eb6bp-31 }, + { 0x1.ffffffff63fc3p-1, 0x1.6fab63324088ap-31 }, + { 0x1.ffffffff6f121p-1, 0x1.56197e30205bap-31 }, + { 0x1.ffffffff79626p-1, 0x1.3e44e45301b92p-31 }, + { 0x1.ffffffff82fabp-1, 0x1.281000bfe4c3fp-31 }, + { 0x1.ffffffff8be77p-1, 0x1.135f28f2d50b4p-31 }, + { 0x1.ffffffff94346p-1, 0x1.00187dded5975p-31 }, + { 0x1.ffffffff9bec8p-1, 0x1.dc479de0ef001p-32 }, + { 0x1.ffffffffa319fp-1, 0x1.bad4fdad3caa1p-32 }, + { 0x1.ffffffffa9c63p-1, 0x1.9baed3ed27ab8p-32 }, + { 0x1.ffffffffaffa4p-1, 0x1.7ead9ce4285bbp-32 }, + { 0x1.ffffffffb5be5p-1, 0x1.63ac6b4edc88ep-32 }, + { 0x1.ffffffffbb1a2p-1, 0x1.4a88be2a6390cp-32 }, + { 0x1.ffffffffc014ep-1, 0x1.332259185f1a0p-32 }, + { 0x1.ffffffffc4b56p-1, 0x1.1d5b1f3793044p-32 }, + { 0x1.ffffffffc901cp-1, 0x1.0916f04b6e18bp-32 }, + { 0x1.ffffffffccfffp-1, 0x1.ec77101de6926p-33 }, + { 0x1.ffffffffd0b56p-1, 0x1.c960bf23153e0p-33 }, + { 0x1.ffffffffd4271p-1, 0x1.a8bd20fc65ef7p-33 }, + { 0x1.ffffffffd759dp-1, 0x1.8a61745ec7d1dp-33 }, + { 0x1.ffffffffda520p-1, 0x1.6e25d0e756261p-33 }, + { 0x1.ffffffffdd13cp-1, 0x1.53e4f7d1666cbp-33 }, + { 0x1.ffffffffdfa2dp-1, 0x1.3b7c27a7ddb0ep-33 }, + { 0x1.ffffffffe202dp-1, 0x1.24caf2c32af14p-33 }, + { 0x1.ffffffffe4371p-1, 0x1.0fb3186804d0fp-33 }, + { 0x1.ffffffffe642ap-1, 0x1.f830c0bb41fd7p-34 }, + { 0x1.ffffffffe8286p-1, 0x1.d3c0f1a91c846p-34 }, + { 0x1.ffffffffe9eb0p-1, 0x1.b1e5acf351d87p-34 }, + { 0x1.ffffffffeb8d0p-1, 0x1.92712d259ce66p-34 }, + { 0x1.ffffffffed10ap-1, 0x1.7538c60a04476p-34 }, + { 0x1.ffffffffee782p-1, 0x1.5a14b04b47879p-34 }, + { 0x1.ffffffffefc57p-1, 0x1.40dfd87456f4cp-34 }, + { 0x1.fffffffff0fa7p-1, 0x1.2977b1172b9d5p-34 }, + { 0x1.fffffffff218fp-1, 0x1.13bc07e891491p-34 }, + { 0x1.fffffffff3227p-1, 0x1.ff1dbb4300811p-35 }, + { 0x1.fffffffff4188p-1, 0x1.d9a880f306bd8p-35 }, + { 0x1.fffffffff4fc9p-1, 0x1.b6e45220b55e0p-35 }, + { 0x1.fffffffff5cfdp-1, 0x1.96a0b33f2c4dap-35 }, + { 0x1.fffffffff6939p-1, 0x1.78b07e9e924acp-35 }, + { 0x1.fffffffff748ep-1, 0x1.5ce9ab1670dd2p-35 }, + { 0x1.fffffffff7f0dp-1, 0x1.4325167006bb0p-35 }, + { 0x1.fffffffff88c5p-1, 0x1.2b3e53538ff3fp-35 }, + { 0x1.fffffffff91c6p-1, 0x1.15137a7f44864p-35 }, + { 0x1.fffffffff9a1bp-1, 0x1.0084ff125639dp-35 }, + { 0x1.fffffffffa1d2p-1, 0x1.daeb0b7311ec7p-36 }, + { 0x1.fffffffffa8f6p-1, 0x1.b7937d1c40c52p-36 }, + { 0x1.fffffffffaf92p-1, 0x1.96d082f59ab06p-36 }, + { 0x1.fffffffffb5b0p-1, 0x1.7872d9fa10aadp-36 }, + { 0x1.fffffffffbb58p-1, 0x1.5c4e8e37bc7d0p-36 }, + { 0x1.fffffffffc095p-1, 0x1.423ac0df49a40p-36 }, + { 0x1.fffffffffc56dp-1, 0x1.2a117230ad284p-36 }, + { 0x1.fffffffffc9e8p-1, 0x1.13af4f04f9998p-36 }, + { 0x1.fffffffffce0dp-1, 0x1.fde703724e560p-37 }, + { 0x1.fffffffffd1e1p-1, 0x1.d77f0c82e7641p-37 }, + { 0x1.fffffffffd56cp-1, 0x1.b3ee02611d7ddp-37 }, + { 0x1.fffffffffd8b3p-1, 0x1.92ff33023d5bdp-37 }, + { 0x1.fffffffffdbbap-1, 0x1.7481a9e69f53fp-37 }, + { 0x1.fffffffffde86p-1, 0x1.5847eda620959p-37 }, + { 0x1.fffffffffe11dp-1, 0x1.3e27c1fcc74bdp-37 }, + { 0x1.fffffffffe380p-1, 0x1.25f9ee0b923dcp-37 }, + { 0x1.fffffffffe5b6p-1, 0x1.0f9a0686531ffp-37 }, + { 0x1.fffffffffe7c0p-1, 0x1.f5cc7718082afp-38 }, + { 0x1.fffffffffe9a2p-1, 0x1.cf7e53d6a2ca5p-38 }, + { 0x1.fffffffffeb60p-1, 0x1.ac0f5f3229372p-38 }, + { 0x1.fffffffffecfbp-1, 0x1.8b498644847eap-38 }, + { 0x1.fffffffffee77p-1, 0x1.6cfa9bcca59dcp-38 }, + { 0x1.fffffffffefd6p-1, 0x1.50f411d4fd2cdp-38 }, + { 0x1.ffffffffff11ap-1, 0x1.370ab8327af5ep-38 }, + { 0x1.ffffffffff245p-1, 0x1.1f167f88c6b6ep-38 }, + { 0x1.ffffffffff359p-1, 0x1.08f24085d4597p-38 }, + { 0x1.ffffffffff457p-1, 0x1.e8f70e181d619p-39 }, + { 0x1.ffffffffff542p-1, 0x1.c324c20e337dcp-39 }, + { 0x1.ffffffffff61bp-1, 0x1.a03261574b54ep-39 }, + { 0x1.ffffffffff6e3p-1, 0x1.7fe903cdf5855p-39 }, + { 0x1.ffffffffff79bp-1, 0x1.6215c58da3450p-39 }, + { 0x1.ffffffffff845p-1, 0x1.46897d4b69fc6p-39 }, + { 0x1.ffffffffff8e2p-1, 0x1.2d1877d731b7bp-39 }, + { 0x1.ffffffffff973p-1, 0x1.159a386b11517p-39 }, + { 0x1.ffffffffff9f8p-1, 0x1.ffd27ae9393cep-40 }, + { 0x1.ffffffffffa73p-1, 0x1.d7c593130dd0bp-40 }, + { 0x1.ffffffffffae4p-1, 0x1.b2cd607c79bcfp-40 }, + { 0x1.ffffffffffb4cp-1, 0x1.90ae4d3405651p-40 }, + { 0x1.ffffffffffbadp-1, 0x1.71312dd1759e2p-40 }, + { 0x1.ffffffffffc05p-1, 0x1.5422ef5d8949dp-40 }, + { 0x1.ffffffffffc57p-1, 0x1.39544b0ecc957p-40 }, + { 0x1.ffffffffffca2p-1, 0x1.20997f73e73ddp-40 }, + { 0x1.ffffffffffce7p-1, 0x1.09ca0eaacd277p-40 }, + { 0x1.ffffffffffd27p-1, 0x1.e9810295890ecp-41 }, + { 0x1.ffffffffffd62p-1, 0x1.c2b45b5aa4a1dp-41 }, + { 0x1.ffffffffffd98p-1, 0x1.9eee068fa7596p-41 }, + { 0x1.ffffffffffdcap-1, 0x1.7df2b399c10a8p-41 }, + { 0x1.ffffffffffdf8p-1, 0x1.5f8b87a31bd85p-41 }, + { 0x1.ffffffffffe22p-1, 0x1.4385c96e9a2d9p-41 }, + { 0x1.ffffffffffe49p-1, 0x1.29b2933ef4cbcp-41 }, + { 0x1.ffffffffffe6cp-1, 0x1.11e68a6378f8ap-41 }, + { 0x1.ffffffffffe8dp-1, 0x1.f7f338086a86bp-42 }, + { 0x1.ffffffffffeabp-1, 0x1.cf8d7d9ce040ap-42 }, + { 0x1.ffffffffffec7p-1, 0x1.aa577251ae484p-42 }, + { 0x1.ffffffffffee1p-1, 0x1.8811d739efb5ep-42 }, + { 0x1.ffffffffffef8p-1, 0x1.68823e52970bep-42 }, + { 0x1.fffffffffff0ep-1, 0x1.4b72ae68e8b4cp-42 }, + { 0x1.fffffffffff22p-1, 0x1.30b14dbe876bcp-42 }, + { 0x1.fffffffffff34p-1, 0x1.181012ef86610p-42 }, + { 0x1.fffffffffff45p-1, 0x1.01647ba798744p-42 }, + { 0x1.fffffffffff54p-1, 0x1.d90e917701675p-43 }, + { 0x1.fffffffffff62p-1, 0x1.b2a87e86d0c8ap-43 }, + { 0x1.fffffffffff6fp-1, 0x1.8f53dcb377293p-43 }, + { 0x1.fffffffffff7bp-1, 0x1.6ed2f2515e933p-43 }, + { 0x1.fffffffffff86p-1, 0x1.50ecc9ed47f19p-43 }, + { 0x1.fffffffffff90p-1, 0x1.356cd5ce7799ep-43 }, + { 0x1.fffffffffff9ap-1, 0x1.1c229a587ab78p-43 }, + { 0x1.fffffffffffa2p-1, 0x1.04e15ecc7f3f6p-43 }, + { 0x1.fffffffffffaap-1, 0x1.deffc7e6a6017p-44 }, + { 0x1.fffffffffffb1p-1, 0x1.b7b040832f310p-44 }, + { 0x1.fffffffffffb8p-1, 0x1.938e021f36d76p-44 }, + { 0x1.fffffffffffbep-1, 0x1.7258610b3b233p-44 }, + { 0x1.fffffffffffc3p-1, 0x1.53d3bfc82a909p-44 }, + { 0x1.fffffffffffc8p-1, 0x1.37c92babdc2fdp-44 }, + { 0x1.fffffffffffcdp-1, 0x1.1e06010120f6ap-44 }, + { 0x1.fffffffffffd1p-1, 0x1.065b9616170d4p-44 }, + { 0x1.fffffffffffd5p-1, 0x1.e13dd96b3753ap-45 }, + { 0x1.fffffffffffd9p-1, 0x1.b950d32467392p-45 }, + { 0x1.fffffffffffdcp-1, 0x1.94a72263259a5p-45 }, + { 0x1.fffffffffffdfp-1, 0x1.72fd93e036cdcp-45 }, + { 0x1.fffffffffffe2p-1, 0x1.54164576929abp-45 }, + { 0x1.fffffffffffe4p-1, 0x1.37b83c521fe96p-45 }, + { 0x1.fffffffffffe7p-1, 0x1.1daf033182e96p-45 }, + { 0x1.fffffffffffe9p-1, 0x1.05ca50205d26ap-45 }, + { 0x1.fffffffffffebp-1, 0x1.dfbb6235639fap-46 }, + { 0x1.fffffffffffedp-1, 0x1.b7807e294781fp-46 }, + { 0x1.fffffffffffeep-1, 0x1.9298add70a734p-46 }, + { 0x1.ffffffffffff0p-1, 0x1.70beaf9c7ffb6p-46 }, + { 0x1.ffffffffffff1p-1, 0x1.51b2cd6709222p-46 }, + { 0x1.ffffffffffff3p-1, 0x1.353a6cf7f7fffp-46 }, + { 0x1.ffffffffffff4p-1, 0x1.1b1fa8cbe84a7p-46 }, + { 0x1.ffffffffffff5p-1, 0x1.0330f0fd69921p-46 }, + { 0x1.ffffffffffff6p-1, 0x1.da81670f96f9bp-47 }, + { 0x1.ffffffffffff7p-1, 0x1.b24a16b4d09aap-47 }, + { 0x1.ffffffffffff7p-1, 0x1.8d6eeb6efdbd6p-47 }, + { 0x1.ffffffffffff8p-1, 0x1.6ba91ac734785p-47 }, + { 0x1.ffffffffffff9p-1, 0x1.4cb7966770ab5p-47 }, + { 0x1.ffffffffffff9p-1, 0x1.305e9721d0981p-47 }, + { 0x1.ffffffffffffap-1, 0x1.1667311fff70ap-47 }, + { 0x1.ffffffffffffbp-1, 0x1.fd3de10d62855p-48 }, + { 0x1.ffffffffffffbp-1, 0x1.d1aefbcd48d0cp-48 }, + { 0x1.ffffffffffffbp-1, 0x1.a9cc93c25aca9p-48 }, + { 0x1.ffffffffffffcp-1, 0x1.85487ee3ea735p-48 }, + { 0x1.ffffffffffffcp-1, 0x1.63daf8b4b1e0cp-48 }, + { 0x1.ffffffffffffdp-1, 0x1.45421e69a6ca1p-48 }, + { 0x1.ffffffffffffdp-1, 0x1.294175802d99ap-48 }, + { 0x1.ffffffffffffdp-1, 0x1.0fa17bf41068fp-48 }, + { 0x1.ffffffffffffdp-1, 0x1.f05e82aae2bb9p-49 }, + { 0x1.ffffffffffffep-1, 0x1.c578101b29058p-49 }, + { 0x1.ffffffffffffep-1, 0x1.9e39dc5dd2f7cp-49 }, + { 0x1.ffffffffffffep-1, 0x1.7a553a728bbf2p-49 }, + { 0x1.ffffffffffffep-1, 0x1.5982008db1304p-49 }, + { 0x1.ffffffffffffep-1, 0x1.3b7e00422e51bp-49 }, + { 0x1.ffffffffffffep-1, 0x1.200c898d9ee3ep-49 }, + { 0x1.fffffffffffffp-1, 0x1.06f5f7eb65a56p-49 }, + { 0x1.fffffffffffffp-1, 0x1.e00e9148a1d25p-50 }, + { 0x1.fffffffffffffp-1, 0x1.b623734024e92p-50 }, + { 0x1.fffffffffffffp-1, 0x1.8fd4e01891bf8p-50 }, + { 0x1.fffffffffffffp-1, 0x1.6cd44c7470d89p-50 }, + { 0x1.fffffffffffffp-1, 0x1.4cd9c04158cd7p-50 }, + { 0x1.fffffffffffffp-1, 0x1.2fa34bf5c8344p-50 }, + { 0x1.fffffffffffffp-1, 0x1.14f4890ff2461p-50 }, + { 0x1.fffffffffffffp-1, 0x1.f92c49dfa4df5p-51 }, + { 0x1.fffffffffffffp-1, 0x1.ccaaea71ab0dfp-51 }, + { 0x1.fffffffffffffp-1, 0x1.a40829f001197p-51 }, + { 0x1.0000000000000p+0, 0x1.7eef13b59e96cp-51 }, + { 0x1.0000000000000p+0, 0x1.5d11e1a252bf5p-51 }, + { 0x1.0000000000000p+0, 0x1.3e296303b2297p-51 }, + { 0x1.0000000000000p+0, 0x1.21f47009f43cep-51 }, + { 0x1.0000000000000p+0, 0x1.083768c5e4541p-51 }, + { 0x1.0000000000000p+0, 0x1.e1777d831265ep-52 }, + { 0x1.0000000000000p+0, 0x1.b69f10b0191b5p-52 }, + { 0x1.0000000000000p+0, 0x1.8f8a3a05b5b52p-52 }, + { 0x1.0000000000000p+0, 0x1.6be573c40c8e7p-52 }, + { 0x1.0000000000000p+0, 0x1.4b645ba991fdbp-52 }, + { 0x1.0000000000000p+0, 0x1.2dc119095729fp-52 }, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/erfc_1u8.c b/contrib/arm-optimized-routines/pl/math/erfc_1u8.c new file mode 100644 index 000000000000..7f2004e9335d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfc_1u8.c @@ -0,0 +1,153 @@ +/* + * Double-precision erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Shift 0x1p45 +#define P20 0x1.5555555555555p-2 /* 1/3. */ +#define P21 0x1.5555555555555p-1 /* 2/3. */ + +#define P40 0x1.999999999999ap-4 /* 1/10. */ +#define P41 0x1.999999999999ap-2 /* 2/5. */ +#define P42 0x1.11111111111111p-3 /* 2/15. */ + +#define P50 0x1.5555555555555p-3 /* 1/6. */ +#define P51 0x1.c71c71c71c71cp-3 /* 2/9. */ +#define P52 0x1.6c16c16c16c17p-5 /* 2/45. */ + +/* Qi = (i+1) / i. */ +#define Q5 0x1.3333333333333p0 +#define Q6 0x1.2aaaaaaaaaaabp0 +#define Q7 0x1.2492492492492p0 +#define Q8 0x1.2p0 +#define Q9 0x1.1c71c71c71c72p0 + +/* Ri = -2 * i / ((i+1)*(i+2)). */ +#define R5 -0x1.e79e79e79e79ep-3 +#define R6 -0x1.b6db6db6db6dbp-3 +#define R7 -0x1.8e38e38e38e39p-3 +#define R8 -0x1.6c16c16c16c17p-3 +#define R9 -0x1.4f2094f2094f2p-3 + +/* Fast erfc approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale(r) are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +double +erfc (double x) +{ + /* Get top words and sign. */ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & 0x7fffffffffffffff; + double a = asdouble (ia); + uint64_t sign = ix & ~0x7fffffffffffffff; + + /* erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2. */ + if (unlikely (ia >= 0x7ff0000000000000)) + return asdouble (sign >> 1) + 1.0 / x; /* Special cases. */ + + /* Return early for large enough negative values. */ + if (x < -6.0) + return 2.0; + + /* For |x| < 3487.0/128.0, the following approximation holds. */ + if (likely (ia < 0x403b3e0000000000)) + { + /* |x| < 0x1p-511 => accurate to 0.5 ULP. */ + if (unlikely (ia < asuint64 (0x1p-511))) + return 1.0 - x; + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + double z = a + Shift; + uint64_t i = asuint64 (z); + double r = z - Shift; + /* These values are scaled by 2^128. */ + double erfcr = __erfc_data.tab[i].erfc; + double scale = __erfc_data.tab[i].scale; + + /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ + double d = a - r; + double d2 = d * d; + double r2 = r * r; + /* Compute p_i as a regular (low-order) polynomial. */ + double p1 = -r; + double p2 = fma (P21, r2, -P20); + double p3 = -r * fma (P20, r2, -0.5); + double p4 = fma (fma (P42, r2, -P41), r2, P40); + double p5 = -r * fma (fma (P52, r2, -P51), r2, P50); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + double p6 = fma (Q5 * r, p5, p4) * R5; + double p7 = fma (Q6 * r, p6, p5) * R6; + double p8 = fma (Q7 * r, p7, p6) * R7; + double p9 = fma (Q8 * r, p8, p7) * R8; + double p10 = fma (Q9 * r, p9, p8) * R9; + /* Compute polynomial in d using pairwise Horner scheme. */ + double p90 = fma (p10, d, p9); + double p78 = fma (p8, d, p7); + double p56 = fma (p6, d, p5); + double p34 = fma (p4, d, p3); + double p12 = fma (p2, d, p1); + double y = fma (p90, d2, p78); + y = fma (y, d2, p56); + y = fma (y, d2, p34); + y = fma (y, d2, p12); + + y = fma (-fma (y, d2, d), scale, erfcr); + + /* Handle sign and scale back in a single fma. */ + double off = asdouble (sign >> 1); + double fac = asdouble (asuint64 (0x1p-128) | sign); + y = fma (y, fac, off); + + if (unlikely (x > 26.0)) + { + /* The underflow exception needs to be signaled explicitly when + result gets into the subnormal range. */ + if (unlikely (y < 0x1p-1022)) + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + /* Set errno to ERANGE if result rounds to 0. */ + return __math_check_uflow (y); + } + + return y; + } + /* Above the threshold (x > 3487.0/128.0) erfc is constant and needs to raise + underflow exception for positive x. */ + return __math_uflow (0); +} + +PL_SIG (S, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (erfc, 1.21) +PL_TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000) +PL_TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000) +PL_TEST_INTERVAL (erfc, 28.0, inf, 40000) +PL_TEST_INTERVAL (erfc, -6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfc_4u5.c b/contrib/arm-optimized-routines/pl/math/erfc_4u5.c deleted file mode 100644 index e9af9d3bcdb4..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erfc_4u5.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Double-precision erfc(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" -#include "pairwise_horner.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define AbsMask (0x7fffffffffffffff) - -#define xint __erfc_data.interval_bounds -#define PX __erfc_data.poly - -/* Accurate exponential from optimized routines. */ -double -__exp_dd (double x, double xtail); - -static inline double -eval_poly_horner (double z, int i) -{ - double z2 = z * z; -#define C(j) PX[i][j] - return PAIRWISE_HORNER_12 (z, z2, C); -} - -/* Accurate evaluation of exp(x^2) - using compensated product (x^2 ~ x*x + e2) - and the __exp_dd(y,d) routine, that is the - computation of exp(y+d) with a small correction d< 6.0. */ -static inline double -approx_erfc_hi (double x, int i) -{ - double a = fabs (x); - double z = a - xint[i]; - double p = eval_poly_horner (z, i); - double e_mx2 = eval_accurate_gaussian (a); - return p * e_mx2; -} - -static inline int -get_itv_idx (double x) -{ - /* Interval bounds are a logarithmic scale, i.e. interval n has - lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain - the interval index. */ - double a = asdouble (asuint64 (x) & AbsMask); - double z = a + 1.0; - z = z * z; - z = z * z; - return (asuint64 (z) >> 52) - 1023; -} - -/* Approximation of erfc for |x| < 6.0. */ -static inline double -approx_erfc_lo (double x, uint32_t sign, int i) -{ - double a = fabs (x); - double z = a - xint[i]; - double p = eval_poly_horner (z, i); - double e_mx2 = eval_accurate_gaussian (a); - if (sign) - return fma (-p, e_mx2, 2.0); - else - return p * e_mx2; -} - -/* Top 12 bits of a double (sign and exponent bits). */ -static inline uint32_t -abstop12 (double x) -{ - return (asuint64 (x) >> 52) & 0x7ff; -} - -/* Top 32 bits of a double. */ -static inline uint32_t -top32 (double x) -{ - return asuint64 (x) >> 32; -} - -/* Fast erfc implementation. - The approximation uses polynomial approximation of - exp(x^2) * erfc(x) with fixed orders on 20 intervals. - Maximum measured error is 4.05 ULPs:. - erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2 - want 0x1.ff84036f8f0b7p-2. */ -double -erfc (double x) -{ - /* Get top words. */ - uint32_t ix = top32 (x); /* We need to compare at most 32 bits. */ - uint32_t ia = ix & 0x7fffffff; - uint32_t sign = ix >> 31; - - /* Handle special cases and small values with a single comparison: - abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small) - Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2 - Errno EDOM does not have to be set in case of erfc(nan). - Only ERANGE may be set in case of underflow. - Small values (|x| accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7) - |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd). */ - if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd)) - { - if (abstop12 (x) >= 0x7ff) - return (double) (sign << 1) + 1.0 / x; /* special cases. */ - else - return 1.0 - x; /* small case. */ - } - else if (ia < 0x40180000) - { /* |x| < 6.0. */ - return approx_erfc_lo (x, sign, get_itv_idx (x)); - } - else if (sign) - { /* x <= -6.0. */ - return 2.0; - } - else if (ia < 0x403c0000) - { /* 6.0 <= x < 28. */ - return approx_erfc_hi (x, get_itv_idx (x)); - } - else - { /* x > 28. */ - return __math_uflow (0); - } -} - -PL_SIG (S, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (erfc, 3.56) -PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000) -PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000) -PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (erfc, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfc_data.c b/contrib/arm-optimized-routines/pl/math/erfc_data.c index fa7184fcc871..40f72a4d6d5b 100644 --- a/contrib/arm-optimized-routines/pl/math/erfc_data.c +++ b/contrib/arm-optimized-routines/pl/math/erfc_data.c @@ -1,145 +1,3507 @@ /* * Data used in double-precision erfc(x) function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double - precision. Generated using the Remez algorithm on each interval separately - (see erfc.sollya for more detail). */ +/* Lookup table used in erfc. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = ~27.0 (3488 values): + - the first entry __erfc_data.tab.erfc contains the values of erfc(r), + - the second entry __erfc_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore + they are scaled by a large enough value 2^128 (fits in 8bit). */ const struct erfc_data __erfc_data = { - -/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a - logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the - exception of the first interval. */ -.interval_bounds = { - 0x1.0p-50, /* Tiny boundary. */ - 0x1.837f05c490126p-3, /* 0.189. */ - 0x1.a827997709f7ap-2, /* 0.414. */ - 0x1.5d13f326fe9c8p-1, /* 0.682. */ - 0x1.0p0, /* 1.000. */ - 0x1.60dfc14636e2ap0, /* 1.378. */ - 0x1.d413cccfe779ap0, /* 1.828. */ - 0x1.2e89f995ad3adp1, /* 2.364. */ - 0x1.8p1, /* 3.000. */ - 0x1.e0dfc14636e2ap1, /* 3.757. */ - 0x1.2a09e667f3bcdp2, /* 4.657. */ - 0x1.6e89f995ad3adp2, /* 5.727. */ - 0x1.cp2, /* 7.000. */ - 0x1.106fe0a31b715p3, /* 8.514. */ - 0x1.4a09e667f3bcdp3, /* 10.31. */ - 0x1.8e89f995ad3adp3, /* 12.45. */ - 0x1.ep3, /* 15.00. */ - 0x1.206fe0a31b715p4, /* 18.03. */ - 0x1.5a09e667f3bcdp4, /* 21.63. */ - 0x1.9e89f995ad3adp4, /* 25.91. */ - 0x1.fp4 /* 31.00. */ + .tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, + { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, + { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, + { 0x1.f27640f9853d9p127, 0x1.20b4d8bac36c1p128 }, + { 0x1.edf3a9ba22dadp127, 0x1.209546ad13ccfp128 }, + { 0x1.e971a2c4436aep127, 0x1.206cb4897b148p128 }, + { 0x1.e4f05010eca8cp127, 0x1.203b261cd0053p128 }, + { 0x1.e06fd58842c7ep127, 0x1.2000a00ae3804p128 }, + { 0x1.dbf056fe2df35p127, 0x1.1fbd27cdc72d3p128 }, + { 0x1.d771f82f02f4ep127, 0x1.1f70c3b4f2cc8p128 }, + { 0x1.d2f4dcbc2f894p127, 0x1.1f1b7ae44867fp128 }, + { 0x1.ce792828eae5cp127, 0x1.1ebd5552f795bp128 }, + { 0x1.c9fefdd6eaf19p127, 0x1.1e565bca400d4p128 }, + { 0x1.c58681031eb6ap127, 0x1.1de697e413d29p128 }, + { 0x1.c10fd4c26e896p127, 0x1.1d6e14099944ap128 }, + { 0x1.bc9b1bfe82687p127, 0x1.1cecdb718d61cp128 }, + { 0x1.b82879728f11ep127, 0x1.1c62fa1e869b6p128 }, + { 0x1.b3b80fa82a4bbp127, 0x1.1bd07cdd189acp128 }, + { 0x1.af4a00f426daap127, 0x1.1b357141d95d5p128 }, + { 0x1.aade6f7378a0ep127, 0x1.1a91e5a748165p128 }, + { 0x1.a6757d08215d8p127, 0x1.19e5e92b964abp128 }, + { 0x1.a20f4b5626818p127, 0x1.19318bae53a04p128 }, + { 0x1.9dabfbc090901p127, 0x1.1874ddcdfce24p128 }, + { 0x1.994baf66747adp127, 0x1.17aff0e56ec1p128 }, + { 0x1.94ee8720076b6p127, 0x1.16e2d7093cd8cp128 }, + { 0x1.9094a37bbd66ep127, 0x1.160da304ed92fp128 }, + { 0x1.8c3e24bb73372p127, 0x1.153068581b781p128 }, + { 0x1.87eb2ad1a4032p127, 0x1.144b3b337c90cp128 }, + { 0x1.839bd55eaafc8p127, 0x1.135e3075d076bp128 }, + { 0x1.7f5043ae11862p127, 0x1.12695da8b5bdep128 }, + { 0x1.7b0894b3ea35cp127, 0x1.116cd8fd67618p128 }, + { 0x1.76c4e70a390e7p127, 0x1.1068b94962e5ep128 }, + { 0x1.728558ee694fcp127, 0x1.0f5d1602f7e41p128 }, + { 0x1.6e4a083ed132fp127, 0x1.0e4a073dc1b91p128 }, + { 0x1.6a13127843ec1p127, 0x1.0d2fa5a70c168p128 }, + { 0x1.65e094b3b2413p127, 0x1.0c0e0a8223359p128 }, + { 0x1.61b2aba3da093p127, 0x1.0ae54fa490723p128 }, + { 0x1.5d89739304dcfp127, 0x1.09b58f724416bp128 }, + { 0x1.59650860d6469p127, 0x1.087ee4d9ad247p128 }, + { 0x1.5545858029b39p127, 0x1.07416b4fbfe7cp128 }, + { 0x1.512b05f5006e1p127, 0x1.05fd3ecbec298p128 }, + { 0x1.4d15a4527fdc7p127, 0x1.04b27bc403d3p128 }, + { 0x1.49057ab900447p127, 0x1.03613f2812dafp128 }, + { 0x1.44faa2d42c4ap127, 0x1.0209a65e29545p128 }, + { 0x1.40f535d93160ep127, 0x1.00abcf3e187a9p128 }, + { 0x1.3cf54c850162p127, 0x1.fe8fb01a47307p127 }, + { 0x1.38faff1aa574ap127, 0x1.fbbbbef34b4b2p127 }, + { 0x1.35066561a275dp127, 0x1.f8dc092d58ff8p127 }, + { 0x1.311796a46f064p127, 0x1.f5f0cdaf15313p127 }, + { 0x1.2d2ea9aefb636p127, 0x1.f2fa4c16c0019p127 }, + { 0x1.294bb4cd4b2bdp127, 0x1.eff8c4b1375dbp127 }, + { 0x1.256ecdca212ccp127, 0x1.ecec7870ebca8p127 }, + { 0x1.219809edbd524p127, 0x1.e9d5a8e4c934ep127 }, + { 0x1.1dc77dfcacd02p127, 0x1.e6b4982f158b9p127 }, + { 0x1.19fd3e36ac96ap127, 0x1.e38988fc46e72p127 }, + { 0x1.16395e559e218p127, 0x1.e054be79d3042p127 }, + { 0x1.127bf18c8eadcp127, 0x1.dd167c4cf9d2ap127 }, + { 0x1.0ec50a86d0dd4p127, 0x1.d9cf06898cdafp127 }, + { 0x1.0b14bb6728cd8p127, 0x1.d67ea1a8b5368p127 }, + { 0x1.076b15c70aa28p127, 0x1.d325927fb9d89p127 }, + { 0x1.03c82ab5eb831p127, 0x1.cfc41e36c7df9p127 }, + { 0x1.002c0ab8a5018p127, 0x1.cc5a8a3fbea4p127 }, + { 0x1.f92d8b91d5cc7p126, 0x1.c8e91c4d01368p127 }, + { 0x1.f210d6a9a6a31p126, 0x1.c5701a484ef9dp127 }, + { 0x1.eb02147ce245cp126, 0x1.c1efca49a5011p127 }, + { 0x1.e40161b701275p126, 0x1.be68728e29d5ep127 }, + { 0x1.dd0ed9ea4bdd6p126, 0x1.bada596f25436p127 }, + { 0x1.d62a978f7c957p126, 0x1.b745c55905bf8p127 }, + { 0x1.cf54b4058455fp126, 0x1.b3aafcc27502ep127 }, + { 0x1.c88d479173ccep126, 0x1.b00a46237d5bep127 }, + { 0x1.c1d4695e87644p126, 0x1.ac63e7ecc1411p127 }, + { 0x1.bb2a2f7e5652p126, 0x1.a8b8287ec6a09p127 }, + { 0x1.b48eaee924501p126, 0x1.a5074e215762p127 }, + { 0x1.ae01fb7e55a66p126, 0x1.a1519efaf889ep127 }, + { 0x1.a78428050527ep126, 0x1.9d97610879642p127 }, + { 0x1.a115462cbbc17p126, 0x1.99d8da149c13fp127 }, + { 0x1.9ab5668e4930ap126, 0x1.96164fafd8de3p127 }, + { 0x1.946498acbd766p126, 0x1.925007283d7aap127 }, + { 0x1.8e22eaf68291ep126, 0x1.8e86458169af8p127 }, + { 0x1.87f06ac6960c4p126, 0x1.8ab94f6caa71dp127 }, + { 0x1.81cd2465e1d96p126, 0x1.86e9694134b9ep127 }, + { 0x1.7bb9230cb40b4p126, 0x1.8316d6f48133dp127 }, + { 0x1.75b470e454d35p126, 0x1.7f41dc12c9e89p127 }, + { 0x1.6fbf1708ba47cp126, 0x1.7b6abbb7aaf19p127 }, + { 0x1.69d91d8a595dap126, 0x1.7791b886e7403p127 }, + { 0x1.64028b7013867p126, 0x1.73b714a552763p127 }, + { 0x1.5e3b66b9405a9p126, 0x1.6fdb11b1e0c34p127 }, + { 0x1.5883b45fd2b63p126, 0x1.6bfdf0beddaf5p127 }, + { 0x1.52db785a98acap126, 0x1.681ff24b4ab04p127 }, + { 0x1.4d42b59f95afap126, 0x1.6441563c665d4p127 }, + { 0x1.47b96e267647ap126, 0x1.60625bd75d07bp127 }, + { 0x1.423fa2eb1cb59p126, 0x1.5c8341bb23767p127 }, + { 0x1.3cd553f045d45p126, 0x1.58a445da7c74cp127 }, + { 0x1.377a8042458d1p126, 0x1.54c5a57629dbp127 }, + { 0x1.322f25f9da2fdp126, 0x1.50e79d1749ac9p127 }, + { 0x1.2cf3423f15fdfp126, 0x1.4d0a6889dfd9fp127 }, + { 0x1.27c6d14c5e341p126, 0x1.492e42d78d2c5p127 }, + { 0x1.22a9ce717edcbp126, 0x1.4553664273d24p127 }, + { 0x1.1d9c3416d2b4bp126, 0x1.417a0c4049fdp127 }, + { 0x1.189dfbc07e69p126, 0x1.3da26d759aef5p127 }, + { 0x1.13af1e11be721p126, 0x1.39ccc1b136d5ap127 }, + { 0x1.0ecf92d046d22p126, 0x1.35f93fe7d1b3dp127 }, + { 0x1.09ff50e7b3f93p126, 0x1.32281e2fd1a92p127 }, + { 0x1.053e4e6d0c10bp126, 0x1.2e5991bd4cbfcp127 }, + { 0x1.008c80a24ff1p126, 0x1.2a8dcede3673bp127 }, + { 0x1.f7d3b7f436013p125, 0x1.26c508f6bd0ffp127 }, + { 0x1.eeaca836a27ccp125, 0x1.22ff727dd6f7bp127 }, + { 0x1.e5a3b7c9b56dap125, 0x1.1f3d3cf9ffe5ap127 }, + { 0x1.dcb8cae2d747fp125, 0x1.1b7e98fe26217p127 }, + { 0x1.d3ebc436b0f26p125, 0x1.17c3b626c7a12p127 }, + { 0x1.cb3c8500ea349p125, 0x1.140cc3173f007p127 }, + { 0x1.c2aaed0bfcfeep125, 0x1.1059ed7740313p127 }, + { 0x1.ba36dab91c0e9p125, 0x1.0cab61f084b93p127 }, + { 0x1.b1e02b082b72p125, 0x1.09014c2ca74dap127 }, + { 0x1.a9a6b99fc973bp125, 0x1.055bd6d32e8d7p127 }, + { 0x1.a18a60d56673ep125, 0x1.01bb2b87c6968p127 }, + { 0x1.998af9b56a3aep125, 0x1.fc3ee5d1524bp126 }, + { 0x1.91a85c0b65519p125, 0x1.f511a91a67d2ap126 }, + { 0x1.89e25e6a4cef9p125, 0x1.edeeee0959518p126 }, + { 0x1.8238d634c0127p125, 0x1.e6d6ffaa65a25p126 }, + { 0x1.7aab97a554544p125, 0x1.dfca26f5bbf88p126 }, + { 0x1.733a75d6e91b8p125, 0x1.d8c8aace11e63p126 }, + { 0x1.6be542ccffc2fp125, 0x1.d1d2cfff91594p126 }, + { 0x1.64abcf7c175b4p125, 0x1.cae8d93f1d7b7p126 }, + { 0x1.5d8debd20aacep125, 0x1.c40b0729ed548p126 }, + { 0x1.568b66be6f268p125, 0x1.bd3998457afdbp126 }, + { 0x1.4fa40e3af3674p125, 0x1.b674c8ffc6283p126 }, + { 0x1.48d7af53bc19fp125, 0x1.afbcd3afe8ab6p126 }, + { 0x1.4226162fbddd5p125, 0x1.a911f096fbc26p126 }, + { 0x1.3b8f0e1912f7p125, 0x1.a27455e14c93cp126 }, + { 0x1.351261854b991p125, 0x1.9be437a7de946p126 }, + { 0x1.2eafda1db784ap125, 0x1.9561c7f23a47bp126 }, + { 0x1.286740c7a7dabp125, 0x1.8eed36b886d93p126 }, + { 0x1.22385daca7f47p125, 0x1.8886b1e5ecfd1p126 }, + { 0x1.1c22f842ac1f2p125, 0x1.822e655b417e7p126 }, + { 0x1.1626d7543522p125, 0x1.7be47af1f5d89p126 }, + { 0x1.1043c1086777dp125, 0x1.75a91a7f4d2edp126 }, + { 0x1.0a797aeb152f2p125, 0x1.6f7c69d7d3ef8p126 }, + { 0x1.04c7c9f4b969p125, 0x1.695e8cd31867ep126 }, + { 0x1.fe5ce524c8ee5p124, 0x1.634fa54fa285fp126 }, + { 0x1.f35a715b2f3e1p124, 0x1.5d4fd33729015p126 }, + { 0x1.e887bf681f218p124, 0x1.575f3483021c3p126 }, + { 0x1.dde4553ef94dep124, 0x1.517de540ce2a3p126 }, + { 0x1.d36fb7fa50177p124, 0x1.4babff975a04cp126 }, + { 0x1.c9296beb09cf1p124, 0x1.45e99bcbb7915p126 }, + { 0x1.bf10f4a759889p124, 0x1.4036d0468a7a2p126 }, + { 0x1.b525d5198cb1cp124, 0x1.3a93b1998736cp126 }, + { 0x1.ab678f8eabedbp124, 0x1.35005285227f1p126 }, + { 0x1.a1d5a5c4edb96p124, 0x1.2f7cc3fe6f423p126 }, + { 0x1.986f98f9f96c8p124, 0x1.2a09153529381p126 }, + { 0x1.8f34e9f8f93a6p124, 0x1.24a55399ea239p126 }, + { 0x1.8625192879e39p124, 0x1.1f518ae487dc8p126 }, + { 0x1.7d3fa69816db5p124, 0x1.1a0dc51a9934dp126 }, + { 0x1.7484120df1b01p124, 0x1.14da0a961fd14p126 }, + { 0x1.6bf1db13f3983p124, 0x1.0fb6620c550afp126 }, + { 0x1.63888104d811ap124, 0x1.0aa2d09497f2bp126 }, + { 0x1.5b478318ff939p124, 0x1.059f59af7a906p126 }, + { 0x1.532e6073095f2p124, 0x1.00abff4dec7a3p126 }, + { 0x1.4b3c982c338c7p124, 0x1.f79183b101c5bp125 }, + { 0x1.4371a960807f8p124, 0x1.edeb406d9c825p125 }, + { 0x1.3bcd133aa0ffcp124, 0x1.e4652fadcb6b2p125 }, + { 0x1.344e54ffa23b9p124, 0x1.daff4969c0b04p125 }, + { 0x1.2cf4ee1a5f0fcp124, 0x1.d1b982c50137p125 }, + { 0x1.25c05e26b3f99p124, 0x1.c893ce1dcbef7p125 }, + { 0x1.1eb024fc75285p124, 0x1.bf8e1b1ca2279p125 }, + { 0x1.17c3c2ba26319p124, 0x1.b6a856c3ed54fp125 }, + { 0x1.10fab7cf72f94p124, 0x1.ade26b7fbed95p125 }, + { 0x1.0a548507696cp124, 0x1.a53c4135a6526p125 }, + { 0x1.03d0ab9273b94p124, 0x1.9cb5bd549b111p125 }, + { 0x1.fadd5a20258d3p123, 0x1.944ec2e4f563p125 }, + { 0x1.ee5c1730b147cp123, 0x1.8c07329874652p125 }, + { 0x1.e21c938a45a83p123, 0x1.83deeada4d25ap125 }, + { 0x1.d61dd57628999p123, 0x1.7bd5c7df3fe9cp125 }, + { 0x1.ca5ee4649e31fp123, 0x1.73eba3b5b07b7p125 }, + { 0x1.bedec8fddb34p123, 0x1.6c205655be72p125 }, + { 0x1.b39c8d3276d8ap123, 0x1.6473b5b15a7a1p125 }, + { 0x1.a8973c4b5c03ep123, 0x1.5ce595c455b0ap125 }, + { 0x1.9dcde2f93a207p123, 0x1.5575c8a468362p125 }, + { 0x1.933f8f6375f2cp123, 0x1.4e241e912c305p125 }, + { 0x1.88eb51369acb9p123, 0x1.46f066040a832p125 }, + { 0x1.7ed039b24c96bp123, 0x1.3fda6bc016994p125 }, + { 0x1.74ed5bb6bb581p123, 0x1.38e1fae1d6a9dp125 }, + { 0x1.6b41cbd198bc8p123, 0x1.3206dceef5f87p125 }, + { 0x1.61cca04a90795p123, 0x1.2b48d9e5dea1cp125 }, + { 0x1.588cf12f4446bp123, 0x1.24a7b84d38971p125 }, + { 0x1.4f81d85ecc55bp123, 0x1.1e233d434b813p125 }, + { 0x1.46aa7194bd324p123, 0x1.17bb2c8d41535p125 }, + { 0x1.3e05da73b4159p123, 0x1.116f48a6476ccp125 }, + { 0x1.3593328f6abbep123, 0x1.0b3f52ce8c383p125 }, + { 0x1.2d519b7653e1ep123, 0x1.052b0b1a174eap125 }, + { 0x1.254038bac19d6p123, 0x1.fe6460fef468p124 }, + { 0x1.1d5e2ffb96d4p123, 0x1.f2a901ccafb37p124 }, + { 0x1.15aaa8ec85205p123, 0x1.e723726b824a9p124 }, + { 0x1.0e24cd5dd8846p123, 0x1.dbd32ac4c99bp124 }, + { 0x1.06cbc943d255ap123, 0x1.d0b7a0f921e7cp124 }, + { 0x1.ff3d957b29b39p122, 0x1.c5d0497c09e74p124 }, + { 0x1.f13a043742333p122, 0x1.bb1c972f23e5p124 }, + { 0x1.e38b43cbd0f0fp122, 0x1.b09bfb7d11a84p124 }, + { 0x1.d62fbdc2e756bp122, 0x1.a64de673e8837p124 }, + { 0x1.c925e02b41668p122, 0x1.9c31c6df3b1b8p124 }, + { 0x1.bc6c1da1f3121p122, 0x1.92470a61b6965p124 }, + { 0x1.b000ed5b4a626p122, 0x1.888d1d8e510a3p124 }, + { 0x1.a3e2cb2ae9edbp122, 0x1.7f036c0107294p124 }, + { 0x1.9810378b1f299p122, 0x1.75a96077274bap124 }, + { 0x1.8c87b7a37834fp122, 0x1.6c7e64e7281cbp124 }, + { 0x1.8147d54e9cc33p122, 0x1.6381e2980956bp124 }, + { 0x1.764f1f1f6ddeap122, 0x1.5ab342383d178p124 }, + { 0x1.6b9c28657041ap122, 0x1.5211ebf41880bp124 }, + { 0x1.612d893085125p122, 0x1.499d478bca735p124 }, + { 0x1.5701de53f4d2ep122, 0x1.4154bc68d75c3p124 }, + { 0x1.4d17c968d062bp122, 0x1.3937b1b31925ap124 }, + { 0x1.436df0cfabf1dp122, 0x1.31458e6542847p124 }, + { 0x1.3a02ffb1b7ceep122, 0x1.297db960e4f63p124 }, + { 0x1.30d5a6013afc5p122, 0x1.21df9981f8e53p124 }, + { 0x1.27e49879737d3p122, 0x1.1a6a95b1e786fp124 }, + { 0x1.1f2e909de04d2p122, 0x1.131e14fa1625dp124 }, + { 0x1.16b24cb8f8f92p122, 0x1.0bf97e95f2a64p124 }, + { 0x1.0e6e8fda56cf7p122, 0x1.04fc3a0481321p124 }, + { 0x1.066221d4539d8p122, 0x1.fc4b5e32d6259p123 }, + { 0x1.fd179e7243e3cp121, 0x1.eeea8c1b1db94p123 }, + { 0x1.edd4d2aec5adbp121, 0x1.e1d4cf1e2450ap123 }, + { 0x1.def98c6c79efap121, 0x1.d508f9a1ea64fp123 }, + { 0x1.d0838121f2418p121, 0x1.c885df3451a07p123 }, + { 0x1.c2706fa45005ep121, 0x1.bc4a54a84e834p123 }, + { 0x1.b4be201caa4b4p121, 0x1.b055303221015p123 }, + { 0x1.a76a63fc95c79p121, 0x1.a4a549829587ep123 }, + { 0x1.9a7315f1d6a55p121, 0x1.993979e14fffep123 }, + { 0x1.8dd619d943ca1p121, 0x1.8e109c4622913p123 }, + { 0x1.81915cb0e3323p121, 0x1.83298d717210ep123 }, + { 0x1.75a2d48946eb1p121, 0x1.78832c03aa2b1p123 }, + { 0x1.6a08807632262p121, 0x1.6e1c5893c380bp123 }, + { 0x1.5ec0687e8dcb2p121, 0x1.63f3f5c4de13bp123 }, + { 0x1.53c89d8bb3ddbp121, 0x1.5a08e85af27ep123 }, + { 0x1.491f395818f54p121, 0x1.505a174e9c929p123 }, + { 0x1.3ec25e5d5af12p121, 0x1.46e66be00224p123 }, + { 0x1.34b037c1bbfc5p121, 0x1.3dacd1a8d8ccep123 }, + { 0x1.2ae6f94510dd8p121, 0x1.34ac36ad8dafep123 }, + { 0x1.2164df2d29765p121, 0x1.2be38b6d92415p123 }, + { 0x1.18282e31ba3e8p121, 0x1.2351c2f2d1449p123 }, + { 0x1.0f2f3367cd6aap121, 0x1.1af5d2e04f3f6p123 }, + { 0x1.0678442cc256fp121, 0x1.12ceb37ff9bc3p123 }, + { 0x1.fc037c21c3622p120, 0x1.0adb5fcfa8c75p123 }, + { 0x1.eb940d8319831p120, 0x1.031ad58d56279p123 }, + { 0x1.db9f17e61c31p120, 0x1.f7182a851bca2p122 }, + { 0x1.cc218694238a2p120, 0x1.e85c449e377f3p122 }, + { 0x1.bd18548996419p120, 0x1.da0005e5f28dfp122 }, + { 0x1.ae808c479c371p120, 0x1.cc0180af00a8bp122 }, + { 0x1.a05747a543aa7p120, 0x1.be5ecd2fcb5f9p122 }, + { 0x1.9299afa0246a6p120, 0x1.b1160991ff737p122 }, + { 0x1.8544fc2c8c1dap120, 0x1.a4255a00b9f03p122 }, + { 0x1.785674053e8b9p120, 0x1.978ae8b55ce1bp122 }, + { 0x1.6bcb6c7ad4854p120, 0x1.8b44e6031383ep122 }, + { 0x1.5fa14942c3d54p120, 0x1.7f5188610ddc8p122 }, + { 0x1.53d57c461a5a7p120, 0x1.73af0c737bb45p122 }, + { 0x1.4865856ff632ap120, 0x1.685bb5134ef13p122 }, + { 0x1.3d4ef27bc49a6p120, 0x1.5d55cb54cd53ap122 }, + { 0x1.328f5ec350e67p120, 0x1.529b9e8cf9a1ep122 }, + { 0x1.2824730cacbb4p120, 0x1.482b8455dc491p122 }, + { 0x1.1e0be557fa673p120, 0x1.3e03d891b37dep122 }, + { 0x1.144378ad22027p120, 0x1.3422fd6d12e2bp122 }, + { 0x1.0ac8fce979b96p120, 0x1.2a875b5ffab56p122 }, + { 0x1.019a4e8d69649p120, 0x1.212f612dee7fbp122 }, + { 0x1.f16aad1422a55p119, 0x1.181983e5133ddp122 }, + { 0x1.e030141df7d25p119, 0x1.0f443edc5ce49p122 }, + { 0x1.cf80d4afc3019p119, 0x1.06ae13b0d3255p122 }, + { 0x1.bf5908f50b4ap119, 0x1.fcab1483ea7fcp121 }, + { 0x1.afb4e269693dfp119, 0x1.ec72615a894c4p121 }, + { 0x1.a090a974cfebep119, 0x1.dcaf3691fc448p121 }, + { 0x1.91e8bd0830a74p119, 0x1.cd5ec93c12432p121 }, + { 0x1.83b9923a85f7bp119, 0x1.be7e5ac24963bp121 }, + { 0x1.75ffb3e6519ap119, 0x1.b00b38d6b3575p121 }, + { 0x1.68b7c2479902dp119, 0x1.a202bd6372dcep121 }, + { 0x1.5bde729a6b60fp119, 0x1.94624e78e0fafp121 }, + { 0x1.4f708eb9fba63p119, 0x1.87275e3a6869ep121 }, + { 0x1.436af4c058acbp119, 0x1.7a4f6aca256cbp121 }, + { 0x1.37ca96a6cd1d4p119, 0x1.6dd7fe335823p121 }, + { 0x1.2c8c79e6f04a3p119, 0x1.61beae53b72b7p121 }, + { 0x1.21adb71c70c75p119, 0x1.56011cc3b036dp121 }, + { 0x1.172b79a7a1181p119, 0x1.4a9cf6bda3f4cp121 }, + { 0x1.0d02ff50ce651p119, 0x1.3f8ff5042a88ep121 }, + { 0x1.033197ec68c0ep119, 0x1.34d7dbc76d7e5p121 }, + { 0x1.f3694a0008381p118, 0x1.2a727a89a3f14p121 }, + { 0x1.e11332d0714c5p118, 0x1.205dac02bd6b9p121 }, + { 0x1.cf5bf1fed1e7p118, 0x1.1697560347b26p121 }, + { 0x1.be3eb08ae7c2p118, 0x1.0d1d69569b82dp121 }, + { 0x1.adb6b810af9e2p118, 0x1.03ede1a45bfeep121 }, + { 0x1.9dbf721b98dfap118, 0x1.f60d8aa2a88f2p120 }, + { 0x1.8e54677bb0151p118, 0x1.e4cc4abf7d065p120 }, + { 0x1.7f713f9cc9784p118, 0x1.d4143a9dfe965p120 }, + { 0x1.7111bfdfb3cep118, 0x1.c3e1a5f5c077cp120 }, + { 0x1.6331caf57b5dbp118, 0x1.b430ecf4a83a8p120 }, + { 0x1.55cd603cc415p118, 0x1.a4fe83fb9db25p120 }, + { 0x1.48e09b21414bfp118, 0x1.9646f35a76624p120 }, + { 0x1.3c67b27d50fe7p118, 0x1.8806d70b2fc36p120 }, + { 0x1.305ef7fdbfb95p118, 0x1.7a3ade6c8b3e5p120 }, + { 0x1.24c2d787b9e37p118, 0x1.6cdfcbfc1e263p120 }, + { 0x1.198fd6a0ee7bdp118, 0x1.5ff2750fe782p120 }, + { 0x1.0ec293d9e6d85p118, 0x1.536fc18f7ce5cp120 }, + { 0x1.0457c63a9669p118, 0x1.4754abacdf1dcp120 }, + { 0x1.f49879624a021p117, 0x1.3b9e3f9d06e3fp120 }, + { 0x1.e139bb05eb49ep117, 0x1.30499b503957fp120 }, + { 0x1.ce8d4b7fd6c7p117, 0x1.2553ee2a336bfp120 }, + { 0x1.bc8d516fda8bap117, 0x1.1aba78ba3af89p120 }, + { 0x1.ab341ee553e25p117, 0x1.107a8c7323a6ep120 }, + { 0x1.9a7c305336484p117, 0x1.06918b6355624p120 }, + { 0x1.8a602b88919cp117, 0x1.f9f9cfd9c3035p119 }, + { 0x1.7adadead962edp117, 0x1.e77448fb66bb9p119 }, + { 0x1.6be73f45149fbp117, 0x1.d58da68fd117p119 }, + { 0x1.5d80693276a6dp117, 0x1.c4412bf4b8f0bp119 }, + { 0x1.4fa19dc42d409p117, 0x1.b38a3af2e55b4p119 }, + { 0x1.424642c28ff75p117, 0x1.a3645330550ffp119 }, + { 0x1.3569e18328604p117, 0x1.93cb11a30d765p119 }, + { 0x1.29082600643fdp117, 0x1.84ba3004a50dp119 }, + { 0x1.1d1cddf5a82dep117, 0x1.762d84469c18fp119 }, + { 0x1.11a3f7ffbbfeap117, 0x1.6821000795a03p119 }, + { 0x1.069982c189a9ep117, 0x1.5a90b00981d93p119 }, + { 0x1.f7f3581a4dc2cp116, 0x1.4d78bba8ca5fdp119 }, + { 0x1.e381802242163p116, 0x1.40d564548fad7p119 }, + { 0x1.cfd6511405b2dp116, 0x1.34a305080681fp119 }, + { 0x1.bcead7f01492fp116, 0x1.28de11c5031ebp119 }, + { 0x1.aab859b20ac9ep116, 0x1.1d83170fbf6fbp119 }, + { 0x1.993851cc9779ap116, 0x1.128eb96be8798p119 }, + { 0x1.886470ad946a7p116, 0x1.07fdb4dafea5fp119 }, + { 0x1.78369a4a2cbd6p116, 0x1.fb99b8b8279e1p118 }, + { 0x1.68a8e4b2fc8c2p116, 0x1.e7f232d9e263p118 }, + { 0x1.59b596b012aaap116, 0x1.d4fed7195d7e8p118 }, + { 0x1.4b572664bd2dcp116, 0x1.c2b9cf7f893bfp118 }, + { 0x1.3d8837fb08d1dp116, 0x1.b11d702b3deb2p118 }, + { 0x1.30439c56dadf6p116, 0x1.a024365f771bdp118 }, + { 0x1.23844fd08cb93p116, 0x1.8fc8c794b03b5p118 }, + { 0x1.174578f6efd5dp116, 0x1.8005f08d6f1efp118 }, + { 0x1.0b826758a086bp116, 0x1.70d6a46e07ddap118 }, + { 0x1.003692548d98bp116, 0x1.6235fbd7a4345p118 }, + { 0x1.eabb2fe335196p115, 0x1.541f340697987p118 }, + { 0x1.d5e6777a83c2ap115, 0x1.468dadf4080abp118 }, + { 0x1.c1e6cb6239574p115, 0x1.397ced7af2b15p118 }, + { 0x1.aeb4423e690e7p115, 0x1.2ce898809244ep118 }, + { 0x1.9c47374a0974ep115, 0x1.20cc76202c5fbp118 }, + { 0x1.8a98484a1e8d3p115, 0x1.15246dda49d47p118 }, + { 0x1.79a0538dd4fc7p115, 0x1.09ec86c75d497p118 }, + { 0x1.695875fb574ap115, 0x1.fe41cd9bb4eeep117 }, + { 0x1.59ba0929261c5p115, 0x1.e97ba3b77f306p117 }, + { 0x1.4abea183bc47p115, 0x1.d57f524723822p117 }, + { 0x1.3c600c7f477c5p115, 0x1.c245d4b99847ap117 }, + { 0x1.2e984ed53e777p115, 0x1.afc85e0f82e12p117 }, + { 0x1.2161a2cd9d894p115, 0x1.9e005769dbc1dp117 }, + { 0x1.14b67693928cfp115, 0x1.8ce75e9f6f8ap117 }, + { 0x1.08916a956172p115, 0x1.7c7744d9378f7p117 }, + { 0x1.f9da9fde95755p114, 0x1.6caa0d3582fe9p117 }, + { 0x1.e38a4dc27b11bp114, 0x1.5d79eb71e893bp117 }, + { 0x1.ce283a9e3e33p114, 0x1.4ee1429bf7ccp117 }, + { 0x1.b9ab1a96e3b3ep114, 0x1.40daa3c89f5b6p117 }, + { 0x1.a609f7584d32bp114, 0x1.3360ccd23db3ap117 }, + { 0x1.933c2d52c56c9p114, 0x1.266ea71d4f71ap117 }, + { 0x1.8139690c0d187p114, 0x1.19ff4663ae9dfp117 }, + { 0x1.6ff9a4837fa43p114, 0x1.0e0de78654d1ep117 }, + { 0x1.5f7524a8e81a2p114, 0x1.0295ef6591848p117 }, + { 0x1.4fa476e59f668p114, 0x1.ef25d37f49fe1p116 }, + { 0x1.40806eb78e353p114, 0x1.da01102b5f851p116 }, + { 0x1.3202235dada5p114, 0x1.c5b5412dcafadp116 }, + { 0x1.2422ed95a3235p114, 0x1.b23a5a23e421p116 }, + { 0x1.16dc656a14df6p114, 0x1.9f8893d8fd1c1p116 }, + { 0x1.0a2860115569cp114, 0x1.8d986a4187285p116 }, + { 0x1.fc01dbb80c841p113, 0x1.7c629a822bc9ep116 }, + { 0x1.e4c0b066a497p113, 0x1.6be02102b352p116 }, + { 0x1.ce823f4cc4badp113, 0x1.5c0a378c90bcap116 }, + { 0x1.b93bf40d5eccbp113, 0x1.4cda5374ea275p116 }, + { 0x1.a4e3a125adc76p113, 0x1.3e4a23d1f4703p116 }, + { 0x1.916f7c5f2f764p113, 0x1.30538fbb77ecdp116 }, + { 0x1.7ed61b5d3db0ap113, 0x1.22f0b496539bep116 }, + { 0x1.6d0e7045988cbp113, 0x1.161be46ad3b5p116 }, + { 0x1.5c0fc68335b0cp113, 0x1.09cfa445b00ffp116 }, + { 0x1.4bd1bfa2aba3dp113, 0x1.fc0d55470cf51p115 }, + { 0x1.3c4c504792bf8p113, 0x1.e577bbcd49935p115 }, + { 0x1.2d77bd3a382bcp113, 0x1.cfd4a5adec5cp115 }, + { 0x1.1f4c988d02149p113, 0x1.bb1a9657ce465p115 }, + { 0x1.11c3bed8e716ap113, 0x1.a740684026555p115 }, + { 0x1.04d654905dadp113, 0x1.943d4a1d1ed39p115 }, + { 0x1.f0fb86d056745p112, 0x1.8208bc334a6a5p115 }, + { 0x1.d9676faafa27fp112, 0x1.709a8db59f25cp115 }, + { 0x1.c2e43d417197bp112, 0x1.5feada379d8b7p115 }, + { 0x1.ad664518e771bp112, 0x1.4ff207314a102p115 }, + { 0x1.98e25420092dap112, 0x1.40a8c1949f75ep115 }, + { 0x1.854daa4a49b0fp112, 0x1.3207fb7420eb9p115 }, + { 0x1.729df6503422ap112, 0x1.2408e9ba3327fp115 }, + { 0x1.60c95193c542dp112, 0x1.16a501f0e42cap115 }, + { 0x1.4fc63c27c71aep112, 0x1.09d5f819c9e29p115 }, + { 0x1.3f8b98f93052ap112, 0x1.fb2b792b40a22p114 }, + { 0x1.3010aa198de78p112, 0x1.e3bcf436a1a95p114 }, + { 0x1.214d0d298365p112, 0x1.cd55277c18d05p114 }, + { 0x1.1338b7e273194p112, 0x1.b7e94604479dcp114 }, + { 0x1.05cbf4be650abp112, 0x1.a36eec00926ddp114 }, + { 0x1.f1febf7a916aap111, 0x1.8fdc1b2dcf7b9p114 }, + { 0x1.d997c68d65936p111, 0x1.7d2737527c3f9p114 }, + { 0x1.c2556a4e7a90fp111, 0x1.6b4702d7d5849p114 }, + { 0x1.ac2aa7516ade4p111, 0x1.5a329b7d30748p114 }, + { 0x1.970b05888fda2p111, 0x1.49e17724f4d41p114 }, + { 0x1.82ea92dbc1a27p111, 0x1.3a4b60ba9aa4ep114 }, + { 0x1.6fbdddeff308fp111, 0x1.2b6875310f785p114 }, + { 0x1.5d79f11e27f6bp111, 0x1.1d312098e9dbap114 }, + { 0x1.4c144d984e1b8p111, 0x1.0f9e1b4dd36dfp114 }, + { 0x1.3b82e6ba892a4p111, 0x1.02a8673a94692p114 }, + { 0x1.2bbc1d878d272p111, 0x1.ec929a665b449p113 }, + { 0x1.1cb6bc4eaa678p111, 0x1.d4f4b4c8e09edp113 }, + { 0x1.0e69f27a37df3p111, 0x1.be6abbb10a5aap113 }, + { 0x1.00cd508511266p111, 0x1.a8e8cc1fadef6p113 }, + { 0x1.e7b1882bccac5p110, 0x1.94637d5bacfdbp113 }, + { 0x1.cf09287e48bb9p110, 0x1.80cfdc72220cfp113 }, + { 0x1.b792bbc489b04p110, 0x1.6e2367dc27f95p113 }, + { 0x1.a140206ab945p110, 0x1.5c540b4936fd2p113 }, + { 0x1.8c03d2d39119bp110, 0x1.4b581b8d170fcp113 }, + { 0x1.77d0e6e5bed21p110, 0x1.3b2652b06c2b2p113 }, + { 0x1.649b01d73110ap110, 0x1.2bb5cc22e5db6p113 }, + { 0x1.525654343aad2p110, 0x1.1cfe010e2052dp113 }, + { 0x1.40f79420887c7p110, 0x1.0ef6c4c84a0fep113 }, + { 0x1.3073f7cff4a85p110, 0x1.01984165a5f36p113 }, + { 0x1.20c1303550f0ep110, 0x1.e9b5e8d00ce77p112 }, + { 0x1.11d563e54f40ep110, 0x1.d16f5716c6c1ap112 }, + { 0x1.03a72a2bbdc06p110, 0x1.ba4f035d60e03p112 }, + { 0x1.ec5b0ca2b20f5p109, 0x1.a447b7b03f045p112 }, + { 0x1.d2bfc6210880ap109, 0x1.8f4ccca7fc90dp112 }, + { 0x1.ba6c1c6e87c4p109, 0x1.7b5223dac7336p112 }, + { 0x1.a35068e9c89cfp109, 0x1.684c227fcacefp112 }, + { 0x1.8d5dbaa383b98p109, 0x1.562fac4329b48p112 }, + { 0x1.7885ce9f67cdbp109, 0x1.44f21e49054f2p112 }, + { 0x1.64bb0863504ddp109, 0x1.34894a5e24657p112 }, + { 0x1.51f06ad20e4c3p109, 0x1.24eb7254ccf83p112 }, + { 0x1.4019914f0b53ap109, 0x1.160f438c70913p112 }, + { 0x1.2f2aa92823e8p109, 0x1.07ebd2a2d2844p112 }, + { 0x1.1f186b432c98bp109, 0x1.f4f12e9ab070ap111 }, + { 0x1.0fd8160ca94ap109, 0x1.db5ad0b27805cp111 }, + { 0x1.015f67a552924p109, 0x1.c304efa2c6f4ep111 }, + { 0x1.e749309831666p108, 0x1.abe09e9144b5ep111 }, + { 0x1.cd3caa04cdd1bp108, 0x1.95df988e76644p111 }, + { 0x1.b48774d0f8e45p108, 0x1.80f439b4ee04bp111 }, + { 0x1.9d189f9f85cbfp108, 0x1.6d11788a69c64p111 }, + { 0x1.86e0050236315p108, 0x1.5a2adfa0b4bc4p111 }, + { 0x1.71ce426a561d3p108, 0x1.4834877429b8fp111 }, + { 0x1.5dd4af79906a9p108, 0x1.37231085c7d9ap111 }, + { 0x1.4ae555af52cdfp108, 0x1.26eb9daed6f7ep111 }, + { 0x1.38f2e86f38216p108, 0x1.1783ceac2891p111 }, + { 0x1.27f0bd5d0e6b1p108, 0x1.08e1badf0fcedp111 }, + { 0x1.17d2c50b2bfafp108, 0x1.f5f7d88472604p110 }, + { 0x1.088d83f7e4069p108, 0x1.db92b5212fb8dp110 }, + { 0x1.f42c17ae0ebf6p107, 0x1.c282cd3957edap110 }, + { 0x1.d8c3ea48f2889p107, 0x1.aab7abace48dcp110 }, + { 0x1.beceb1f9f5b3dp107, 0x1.94219bfcb4928p110 }, + { 0x1.a6399674d366bp107, 0x1.7eb1a2075864ep110 }, + { 0x1.8ef2a9a18d857p107, 0x1.6a597219a93dap110 }, + { 0x1.78e8dcd2e6bfdp107, 0x1.570b69502f313p110 }, + { 0x1.640bf6745325ep107, 0x1.44ba864670882p110 }, + { 0x1.504c882a97424p107, 0x1.335a62115bce2p110 }, + { 0x1.3d9be56279ee9p107, 0x1.22df298214423p110 }, + { 0x1.2bec1a4917edbp107, 0x1.133d96ae7e0ddp110 }, + { 0x1.1b2fe32991d5cp107, 0x1.046aeabcfcdecp110 }, + { 0x1.0b5aa42bf5054p107, 0x1.ecb9cfe1d8642p109 }, + { 0x1.f8c0c2e2ce8dep106, 0x1.d21397ead99cbp109 }, + { 0x1.dc6b6f1384e18p106, 0x1.b8d094c86d374p109 }, + { 0x1.c19fa87de37fbp106, 0x1.a0df0f0c626dcp109 }, + { 0x1.a848df650bea7p106, 0x1.8a2e269750a39p109 }, + { 0x1.90538b942ea7cp106, 0x1.74adc8f4064d3p109 }, + { 0x1.79ad1fce5b3d8p106, 0x1.604ea819f007cp109 }, + { 0x1.6443fdcf0c327p106, 0x1.4d0231928c6f9p109 }, + { 0x1.50076ad55cc39p106, 0x1.3aba85fe22e2p109 }, + { 0x1.3ce784b411931p106, 0x1.296a70f414053p109 }, + { 0x1.2ad53760d7287p106, 0x1.1905613b3abf2p109 }, + { 0x1.19c232fd50b88p106, 0x1.097f6156f32c5p109 }, + { 0x1.09a0e254c75ep106, 0x1.f59a20caf6695p108 }, + { 0x1.f4c8c392fb944p105, 0x1.d9c73698fb1dcp108 }, + { 0x1.d800ed59bd026p105, 0x1.bf716c6168baep108 }, + { 0x1.bcd30dfbd611bp105, 0x1.a6852c6b58392p108 }, + { 0x1.a32923130213fp105, 0x1.8eefd70594a89p108 }, + { 0x1.8aee4cd06ec1bp105, 0x1.789fb715aae95p108 }, + { 0x1.740ebfab80eb4p105, 0x1.6383f726a8e04p108 }, + { 0x1.5e77b6bbd2127p105, 0x1.4f8c96f26a26ap108 }, + { 0x1.4a1766b6e5e8ap105, 0x1.3caa61607f92p108 }, + { 0x1.36dcf18a6465cp105, 0x1.2acee2f5ecdb8p108 }, + { 0x1.24b85a8bf0124p105, 0x1.19ec60b1242edp108 }, + { 0x1.139a7b37f8475p105, 0x1.09f5cf4dd2877p108 }, + { 0x1.0374f8792ca97p105, 0x1.f5bd95d8730d8p107 }, + { 0x1.e87470e4f4246p104, 0x1.d9371e2ff7c35p107 }, + { 0x1.cbbab18b73217p104, 0x1.be41de54d155ap107 }, + { 0x1.b0a44aa2f067ep104, 0x1.a4c89e08ef4f3p107 }, + { 0x1.971a1ec0f40c7p104, 0x1.8cb738399b12cp107 }, + { 0x1.7f064a8ba8323p104, 0x1.75fa8dbc84becp107 }, + { 0x1.685414c16188ep104, 0x1.608078a70dcbcp107 }, + { 0x1.52efdf060cd2p104, 0x1.4c37c0394d094p107 }, + { 0x1.3ec7176d784b5p104, 0x1.39100d5687bfep107 }, + { 0x1.2bc82ab9d2302p104, 0x1.26f9df8519bd7p107 }, + { 0x1.19e277461404p104, 0x1.15e6827001f18p107 }, + { 0x1.090640946d2d5p104, 0x1.05c803e4831c1p107 }, + { 0x1.f24946f22d5aep103, 0x1.ed22548cffd35p106 }, + { 0x1.d45f15b49b35ep103, 0x1.d06ad6ecdf971p106 }, + { 0x1.b83349fd05191p103, 0x1.b551c847fbc96p106 }, + { 0x1.9dacb2c432ef4p103, 0x1.9bc09f112b494p106 }, + { 0x1.84b37e1cbf8ebp103, 0x1.83a1ff0aa239dp106 }, + { 0x1.6d3126d74b6ccp103, 0x1.6ce1aa3fd7bddp106 }, + { 0x1.5710631158bffp103, 0x1.576c72b514859p106 }, + { 0x1.423d13a3b73e1p103, 0x1.43302cc4a0da8p106 }, + { 0x1.2ea43465e3995p103, 0x1.301ba221dc9bbp106 }, + { 0x1.1c33cd3c37addp103, 0x1.1e1e857adc568p106 }, + { 0x1.0adae3e73c2b5p103, 0x1.0d2966b1746f7p106 }, + { 0x1.f512dd15b73b7p102, 0x1.fa5b4f49cc6b2p105 }, + { 0x1.d6608dc942687p102, 0x1.dc3ae30b55c16p105 }, + { 0x1.b9823c51276e1p102, 0x1.bfd7555a3bd68p105 }, + { 0x1.9e5ce2f93dd76p102, 0x1.a517d9e61628ap105 }, + { 0x1.84d6fe15b6b93p102, 0x1.8be4f8f6c951fp105 }, + { 0x1.6cd87746bc76bp102, 0x1.74287ded49339p105 }, + { 0x1.564a91cd221fp102, 0x1.5dcd669f2cd34p105 }, + { 0x1.4117d7e2c667dp102, 0x1.48bfd38302871p105 }, + { 0x1.2d2c0909ebeb9p102, 0x1.34ecf8a3c124ap105 }, + { 0x1.1a7409475f2f9p102, 0x1.22430f521cbcfp105 }, + { 0x1.08ddd13bd35e7p102, 0x1.10b1488aeb235p105 }, + { 0x1.f0b0be22d18e8p101, 0x1.0027c00a263a6p105 }, + { 0x1.d1a75065a8c74p101, 0x1.e12ee004efc37p104 }, + { 0x1.b48117843c1c7p101, 0x1.c3e44ae32b16bp104 }, + { 0x1.99218b8ac7f8ep101, 0x1.a854ea14102a8p104 }, + { 0x1.7f6dc6010b4adp101, 0x1.8e6761569f45dp104 }, + { 0x1.674c6ae60d852p101, 0x1.7603bac345f65p104 }, + { 0x1.50a592e3c968ep101, 0x1.5f1353cdad001p104 }, + { 0x1.3b62b6aafb0c8p101, 0x1.4980cb3c80949p104 }, + { 0x1.276e9b681072fp101, 0x1.3537f00b6ad4dp104 }, + { 0x1.14b54042f445bp101, 0x1.2225b12bffc68p104 }, + { 0x1.0323ccdc1a3dcp101, 0x1.10380e1adb7e9p104 }, + { 0x1.e5510173b9a5p100, 0x1.febc107d5efaap103 }, + { 0x1.c6654733b86adp100, 0x1.df0f2a0ee6947p103 }, + { 0x1.a964ed354f984p100, 0x1.c14b2188bcee4p103 }, + { 0x1.8e324c651b064p100, 0x1.a553644f7f07dp103 }, + { 0x1.74b179d1eba81p100, 0x1.8b0cfce0579ep103 }, + { 0x1.5cc82d9070d95p100, 0x1.725e7c5dd20f7p103 }, + { 0x1.465daafca8b1dp100, 0x1.5b2fe547a134p103 }, + { 0x1.315aaa46df48ep100, 0x1.456a974e92e93p103 }, + { 0x1.1da9433aebbcfp100, 0x1.30f93c3699078p103 }, + { 0x1.0b34d93135fcp100, 0x1.1dc7b5b978cf8p103 }, + { 0x1.f3d41033c44ccp99, 0x1.0bc30c5d52f15p103 }, + { 0x1.d36d25268cd2bp99, 0x1.f5b2be65a0c7fp102 }, + { 0x1.b512a1fb1d8fcp99, 0x1.d5f3a8dea7357p102 }, + { 0x1.98a442fc4fc15p99, 0x1.b82915b03515bp102 }, + { 0x1.7e03b1cc6d738p99, 0x1.9c3517e789488p102 }, + { 0x1.651468e010b8ap99, 0x1.81fb7df06136ep102 }, + { 0x1.4dbb989001d84p99, 0x1.6961b8d641d06p102 }, + { 0x1.37e00dac4e8b5p99, 0x1.524ec4d916caep102 }, + { 0x1.236a197bf0b9ap99, 0x1.3cab1343d18d1p102 }, + { 0x1.10437b1569d7ep99, 0x1.2860757487a01p102 }, + { 0x1.fcae93fb7323cp98, 0x1.155a09065d4f7p102 }, + { 0x1.db23c3f816f92p98, 0x1.0384250e4c9fcp102 }, + { 0x1.bbc1a022c14d4p98, 0x1.e59890b926c78p101 }, + { 0x1.9e658108af2ep98, 0x1.c642116a8a9e3p101 }, + { 0x1.82eedbe410407p98, 0x1.a8e405e651ab6p101 }, + { 0x1.693f22ab61ce9p98, 0x1.8d5f98114f872p101 }, + { 0x1.5139a5f3661fbp98, 0x1.7397c5a66e307p101 }, + { 0x1.3ac3788a1b429p98, 0x1.5b71456c5a4c4p101 }, + { 0x1.25c354b26cb4ep98, 0x1.44d26de513197p101 }, + { 0x1.122182e9a270fp98, 0x1.2fa31d6371537p101 }, + { 0x1.ff8f84418d51p97, 0x1.1bcca373b7b43p101 }, + { 0x1.dd4262aac53e8p97, 0x1.0939ab853339fp101 }, + { 0x1.bd3474ec16ca5p97, 0x1.efac5187b2863p100 }, + { 0x1.9f40fd0082b72p97, 0x1.cf1e86235d0e7p100 }, + { 0x1.8345858c4438dp97, 0x1.b0a68a2128babp100 }, + { 0x1.6921be96b86b1p97, 0x1.9423165bc4444p100 }, + { 0x1.50b75c536f927p97, 0x1.7974e743dea3dp100 }, + { 0x1.39e9f7dcbe479p97, 0x1.607e9eacd105p100 }, + { 0x1.249ef1c3be817p97, 0x1.4924a74dec729p100 }, + { 0x1.10bd565b35393p97, 0x1.334d19e0c216p100 }, + { 0x1.fc5b8748842b2p96, 0x1.1edfa3c5f5ccap100 }, + { 0x1.d9b4a18a38642p96, 0x1.0bc56f1b54701p100 }, + { 0x1.b95cede6d524bp96, 0x1.f3d2185e047d9p99 }, + { 0x1.9b2df77a02225p96, 0x1.d26cb87945e87p99 }, + { 0x1.7f03b935e8e3ap96, 0x1.b334fac4b9f99p99 }, + { 0x1.64bc777824f0ep96, 0x1.96076f7918d1cp99 }, + { 0x1.4c389be9acb83p96, 0x1.7ac2d72fc2c63p99 }, + { 0x1.355a9387de78cp96, 0x1.614801550319ep99 }, + { 0x1.2006aeb6bc768p96, 0x1.4979ac8b28927p99 }, + { 0x1.0c23033e2a376p96, 0x1.333c68e2d0548p99 }, + { 0x1.f32ea02b55d23p95, 0x1.1e767bce37dd7p99 }, + { 0x1.d099c5c770f5ap95, 0x1.0b0fc5b6d05ap99 }, + { 0x1.b05cfe2e99435p95, 0x1.f1e3523b41d7dp98 }, + { 0x1.92508d0743fc9p95, 0x1.d00de6608effep98 }, + { 0x1.764f46cf19f9cp95, 0x1.b0778b7b3301bp98 }, + { 0x1.5c36679625a01p95, 0x1.92fb04ec0f6cfp98 }, + { 0x1.43e56c3e340a7p95, 0x1.77756ec9f78fap98 }, + { 0x1.2d3dee1869201p95, 0x1.5dc61922d5a06p98 }, + { 0x1.182380bd2f494p95, 0x1.45ce65699ff6dp98 }, + { 0x1.047b91fcb6491p95, 0x1.2f71a5f15997p98 }, + { 0x1.e45a9790460c1p94, 0x1.1a94ff571654fp98 }, + { 0x1.c242efeaca76p94, 0x1.071f4bbea09ecp98 }, + { 0x1.a284cb82c31cep94, 0x1.e9f1ff8ddd774p97 }, + { 0x1.84f7a1eb7f7f3p94, 0x1.c818223a202c7p97 }, + { 0x1.697595326d7dcp94, 0x1.a887bd2b4404dp97 }, + { 0x1.4fdb462549af1p94, 0x1.8b1a336c5eb6bp97 }, + { 0x1.3807ab51436a8p94, 0x1.6fab63324088ap97 }, + { 0x1.21dbea9108398p94, 0x1.56197e30205bap97 }, + { 0x1.0d3b35021d695p94, 0x1.3e44e45301b92p97 }, + { 0x1.f4154a787cc1bp93, 0x1.281000bfe4c3fp97 }, + { 0x1.d0623f4f4a28fp93, 0x1.135f28f2d50b4p97 }, + { 0x1.af2e69a26261p93, 0x1.00187dded5975p97 }, + { 0x1.904e0b3aa82a3p93, 0x1.dc479de0ef001p96 }, + { 0x1.73985278fa30ep93, 0x1.bad4fdad3caa1p96 }, + { 0x1.58e7298af87d9p93, 0x1.9baed3ed27ab8p96 }, + { 0x1.401708b7e64c6p93, 0x1.7ead9ce4285bbp96 }, + { 0x1.2906cb94eb40dp93, 0x1.63ac6b4edc88ep96 }, + { 0x1.139788f2dd663p93, 0x1.4a88be2a6390cp96 }, + { 0x1.ff58dab4f2a79p92, 0x1.332259185f1ap96 }, + { 0x1.da552fdd03043p92, 0x1.1d5b1f3793044p96 }, + { 0x1.b7f1f31b571b6p92, 0x1.0916f04b6e18bp96 }, + { 0x1.98006c2117e39p92, 0x1.ec77101de6926p95 }, + { 0x1.7a550f03b145bp92, 0x1.c960bf23153ep95 }, + { 0x1.5ec74662c5961p92, 0x1.a8bd20fc65ef7p95 }, + { 0x1.453141082302ap92, 0x1.8a61745ec7d1dp95 }, + { 0x1.2d6fc2c9e8bcp92, 0x1.6e25d0e756261p95 }, + { 0x1.1761f87a6dc3dp92, 0x1.53e4f7d1666cbp95 }, + { 0x1.02e94eb4ac8a5p92, 0x1.3b7c27a7ddb0ep95 }, + { 0x1.dfd296adef82ap91, 0x1.24caf2c32af14p95 }, + { 0x1.bc8ed301215ebp91, 0x1.0fb3186804d0fp95 }, + { 0x1.9bd5efd2c0f15p91, 0x1.f830c0bb41fd7p94 }, + { 0x1.7d79f2db2d4a5p91, 0x1.d3c0f1a91c846p94 }, + { 0x1.61500f5293f06p91, 0x1.b1e5acf351d87p94 }, + { 0x1.47306f04df3d6p91, 0x1.92712d259ce66p94 }, + { 0x1.2ef5ff0323b28p91, 0x1.7538c60a04476p94 }, + { 0x1.187e3fb74914dp91, 0x1.5a14b04b47879p94 }, + { 0x1.03a918225a966p91, 0x1.40dfd87456f4cp94 }, + { 0x1.e0b15822be4ep90, 0x1.2977b1172b9d5p94 }, + { 0x1.bce26a2fb7176p90, 0x1.13bc07e891491p94 }, + { 0x1.9bb1bc445c3c6p90, 0x1.ff1dbb4300811p93 }, + { 0x1.7cef42e9a617dp90, 0x1.d9a880f306bd8p93 }, + { 0x1.606e51e0a4963p90, 0x1.b6e45220b55ep93 }, + { 0x1.460560e841d79p90, 0x1.96a0b33f2c4dap93 }, + { 0x1.2d8dd47a40ad8p90, 0x1.78b07e9e924acp93 }, + { 0x1.16e3ca3d4393fp90, 0x1.5ce9ab1670dd2p93 }, + { 0x1.01e5e8edda47bp90, 0x1.4325167006bbp93 }, + { 0x1.dcea670907819p89, 0x1.2b3e53538ff3fp93 }, + { 0x1.b8e9bec48816dp89, 0x1.15137a7f44864p93 }, + { 0x1.97945aa1c9c35p89, 0x1.0084ff125639dp93 }, + { 0x1.78b88a4e7107bp89, 0x1.daeb0b7311ec7p92 }, + { 0x1.5c2827c986b62p89, 0x1.b7937d1c40c53p92 }, + { 0x1.41b858361b0fep89, 0x1.96d082f59ab06p92 }, + { 0x1.294150fb19119p89, 0x1.7872d9fa10aadp92 }, + { 0x1.129e20e732adcp89, 0x1.5c4e8e37bc7dp92 }, + { 0x1.fb58fa290d436p88, 0x1.423ac0df49a4p92 }, + { 0x1.d499229819bc6p88, 0x1.2a117230ad284p92 }, + { 0x1.b0c1a759f7739p88, 0x1.13af4f04f9998p92 }, + { 0x1.8f9bb6c075486p88, 0x1.fde703724e56p91 }, + { 0x1.70f4744735c2bp88, 0x1.d77f0c82e7641p91 }, + { 0x1.549cb0f7ef8e2p88, 0x1.b3ee02611d7ddp91 }, + { 0x1.3a68a8c1234e1p88, 0x1.92ff33023d5bdp91 }, + { 0x1.222fc469e8b8cp88, 0x1.7481a9e69f53fp91 }, + { 0x1.0bcc5fd30f1ddp88, 0x1.5847eda620959p91 }, + { 0x1.ee3728761897bp87, 0x1.3e27c1fcc74bdp91 }, + { 0x1.c7fa0c7e3bac7p87, 0x1.25f9ee0b923dcp91 }, + { 0x1.a4a56eb132a54p87, 0x1.0f9a0686532p91 }, + { 0x1.8401b5336a8ap87, 0x1.f5cc7718082bp90 }, + { 0x1.65db58e2358c1p87, 0x1.cf7e53d6a2ca5p90 }, + { 0x1.4a029a7ea7cd1p87, 0x1.ac0f5f3229372p90 }, + { 0x1.304b3d1961171p87, 0x1.8b498644847eap90 }, + { 0x1.188c45630dc53p87, 0x1.6cfa9bcca59dcp90 }, + { 0x1.029fbd8b92835p87, 0x1.50f411d4fd2cdp90 }, + { 0x1.dcc4fabf32f1cp86, 0x1.370ab8327af5ep90 }, + { 0x1.b767ecb334a7ep86, 0x1.1f167f88c6b6ep90 }, + { 0x1.94ec06c0ff29fp86, 0x1.08f24085d4597p90 }, + { 0x1.751977e5803d3p86, 0x1.e8f70e181d61ap89 }, + { 0x1.57bc950253825p86, 0x1.c324c20e337dcp89 }, + { 0x1.3ca58b816a87fp86, 0x1.a03261574b54ep89 }, + { 0x1.23a8197d2607ep86, 0x1.7fe903cdf5855p89 }, + { 0x1.0c9b4b0a6a16fp86, 0x1.6215c58da345p89 }, + { 0x1.eeb27891d2bb3p85, 0x1.46897d4b69fc6p89 }, + { 0x1.c77dbfc848866p85, 0x1.2d1877d731b7bp89 }, + { 0x1.a357936adf17bp85, 0x1.159a386b11517p89 }, + { 0x1.8203fa7992554p85, 0x1.ffd27ae9393cep88 }, + { 0x1.634b7f56b0a5cp85, 0x1.d7c593130dd0bp88 }, + { 0x1.46fada7e6a5fep85, 0x1.b2cd607c79bcfp88 }, + { 0x1.2ce2a3690576bp85, 0x1.90ae4d3405651p88 }, + { 0x1.14d707280e6cfp85, 0x1.71312dd1759e2p88 }, + { 0x1.fd5f08ad2b29ap84, 0x1.5422ef5d8949dp88 }, + { 0x1.d48d57f7718b7p84, 0x1.39544b0ecc957p88 }, + { 0x1.aef3ce0add578p84, 0x1.20997f73e73ddp88 }, + { 0x1.8c52800f939c8p84, 0x1.09ca0eaacd277p88 }, + { 0x1.6c6e61e57bf9bp84, 0x1.e9810295890ecp87 }, + { 0x1.4f10e8ebc44a9p84, 0x1.c2b45b5aa4a1dp87 }, + { 0x1.3407b59d72a5bp84, 0x1.9eee068fa7596p87 }, + { 0x1.1b2443858c0a1p84, 0x1.7df2b399c10a8p87 }, + { 0x1.043b9f1621ff3p84, 0x1.5f8b87a31bd85p87 }, + { 0x1.de4c41eb96b45p83, 0x1.4385c96e9a2d9p87 }, + { 0x1.b77e5cbd5d147p83, 0x1.29b2933ef4cbcp87 }, + { 0x1.93c9fc62bfb11p83, 0x1.11e68a6378f8ap87 }, + { 0x1.72f0c4c8e9bffp83, 0x1.f7f338086a86bp86 }, + { 0x1.54b92affb11afp83, 0x1.cf8d7d9ce040ap86 }, + { 0x1.38ee17b150182p83, 0x1.aa577251ae485p86 }, + { 0x1.1f5e908f70e0cp83, 0x1.8811d739efb5fp86 }, + { 0x1.07dd6833bb38p83, 0x1.68823e52970bep86 }, + { 0x1.e481e7f6ac4bcp82, 0x1.4b72ae68e8b4cp86 }, + { 0x1.bcc58edad5559p82, 0x1.30b14dbe876bcp86 }, + { 0x1.983ee9896d582p82, 0x1.181012ef8661p86 }, + { 0x1.76aca47764427p82, 0x1.01647ba798745p86 }, + { 0x1.57d287836bd3dp82, 0x1.d90e917701675p85 }, + { 0x1.3b79118c097a1p82, 0x1.b2a87e86d0c8ap85 }, + { 0x1.216d1b97279a9p82, 0x1.8f53dcb377293p85 }, + { 0x1.097f82fc04025p82, 0x1.6ed2f2515e933p85 }, + { 0x1.e709b415656dp81, 0x1.50ecc9ed47f19p85 }, + { 0x1.beaa3d6c15504p81, 0x1.356cd5ce7799ep85 }, + { 0x1.9996ed9b83967p81, 0x1.1c229a587ab78p85 }, + { 0x1.778be2bd9795bp81, 0x1.04e15ecc7f3f6p85 }, + { 0x1.584a99af8a842p81, 0x1.deffc7e6a6017p84 }, + { 0x1.3b99832cbefddp81, 0x1.b7b040832f31p84 }, + { 0x1.2143a112d0466p81, 0x1.938e021f36d76p84 }, + { 0x1.09182b326b229p81, 0x1.7258610b3b233p84 }, + { 0x1.e5d47637f5db5p80, 0x1.53d3bfc82a909p84 }, + { 0x1.bd20fcc3b76d7p80, 0x1.37c92babdc2fdp84 }, + { 0x1.97c9dda748fc7p80, 0x1.1e06010120f6ap84 }, + { 0x1.7589207e91ad1p80, 0x1.065b9616170d4p84 }, + { 0x1.561e669aa7fdbp80, 0x1.e13dd96b3753bp83 }, + { 0x1.394e7a2ac9fc7p80, 0x1.b950d32467392p83 }, + { 0x1.1ee2e61eccc99p80, 0x1.94a72263259a5p83 }, + { 0x1.06a996198f06fp80, 0x1.72fd93e036cdcp83 }, + { 0x1.e0e8fbad2703ep79, 0x1.54164576929abp83 }, + { 0x1.b8328ee330ae9p79, 0x1.37b83c521fe96p83 }, + { 0x1.92e21013a767p79, 0x1.1daf033182e96p83 }, + { 0x1.70aff489136ebp79, 0x1.05ca50205d26ap83 }, + { 0x1.515a7c77fab48p79, 0x1.dfbb6235639fap82 }, + { 0x1.34a53ce0bbb6fp79, 0x1.b7807e294781fp82 }, + { 0x1.1a58b2b09fdcbp79, 0x1.9298add70a734p82 }, + { 0x1.0241de6c31e5bp79, 0x1.70beaf9c7ffb6p82 }, + { 0x1.d863cf753825cp78, 0x1.51b2cd6709222p82 }, + { 0x1.affb906d0ae09p78, 0x1.353a6cf7f7fffp82 }, + { 0x1.8afbf9e9520c2p78, 0x1.1b1fa8cbe84a7p82 }, + { 0x1.691c7c768becep78, 0x1.0330f0fd69921p82 }, + { 0x1.4a1a79df39cdep78, 0x1.da81670f96f9bp81 }, + { 0x1.2db8ca9009091p78, 0x1.b24a16b4d09aap81 }, + { 0x1.13bf4cb384e4ap78, 0x1.8d6eeb6efdbd6p81 }, + { 0x1.f7f4f88751db4p77, 0x1.6ba91ac734786p81 }, + { 0x1.cc7626bced452p77, 0x1.4cb7966770ab5p81 }, + { 0x1.a4ab6470c1c5cp77, 0x1.305e9721d0981p81 }, + { 0x1.80451c2811052p77, 0x1.1667311fff70ap81 }, + { 0x1.5efa4d64f59f6p77, 0x1.fd3de10d62855p80 }, + { 0x1.40880373ed74p77, 0x1.d1aefbcd48d0cp80 }, + { 0x1.24b0d7368076ep77, 0x1.a9cc93c25aca9p80 }, + { 0x1.0b3c7b0d960fp77, 0x1.85487ee3ea735p80 }, + { 0x1.e7eea02e4ed88p76, 0x1.63daf8b4b1e0cp80 }, + { 0x1.bd6408059b696p76, 0x1.45421e69a6ca1p80 }, + { 0x1.96826d9e90341p76, 0x1.294175802d99ap80 }, + { 0x1.72fa4fa12d516p76, 0x1.0fa17bf41068fp80 }, + { 0x1.5282d2d5803fep76, 0x1.f05e82aae2bb9p79 }, + { 0x1.34d935f1be064p76, 0x1.c578101b29058p79 }, + { 0x1.19c050c56d0d7p76, 0x1.9e39dc5dd2f7cp79 }, + { 0x1.01001dd9c7ccep76, 0x1.7a553a728bbf2p79 }, + { 0x1.d4ca9b634ecbap75, 0x1.5982008db1304p79 }, + { 0x1.ab81c5c80cf39p75, 0x1.3b7e00422e51bp79 }, + { 0x1.85cfacb7477f2p75, 0x1.200c898d9ee3ep79 }, + { 0x1.6365862923eb9p75, 0x1.06f5f7eb65a56p79 }, + { 0x1.43fb317b5dc37p75, 0x1.e00e9148a1d25p78 }, + { 0x1.274ea96044bd7p75, 0x1.b623734024e92p78 }, + { 0x1.0d23817479c67p75, 0x1.8fd4e01891bf8p78 }, + { 0x1.ea84dd159259p74, 0x1.6cd44c7470d89p78 }, + { 0x1.bef1b1a12823ep74, 0x1.4cd9c04158cd7p78 }, + { 0x1.9730edfda64acp74, 0x1.2fa34bf5c8344p78 }, + { 0x1.72ede3b7eaa25p74, 0x1.14f4890ff2461p78 }, + { 0x1.51db1ec3a3087p74, 0x1.f92c49dfa4df5p77 }, + { 0x1.33b1c9d1576ecp74, 0x1.ccaaea71ab0dfp77 }, + { 0x1.18311f8a03acap74, 0x1.a40829f001197p77 }, + { 0x1.fe3bcf4629feap73, 0x1.7eef13b59e96cp77 }, + { 0x1.d083fda665164p73, 0x1.5d11e1a252bf5p77 }, + { 0x1.a6d7d18831888p73, 0x1.3e296303b2297p77 }, + { 0x1.80dcd6603df1bp73, 0x1.21f47009f43cep77 }, + { 0x1.5e4062d5b6a4ep73, 0x1.083768c5e4542p77 }, + { 0x1.3eb6ef47c2758p73, 0x1.e1777d831265fp76 }, + { 0x1.21fb7a81c5444p73, 0x1.b69f10b0191b5p76 }, + { 0x1.07cefb734d68bp73, 0x1.8f8a3a05b5b53p76 }, + { 0x1.dfefbdb19ac7ep72, 0x1.6be573c40c8e7p76 }, + { 0x1.b4831fb12344p72, 0x1.4b645ba991fdbp76 }, + { 0x1.8cf81557d20b6p72, 0x1.2dc119095729fp76 }, + { 0x1.68f6f0feb4755p72, 0x1.12bbcfa4d62dep76 }, + { 0x1.482fa78c40635p72, 0x1.f4343c7d504b9p75 }, + { 0x1.2a59289a484fbp72, 0x1.c74d4fe1e0e8bp75 }, + { 0x1.0f30c4d0be5cp72, 0x1.9e614ecbf4af6p75 }, + { 0x1.ecf3428c48d4fp71, 0x1.791716475420cp75 }, + { 0x1.bff86d9ec8499p71, 0x1.571d34563050ap75 }, + { 0x1.970bb87f4ae14p71, 0x1.3829407a207d8p75 }, + { 0x1.71d0b55b79b86p71, 0x1.1bf74244aed5ap75 }, + { 0x1.4ff315d036fbdp71, 0x1.024924c7520d1p75 }, + { 0x1.3125f6a3d257p71, 0x1.d5cc6ba567f29p74 }, + { 0x1.15233ae8815f2p71, 0x1.ab3560167ccaap74 }, + { 0x1.f755ea760487dp70, 0x1.846e9dda7a163p74 }, + { 0x1.c905bbd9ab5a6p70, 0x1.6121d7db32bddp74 }, + { 0x1.9eebaa0589b4ep70, 0x1.410047ead6894p74 }, + { 0x1.78a6de0f41b89p70, 0x1.23c2090cdde78p74 }, + { 0x1.55df1790f2f61p70, 0x1.09257fca001cp74 }, + { 0x1.3643ec463a3cfp70, 0x1.e1dd9ec677783p73 }, + { 0x1.198c18435598dp70, 0x1.b5ceb5a13221bp73 }, + { 0x1.fee9bab9f4e14p69, 0x1.8dbaa11de2037p73 }, + { 0x1.cf82e0eb6196bp69, 0x1.694680a9a3ee6p73 }, + { 0x1.a474e7029a919p69, 0x1.481f73b3778e8p73 }, + { 0x1.7d5af6513e2bep69, 0x1.29f9e7d8fd094p73 }, + { 0x1.59d93e1d8f57dp69, 0x1.0e90f64b5b103p73 }, + { 0x1.399c279e4699ap69, 0x1.eb4b9e47b58c9p72 }, + { 0x1.1c579bbca6885p69, 0x1.bdfe62f60dd7p72 }, + { 0x1.01c659160612dp69, 0x1.94d1de5c4576fp72 }, + { 0x1.d352b1ae2694p68, 0x1.6f66f6ab90c3cp72 }, + { 0x1.a78e8252c204dp68, 0x1.4d67050b31c2ap72 }, + { 0x1.7fd7c80f3410ep68, 0x1.2e8318008cf89p72 }, + { 0x1.5bcf92cc55d86p68, 0x1.1273463a1589bp72 }, + { 0x1.3b1f876b10da7p68, 0x1.f1ec20afad0e2p71 }, + { 0x1.1d791bb1324a1p68, 0x1.c39fa0d4a5a2bp71 }, + { 0x1.0294e37abcee8p68, 0x1.99946bf7e02a1p71 }, + { 0x1.d463db5fa3c13p67, 0x1.73679b24aeb9bp71 }, + { 0x1.a82a5f4047a5bp67, 0x1.50bf2558ab78fp71 }, + { 0x1.8011fb05fe09p67, 0x1.314916abfa1eap71 }, + { 0x1.5bb91decf8a58p67, 0x1.14bad9006f53bp71 }, + { 0x1.3ac71ce35c1d3p67, 0x1.f5a1196b5bb2ep70 }, + { 0x1.1ceb656955c59p67, 0x1.c698e001f6d3p70 }, + { 0x1.01dcc2acf7755p67, 0x1.9beca74b0f147p70 }, + { 0x1.d2b166911c178p66, 0x1.753637caac6d9p70 }, + { 0x1.a6459c5b11342p66, 0x1.5218993857afcp70 }, + { 0x1.7e086accc805dp66, 0x1.323f3f19cff3ep70 }, + { 0x1.59962aef547b3p66, 0x1.155d47fdb9c94p70 }, + { 0x1.3894608650edep66, 0x1.f6599b70323cap69 }, + { 0x1.1ab0e4d284f44p66, 0x1.c6dc8a4bb3ba6p69 }, + { 0x1.ff4248ebb8299p65, 0x1.9bcfd83a431e9p69 }, + { 0x1.ce42dd8e4fa23p65, 0x1.74ca889bbacd5p69 }, + { 0x1.a1e8aa1400997p65, 0x1.516d33e26c04p69 }, + { 0x1.79c430435a7fcp65, 0x1.31612a7ef535fp69 }, + { 0x1.557046eb39249p65, 0x1.1457ab75c2489p69 }, + { 0x1.349127b59b217p65, 0x1.f41259c9550cp68 }, + { 0x1.16d392dff5104p65, 0x1.c46969ca99a2ep68 }, + { 0x1.f7d80dc993f2fp64, 0x1.993e82b76e726p68 }, + { 0x1.c72c149cb214bp64, 0x1.72267ac1b25ap68 }, + { 0x1.9b270c24cc8fap64, 0x1.4ec0062aeeb78p68 }, + { 0x1.73585df7b6643p64, 0x1.2eb2d18a2081bp68 }, + { 0x1.4f59f9910367ep64, 0x1.11aeb0b11d1a1p68 }, + { 0x1.2ecf5b7f6abe3p64, 0x1.eed5c0bbf1061p67 }, + { 0x1.1164ab45aa235p64, 0x1.bf4ab21b4f3fp67 }, + { 0x1.ed9bdbc6f1b0ap63, 0x1.944462d4d5991p67 }, + { 0x1.bd8c96533b39bp63, 0x1.6d561de54f6a1p67 }, + { 0x1.921ec84d5860ep63, 0x1.4a1d472804fc8p67 }, + { 0x1.6ae172414cebap63, 0x1.2a406e25fcb44p67 }, + { 0x1.476e3b661be8cp63, 0x1.0d6e7662dda9dp67 }, + { 0x1.276873924f0b4p63, 0x1.e6bba6770e22dp66 }, + { 0x1.0a7c2c9322f59p63, 0x1.b797ab2ba22d2p66 }, + { 0x1.e0bad18c4e37dp62, 0x1.8cf813910fdcdp66 }, + { 0x1.b18eba0be4d24p62, 0x1.666f488db6e0ap66 }, + { 0x1.86f7884e1caadp62, 0x1.4399f7770045fp66 }, + { 0x1.608484d592328p62, 0x1.241e1ebbbf4ecp66 }, + { 0x1.3dcfaee52a8f5p62, 0x1.07aa30ce6a5ap66 }, + { 0x1.1e7cbac093f27p62, 0x1.dbe8969a24c6fp65 }, + { 0x1.023827dc88ed9p62, 0x1.ad7301258d788p65 }, + { 0x1.d16cd999791c3p61, 0x1.837a640fa9d3dp65 }, + { 0x1.a3666de0788bp61, 0x1.5d90f358d61f6p65 }, + { 0x1.79e17816df1e8p61, 0x1.3b5342f7be9cp65 }, + { 0x1.546e385224d1p61, 0x1.1c674ecd152d3p65 }, + { 0x1.32a7a483e977bp61, 0x1.007b997a0b531p65 }, + { 0x1.1432649c86c4dp61, 0x1.ce8cc007a6432p64 }, + { 0x1.f177ce0bd5836p60, 0x1.a109c0bccbc39p64 }, + { 0x1.bff3166bc36eep60, 0x1.77f5624913c3ap64 }, + { 0x1.934fc0975fb3p60, 0x1.52e251d5d3b1fp64 }, + { 0x1.6b13ebb9a5ad4p60, 0x1.316da780bc4d9p64 }, + { 0x1.46d17a80cc174p60, 0x1.133deb1d3526p64 }, + { 0x1.2624f3a0a887p60, 0x1.f00460b24acf8p63 }, + { 0x1.08b47d7733cb6p60, 0x1.bee2903d584f9p63 }, + { 0x1.dc5de496b181p59, 0x1.92920a7c80e26p63 }, + { 0x1.ac9615b3c9fd7p59, 0x1.6a9b25345c773p63 }, + { 0x1.818d3a356669ep59, 0x1.4691b26b9c82fp63 }, + { 0x1.5acbdab2ed713p59, 0x1.2613e9610f6d1p63 }, + { 0x1.37e61fd4c0fep59, 0x1.08c969adf0beap63 }, + { 0x1.187ab3d71db11p59, 0x1.dcc4ac4f59be5p62 }, + { 0x1.f8637ea4e52acp58, 0x1.ad2d0a9a18288p62 }, + { 0x1.c577fd709b099p58, 0x1.82498a7cc94b9p62 }, + { 0x1.97a3dc62119c8p58, 0x1.5ba462dee8a02p62 }, + { 0x1.6e66137bb7ccap58, 0x1.38d330d8806ap62 }, + { 0x1.494a3f6a9a70ep58, 0x1.1975e0627306cp62 }, + { 0x1.27e767bb79ea2p58, 0x1.fa6b5ee8f3088p61 }, + { 0x1.09dee32687729p58, 0x1.c78892308bd9p61 }, + { 0x1.ddb6ae2f39381p57, 0x1.99b5ec6741cb3p61 }, + { 0x1.ad1f9fba4b2abp57, 0x1.7073c400e10dcp61 }, + { 0x1.816dde4c11ca3p57, 0x1.4b4ee0b3a84d6p61 }, + { 0x1.5a245d5e5289cp57, 0x1.29df4862ac231p61 }, + { 0x1.36d26a686daafp57, 0x1.0bc7294e0cbafp61 }, + { 0x1.171277cbbce9cp57, 0x1.e163bd8df864p60 }, + { 0x1.f5120b45c00e6p56, 0x1.b0a61bce91993p60 }, + { 0x1.c1c74b30d0bbp56, 0x1.84cbb00f925fp60 }, + { 0x1.93b02e5cf0324p56, 0x1.5d5841ce6cb73p60 }, + { 0x1.6a46f43f3118cp56, 0x1.39dbcd485dd07p60 }, + { 0x1.45132973bb79bp56, 0x1.19f153b38a108p60 }, + { 0x1.23a85891dc72bp56, 0x1.fa7b9159fc471p59 }, + { 0x1.05a4dba466c4ep56, 0x1.c6de3429e31fap59 }, + { 0x1.d561964307dc4p55, 0x1.98769faac8a1bp59 }, + { 0x1.a4fa0f13737e8p55, 0x1.6ebf82977acfp59 }, + { 0x1.7984b636ad1bep55, 0x1.4940bc89fa5aap59 }, + { 0x1.5281628cb373ap55, 0x1.278e135bcf0a4p59 }, + { 0x1.2f7cc38bc628dp55, 0x1.0946088b6f8edp59 }, + { 0x1.100f1aef8eaf5p55, 0x1.dc21972b9e9f4p58 }, + { 0x1.e7b62ce66acdep54, 0x1.ab3e8cfada51ap58 }, + { 0x1.b5198cf325114p54, 0x1.7f5483f729c27p58 }, + { 0x1.87b15da6677afp54, 0x1.57e33e2b1c6dap58 }, + { 0x1.5ef5de2e68985p54, 0x1.3477480d89e25p58 }, + { 0x1.3a6d00852a688p54, 0x1.14a8b54629fb2p58 }, + { 0x1.19a90b14f53afp54, 0x1.f033fa073d52p57 }, + { 0x1.f88eba04114cbp53, 0x1.bcede5acc0d4p57 }, + { 0x1.c3dea36b87937p53, 0x1.8ee7b29d0b081p57 }, + { 0x1.94a28136fa731p53, 0x1.659917bbb6632p57 }, + { 0x1.6a4b2c9663fa1p53, 0x1.40877b79cd868p57 }, + { 0x1.44580945b8452p53, 0x1.1f44979177348p57 }, + { 0x1.22558f1aa9f03p53, 0x1.016d3f035816p57 }, + { 0x1.03dbf8db89298p53, 0x1.cd508600d0ba8p56 }, + { 0x1.d11c2965639f6p52, 0x1.9d4ae77a21604p56 }, + { 0x1.a03065db54a4bp52, 0x1.723974e9529d8p56 }, + { 0x1.745e6013d8cf3p52, 0x1.4b9a944f57915p56 }, + { 0x1.4d1f2eb8531p52, 0x1.28f9c9b769ee3p56 }, + { 0x1.29f9b7c4f56dfp52, 0x1.09ee66b6e99e9p56 }, + { 0x1.0a814a1dfc5edp52, 0x1.dc34b6999ff72p55 }, + { 0x1.dca8b63e38fa9p51, 0x1.aa5249b4cca57p55 }, + { 0x1.aa36c9242f8bcp51, 0x1.7d9db080918bap55 }, + { 0x1.7d0fbfa6c3c19p51, 0x1.558e88e8945efp55 }, + { 0x1.54a6b679dd96fp51, 0x1.31aa564e92066p55 }, + { 0x1.307d4e71272d7p51, 0x1.11831a9c3763dp55 }, + { 0x1.1022313b11381p51, 0x1.e96c265c21fbfp54 }, + { 0x1.e65f78e13edcdp50, 0x1.b5d52c19374fep54 }, + { 0x1.b2959e487c93fp50, 0x1.87a2188252d5fp54 }, + { 0x1.84436cf62b6f8p50, 0x1.5e440cc8caaf9p54 }, + { 0x1.5ad66c67f3f63p50, 0x1.393ad199301dep54 }, + { 0x1.35cb549c616ebp50, 0x1.18135a0647102p54 }, + { 0x1.14ac7e9322a1ap50, 0x1.f4ccd98eab06bp53 }, + { 0x1.ee20fae75a2c5p49, 0x1.bfaedff2748c1p53 }, + { 0x1.b931b883c77f2p49, 0x1.9026a7e3c9538p53 }, + { 0x1.89e1f8e1d4be6p49, 0x1.659f3419269eep53 }, + { 0x1.5f9a24050e89fp49, 0x1.3f92e9472ca4cp53 }, + { 0x1.39d2746cbe57fp49, 0x1.1d89fb6602df9p53 }, + { 0x1.18115431b6c4ap49, 0x1.fe32077e095c4p52 }, + { 0x1.f3d3ca19edf64p48, 0x1.c7bf775863df5p52 }, + { 0x1.bdf55dd9bdcep48, 0x1.970fb0b5580dcp52 }, + { 0x1.8dd8e25d2255dp48, 0x1.6b88087e4af9fp52 }, + { 0x1.62e225ebca19p48, 0x1.449de67f2c6b2p52 }, + { 0x1.3c855ef212badp48, 0x1.21d51dc348d4dp52 }, + { 0x1.1a4576cd5cddcp48, 0x1.02be7023a443ep52 }, + { 0x1.f765035c713d8p47, 0x1.cdec7155697e1p51 }, + { 0x1.c0d0bdeb46ae2p47, 0x1.9c4671c1a6e3cp51 }, + { 0x1.901afbd3819bep47, 0x1.6feb0af26f865p51 }, + { 0x1.64a386137b955p47, 0x1.484b1e63b3be4p51 }, + { 0x1.3ddb15521ce49p47, 0x1.24e68a1458bd7p51 }, + { 0x1.1b418ba2217c6p47, 0x1.054a9a7c2f05ap51 }, + { 0x1.f8c8bad8e2a2p46, 0x1.d2214ad33ca5ep50 }, + { 0x1.c1ba4950b8f4fp46, 0x1.9fb9933adac68p50 }, + { 0x1.90a0b40dd690cp46, 0x1.72b99eccc462ep50 }, + { 0x1.64d860502b279p46, 0x1.4a8e4dbe3539cp50 }, + { 0x1.3dcf1aadc099dp46, 0x1.26b4018ef81f7p50 }, + { 0x1.1b02414a73357p46, 0x1.06b4fe82cc6aep50 }, + { 0x1.f7fa3e4bec2aep45, 0x1.d44feffb34893p49 }, + { 0x1.c0aee6d6b1406p45, 0x1.a15d86bb23572p49 }, + { 0x1.8f684065398bfp45, 0x1.73ea5ac0d71a9p49 }, + { 0x1.637ff9397e989p45, 0x1.4b5fdd0f567fap49 }, + { 0x1.3c618d3c706ebp45, 0x1.2737769828878p49 }, + { 0x1.1988625955723p45, 0x1.06f8da87263cep49 }, + { 0x1.f4fc2f6d50e41p44, 0x1.d4710a9e149edp48 }, + { 0x1.bdb204ff1cda3p44, 0x1.a12cc7b1bf616p48 }, + { 0x1.8c75a6fa17116p44, 0x1.73793d6253bd7p48 }, + { 0x1.609ec277b8703p44, 0x1.4abd0af44c7f8p48 }, + { 0x1.399725d96eb63p44, 0x1.266f2e981ccfbp48 }, + { 0x1.16d8d1241b86bp44, 0x1.06154a07d21a2p48 }, + { 0x1.efd875a51d28dp43, 0x1.d2842b40e25fp47 }, + { 0x1.b8cd873c4de72p43, 0x1.9f27fa465d061p47 }, + { 0x1.87d2a89e5ac65p43, 0x1.7167c3937ded9p47 }, + { 0x1.5c3e42539c769p43, 0x1.48a7fb96552cap47 }, + { 0x1.35791e04cd29fp43, 0x1.245dcbaa25b1bp47 }, + { 0x1.12fc6cdafd10dp43, 0x1.040d4ab2de626p47 }, + { 0x1.e8a0077a1ed47p42, 0x1.ce8fcb8dadc2cp46 }, + { 0x1.b2118f75a4eb7p42, 0x1.9b55e7c11d9e6p46 }, + { 0x1.818e8b1c2616fp42, 0x1.6dbce02ec5c77p46 }, + { 0x1.566cdf4525ebp42, 0x1.4527acab6dfebp46 }, + { 0x1.3014fd204bc71p42, 0x1.210a3ddcb4706p46 }, + { 0x1.0dffe0bfc0c74p42, 0x1.00e7aba6527c9p46 }, + { 0x1.df6a8d5e14f11p41, 0x1.c8a12a152d814p45 }, + { 0x1.a9942579915cdp41, 0x1.95c35893651c9p45 }, + { 0x1.79bdc576e403ap41, 0x1.6884d52cc9914p45 }, + { 0x1.4f3d9114d799bp41, 0x1.4047ce663f641p45 }, + { 0x1.297c4e6eb62fcp41, 0x1.1c7f9c74f3e7cp45 }, + { 0x1.07f35ef1a4fcp41, 0x1.f95dcee779f74p44 }, + { 0x1.d455e0a3b0d94p40, 0x1.c0cc007cc808ep44 }, + { 0x1.9f70bf04a77cep40, 0x1.8e82cd2a6133cp44 }, + { 0x1.707990a8defefp40, 0x1.61d0ef76712e4p44 }, + { 0x1.46c779ebb14aep40, 0x1.3a1882865d26ep44 }, + { 0x1.21c4420bc9879p40, 0x1.16cce86450b2p44 }, + { 0x1.00ea48df1e7fbp40, 0x1.eee1d41e1e516p43 }, + { 0x1.c7856a7693627p39, 0x1.b72a1658393d4p43 }, + { 0x1.93c7abef59a2cp39, 0x1.85ac17b553c4fp43 }, + { 0x1.65df602b1e0ffp39, 0x1.59b72775450f3p43 }, + { 0x1.3d256a5ee461dp39, 0x1.32ae03812fcp43 }, + { 0x1.19053bac5f645p39, 0x1.1004b9cd4bae6p43 }, + { 0x1.f1f58fe66e142p38, 0x1.e27d88d5289bfp42 }, + { 0x1.b9216793da422p38, 0x1.abdab3fb224cep42 }, + { 0x1.86bd6adace04ep38, 0x1.7b5bd9f52a89ep42 }, + { 0x1.5a104640aeb74p38, 0x1.5051a941eb13p42 }, + { 0x1.32755417b50ddp38, 0x1.2a20366f6a0dep42 }, + { 0x1.0f5a5274f5c45p38, 0x1.083cdb1163405p42 }, + { 0x1.e07ab300dc4b9p37, 0x1.d458a013d18b4p41 }, + { 0x1.a956163a49613p37, 0x1.9f01f97b2e043p41 }, + { 0x1.7879eb52380edp37, 0x1.6fb2eaf7d8102p41 }, + { 0x1.4d30488394e18p37, 0x1.45be480207b14p41 }, + { 0x1.26d7af2869fc5p37, 0x1.208a2b041836ep41 }, + { 0x1.04e0c593552f5p37, 0x1.ff1ba8cbc9c8dp40 }, + { 0x1.cd98a274acae3p36, 0x1.c49f8a8ec4aebp40 }, + { 0x1.9852d44d7528bp36, 0x1.90c81ede57558p40 }, + { 0x1.6927c2c3e497p36, 0x1.62d5a948b6358p40 }, + { 0x1.3f65a98c177c9p36, 0x1.3a1de0952fd2bp40 }, + { 0x1.1a6ed66936eeap36, 0x1.16098d4b94692p40 }, + { 0x1.f36ed3084aa81p35, 0x1.ec24d6a8bc072p39 }, + { 0x1.b986ab7ebdd54p35, 0x1.b3828ebcc128bp39 }, + { 0x1.864933f3c0573p35, 0x1.8158a3038115ep39 }, + { 0x1.58f359f0c4e8fp35, 0x1.54eb3e9a3e72bp39 }, + { 0x1.30d82cb8a968cp35, 0x1.2d93b0174f61ap39 }, + { 0x1.0d5e5f59de7c1p35, 0x1.0abe0d45fd5c2p39 }, + { 0x1.dbfc240ab5f81p34, 0x1.d7ce33a39bd89p38 }, + { 0x1.a47db588b15cfp34, 0x1.a134d30d655e4p38 }, + { 0x1.736c0d0a31187p34, 0x1.70e16f315ef4p38 }, + { 0x1.480a1879e8f57p34, 0x1.461cda38e2783p38 }, + { 0x1.21b0591ce1cfdp34, 0x1.2044a2faebb7bp38 }, + { 0x1.ff94e3fca1752p33, 0x1.fd91813f8cc8cp37 }, + { 0x1.c3a9f9558ffap33, 0x1.c2530177987fep37 }, + { 0x1.8eb738c76b2f2p33, 0x1.8deb61106f334p37 }, + { 0x1.5fee91a43fef1p33, 0x1.5f91f55e86346p37 }, + { 0x1.3699940a6a811p33, 0x1.3694e7b13691bp37 }, + { 0x1.1216c07263dep33, 0x1.1256a18de488bp37 }, + { 0x1.e3ae49fef5535p32, 0x1.e49705a5ebd5fp36 }, + { 0x1.aab87fb8e4441p32, 0x1.abefb3186e784p36 }, + { 0x1.786c3dca158c4p32, 0x1.79dc285401b7dp36 }, + { 0x1.4c036b7451223p32, 0x1.4d9a4f359ba1ep36 }, + { 0x1.24cec8453db03p32, 0x1.267e46fd85893p36 }, + { 0x1.02334e92993b9p32, 0x1.03efdea0a0506p36 }, + { 0x1.c74fc41217dfbp31, 0x1.cad0afbb569b1p35 }, + { 0x1.9166837399532p31, 0x1.94e0d5e7a8744p35 }, + { 0x1.61d46c11dd916p31, 0x1.653d077d9eefp35 }, + { 0x1.37dbe7711fcd4p31, 0x1.3b2a639494566p35 }, + { 0x1.12d55c1e73c65p31, 0x1.16038b4af0a0ep35 }, + { 0x1.e4594b115943bp30, 0x1.ea6c598920c48p34 }, + { 0x1.aabdabdb93484p30, 0x1.b081aaf25ade1p34 }, + { 0x1.77f073eb945dfp30, 0x1.7d62079a4e4a6p34 }, + { 0x1.4b252d0bc8bebp30, 0x1.5042e1a8664edp34 }, + { 0x1.23a7345c57ccap30, 0x1.287117d29a9e6p34 }, + { 0x1.00d6f8a57f06ep30, 0x1.054e44f8ee735p34 }, + { 0x1.c44f136cf3bd8p29, 0x1.cc9cbc5fe04a8p33 }, + { 0x1.8e38df2790b7ap29, 0x1.95eb2cb828067p33 }, + { 0x1.5e8f828661e21p29, 0x1.65acfefcd0029p33 }, + { 0x1.3490e7e2bc31cp29, 0x1.3b20c56ad84f5p33 }, + { 0x1.0f91b7ff9bb2ap29, 0x1.159b917beb87ap33 }, + { 0x1.ddf56913a541ep28, 0x1.e90cb5cac7057p32 }, + { 0x1.a48cc1b8a7bc7p28, 0x1.aeb7659e5f7efp32 }, + { 0x1.71fde01e2ca8cp28, 0x1.7b4b752e86e5fp32 }, + { 0x1.4578e0b906b32p28, 0x1.4df8ace15322ep32 }, + { 0x1.1e4659a2a2156p28, 0x1.26072a17961ap32 }, + { 0x1.f788fc218597bp27, 0x1.02d48c75e7d9bp32 }, + { 0x1.bac92daac0b9dp27, 0x1.c7a2ecd5f05ap31 }, + { 0x1.85518c3484796p27, 0x1.90feaede7f2aep31 }, + { 0x1.56441b55bfff1p27, 0x1.60dcef1cedc3ap31 }, + { 0x1.2cdd203ab43a1p27, 0x1.36787980e7387p31 }, + { 0x1.08700c199ad4fp27, 0x1.112346e13dd7ep31 }, + { 0x1.d0c9857c390f3p26, 0x1.e087915129a98p30 }, + { 0x1.986a650394095p26, 0x1.a6a5096da5b7dp30 }, + { 0x1.66d6688315ad6p26, 0x1.73aff07c7874ep30 }, + { 0x1.3b3d55ebd8547p26, 0x1.46d572e10e216p30 }, + { 0x1.14e7b714e7093p26, 0x1.1f5ba17e5a90bp30 }, + { 0x1.e667d9a8bcd9ep25, 0x1.f93d0d186fbcdp29 }, + { 0x1.ab2733e383ad8p25, 0x1.bc1b22cec72bp29 }, + { 0x1.7712b76c8c7f6p25, 0x1.86529e9df069cp29 }, + { 0x1.494d8e1d4fc61p25, 0x1.5702d052bf73ap29 }, + { 0x1.2115447c6627dp25, 0x1.2d65aee08874cp29 }, + { 0x1.fb7d503fc65c8p24, 0x1.08ccb49580d43p29 }, + { 0x1.bd660913b938cp24, 0x1.d13c32a98512bp28 }, + { 0x1.86db66e158524p24, 0x1.98a4bfd5a5fadp28 }, + { 0x1.56f3ed5aa4222p24, 0x1.66e459a7794f4p28 }, + { 0x1.2ce2265a96befp24, 0x1.3b28bbce3c1c6p28 }, + { 0x1.07f14a8d0c116p24, 0x1.14b8b6b67144ep28 }, + { 0x1.cf049ebedf60dp23, 0x1.e5e26dbef0e28p27 }, + { 0x1.96129ca292f7ep23, 0x1.aa854b5c4f131p27 }, + { 0x1.6416763f6b3bcp23, 0x1.765d329106241p27 }, + { 0x1.3837bf030f4a8p23, 0x1.488b9479ee1c4p27 }, + { 0x1.11b82880134f9p23, 0x1.204c8d940530bp27 }, + { 0x1.dfe0c1b8af1f3p22, 0x1.f9e77238e0031p26 }, + { 0x1.a49aa1651cfcap22, 0x1.bbd2c8fd7e193p26 }, + { 0x1.709b5a3a79128p22, 0x1.85502f16a0f8dp26 }, + { 0x1.42ffa7e9ace3fp22, 0x1.5574ceffe3945p26 }, + { 0x1.1affd2eccd616p22, 0x1.2b72182c97af5p26 }, + { 0x1.efd8be43ac9a9p21, 0x1.06925da53a0fcp26 }, + { 0x1.b2564005de7e5p21, 0x1.cc6bb6d71090dp25 }, + { 0x1.7c694cd2b4ffdp21, 0x1.93a02d0c97221p25 }, + { 0x1.4d23fa69bd814p21, 0x1.61cb1a027e057p25 }, + { 0x1.23b556e6e918ep21, 0x1.361358dd1f243p25 }, + { 0x1.fecbcf04dca9p20, 0x1.0fba0d2660d89p25 }, + { 0x1.bf29264dcdc82p20, 0x1.dc2ef387bd0ep24 }, + { 0x1.8767d7fc43eb6p20, 0x1.a130711aadcdap24 }, + { 0x1.568f9937abc79p20, 0x1.6d758e1ac9659p24 }, + { 0x1.2bc67d8c20136p20, 0x1.401abca024479p24 }, + { 0x1.064d4616b0094p20, 0x1.185819a7f8c6ap24 }, + { 0x1.caf8458ad2a12p19, 0x1.eafc2b00a99b1p23 }, + { 0x1.917faff93e54p19, 0x1.ade505ba61e89p23 }, + { 0x1.5f2e79283b1cap19, 0x1.785c00b5cb27ep23 }, + { 0x1.33220b1da4f59p19, 0x1.4973634932c1ap23 }, + { 0x1.0c93ac678b0ccp19, 0x1.205a7d78be568p23 }, + { 0x1.d5aa313452daep18, 0x1.f8b4440d68221p22 }, + { 0x1.9a9b05368c88bp18, 0x1.b9a31a7b9868cp22 }, + { 0x1.66ede7f0c2d55p18, 0x1.826756e1a42e2p22 }, + { 0x1.39b7fc18e5891p18, 0x1.5209676e4b424p22 }, + { 0x1.122b662569616p18, 0x1.27b019965e362p22 }, + { 0x1.df2779ceabfc8p17, 0x1.029ce648133fdp22 }, + { 0x1.a2a5d2945d2b7p17, 0x1.c45161cd95fe8p21 }, + { 0x1.6dbccf848794ap17, 0x1.8b81d680cdfc5p21 }, + { 0x1.3f79bf21caa96p17, 0x1.59ca24a7521ddp21 }, + { 0x1.17080ae674896p17, 0x1.2e48f266999cfp21 }, + { 0x1.e75b024885f54p16, 0x1.0838b13324d03p21 }, + { 0x1.a98e26924c6c8p16, 0x1.cdd86b83e679dp20 }, + { 0x1.738bf4bc8d296p16, 0x1.93977456406ddp20 }, + { 0x1.445a6a9a273c6p16, 0x1.60a47aca18e96p20 }, + { 0x1.1b1eabeffc3a5p16, 0x1.341669953fe1cp20 }, + { 0x1.ee324e1fde417p15, 0x1.0d210b765b3d6p20 }, + { 0x1.af4465e9c5668p15, 0x1.d622fa53c02cep19 }, + { 0x1.784e3008fb46bp15, 0x1.9a961d6383ef7p19 }, + { 0x1.484eecd2f1383p15, 0x1.66890cd0bf55fp19 }, + { 0x1.1e65fd1ef2701p15, 0x1.390b73f2a4fbp19 }, + { 0x1.f39dc6baaccd7p14, 0x1.114ae59581395p19 }, + { 0x1.b3bb863d26278p14, 0x1.dd1e5296953a3p18 }, + { 0x1.7bf89f052b591p14, 0x1.a06dfa21b6c59p18 }, + { 0x1.4b4e35dbe0cddp14, 0x1.6b6a7a27c9005p18 }, + { 0x1.20d6781986167p14, 0x1.3d1cca3d4f6d8p18 }, + { 0x1.f790f6877f51ep13, 0x1.14acc164c64fep18 }, + { 0x1.b6e93fa7299b3p13, 0x1.e2ba80b9c3a1bp17 }, + { 0x1.7e82cde922833p13, 0x1.a511aa3827999p17 }, + { 0x1.4d515a14a6132p13, 0x1.6f3d9139319edp17 }, + { 0x1.226a790f97768p13, 0x1.404113d7d18e6p17 }, + { 0x1.fa02b8ac73416p12, 0x1.173ed60fcd6fap17 }, + { 0x1.b8c634233722p12, 0x1.e6ea95e92c624p16 }, + { 0x1.7fe6d7fbcef2cp12, 0x1.a8767775dd309p16 }, + { 0x1.4e53acc7531b1p12, 0x1.71f97a2983044p16 }, + { 0x1.231e547065724p12, 0x1.42710a88aab19p16 }, + { 0x1.faed5c4559717p11, 0x1.18fb2ded8ebb1p16 }, + { 0x1.b94e0bfb59934p11, 0x1.e9a4d9b21386ep15 }, + { 0x1.80217e57d8a3fp11, 0x1.aa947efe69879p15 }, + { 0x1.4e52d23cf50bp11, 0x1.7397d8e2bd385p15 }, + { 0x1.22f0652094ae6p11, 0x1.43a79684f6ef6p15 }, + { 0x1.fa4eba730bf6p10, 0x1.19ddbd8138a9p15 }, + { 0x1.b87f86a26fad7p10, 0x1.eae2ef93df996p14 }, + { 0x1.7f323487ff94ap10, 0x1.ab66cfccafb75p14 }, + { 0x1.4d4ec8ea8ee67p10, 0x1.7414e5b5ca43cp14 }, + { 0x1.21e112e39bf18p10, 0x1.43e1e22ebfdb4p14 }, + { 0x1.f8283ec45f117p9, 0x1.19e4732be2ffp14 }, + { 0x1.b65c7f9f1fbedp9, 0x1.eaa1efb3b003ep13 }, + { 0x1.7d1b22b6810f6p9, 0x1.aaeb7de6855e2p13 }, + { 0x1.4b49e984886ep9, 0x1.736f7c0d13f06p13 }, + { 0x1.1ff2d0d5a2649p9, 0x1.431f651be2ff4p13 }, + { 0x1.f47ee1cab73ddp8, 0x1.190f3f39e9af4p13 }, + { 0x1.b2e9e76c8d9f9p8, 0x1.e8e2722ca46cfp12 }, + { 0x1.79e11d635b9a7p8, 0x1.a923a9d8d5019p12 }, + { 0x1.4848ddf7dfffep8, 0x1.71a91ee04e82cp12 }, + { 0x1.1d2a13fdd2709p8, 0x1.4161e6298ed3ap12 }, + { 0x1.ef5b15f73200ap7, 0x1.176014201ab17p12 }, + { 0x1.ae2fb07705cc3p7, 0x1.e5a88cbf394e4p11 }, + { 0x1.758b92cdfdc64p7, 0x1.a6137c537bf6dp11 }, + { 0x1.44528f79b1b51p7, 0x1.6ec5f2d1367f4p11 }, + { 0x1.198d422be3f8cp7, 0x1.3ead7491061afp11 }, + { 0x1.e8c8a7276c93p6, 0x1.14dadee76975ap11 }, + { 0x1.a838b09afcf62p6, 0x1.e0fbc2ec572b9p10 }, + { 0x1.70246e766d2f3p6, 0x1.a1c215fcd0beap10 }, + { 0x1.3f700c0d99876p6, 0x1.6accae115453ep10 }, + { 0x1.1524997d01ap6, 0x1.3b08582357e32p10 }, + { 0x1.e0d68d9047f7ap5, 0x1.118577f06b2f2p10 }, + { 0x1.a11277ca2bd3fp5, 0x1.dae6e8d292a1ep9 }, + { 0x1.69b7f34ec048ep5, 0x1.9c3973d4c9b08p9 }, + { 0x1.39ac6410ceb63p5, 0x1.65c67e684d1e6p9 }, + { 0x1.0ffa110b113fp5, 0x1.367af901b137p9 }, + { 0x1.d796b4f7aaf7fp4, 0x1.0d678c614f535p9 }, + { 0x1.98cd1cb38dccp4, 0x1.d377f96b9fd62p8 }, + { 0x1.62548d6675835p4, 0x1.958648bd6035p8 }, + { 0x1.331480815e7cdp4, 0x1.5fbee5e7590f4p8 }, + { 0x1.0a19336cc73a1p4, 0x1.310fbf558eca2p8 }, + { 0x1.cd1db96a6c6efp3, 0x1.088a80b837328p8 }, + { 0x1.8f7b007e1de49p3, 0x1.cabfe10b3371ap7 }, + { 0x1.5a0a9c047e3c7p3, 0x1.8db7ccf7600f4p7 }, + { 0x1.2bb6f2dd8e254p3, 0x1.58c38f07b7c3bp7 }, + { 0x1.038ef3cbdc1c7p3, 0x1.2ad2ebb6268bdp7 }, + { 0x1.c1829acfb62b3p2, 0x1.02f94d1fb1ba4p7 }, + { 0x1.85308ad209551p2, 0x1.c0d23d3daadadp6 }, + { 0x1.50ec3549a202dp2, 0x1.84df8496cc3aep6 }, + { 0x1.23a3bf963c1ebp2, 0x1.50e4191e1b76cp6 }, + { 0x1.f8d2fce0ebb41p1, 0x1.23d2690dc7344p6 }, + { 0x1.b4de68e608347p1, 0x1.f980a88588961p5 }, + { 0x1.7a03df8f9f479p1, 0x1.b5c5135a44acbp5 }, + { 0x1.470ce4924af72p1, 0x1.7b10fe1f0aeaap5 }, + { 0x1.1aec242758b4fp1, 0x1.4831de32e25bdp5 }, + { 0x1.e9700b697ec96p0, 0x1.1c1d98f1b1f71p5 }, + { 0x1.a74be9568f922p0, 0x1.ebda6af103d07p4 }, + { 0x1.6e0c8fadbb05p0, 0x1.a9b07f491a273p4 }, + { 0x1.3c8164e42f29cp0, 0x1.70618a9c019dap4 }, + { 0x1.11a259faba91ep0, 0x1.3ebfb36da371bp4 }, + { 0x1.d91518c2acaf6p-1, 0x1.13c51b7852ecp4 }, + { 0x1.98e739a118b5ep-1, 0x1.dd1d36683753bp3 }, + { 0x1.616346ca3be0ep-1, 0x1.9cae5c1f5de61p3 }, + { 0x1.315f58c13df9cp-1, 0x1.64e7f0a95542fp3 }, + { 0x1.07d957435b8c4p-1, 0x1.34a1a5595e9cbp3 }, + { 0x1.c7e35cf4db634p-2, 0x1.0ada93ac2688ep3 }, + { 0x1.89cd6ead31b71p-2, 0x1.cd680d6a376d2p2 }, + { 0x1.542176fe1c2b2p-2, 0x1.8ed9e84be9bacp2 }, + { 0x1.25bd00bd97eddp-2, 0x1.58bc1beb8e117p2 }, + { 0x1.fb491e02b7c15p-3, 0x1.29ecb15514182p2 }, + { 0x1.b5fcd30c7e1f6p-3, 0x1.017069c4b54cfp2 }, + { 0x1.7a1c33cc1922bp-3, 0x1.bcdb33f7b88f9p1 }, + { 0x1.46610483f2395p-3, 0x1.804f671a7a35cp1 }, + { 0x1.19b0f23241b88p-3, 0x1.4bf6ca87a4707p1 }, + { 0x1.e62f62b4555dcp-4, 0x1.1eb67d8a75351p1 }, + { 0x1.a383ca9f98a0fp-4, 0x1.ef3318a5788dep0 }, + { 0x1.69f16aeb3677p-4, 0x1.ab97c2106c4d2p0 }, + { 0x1.383bf2b37a037p-4, 0x1.712bc1550fb6ap0 }, + { 0x1.0d51cf5a16254p-4, 0x1.3eb13a24821e2p0 }, + { 0x1.d08cdac87dce6p-5, 0x1.131510c1da6adp0 }, + { 0x1.909a7c3ac6f99p-5, 0x1.dad26311e9efp-1 }, + { 0x1.596acfa0bcc8fp-5, 0x1.99bf36c7ef068p-1 }, + { 0x1.29cc13bfd53ap-5, 0x1.618c26c1169a6p-1 }, + { 0x1.00b60212cf113p-5, 0x1.3104d5f799552p-1 }, + { 0x1.ba886ae6e40ep-6, 0x1.071e8b6003b16p-1 }, + { 0x1.7d62a282a4851p-6, 0x1.c5e5338097f6bp-2 }, + { 0x1.48a59e9cb1eb1p-6, 0x1.87730de08c821p-2 }, + { 0x1.1b2abc895a771p-6, 0x1.518db221cf8bap-2 }, + { 0x1.e7e6f4c33ededp-7, 0x1.230ae74a714aap-2 }, + { 0x1.a4480db60fe17p-7, 0x1.f5d1c58fdc6acp-3 }, + { 0x1.69fd19aacb90ap-7, 0x1.b091a88a72f08p-3 }, + { 0x1.37be42e1159e7p-7, 0x1.74d459ba38afep-3 }, + { 0x1.0c707db025298p-7, 0x1.414d114bdcde1p-3 }, + { 0x1.ce3ee3757dbe5p-8, 0x1.14dc49cbc0c3p-3 }, + { 0x1.8df06bfb34f6dp-8, 0x1.dd13408401cdcp-4 }, + { 0x1.568986affafc5p-8, 0x1.9afd0eca1593dp-4 }, + { 0x1.26d009f5af049p-8, 0x1.6203633a6814ap-4 }, + { 0x1.fb69c5d6b524ep-9, 0x1.30e632b0008c9p-4 }, + { 0x1.b49c67cd1611fp-9, 0x1.069124dc6eaefp-4 }, + { 0x1.77a47ec4e9fa1p-9, 0x1.c42b48d5cfe42p-5 }, + { 0x1.43260788f0a1fp-9, 0x1.854b792c33d4ap-5 }, + { 0x1.15f4e018a09eep-9, 0x1.4f1f511f7b2d7p-5 }, + { 0x1.de1c72f739a49p-10, 0x1.2073f996519cp-5 }, + { 0x1.9b25dc6d6642ep-10, 0x1.f08155c194aadp-6 }, + { 0x1.61853cc8eddacp-10, 0x1.ab41e011814e5p-6 }, + { 0x1.2feeed430b87bp-10, 0x1.6f9f62ec4193ap-6 }, + { 0x1.05451535e8102p-10, 0x1.3c45d7f9e2fbp-6 }, + { 0x1.c122bcbda7f8ep-11, 0x1.100ffa10ff0f3p-6 }, + { 0x1.81ff0b26f3b6ap-11, 0x1.d401bee3a7787p-7 }, + { 0x1.4bb153d2d0728p-11, 0x1.927ce5fbbe352p-7 }, + { 0x1.1cfe80beb05a4p-11, 0x1.5a195c6e2a08ep-7 }, + { 0x1.e9ae566e02486p-12, 0x1.2992f3c7d2ce7p-7 }, + { 0x1.a4a3297375461p-12, 0x1.ffa47aef63bd2p-8 }, + { 0x1.6948e77b6c537p-12, 0x1.b7ccca35ce88ep-8 }, + { 0x1.3644eed5b1126p-12, 0x1.79ffc3cd6bc92p-8 }, + { 0x1.0a6cd27d913d7p-12, 0x1.44d7c3dca9cc8p-8 }, + { 0x1.c97f5c053e775p-13, 0x1.1720abf01aa9bp-8 }, + { 0x1.88c0c973b68fcp-13, 0x1.dfa22008cf2c8p-9 }, + { 0x1.512157ee1d8bep-13, 0x1.9c08a63df00dcp-9 }, + { 0x1.215988e86b086p-13, 0x1.61eb258af5a93p-9 }, + { 0x1.f09f2b684fb31p-14, 0x1.2ff68a28f7dc4p-9 }, + { 0x1.aa222a98ba953p-14, 0x1.0506e21782262p-9 }, + { 0x1.6d9b06046eb66p-14, 0x1.c041afe3a1ad2p-10 }, + { 0x1.39a30e3030664p-14, 0x1.80d8271e40929p-10 }, + { 0x1.0d05cd2b64652p-14, 0x1.4a5cc1e67b046p-10 }, + { 0x1.cd740d2318d4dp-15, 0x1.1b8f04bdfa1bfp-10 }, + { 0x1.8bb7603d9828p-15, 0x1.e6b65816f0ff1p-11 }, + { 0x1.534d810db5377p-15, 0x1.a1a7ec86c94fbp-11 }, + { 0x1.22e56de90dc1ap-15, 0x1.665a9398034f1p-11 }, + { 0x1.f2bb06a7069e2p-16, 0x1.336f30c8d3345p-11 }, + { 0x1.ab79b6edb04e1p-16, 0x1.07b7cbf13abf4p-11 }, + { 0x1.6e5b33b150249p-16, 0x1.c461717dacbd8p-12 }, + { 0x1.39f005226a7dbp-16, 0x1.83f56253c12f1p-12 }, + { 0x1.0cfc8192e69bdp-16, 0x1.4cab82baddd6cp-12 }, + { 0x1.cce310b024fd4p-17, 0x1.1d39d04e50424p-12 }, + { 0x1.8acc81455f971p-17, 0x1.e9094beff3587p-13 }, + { 0x1.522570529739fp-17, 0x1.a3308036822dbp-13 }, + { 0x1.219685023e1bep-17, 0x1.67464f8a36affp-13 }, + { 0x1.eff1f945e7f7bp-18, 0x1.33e2c9c277148p-13 }, + { 0x1.a89fa515a2b44p-18, 0x1.07d0b7bb52fc7p-13 }, + { 0x1.6b83bb4ee4348p-18, 0x1.c40cfbd11fd1p-14 }, + { 0x1.372982e2fde1dp-18, 0x1.833ffa698fa8bp-14 }, + { 0x1.0a51297b20ab7p-18, 0x1.4bb29dadf3acp-14 }, + { 0x1.c7d093fb7e463p-19, 0x1.1c147957723bdp-14 }, + { 0x1.8607006600009p-19, 0x1.e6896f5762306p-15 }, + { 0x1.4db1c7b733812p-19, 0x1.a096cc3260668p-15 }, + { 0x1.1d76959a6b622p-19, 0x1.64a7647d3f88ap-15 }, + { 0x1.e858d8b3acc8p-20, 0x1.314deba7bab37p-15 }, + { 0x1.a1a94b14e3d7fp-20, 0x1.0550e92636252p-15 }, + { 0x1.6529df3d1cf1cp-20, 0x1.bf46cd0f972c3p-16 }, + { 0x1.316449a955429p-20, 0x1.7ebd49fbb30eep-16 }, + { 0x1.0517b9e1f89dep-20, 0x1.47796af08285bp-16 }, + { 0x1.be627dddb55d7p-21, 0x1.1827a73755ec7p-16 }, + { 0x1.7d8a7f2a8a2dp-21, 0x1.df49a10ccc568p-17 }, + { 0x1.4613bf000c71dp-21, 0x1.99ee7037b652bp-17 }, + { 0x1.16a45fcb7b882p-21, 0x1.5e9197017791dp-17 }, + { 0x1.dc283bcbe780fp-22, 0x1.2bc40c543e36bp-17 }, + { 0x1.96ca751cac37fp-22, 0x1.004b34180a4a9p-17 }, + { 0x1.5b7cd13179ddep-22, 0x1.b632d58444fadp-18 }, + { 0x1.28cb2cb8b4015p-22, 0x1.768f3e13d3bdcp-18 }, + { 0x1.faedd62dabd96p-23, 0x1.401fa7657909ep-18 }, + { 0x1.b0de982dbf111p-23, 0x1.1190d162109abp-18 }, + { 0x1.7195b2becea19p-23, 0x1.d3803e22a78e4p-19 }, + { 0x1.3b8387eea3f9dp-23, 0x1.8f694ad8ac632p-19 }, + { 0x1.0d521f8291cd6p-23, 0x1.55326d6aac6fap-19 }, + { 0x1.cbb9be9cbac1ep-24, 0x1.236e8d3a9e0e7p-19 }, + { 0x1.8852e54d26542p-24, 0x1.f1ca221c0b98bp-20 }, + { 0x1.4ec36b8fdf428p-24, 0x1.a914b62872bc3p-20 }, + { 0x1.1d9d0055d11dp-24, 0x1.6af2ae42db58p-20 }, + { 0x1.e74cb7ebdea0ap-25, 0x1.35dbe86ed95c7p-20 }, + { 0x1.9fa735b03463ap-25, 0x1.0880cfe68041ep-20 }, + { 0x1.627f6220ca6a9p-25, 0x1.c3847cbf78a3bp-21 }, + { 0x1.2e4d9d8b5b22fp-25, 0x1.81550cf271bfdp-21 }, + { 0x1.01c325e8bb3cp-25, 0x1.48cefa0aac509p-21 }, + { 0x1.b783bc148fcefp-26, 0x1.188ab9ce5fdddp-21 }, + { 0x1.76aa8791eba33p-26, 0x1.dea9996bf1c0fp-22 }, + { 0x1.3f58d390caeecp-26, 0x1.984c7bb9c53ffp-22 }, + { 0x1.10299f255a2cap-26, 0x1.5c3c6ce5f2f75p-22 }, + { 0x1.cfd7e08a13b2p-27, 0x1.28f8faa7c3202p-22 }, + { 0x1.8b368e0429dacp-27, 0x1.fa7304087353p-23 }, + { 0x1.50b2501707be6p-27, 0x1.afca3c464e1d5p-23 }, + { 0x1.1ecf2c897b782p-27, 0x1.701780b38d71ap-23 }, + { 0x1.e891642306feep-28, 0x1.39c08dab159ep-23 }, + { 0x1.a013c6709bdd5p-28, 0x1.0b66dac93672bp-23 }, + { 0x1.624c9a2f2f8fcp-28, 0x1.c7bde43ebd873p-24 }, + { 0x1.2da83d59392f5p-28, 0x1.84520ec5eb55ap-24 }, + { 0x1.00ce3767b77a8p-28, 0x1.4ad54236cf6b4p-24 }, + { 0x1.b5312d520a3f4p-29, 0x1.19d258cf47194p-24 }, + { 0x1.74191dcab90bcp-29, 0x1.e015665e4efbdp-25 }, + { 0x1.3ca855a30dad5p-29, 0x1.98dc92b26aeap-25 }, + { 0x1.0d71d1069e44fp-29, 0x1.5c29c3e79c162p-25 }, + { 0x1.ca7c7b61a5357p-30, 0x1.28708aaed4d7p-25 }, + { 0x1.86083aaabaf73p-30, 0x1.f8bd2046619b5p-26 }, + { 0x1.4bc21b880f9dep-30, 0x1.ada636f165959p-26 }, + { 0x1.1a28183b0e32p-30, 0x1.6dafa60f704a1p-26 }, + { 0x1.dfe23a6ad4f8bp-31, 0x1.37351629c53cp-26 }, + { 0x1.980956bea8ccp-31, 0x1.08cff68f5874cp-26 }, + { 0x1.5ae767663002ep-31, 0x1.c29ce58c1fc1p-27 }, + { 0x1.26e4fd1165b76p-31, 0x1.7f5772973d16cp-27 }, + { 0x1.f54dde2ba8f56p-32, 0x1.4612c5674eed9p-27 }, + { 0x1.aa0af3e698b26p-32, 0x1.15539e864d70fp-27 }, + { 0x1.6a0956d7d1b63p-32, 0x1.d7ad5cdc3741ep-28 }, + { 0x1.339bd6e517d44p-32, 0x1.9110bc4b50f8cp-28 }, + { 0x1.0554f0943ba8cp-32, 0x1.54fb970dbe54ep-28 }, + { 0x1.bbfac9007ec07p-33, 0x1.21dd98bc7de87p-28 }, + { 0x1.791862715d02fp-33, 0x1.ecc34851c9763p-29 }, + { 0x1.403f77382e654p-33, 0x1.a2ca34863bfcbp-29 }, + { 0x1.0feff2a4fc49p-33, 0x1.63e0d12d4d288p-29 }, + { 0x1.cdc5de1ae8c09p-34, 0x1.2e615f0543e41p-29 }, + { 0x1.8804761a993c4p-34, 0x1.00e4ae934cb56p-29 }, + { 0x1.4cc23eb3b5ffap-34, 0x1.b471c42165f4ap-30 }, + { 0x1.1a6c6c06ea18bp-34, 0x1.72b316e47cc93p-30 }, + { 0x1.df58ab9ae4fcbp-35, 0x1.3ad1e7143aa75p-30 }, + { 0x1.96bd0bd6c9a31p-35, 0x1.0b54bd6a9e23fp-30 }, + { 0x1.59163428fb3a6p-35, 0x1.c5f4a785a88d1p-31 }, + { 0x1.24be8d0138113p-35, 0x1.8162809b8dff6p-31 }, + { 0x1.f09f3c1618809p-36, 0x1.4721b76389525p-31 }, + { 0x1.a53148c3fc482p-36, 0x1.15a6678e0082cp-31 }, + { 0x1.652d1d62b45e1p-36, 0x1.d73f8da963966p-32 }, + { 0x1.2eda549c16ee8p-36, 0x1.8fdeb6a9e8ebcp-32 }, + { 0x1.00c2a84aed164p-36, 0x1.5342fe16e83a5p-32 }, + { 0x1.b3501c0fdbbcfp-37, 0x1.1fcdfea216d16p-32 }, + { 0x1.70f8998ccf075p-37, 0x1.e83eb9bce31c4p-33 }, + { 0x1.38b3a7222dd33p-37, 0x1.9e170e2dbff8cp-33 }, + { 0x1.08fb437656229p-37, 0x1.5f27a9aa5f66p-33 }, + { 0x1.c1085f96d9feep-38, 0x1.29bfa42bc7b76p-33 }, + { 0x1.7c6a3cf1c9dcfp-38, 0x1.f8de2739c95a9p-34 }, + { 0x1.423e65b2a3a8cp-38, 0x1.abfaa7d4233fap-34 }, + { 0x1.10ef40de709bcp-38, 0x1.6ac1833360c58p-34 }, + { 0x1.ce48f9d9e5928p-39, 0x1.336f5ff042b88p-34 }, + { 0x1.8773adc5703cep-39, 0x1.0484d7ff5f6bdp-34 }, + { 0x1.4b6e86a5aa9d8p-39, 0x1.b978904649f57p-35 }, + { 0x1.189488e2e9743p-39, 0x1.760249f31a968p-35 }, + { 0x1.db0100ef385d3p-40, 0x1.3cd13761f1731p-35 }, + { 0x1.9206c1ae9fb29p-40, 0x1.0c569a0b1627cp-35 }, + { 0x1.54382e8081943p-40, 0x1.c67fe1e83e91p-36 }, + { 0x1.1fe13002859cap-40, 0x1.80dbcff1d72cfp-36 }, + { 0x1.e71fde0c5e218p-41, 0x1.45d945dc4844dp-36 }, + { 0x1.9c159bbc9900ap-41, 0x1.13da615eb6c5fp-36 }, + { 0x1.5c8fc931c6d94p-41, 0x1.d2ffe78d87996p-37 }, + { 0x1.26cb8c1920344p-41, 0x1.8b4017551e03bp-37 }, + { 0x1.f295714275bc3p-42, 0x1.4e7bd56b77338p-37 }, + { 0x1.a592ca70605e5p-42, 0x1.1b06621cfb60ep-37 }, + { 0x1.646a234bddd88p-42, 0x1.dee83fc205fc8p-38 }, + { 0x1.2d4a498c21371p-42, 0x1.9521701d324dap-38 }, + { 0x1.fd5235020e009p-43, 0x1.56ad77d8efe38p-38 }, + { 0x1.ae71657ff542ep-43, 0x1.21d11201bfbcfp-38 }, + { 0x1.6bbc82f12468ap-43, 0x1.ea290040397f4p-39 }, + { 0x1.3354802504d9ep-43, 0x1.9e7295f29cf91p-39 }, + { 0x1.03a3b07cf84bp-43, 0x1.5e631fb2a96dbp-39 }, + { 0x1.b6a52af7c7202p-44, 0x1.28313d62cbf4fp-39 }, + { 0x1.727cc024d462ap-44, 0x1.f4b2d92a8da6ap-40 }, + { 0x1.38e1c7590edafp-44, 0x1.a726cda9c5fc4p-40 }, + { 0x1.083385f1e344cp-44, 0x1.6592390114765p-40 }, + { 0x1.be229b5ed10ebp-45, 0x1.2e1e1bdc1cff3p-40 }, + { 0x1.78a15c33bf0d1p-45, 0x1.fe77379b5869ap-41 }, + { 0x1.3dea49bdca04dp-45, 0x1.af3202215009fp-41 }, + { 0x1.0c5225e967ce3p-45, 0x1.6c30c15ee186bp-41 }, + { 0x1.c4df14833b32ep-46, 0x1.338f646703f05p-41 }, + { 0x1.7e2197e99732ep-46, 0x1.03b4338f71d3bp-41 }, + { 0x1.4266d76b7e9efp-46, 0x1.b688e02001605p-42 }, + { 0x1.0ff9aa4df55cbp-46, 0x1.72355f261c90fp-42 }, + { 0x1.cad0ea9847218p-47, 0x1.387d609c076c8p-42 }, + { 0x1.82f5884a3c4ffp-47, 0x1.07bcd8d61f54dp-42 }, + { 0x1.4650f71159187p-47, 0x1.bd20f0d88c869p-43 }, + { 0x1.1324c9f973607p-47, 0x1.77977767b819cp-43 }, + { 0x1.cfef7f529f1bfp-48, 0x1.3ce0fee10ae91p-43 }, + { 0x1.8716298a66d68p-48, 0x1.0b4fbeda58aa9p-43 }, + { 0x1.49a2f582864b8p-48, 0x1.c2f0b2bc85943p-44 }, + { 0x1.15cee56fb8f8p-48, 0x1.7c4f426570458p-44 }, + { 0x1.d43356b5d1bc3p-49, 0x1.40b3e347db73ap-44 }, + { 0x1.8a7d700826ce3p-49, 0x1.0e67b4f33d066p-44 }, + { 0x1.4c57f38808af9p-49, 0x1.c7efb04c36011p-45 }, + { 0x1.17f41219f6e6ep-49, 0x1.8055de49eb405p-45 }, + { 0x1.d796294cc09e7p-50, 0x1.43f076e4dac86p-45 }, + { 0x1.8d265709c8b81p-50, 0x1.11003322f9f2ap-45 }, + { 0x1.4e6bf1c869176p-50, 0x1.cc169496c493bp-46 }, + { 0x1.199123dce7f7cp-50, 0x1.83a55fe01c77fp-46 }, + { 0x1.da12f38ef6065p-51, 0x1.4691f56a0b9d1p-46 }, + { 0x1.8f0ced10d0db4p-51, 0x1.131565242338p-46 }, + { 0x1.4fdbda9c9106cp-51, 0x1.cf5f3d25346p-47 }, + { 0x1.1aa3b4e8f3caap-51, 0x1.8638e1112031dp-47 }, + { 0x1.dba6023e1257ap-52, 0x1.489478d82c425p-47 }, + { 0x1.902e5d96b5dc7p-52, 0x1.14a433d21a4e2p-47 }, + { 0x1.50a589affacc9p-52, 0x1.d1c4c912f9acbp-48 }, + { 0x1.1b2a2ba958505p-52, 0x1.880c8cf6ecf16p-48 }, + { 0x1.dc4cfb90a7ce5p-53, 0x1.49f5031dc194p-48 }, + { 0x1.9088f811b7254p-53, 0x1.15aa4ccc2f79bp-48 }, + { 0x1.50c7d151d73dp-53, 0x1.d343a5202c7c4p-49 }, + { 0x1.1b23bebdcda6dp-53, 0x1.891da95a3a6f5p-49 }, + { 0x1.dc06e50abd949p-54, 0x1.4ab18582d9df2p-49 }, + { 0x1.901c34297491p-54, 0x1.1626283914e64p-49 }, + { 0x1.50427d64b1c7dp-54, 0x1.d3d994938f3adp-50 }, + { 0x1.1a9076f0d2e24p-54, 0x1.896a9d7ab89b1p-50 }, + { 0x1.dad425efa38efp-55, 0x1.4ac8e5c7c8723p-50 }, + { 0x1.8ee8b30ca2586p-55, 0x1.16170c969f828p-50 }, + { 0x1.4f1653e256f41p-55, 0x1.d385b6cd88b32p-51 }, + { 0x1.19712f23cae3dp-55, 0x1.88f2f609fe4d3p-51 }, + { 0x1.d8b686448b5afp-56, 0x1.4a3b00e506616p-51 }, + { 0x1.8cf03de32b406p-56, 0x1.157d10888e2f3p-51 }, + { 0x1.4d4512f22a65dp-56, 0x1.d2488978a2f74p-52 }, + { 0x1.17c7923127a39p-56, 0x1.87b7664b4e00cp-52 }, + { 0x1.d5b12a674c804p-57, 0x1.4908ab62a09acp-52 }, + { 0x1.8a35c1621f2ccp-57, 0x1.14591aa0080cap-52 }, + { 0x1.4ad16c988b007p-57, 0x1.d023e74fea7e1p-53 }, + { 0x1.159616cbf8a0cp-57, 0x1.85b9c65443c51p-53 }, + { 0x1.d1c88b489c5c3p-58, 0x1.4733af4601fe1p-53 }, + { 0x1.86bd4690c0845p-58, 0x1.12acdf1c9738cp-53 }, + { 0x1.47bf000e37ae9p-58, 0x1.cd1b037f7490bp-54 }, + { 0x1.12dff96b26d81p-58, 0x1.82fd0e7486194p-54 }, + { 0x1.cd026b64a0ca8p-59, 0x1.44bec79d5416cp-54 }, + { 0x1.828be8d7b2e74p-59, 0x1.107adbae7661dp-54 }, + { 0x1.441250d6b8cc7p-59, 0x1.c93261af2cd0dp-55 }, + { 0x1.0fa934555eb5ap-59, 0x1.7f854fd47e7d3p-55 }, + { 0x1.c765c89feb632p-60, 0x1.41ad99b7fc9ebp-55 }, + { 0x1.7da7c97c8ea4bp-60, 0x1.0dc65148f57fcp-55 }, + { 0x1.3fd0bbb47d67cp-60, 0x1.c46fcad39a071p-56 }, + { 0x1.0bf675e9015a3p-60, 0x1.7b57aa64c1e42p-56 }, + { 0x1.c0facb396944ap-61, 0x1.3e04ac23c3f11p-56 }, + { 0x1.781800b4c5862p-61, 0x1.0a933c1a65e31p-56 }, + { 0x1.3b0069a07f02dp-61, 0x1.beda3eeb5f0a2p-57 }, + { 0x1.07cd15415698ap-61, 0x1.767a404101f5ap-57 }, + { 0x1.b9cab20b7b4acp-62, 0x1.39c95b8dcd835p-57 }, + { 0x1.71e48c82b190ap-62, 0x1.06e649c54a11dp-57 }, + { 0x1.35a840f1bb9bfp-62, 0x1.b879e3daa485dp-58 }, + { 0x1.0333055f872d1p-62, 0x1.70f426b1f5c67p-58 }, + { 0x1.b1dfbc5f13465p-63, 0x1.3501cdad9df5bp-58 }, + { 0x1.6b163d96b3dd9p-63, 0x1.02c4cdfc5722cp-58 }, + { 0x1.2fcfd4e6913cap-63, 0x1.b157f19f267eap-59 }, + { 0x1.fc5d8e0519af3p-64, 0x1.6acd55017e4e2p-59 }, + { 0x1.a945119b38a65p-64, 0x1.2fb4e266d3e9fp-59 }, + { 0x1.63b6a2745bde1p-64, 0x1.fc696b5025168p-60 }, + { 0x1.297f53c6e927fp-64, 0x1.a97e9c202c067p-60 }, + { 0x1.f18eb2ba6357fp-65, 0x1.640e915b3f3eap-60 }, + { 0x1.a006a7219c6a4p-65, 0x1.29ea2353deb28p-60 }, + { 0x1.5bcff1208eb99p-65, 0x1.f278f182d5ccep-61 }, + { 0x1.22bf73da1838dp-65, 0x1.a0f8fae51588p-61 }, + { 0x1.e60853b8b4b65p-66, 0x1.5cc15bf9dbbbbp-61 }, + { 0x1.963124add21cp-66, 0x1.23a9b1f0c9515p-61 }, + { 0x1.536cefa1810b4p-66, 0x1.e7c6162103b4ep-62 }, + { 0x1.1b995f6e584afp-66, 0x1.97d2ef035140ap-62 }, + { 0x1.d9da06644bc9dp-67, 0x1.54efd8e5e8a15p-62 }, + { 0x1.8bd1c79049ec2p-67, 0x1.1cfc34a10ee47p-62 }, + { 0x1.4a98db9bff0e8p-67, 0x1.dc5f9803d5324p-63 }, + { 0x1.1416a031bacf2p-67, 0x1.8e1907994f8d3p-63 }, + { 0x1.cd13f7b7c3414p-68, 0x1.4ca4b88f6234cp-63 }, + { 0x1.80f645203dff7p-68, 0x1.15eac2ce52257p-63 }, + { 0x1.415f515af2672p-68, 0x1.d054eb8db2ad5p-64 }, + { 0x1.0c410a1d6b3cap-68, 0x1.83d8652f7235cp-64 }, + { 0x1.bfc6c8b2d1c95p-69, 0x1.43eb1f8cfdcf1p-64 }, + { 0x1.75acacc068ebep-69, 0x1.0e7ed05fb3af3p-64 }, + { 0x1.37cc328e513e5p-69, 0x1.c3b617ec3cfd6p-65 }, + { 0x1.0422a6340a512p-69, 0x1.791e9c59e2b42p-65 }, + { 0x1.b2036a988beadp-70, 0x1.3ace8dce03fbdp-65 }, + { 0x1.6a0349d192d1ap-70, 0x1.06c218ca5f25ap-65 }, + { 0x1.2deb8d0dae905p-70, 0x1.b69393c895b87p-66 }, + { 0x1.f78b3aa5bebbep-71, 0x1.6df997f6bab1bp-66 }, + { 0x1.a3dafb67a96cfp-71, 0x1.315ac58b7d6b7p-66 }, + { 0x1.5e0885ebd9cc3p-71, 0x1.fd7d13f78002dp-67 }, + { 0x1.23c981e88b022p-71, 0x1.a8fe21d205ebp-67 }, + { 0x1.e66846a73c925p-72, 0x1.62777b62fde0cp-67 }, + { 0x1.955ea2f392221p-72, 0x1.279bb2446baf4p-67 }, + { 0x1.51cacbb42476ep-72, 0x1.ecfc5eb955129p-68 }, + { 0x1.19722d0b598a4p-72, 0x1.9b06ad8cbcafbp-68 }, + { 0x1.d4f0c5733dbc9p-73, 0x1.56a684fe99fcap-68 }, + { 0x1.869f70ffc1fcbp-73, 0x1.1d9d500e92622p-68 }, + { 0x1.45586a9e82938p-73, 0x1.dc163a555fefbp-69 }, + { 0x1.0ef18dbc017ffp-73, 0x1.8cbe28ca7c426p-69 }, + { 0x1.c338d2435fb4bp-74, 0x1.4a94f1540c9eap-69 }, + { 0x1.77ae3cb88b469p-74, 0x1.136b93820fc76p-69 }, + { 0x1.38bf7be87e681p-74, 0x1.cadeb8c3bba05p-70 }, + { 0x1.0453702b9a5bbp-74, 0x1.7e356a2db5e15p-70 }, + { 0x1.b154294e891dap-75, 0x1.3e50df3387f95p-70 }, + { 0x1.689b85dc875b1p-75, 0x1.09125281c373ap-70 }, + { 0x1.2c0dc90fab5bap-75, 0x1.b969aedac7779p-71 }, + { 0x1.f346b0aa94647p-76, 0x1.6f7d0d10edd84p-71 }, + { 0x1.9f5604d9610bp-76, 0x1.31e8350b95daep-71 }, + { 0x1.597757e14e4e8p-76, 0x1.fd3a5c3ac18bbp-72 }, + { 0x1.1f50b401397f7p-76, 0x1.a7ca8fa24018p-72 }, + { 0x1.ddd8dcb76e388p-77, 0x1.60a5532471804p-72 }, + { 0x1.8d50fcdd2a012p-77, 0x1.256887c26e498p-72 }, + { 0x1.4a512f5483d32p-77, 0x1.e82efb884fa7p-73 }, + { 0x1.129521372a709p-77, 0x1.961449f1f5f93p-73 }, + { 0x1.c872d91eff745p-78, 0x1.51be080b9d49dp-73 }, + { 0x1.7b56e9895b756p-78, 0x1.18df034ba2c47p-73 }, + { 0x1.3b37e1b01d1bdp-78, 0x1.d31877f1753bap-74 }, + { 0x1.05e763ef1c6e1p-78, 0x1.845928aac023dp-74 }, + { 0x1.b3291e83a6ddap-79, 0x1.42d6673958cf7p-74 }, + { 0x1.6978c8d7d61b8p-79, 0x1.0c58552d896bdp-74 }, + { 0x1.2c3987ce2b431p-79, 0x1.be0be95f0126ep-75 }, + { 0x1.f2a6593b4ee39p-80, 0x1.72aab5cc51918p-75 }, + { 0x1.9e0f0cfd57ab4p-80, 0x1.33fd04413c4e8p-75 }, + { 0x1.57c6a75ebbd36p-80, 0x1.ffc132424c87ap-76 }, + { 0x1.1d636b1da2b46p-80, 0x1.a91d6af35687bp-76 }, + { 0x1.d9c6f3705063cp-81, 0x1.6119a09e14fe5p-76 }, + { 0x1.8936d384f421ap-81, 0x1.253fb5c838ba6p-76 }, + { 0x1.464f8c7e074fcp-81, 0x1.e7068fdcaeb4ep-77 }, + { 0x1.0ec1f5aebc21fp-81, 0x1.945fff2eb1b17p-77 }, + { 0x1.c14515cb6f8fp-82, 0x1.4fb5a7146299ap-77 }, + { 0x1.74b15b6eeceb1p-82, 0x1.16ab8334ccb0ap-77 }, + { 0x1.352169fa33216p-82, 0x1.ce965139dad89p-78 }, + { 0x1.0060a522d6818p-82, 0x1.7fe578074e0c8p-78 }, + { 0x1.a933ad3e37ea3p-83, 0x1.3e8d828e807b4p-78 }, + { 0x1.608e37fe916b7p-83, 0x1.084c9533fea9dp-78 }, + { 0x1.24490f08ca22dp-83, 0x1.b68488148e38cp-79 }, + { 0x1.e4940102c0a26p-84, 0x1.6bbe630bdc58cp-79 }, + { 0x1.91a40479b1837p-84, 0x1.2daed7fd23569p-79 }, + { 0x1.4cdb9a0d20ef7p-84, 0x1.f45c523b5ec4ep-80 }, + { 0x1.13d21ec7ce7a5p-84, 0x1.9ee3b5d440d2p-80 }, + { 0x1.c90f21d2d475fp-85, 0x1.57f9f997e1f52p-80 }, + { 0x1.7aa5b8d4b4359p-85, 0x1.1d262b74c69e4p-80 }, + { 0x1.39a647b21bed6p-85, 0x1.d8b50e711660ap-81 }, + { 0x1.03c70a0dadb1dp-85, 0x1.87c4bc616ed3dp-81 }, + { 0x1.ae43ba1c85bb1p-86, 0x1.44a615135e868p-81 }, + { 0x1.6446b3db12c58p-86, 0x1.0cfed72363bb7p-81 }, + { 0x1.26f997cdc041dp-86, 0x1.bdb5f7a82d0f4p-82 }, + { 0x1.e86218ea3e6acp-87, 0x1.7136d3b897e11p-82 }, + { 0x1.9440cec9f5e3ap-87, 0x1.31cf2729ac24dp-82 }, + { 0x1.4e93295651e9bp-87, 0x1.fa860b2bf75f8p-83 }, + { 0x1.14df714b2cc27p-87, 0x1.a36fa64c5b19fp-83 }, + { 0x1.ca3058fde005fp-88, 0x1.5b478418ed951p-83 }, + { 0x1.7b135dc219792p-88, 0x1.1f8035d726d41p-83 }, + { 0x1.3995999427ba7p-88, 0x1.dbf75e60682c2p-84 }, + { 0x1.03604de581436p-88, 0x1.89f0afa1deecap-84 }, + { 0x1.ad067d36fa2c8p-89, 0x1.4602a49df0a52p-84 }, + { 0x1.62c6642f5d4b9p-89, 0x1.0dc2db21eaf21p-84 }, + { 0x1.2556d7a42568ap-89, 0x1.be61355e30a98p-85 }, + { 0x1.e5068065139bep-90, 0x1.7145a7dd1cf8cp-85 }, + { 0x1.90efd5cd13c3p-90, 0x1.31725e0702649p-85 }, + { 0x1.4b62e9374c452p-90, 0x1.f93e90900fd6bp-86 }, + { 0x1.11de133cc6916p-90, 0x1.a1d0c10ff74dfp-86 }, + { 0x1.c49bf95c5f745p-91, 0x1.597928f3e0c7p-86 }, + { 0x1.75f56ab48bd89p-91, 0x1.1d9f316556fccp-86 }, + { 0x1.34f00cbd8ea42p-91, 0x1.d8389849eaf01p-87 }, + { 0x1.fe61cbe17950dp-92, 0x1.8650e1db268ebp-87 }, + { 0x1.a589caf82618cp-92, 0x1.4293ddcb013c1p-87 }, + { 0x1.5c1e107375834p-92, 0x1.0a90025fd130cp-87 }, + { 0x1.1f7319c565581p-92, 0x1.b87eb911fc5efp-88 }, + { 0x1.daa6c6af5c17fp-93, 0x1.6bea387f6b0ap-88 }, + { 0x1.87d63120a742cp-93, 0x1.2c9c915a28ddap-88 }, + { 0x1.436e80df031fp-93, 0x1.f094496a5e827p-89 }, + { 0x1.0aef9bffa708dp-93, 0x1.9a19446f657ccp-89 }, + { 0x1.b890579385cdcp-94, 0x1.52a33b4b8094cp-89 }, + { 0x1.6b84ffdb5d885p-94, 0x1.179841589cdp-89 }, + { 0x1.2be9773700384p-94, 0x1.cda2d93f291abp-90 }, + { 0x1.eecef0206652cp-95, 0x1.7d0e0e7cac5bp-90 }, + { 0x1.9821029662ccfp-95, 0x1.3a804f20fd2f4p-90 }, + { 0x1.5097c74b3d08ep-95, 0x1.038a34010e13fp-90 }, + { 0x1.158fcf12f6c8ep-95, 0x1.ac508371be502p-91 }, + { 0x1.c9b60c296975dp-96, 0x1.61608ea10db83p-91 }, + { 0x1.7958bc88e6006p-96, 0x1.2383e3bce375p-91 }, + { 0x1.370dfa8e149d1p-96, 0x1.e0e820ef7463p-92 }, + { 0x1.0060a594f59c7p-96, 0x1.8c9f67fa9c048p-92 }, + { 0x1.a6925bee98d74p-97, 0x1.471203b047e85p-92 }, + { 0x1.5c351b499632p-97, 0x1.0dae92b93887p-92 }, + { 0x1.1ee518d278c58p-97, 0x1.bcabf2ba981bfp-93 }, + { 0x1.d8b2f8b0b2924p-98, 0x1.6e8f25135d13fp-93 }, + { 0x1.855f0a34582a6p-98, 0x1.2e219acb023aep-93 }, + { 0x1.40b1881e58e3p-98, 0x1.f1fe817902cebp-94 }, + { 0x1.0818d80634105p-98, 0x1.9a5d5233d8e13p-94 }, + { 0x1.b2ecbb2e8d76cp-99, 0x1.521d0766f8b85p-94 }, + { 0x1.6614d9da549fbp-99, 0x1.168c985c93c95p-94 }, + { 0x1.26c7736a63e7fp-99, 0x1.cae6809d7d445p-95 }, + { 0x1.e546a107b57d5p-100, 0x1.79f71edd3cb51p-95 }, + { 0x1.8f64020effd9cp-100, 0x1.37443c37e4835p-95 }, + { 0x1.48aa64075b15p-100, 0x1.004e8297ce819p-95 }, + { 0x1.0e6e891142764p-100, 0x1.a60ceba01346ap-96 }, + { 0x1.bcfa525d16889p-101, 0x1.5b71dfbe662f9p-96 }, + { 0x1.6e0be1ed4e4ccp-101, 0x1.1dfe04c5b884ap-96 }, + { 0x1.2d14568fa3103p-101, 0x1.d6c299b6b03dep-97 }, + { 0x1.ef39c9c67da7p-102, 0x1.8366f8264d161p-97 }, + { 0x1.973b86e9a718fp-102, 0x1.3ec401194be5fp-97 }, + { 0x1.4ed55e6d4d5dfp-102, 0x1.0641ea45be131p-97 }, + { 0x1.1345b1de4a541p-102, 0x1.af7b06dd7c2fap-98 }, + { 0x1.c48e8cf8e20edp-103, 0x1.62e7924beab28p-98 }, + { 0x1.73f6cd7db5a56p-103, 0x1.23e2123cac1dcp-98 }, + { 0x1.31afb2e91937bp-103, 0x1.e00be39adba8fp-99 }, + { 0x1.f6600b76754fcp-104, 0x1.8ab4ee2717624p-99 }, + { 0x1.9cc2881babafp-104, 0x1.447fa5b4e25fep-99 }, + { 0x1.5316d5b010b17p-104, 0x1.0abf02c055867p-99 }, + { 0x1.1688993cfebe3p-104, 0x1.b67d9f35f4de8p-100 }, + { 0x1.c98758b0a4ebap-105, 0x1.685ccfe1e2ab5p-100 }, + { 0x1.77baf72da4868p-105, 0x1.281e65593d67p-100 }, + { 0x1.3484c1e2418cbp-105, 0x1.e698bd1000fd2p-101 }, + { 0x1.fa991c211034p-106, 0x1.8fc0326c87b11p-101 }, + { 0x1.9fe006460b912p-106, 0x1.485d5ed97243ep-101 }, + { 0x1.555b844a27ecdp-106, 0x1.0db191585c5a2p-101 }, + { 0x1.182875c9f3984p-106, 0x1.baf50ff65044dp-102 }, + { 0x1.cbce2423a80acp-107, 0x1.6bb8ebe73c54ap-102 }, + { 0x1.794741d4d28c6p-107, 0x1.2a9fd1221e357p-102 }, + { 0x1.3586a18110b0ep-107, 0x1.ea4b746dbeae3p-103 }, + { 0x1.fbd1c1dcb3991p-108, 0x1.9271dfe5687e7p-103 }, + { 0x1.a085cf5d6c87ep-108, 0x1.4a4b9ae2c857dp-103 }, + { 0x1.559911f8b7812p-108, 0x1.0f0c2d578f06ap-103 }, + { 0x1.181ddd71c27fbp-108, 0x1.bccd0201398bap-104 }, + { 0x1.cb5889458c00ep-109, 0x1.6cec95dfef21ap-104 }, + { 0x1.789499da6bff1p-109, 0x1.2b5ae7721763fp-104 }, + { 0x1.34b0b5ddf82c6p-109, 0x1.eb1327842cc63p-105 }, + { 0x1.fa04646636ebep-110, 0x1.92bda7bca05b7p-105 }, + { 0x1.9eb0ea42d451ep-110, 0x1.4a4186866270ap-105 }, + { 0x1.53ce6234f7db7p-110, 0x1.0ec8a57831ec5p-105 }, + { 0x1.1668fdbb007d5p-110, 0x1.bbfd05e1b64f3p-106 }, + { 0x1.c8289c5fd0187p-111, 0x1.6bf24d893426cp-106 }, + { 0x1.75a62b0407aefp-111, 0x1.2a4c4fb42b862p-106 }, + { 0x1.3206cc37b0e4ap-111, 0x1.e8ec43d273fbap-107 }, + { 0x1.f53937c26236ep-112, 0x1.90a22ee0d506ep-107 }, + { 0x1.9a69ad7793258p-112, 0x1.483f4fee6553cp-107 }, + { 0x1.50039cbf56e41p-112, 0x1.0ce82f0139653p-107 }, + { 0x1.13119a81ee824p-112, 0x1.b888d3fea2a71p-108 }, + { 0x1.c24cdc6a6909bp-113, 0x1.68ce8cbb7eaebp-108 }, + { 0x1.7089487e1182ep-113, 0x1.2778e05f0f826p-108 }, + { 0x1.2d94fe2dcd5a4p-113, 0x1.e3e0a1bcb7b9p-109 }, + { 0x1.ed85fe218f015p-114, 0x1.8c29185861611p-109 }, + { 0x1.93c37ffa2be3p-114, 0x1.444e2559eb861p-109 }, + { 0x1.4a49efe08b764p-114, 0x1.09735c9244f77p-109 }, + { 0x1.0e26d33274acdp-114, 0x1.b28030446d467p-110 }, + { 0x1.b9dfc560135fp-115, 0x1.638fa554a9791p-110 }, + { 0x1.6955081ac80b2p-115, 0x1.22ed7a20d2031p-110 }, + { 0x1.276f565251c73p-115, 0x1.dc07399fb9ebdp-111 }, + { 0x1.e30d639687648p-116, 0x1.8566bbf3afdccp-111 }, + { 0x1.8adc46e842374p-116, 0x1.3e7fef514c8f7p-111 }, + { 0x1.42bb0eedd3fb2p-116, 0x1.0479dd0162987p-111 }, + { 0x1.07beb0edff1b8p-116, 0x1.a9fe7272a642bp-112 }, + { 0x1.af070915be74ep-117, 0x1.5c4d5495043b3p-112 }, + { 0x1.602994f04daa5p-117, 0x1.1cbea64272b5fp-112 }, + { 0x1.1fb139d7ad13p-117, 0x1.d18375dee0b86p-113 }, + { 0x1.d5fdfa65dd70dp-118, 0x1.7c798c690caf6p-113 }, + { 0x1.7fdb85ec65bd4p-118, 0x1.36eec953c25e3p-113 }, + { 0x1.39787263ebbcap-118, 0x1.fc2409fc1812ep-114 }, + { 0x1.ffeb0495cc103p-119, 0x1.9f29b80329143p-114 }, + { 0x1.a1f276c1aeb71p-119, 0x1.5328106ecc8f8p-114 }, + { 0x1.552f40714fe54p-119, 0x1.1507fc4d2f4bap-114 }, + { 0x1.167c9d827337cp-119, 0x1.c484291d11ffp-115 }, + { 0x1.c690e28b6a9bfp-120, 0x1.7189333483e3bp-115 }, + { 0x1.72f13b97db104p-120, 0x1.2dbc3e931f24dp-115 }, + { 0x1.2eaa616a9b21cp-120, 0x1.ecb050b3055ap-116 }, + { 0x1.edda16b7edc87p-121, 0x1.9231c8255bcdbp-116 }, + { 0x1.92da9c960076ap-121, 0x1.4848161f4e509p-116 }, + { 0x1.48955baf138afp-121, 0x1.0beb55467080ap-116 }, + { 0x1.0bf90e157d9dap-121, 0x1.b542338309321p-117 }, + { 0x1.b5082a5d8de09p-122, 0x1.64c56b8fb3cecp-117 }, + { 0x1.6454856772fedp-122, 0x1.231052b5f7dd6p-117 }, + { 0x1.227ecea87251dp-122, 0x1.dadb937ed07ebp-118 }, + { 0x1.d99724acabf71p-123, 0x1.834eb55a1d18ep-118 }, + { 0x1.81ff31715569ap-123, 0x1.3bdc43dd8955fp-118 }, + { 0x1.3a90e48619574p-123, 0x1.018fd4cd15479p-118 }, + { 0x1.005296113b586p-123, 0x1.a3fee5158c03fp-119 }, + { 0x1.a1acf8c750894p-124, 0x1.5664a8518a142p-119 }, + { 0x1.54421936100c1p-124, 0x1.171860917e7c8p-119 }, + { 0x1.152813e135602p-124, 0x1.c6f152728fb8fp-120 }, + { 0x1.c375a4cba7b23p-125, 0x1.72bf4ab4db677p-120 }, + { 0x1.6fa5568fa20f3p-125, 0x1.2e18c95c4bfb1p-120 }, + { 0x1.2b5b13ef0805cp-125, 0x1.ec41a3d4cf576p-121 }, + { 0x1.e77117811a7d2p-126, 0x1.91022d83bf8f5p-121 }, + { 0x1.8ccd934db2cbp-126, 0x1.46a292659269ep-121 }, + { 0x1.42faa33070d2ap-126, 0x1.0a05da41d6048p-121 }, + { 0x1.06db98d7f6125p-126, 0x1.b14375f322de2p-122 }, + { 0x1.abcdbdfcc9f7cp-127, 0x1.60c75486158bp-122 }, + { 0x1.5c15c23fbb403p-127, 0x1.1f35bc35fb59fp-122 }, + { 0x1.1b2fdb7cab6dfp-127, 0x1.d39954e0a9d3dp-123 }, + { 0x1.ccb8a64624f6cp-128, 0x1.7c98ab66270f5p-123 }, + { 0x1.76bb52e82b59ap-128, 0x1.35be6eb898758p-123 }, + { 0x1.30c117f001ac3p-128, 0x1.f819edd38db9cp-124 }, + { 0x1.efa0e49e3feccp-129, 0x1.9a2821242ebdp-124 }, + { 0x1.92fa046d58d4ep-129, 0x1.4dadd528d6ea9p-124 }, + { 0x1.479ae4e865feep-129, 0x1.0f6d9e092345cp-124 }, + { 0x1.0a4c603089f16p-129, 0x1.b987187720ae4p-125 }, + { 0x1.b0e03e96a5485p-130, 0x1.6711ad9310ce1p-125 }, + { 0x1.5fc89a9e03199p-130, 0x1.23f97aea9f29fp-125 }, + { 0x1.1dd90a3522c75p-130, 0x1.dac6b554960ffp-126 }, + { 0x1.d07c0b8b30398p-131, 0x1.81f77dc55f2bdp-126 }, + { 0x1.795540ea5dda7p-131, 0x1.39bb36d1a51dap-126 }, + { 0x1.327f191dd6247p-131, 0x1.fdf7c425dfb89p-127 }, + { 0x1.f1db008e061d6p-132, 0x1.9e6c7f42ee3ap-127 }, + { 0x1.944b7c8850269p-132, 0x1.50bd38f4b0e14p-127 }, + { 0x1.4846e1e475567p-132, 0x1.11954fcd9d596p-127 }, + { 0x1.0a8512d6deebp-132, 0x1.bc7d8a23288e1p-128 }, + { 0x1.b0b57b848dfd5p-133, 0x1.69099571fea27p-128 }, + { 0x1.5f385601a1095p-133, 0x1.25378a982372p-128 }, + { 0x1.1d0aee3f21eaep-133, 0x1.dc36feecfa2bap-129 }, + { 0x1.ce9ce0f1b56b8p-134, 0x1.82a9fb7ad076bp-129 }, + { 0x1.775af322a6fb6p-134, 0x1.39ea243c7bf71p-129 }, + { 0x1.3084e2fb958e5p-134, 0x1.fda4af81b306ap-130 }, + { 0x1.ee0aaff5c7275p-135, 0x1.9da7a2c5ab52cp-130 }, + { 0x1.90b5b261712acp-135, 0x1.4fb44aa933f5cp-130 }, + { 0x1.44f853ca3d2a1p-135, 0x1.1068e39733d5fp-130 }, + { 0x1.07839b24e2329p-135, 0x1.ba0b385a9673fp-131 }, + { 0x1.ab4ef712ea53cp-136, 0x1.669cb88b98bb4p-131 }, + { 0x1.5a6a27edc2aafp-136, 0x1.22e458ff074e2p-131 }, + { 0x1.18ccfb2383c0dp-136, 0x1.d7dccacf16bdfp-132 }, + { 0x1.c72c7d427b5c7p-137, 0x1.7ea9a57d9c3fdp-132 }, + { 0x1.70debd3477d7cp-137, 0x1.364981b4fcaccp-132 }, + { 0x1.2ae4c8505c4dcp-137, 0x1.f723b60a4c45ap-133 }, + { 0x1.e45347f37826dp-138, 0x1.97e0b5db827a8p-133 }, + { 0x1.8859d9d834871p-138, 0x1.4a9cae44d02aap-133 }, + { 0x1.3dcdd6f53a761p-138, 0x1.0bf347561e06fp-133 }, + { 0x1.0163c7a1b8ce3p-138, 0x1.b246ea577dcd5p-134 }, + { 0x1.a0de9e4d0326ap-139, 0x1.5fe1a8f2ffd47p-134 }, + { 0x1.518a7407eb90ep-139, 0x1.1d15869af1a46p-134 }, + { 0x1.1146574533e59p-139, 0x1.cde08f63664fdp-135 }, + { 0x1.ba6f77161f191p-140, 0x1.761ba88bf6eedp-135 }, + { 0x1.661c59f17faep-140, 0x1.2efafc89163c3p-135 }, + { 0x1.21d2894bdd4c7p-140, 0x1.eab12c8aa7e5p-136 }, + { 0x1.d50e0eba3e44dp-141, 0x1.8d4d432dee077p-136 }, + { 0x1.7b84a5753cf1fp-141, 0x1.41a589d11cb19p-136 }, + { 0x1.33091416396dbp-141, 0x1.045db9ec2ba81p-136 }, + { 0x1.f0bb3ff173143p-142, 0x1.a57861242277fp-137 }, + { 0x1.91c3cacc75aaap-142, 0x1.551681b8d361p-137 }, + { 0x1.44ea256a84bbp-142, 0x1.140098b38820cp-137 }, + { 0x1.06bb841410434p-142, 0x1.be9e2feb561ep-138 }, + { 0x1.a8d98b0d5771p-143, 0x1.694e9fdcb7be5p-138 }, + { 0x1.57755a2313bdfp-143, 0x1.24419d9ce37ffp-138 }, + { 0x1.15a03d39bca43p-143, 0x1.d8bf1578b3aacp-139 }, + { 0x1.c0c4e9f387792p-144, 0x1.7e4dfe2cee6a2p-139 }, + { 0x1.6aa9b63079411p-144, 0x1.3520b0bf08a51p-139 }, + { 0x1.250ad98a67e4fp-144, 0x1.f3daa3dd37f3ap-140 }, + { 0x1.d9842421f4af1p-145, 0x1.94140b3abb78ep-140 }, + { 0x1.7e859d0226582p-145, 0x1.469d2facc66f7p-140 }, + { 0x1.34f9e5d4c96d3p-145, 0x1.07f7c6b04c092p-140 }, + { 0x1.f314a5f5af6d7p-146, 0x1.aa9f80ec12e52p-141 }, + { 0x1.9306ca687d568p-146, 0x1.58b5e63278412p-141 }, + { 0x1.456b681315dafp-146, 0x1.167dcc97a0fd3p-141 }, + { 0x1.06b98180e66fp-146, 0x1.c1ee5bab4ede7p-142 }, + { 0x1.a82a4c036e3f3p-147, 0x1.6b69077bfc3c7p-142 }, + { 0x1.565cda5d05a6ap-147, 0x1.257dcc5bc2717p-142 }, + { 0x1.144d77262f022p-147, 0x1.d9fdd2296338fp-143 }, + { 0x1.bdec7b50a66cp-148, 0x1.7eb427b4ddd71p-143 }, + { 0x1.67cb265d8483ap-148, 0x1.34f5aee91217p-143 }, + { 0x1.224399b226996p-148, 0x1.f2ca4dc8ff69fp-144 }, + { 0x1.d448f86c23d12p-149, 0x1.92943634830d2p-144 }, + { 0x1.79b2a15ae0faap-149, 0x1.44e2d8e947442p-144 }, + { 0x1.3098d833c2dap-149, 0x1.0627b1e47c261p-144 }, + { 0x1.eb3aa595948f3p-150, 0x1.a705784809825p-145 }, + { 0x1.8c0f08dff4e68p-150, 0x1.554226cd542efp-145 }, + { 0x1.3f49a8880f6adp-150, 0x1.1343e7a202e9p-145 }, + { 0x1.015dd1c62a082p-150, 0x1.bc0384ab3550dp-146 }, + { 0x1.9edb80143a705p-151, 0x1.660fe966c4e28p-146 }, + { 0x1.4e52056f2dec4p-151, 0x1.20b6b60dae611p-146 }, + { 0x1.0d62a769875ep-151, 0x1.d1893fc15ba16p-147 }, + { 0x1.b2128dd015485p-152, 0x1.7747e31ddd25cp-147 }, + { 0x1.5dad6d3a16694p-152, 0x1.2e7c997078049p-147 }, + { 0x1.19a81ef58dfc6p-152, 0x1.e790d89e8e564p-148 }, + { 0x1.c5ae1b79c4ee8p-153, 0x1.88e545d12ba57p-148 }, + { 0x1.6d56e11abc8a7p-153, 0x1.3c919aea9787p-148 }, + { 0x1.262a204b39df1p-153, 0x1.fe13c6f07b6aep-149 }, + { 0x1.d9a774b67b183p-154, 0x1.9ae2b16a9550ap-149 }, + { 0x1.7d48e51f6d6edp-154, 0x1.4af14f857334ep-149 }, + { 0x1.32e43016e50e4p-154, 0x1.0a8564eab8ff5p-149 }, + { 0x1.edf747f9f14f1p-155, 0x1.ad3a33350402p-150 }, + { 0x1.8d7d80e14b91p-155, 0x1.5996d7e13f467p-150 }, + { 0x1.3fd1708b687cbp-155, 0x1.1636f3d76858ap-150 }, + { 0x1.014ad3fec9ec4p-155, 0x1.bfe545fce7a55p-151 }, + { 0x1.9dee40ecc2982p-156, 0x1.687ce08618977p-151 }, + { 0x1.4ceca2b27454p-156, 0x1.221a377d62eb4p-151 }, + { 0x1.0bbd071377b87p-156, 0x1.d2dcd30499eb7p-152 }, + { 0x1.ae9438e9a5c0bp-157, 0x1.779da2df7a30cp-152 }, + { 0x1.5a30285652adp-157, 0x1.2e2a7c1fe1c5fp-152 }, + { 0x1.164daef1c2b15p-157, 0x1.e61933d473856p-153 }, + { 0x1.bf6806876a635p-158, 0x1.86f2e6e7e582ap-153 }, + { 0x1.67960688424efp-158, 0x1.3a62b4892ce6ep-153 }, + { 0x1.20f7f47f404a7p-158, 0x1.f99234ed0089ep-154 }, + { 0x1.d061d530972c5p-159, 0x1.9676058974913p-154 }, + { 0x1.7517e8c57f622p-159, 0x1.46bd7c1e28efp-154 }, + { 0x1.2bb6ba79809edp-159, 0x1.069f8cb02119fp-154 }, + { 0x1.e17962871247p-160, 0x1.a61febb6d574dp-155 }, + { 0x1.82af24bbe81ddp-160, 0x1.53351984f5d61p-155 }, + { 0x1.3684a09debb18p-160, 0x1.108b4faaa8971p-155 }, + { 0x1.f2a603a977e7cp-161, 0x1.b5e91e3ee196dp-156 }, + { 0x1.9054beadf5a51p-161, 0x1.5fc381e001854p-156 }, + { 0x1.415c074fc9065p-161, 0x1.1a8782bc000bep-156 }, + { 0x1.01ef55a0092e3p-161, 0x1.c5c9be5ba37d4p-157 }, + { 0x1.9e016e74801cbp-162, 0x1.6c625c9dd5c05p-157 }, + { 0x1.4c3713bae315dp-162, 0x1.248f08aa2a9f5p-157 }, + { 0x1.0a8cf82738469p-162, 0x1.d5b98efc2e8d5p-158 }, + { 0x1.abada51b7b47ep-163, 0x1.790b07dcc17ddp-158 }, + { 0x1.570fb47030aa8p-163, 0x1.2e9c8b4dec3dep-158 }, + { 0x1.13270ae279a57p-163, 0x1.e5affac730013p-159 }, + { 0x1.b951931589ad6p-164, 0x1.85b69d604d483p-159 }, + { 0x1.61dfa678e3296p-164, 0x1.38aa7fa8655e3p-159 }, + { 0x1.1bb88966006c4p-164, 0x1.f5a41ad29abd6p-160 }, + { 0x1.c6e52f00f28e6p-165, 0x1.925df815332e1p-160 }, + { 0x1.6ca07adb2cabep-165, 0x1.42b32a68b6433p-160 }, + { 0x1.243c4de072741p-165, 0x1.02c65f05a223cp-160 }, + { 0x1.d4603cf73627ep-166, 0x1.9ef9ba1f58105p-161 }, + { 0x1.774b9c8b0652p-166, 0x1.4cb0a4ddc2264p-161 }, + { 0x1.2cad15ed5f00dp-166, 0x1.0ab038a2ddd17p-161 }, + { 0x1.e1ba565f2f2dap-167, 0x1.ab82536c08c11p-162 }, + { 0x1.81da56c03901cp-167, 0x1.569ce24f30cadp-162 }, + { 0x1.350587b61e2e7p-167, 0x1.128ac3f80b9acp-162 }, + { 0x1.eeeaf2386ba73p-168, 0x1.b7f008c184953p-163 }, + { 0x1.8c45dba9ebaffp-168, 0x1.6071b5b7d5f0bp-163 }, + { 0x1.3d40375ab2fc9p-168, 0x1.1a5112ad78884p-163 }, + { 0x1.fbe96dd52dd2ap-169, 0x1.c43afb43abf3ap-164 }, + { 0x1.96874b77050b3p-169, 0x1.6a28d7dab475p-164 }, + { 0x1.4557ac9b8a4ffp-169, 0x1.21fe234726979p-164 }, + { 0x1.04568afbad70bp-169, 0x1.d05b30647f5b6p-165 }, + { 0x1.a097bba9c5bbap-170, 0x1.73bbedaae952fp-165 }, + { 0x1.4d4668bc3c638p-170, 0x1.298ce64edbc52p-165 }, + { 0x1.0a969821c25d4p-170, 0x1.dc489a35fd89p-166 }, + { 0x1.aa703eac27071p-171, 0x1.7d248efdebaf1p-166 }, + { 0x1.5506ec96ce1d8p-171, 0x1.30f843b6c62b7p-166 }, + { 0x1.10b0827e1c59fp-171, 0x1.e7fb2011e1175p-167 }, + { 0x1.b409eb99c2287p-172, 0x1.865c4d7ebd336p-167 }, + { 0x1.5c93bed6568e9p-172, 0x1.383b206d0bb99p-167 }, + { 0x1.169ff47b694c6p-172, 0x1.f36aa78ac249dp-168 }, + { 0x1.bd5de633517f7p-173, 0x1.8f5cbbd7e3bd9p-168 }, + { 0x1.63e7724f64774p-173, 0x1.3f5064180659dp-168 }, + { 0x1.1c60a3dd2224ep-173, 0x1.fe8f1d993bb19p-169 }, + { 0x1.c66566ef40333p-174, 0x1.981f750955121p-169 }, + { 0x1.6afcac6c09d1ap-174, 0x1.4632fef2669ecp-169 }, + { 0x1.21ee56dbc8c6ap-174, 0x1.04b03ffb7174ap-169 }, + { 0x1.cf19c31a391acp-175, 0x1.a09e23dee12dbp-170 }, + { 0x1.71ce2ba111a68p-175, 0x1.4cddefbe00daep-170 }, + { 0x1.2744e94597dfp-175, 0x1.09eb734c1a314p-170 }, + { 0x1.d77474fa3c96fp-176, 0x1.a8d28a7b21f9ep-171 }, + { 0x1.7856cde19858bp-176, 0x1.534c49c3a48ap-171 }, + { 0x1.2c60519b06073p-176, 0x1.0ef5469afe541p-171 }, + { 0x1.df6f23e67822ep-177, 0x1.b0b689ea896fp-172 }, + { 0x1.7e9197060941ap-177, 0x1.59793ad60d8abp-172 }, + { 0x1.313ca61e59763p-177, 0x1.13c9ee6b2a529p-172 }, + { 0x1.e703ac45eb1a5p-178, 0x1.b84429b1d33d8p-173 }, + { 0x1.8479b71b66ff2p-178, 0x1.5f60114dc317ap-173 }, + { 0x1.35d621cd7892fp-178, 0x1.1865baa279b03p-173 }, + { 0x1.ee2c2766d39aep-179, 0x1.bf759f4ae6481p-174 }, + { 0x1.8a0a908fbee34p-179, 0x1.64fc41f392bcdp-174 }, + { 0x1.3a29293d26666p-179, 0x1.1cc51b3533d1bp-174 }, + { 0x1.f4e2f320ed2f5p-180, 0x1.c645558315ad7p-175 }, + { 0x1.8f3fbe30bc1d8p-180, 0x1.6a496dcf4682p-175 }, + { 0x1.3e324f4cf0981p-180, 0x1.20e4a4b8e031ep-175 }, + { 0x1.fb22b934b993p-181, 0x1.ccadf3adb1afp-176 }, + { 0x1.941518f17ca26p-181, 0x1.6f4367d03dbd8p-176 }, + { 0x1.41ee59ab3f625p-181, 0x1.24c114d62226p-176 }, + { 0x1.00733b2d2d2a7p-181, 0x1.d2aa649df6e65p-177 }, + { 0x1.9886bd6d1085bp-182, 0x1.73e63a45afd4dp-177 }, + { 0x1.455a452136a6p-182, 0x1.285756918be22p-177 }, + { 0x1.0314c07978175p-182, 0x1.d835dd5ba6335p-178 }, + { 0x1.9c91111b6c15fp-183, 0x1.782e2c1c97a81p-178 }, + { 0x1.4873499e69a71p-183, 0x1.2ba486638ab1ep-178 }, + { 0x1.0573c7a800f18p-183, 0x1.dd4be385e972p-179 }, + { 0x1.a030c72f0cf33p-184, 0x1.7c17c5d99552cp-179 }, + { 0x1.4b36ddfcc8743p-184, 0x1.2ea5f617d321fp-179 }, + { 0x1.078e5ec28bafdp-184, 0x1.e1e853589fe15p-180 }, + { 0x1.a362e51221b9fp-185, 0x1.7f9fd64579e1ap-180 }, + { 0x1.4da2bb75a5c65p-185, 0x1.3159306d0abdp-180 }, + { 0x1.0962c95c3eb5p-185, 0x1.e6076548c0765p-181 }, + { 0x1.a624c67aa97dfp-186, 0x1.82c376c3acddfp-181 }, + { 0x1.4fb4e0c13d49p-186, 0x1.33bbfc6dd55a6p-181 }, + { 0x1.0aef82f484486p-186, 0x1.e9a5b32d2ef52p-182 }, + { 0x1.a874210dbadcfp-187, 0x1.85800f4a2d262p-182 }, + { 0x1.516b94dabb86dp-187, 0x1.35cc607ce4fd8p-182 }, + { 0x1.0c33410fd4c56p-187, 0x1.ecc03cea2935dp-183 }, + { 0x1.aa4f078af0321p-188, 0x1.87d359f39448ep-183 }, + { 0x1.52c5696370c9dp-188, 0x1.3788a50e33e44p-183 }, + { 0x1.0d2cf5025ba2dp-188, 0x1.ef546c9652b0ap-184 }, + { 0x1.abb3ec79d594dp-189, 0x1.89bb66243bfd5p-184 }, + { 0x1.53c13ca08d951p-189, 0x1.38ef570827673p-184 }, + { 0x1.0ddbcd68fc943p-189, 0x1.f1601a115b514p-185 }, + { 0x1.aca1a45423b35p-190, 0x1.8b369b3c6ec4fp-185 }, + { 0x1.545e3b0f8838ap-190, 0x1.39ff49c7fe5e8p-185 }, + { 0x1.0e3f374dd9d68p-190, 0x1.f2e18e05495b4p-186 }, + { 0x1.ad1767288e013p-191, 0x1.8c43bad265564p-186 }, + { 0x1.549be08e15927p-191, 0x1.3ab798c59d4c2p-186 }, + { 0x1.0e56def61fbc4p-191, 0x1.f3d7844c8a592p-187 }, + { 0x1.ad14d1b2f0b5fp-192, 0x1.8ce1e26fb8214p-187 }, + { 0x1.5479f9137160bp-192, 0x1.3b17a8d383f04p-187 }, + { 0x1.0e22b05782284p-192, 0x1.f4412db819edfp-188 }, + { 0x1.ac99e5e7b9269p-193, 0x1.8d108ccedcd75p-188 }, + { 0x1.53f8a0f98a8b8p-193, 0x1.3b1f28f8795cap-188 }, + { 0x1.0da2d734853ffp-193, 0x1.f41e3132440dap-189 }, + { 0x1.aba70af1767bp-194, 0x1.8ccf9296410aep-189 }, + { 0x1.531844d58365ep-194, 0x1.3ace12e143377p-189 }, + { 0x1.0cd7bedf59779p-194, 0x1.f36eac3bc78c2p-190 }, + { 0x1.aa3d0ca096eedp-195, 0x1.8c1f2a8f92477p-190 }, + { 0x1.51d9a0dfd2e93p-195, 0x1.3a24aae988ae7p-190 }, + { 0x1.0bc211a3c2859p-195, 0x1.f23332c263066p-191 }, + { 0x1.a85d1a4e6bedcp-196, 0x1.8affe95ac6f2ap-191 }, + { 0x1.503dbfed30324p-196, 0x1.39237fbbcfa18p-191 }, + { 0x1.0a62b7d92f095p-196, 0x1.f06cce511da3ep-192 }, + { 0x1.a608c535a2ba1p-197, 0x1.8972c09d7f45cp-192 }, + { 0x1.4e45f9fa4adffp-197, 0x1.37cb698950bdap-192 }, + { 0x1.08bad69ed20a4p-197, 0x1.ee1cfc9be3df9p-193 }, + { 0x1.a341fe436d2d7p-198, 0x1.8778fdb058321p-193 }, + { 0x1.4bf3f24d273a5p-198, 0x1.361d88db2b95bp-193 }, + { 0x1.06cbce44363ecp-198, 0x1.eb45ad695330ap-194 }, + { 0x1.a00b13659be7cp-199, 0x1.851447ccc879bp-194 }, + { 0x1.4949952fc2371p-199, 0x1.341b44ff4c3c6p-194 }, + { 0x1.0497386163a39p-199, 0x1.e7e93fdecaep-195 }, + { 0x1.9c66ac5ae65b3p-200, 0x1.82469dbf1833ep-195 }, + { 0x1.464915486577bp-200, 0x1.31c64a141680ep-195 }, + { 0x1.021ee5a248c7fp-200, 0x1.e40a7f340982ap-196 }, + { 0x1.9857c70b8b2bcp-201, 0x1.7f125320f1e94p-196 }, + { 0x1.42f4e894cc71ap-201, 0x1.2f2086b6a5cf4p-196 }, + { 0x1.fec9b69351b7p-202, 0x1.dfac9ed4c27cep-197 }, + { 0x1.93e1b371520a1p-202, 0x1.7b7a0d21f0262p-197 }, + { 0x1.3f4fc50de840ap-202, 0x1.2c2c295822108p-197 }, + { 0x1.f8d6a0e0a9508p-203, 0x1.dad335f7aacdbp-198 }, + { 0x1.8f080f16c57cp-203, 0x1.7780bee4609a1p-198 }, + { 0x1.3b5c9cfaada16p-203, 0x1.28eb9d3f5000ap-198 }, + { 0x1.f269560bdbf92p-204, 0x1.d5823ab37d92ep-199 }, + { 0x1.89cec0363502dp-204, 0x1.7329a5753ca24p-199 }, + { 0x1.371e9af8e6ccfp-204, 0x1.2561873c1cc7ap-199 }, + { 0x1.eb86f931c309dp-205, 0x1.cfbdfc9b64d6ep-200 }, + { 0x1.8439f081b525ap-205, 0x1.6e7843670c8d2p-200 }, + { 0x1.32991dc38028ep-205, 0x1.2190c2136fc76p-200 }, + { 0x1.e434fdd743954p-206, 0x1.c98b1eed08258p-201 }, + { 0x1.7e4e079de1a2ep-206, 0x1.69705c180d6c1p-201 }, + { 0x1.2dcfb3be31ebdp-206, 0x1.1d7c5aaa0949p-201 }, + { 0x1.dc7920bafc5dcp-207, 0x1.c2ee925b3e3f6p-202 }, + { 0x1.780fa5599d558p-207, 0x1.6415eeac7f744p-202 }, + { 0x1.28c6164ec1235p-207, 0x1.19278bf59ff34p-202 }, + { 0x1.d459605b63623p-208, 0x1.bbed8e8100752p-203 }, + { 0x1.71839bad6a45bp-208, 0x1.5e6d30c67b96bp-203 }, + { 0x1.2380250c57526p-208, 0x1.1495babbc8d8ep-203 }, + { 0x1.cbdbf53eed588p-209, 0x1.b48d8b08c37b5p-204 }, + { 0x1.6aaee88d3a5e6p-209, 0x1.587a8905112ebp-204 }, + { 0x1.1e01e0cda0c0ep-209, 0x1.0fca71267dd26p-204 }, + { 0x1.c3074a0c1c67dp-210, 0x1.acd43894c1f06p-205 }, + { 0x1.6396af97c5f7fp-210, 0x1.52428954b7c2fp-205 }, + { 0x1.184f669e7e645p-210, 0x1.0ac95a364b406p-205 }, + { 0x1.b9e1f37f768c9p-211, 0x1.a4c779750fb77p-206 }, + { 0x1.5c4033ae88d94p-211, 0x1.4bc9e91b546a8p-206 }, + { 0x1.126ceaa621095p-211, 0x1.05963d1a5105bp-206 }, + { 0x1.b072a84d6770bp-212, 0x1.9c6d5a387a6d7p-207 }, + { 0x1.54b0d08180ac6p-212, 0x1.45157f4a2e598p-207 }, + { 0x1.0c5eb30658611p-212, 0x1.0034f87652744p-207 }, + { 0x1.a6c038fdf5aedp-213, 0x1.93cc0a254a9f5p-208 }, + { 0x1.4cedf419a9b38p-213, 0x1.3e2a3c60327aap-208 }, + { 0x1.062912bcc23f9p-213, 0x1.f552fb3e1c70bp-209 }, + { 0x1.9cd187cff951cp-214, 0x1.8ae9d3a6eb66fp-209 }, + { 0x1.44fd186d008c2p-214, 0x1.370d2466d3327p-209 }, + { 0x1.ffa0c91caab55p-215, 0x1.e9ef97aa04b46p-210 }, + { 0x1.92ad80b12a09bp-215, 0x1.81cd14bd535bbp-210 }, + { 0x1.3ce3bd0683046p-215, 0x1.2fc348f3a8121p-210 }, + { 0x1.f2b20c0b002abp-216, 0x1.de47d70b3398cp-211 }, + { 0x1.885b1157e885cp-216, 0x1.787c377ac34cdp-211 }, + { 0x1.34a760cc47acap-216, 0x1.2851c338b22e4p-211 }, + { 0x1.e58ea51580badp-217, 0x1.d263d33512bb6p-212 }, + { 0x1.7de1218b19542p-217, 0x1.6efdaa9c0e45ep-212 }, + { 0x1.2c4d7bed4d522p-217, 0x1.20bdae2cd61c6p-212 }, + { 0x1.d83f3d3e6d15p-218, 0x1.c64ba5bdb46dep-213 }, + { 0x1.73468ba3c29b8p-218, 0x1.6557da47246f7p-213 }, + { 0x1.23db7a001a935p-218, 0x1.190c20d5b5808p-213 }, + { 0x1.cacc668087b83p-219, 0x1.ba075f0192b6p-214 }, + { 0x1.689215536317fp-219, 0x1.5b9128fb09361p-214 }, + { 0x1.1b56b45aac06fp-219, 0x1.114228bb99133p-214 }, + { 0x1.bd3e92f58e3aep-220, 0x1.ad9efd6e7e35p-215 }, + { 0x1.5dca68b92a62fp-220, 0x1.51afe8bbb6b6cp-215 }, + { 0x1.12c46cab86e91p-220, 0x1.0964c48f92b05p-215 }, + { 0x1.af9e0c680145ap-221, 0x1.a11a652260dp-216 }, + { 0x1.52f60dcf5b39p-221, 0x1.47ba5483b6e8fp-216 }, + { 0x1.0a29c7db10f7p-221, 0x1.0178df0b67157p-216 }, + { 0x1.a1f2ec5b27de2p-222, 0x1.948157e97fbd7p-217 }, + { 0x1.481b643932becp-222, 0x1.3db68a0470a4fp-217 }, + { 0x1.018bc93b8e2e5p-222, 0x1.f306942454ae6p-218 }, + { 0x1.9445149305037p-223, 0x1.87db6da6dd3cap-218 }, + { 0x1.3d409d78b6819p-223, 0x1.33aa83bd4deabp-218 }, + { 0x1.f1de9c1ab95aap-224, 0x1.e311742f9561bp-219 }, + { 0x1.869c2824b4b6bp-224, 0x1.7b300d303ed2cp-219 }, + { 0x1.326bb792c8c5bp-224, 0x1.299c1370fc2d1p-219 }, + { 0x1.e0b212b870715p-225, 0x1.d31b83aa1a53bp-220 }, + { 0x1.78ff85165ac91p-225, 0x1.6e8665a634affp-220 }, + { 0x1.27a27826da7a5p-225, 0x1.1f90dcff1976ep-220 }, + { 0x1.cf9b0072f8176p-226, 0x1.c32d9c998168ap-221 }, + { 0x1.6b763e947db08p-226, 0x1.61e5684f4d137p-221 }, + { 0x1.1cea67fe8699cp-226, 0x1.158e51a7ac97ep-221 }, + { 0x1.bea20cad09b1fp-227, 0x1.b350464c51c99p-222 }, + { 0x1.5e0717c155a1cp-227, 0x1.5553c2fc66728p-222 }, + { 0x1.1248cf18568a2p-227, 0x1.0b99abbccdbb1p-222 }, + { 0x1.adcf760300963p-228, 0x1.a38baebfb68e4p-223 }, + { 0x1.50b87f214792dp-228, 0x1.48d7dafad7ffep-223 }, + { 0x1.07c2b12fe4dbap-228, 0x1.01b7eac5ea688p-223 }, + { 0x1.9d2b0d0c4a0b1p-229, 0x1.93e7a4bb0743p-224 }, + { 0x1.43908aa677d25p-229, 0x1.3c77c897ed254p-224 }, + { 0x1.fab995891c153p-230, 0x1.efdba02e2ceffp-225 }, + { 0x1.8cbc2fe600108p-230, 0x1.846b92a47c343p-225 }, + { 0x1.3694f45c1b92fp-230, 0x1.30395337f89bbp-225 }, + { 0x1.e6371d3dc0233p-231, 0x1.dc7fb7bbca8adp-226 }, + { 0x1.7c89c6867890ep-231, 0x1.751e7a10e8264p-226 }, + { 0x1.29cb17b0f706bp-231, 0x1.2421ee0211f87p-226 }, + { 0x1.d20647a807a0cp-232, 0x1.c9649548abac7p-227 }, + { 0x1.6c9a3fd812077p-232, 0x1.6606f00ed6d5dp-227 }, + { 0x1.1d37ef5f490cdp-232, 0x1.1836b52067807p-227 }, + { 0x1.be2ec88ae1479p-233, 0x1.b6922692e74d4p-228 }, + { 0x1.5cf38f9818abfp-233, 0x1.572b1a2c0293ap-228 }, + { 0x1.10e013ef486f7p-233, 0x1.0c7c6b93f06a1p-228 }, + { 0x1.aab7b734b99f6p-234, 0x1.a40fcadcdd133p-229 }, + { 0x1.4d9b2cf546b09p-234, 0x1.4890ac32b69b5p-229 }, + { 0x1.04c7bad04b57cp-234, 0x1.00f779993bbc1p-229 }, + { 0x1.97a78d5f1c6dbp-235, 0x1.91e450ac30542p-230 }, + { 0x1.3e9611e8218p-235, 0x1.3a3ce69b6a143p-230 }, + { 0x1.f1e56c0773bb7p-236, 0x1.eb57d7362f984p-231 }, + { 0x1.850426f2df55dp-236, 0x1.8015f467ddd4p-231 }, + { 0x1.2fe8bb3e4f4d8p-236, 0x1.2c3495adab7d8p-231 }, + { 0x1.dac8e8a813f1fp-237, 0x1.d53ae35dbfa26p-232 }, + { 0x1.72d2c2a7422abp-237, 0x1.6eaa5fce4af3ap-232 }, + { 0x1.21972950f570dp-237, 0x1.1e7c114a57a33p-232 }, + { 0x1.c44004226dc17p-238, 0x1.bf9ebf2ac34cfp-233 }, + { 0x1.6118037139874p-238, 0x1.5da6aa3adb7a3p-233 }, + { 0x1.13a4e15d42467p-238, 0x1.11173d5813f4dp-233 }, + { 0x1.ae501496e23f2p-239, 0x1.aa895a750e0f6p-234 }, + { 0x1.4fd7f2b705e64p-239, 0x1.4d0f59b16ac32p-234 }, + { 0x1.0614ef7575b09p-239, 0x1.04098aca1b898p-234 }, + { 0x1.98fdb1084fd1cp-240, 0x1.95ffef5a788b3p-235 }, + { 0x1.3f16033b4da17p-240, 0x1.3ce864a4f75bbp-235 }, + { 0x1.f1d3d20014dd3p-241, 0x1.eeabf27142ccbp-236 }, + { 0x1.844cb59a101a9p-241, 0x1.82070510e6e91p-236 }, + { 0x1.2ed514b22b68bp-241, 0x1.2d35346de60f3p-236 }, + { 0x1.d84bdf7421499p-242, 0x1.d5fe3202b4d44p-237 }, + { 0x1.7040489842ad7p-242, 0x1.6ea2738b3dbebp-237 }, + { 0x1.1f1777f205012p-242, 0x1.1df8a8637ba9cp-237 }, + { 0x1.bf956a62adf73p-243, 0x1.be0e1bcc5bf2bp-238 }, + { 0x1.5cdae0381ff94p-243, 0x1.5bd567e120a1cp-238 }, + { 0x1.0fdef3b187063p-243, 0x1.0f35198b8b7f7p-238 }, + { 0x1.a7b2fd5556b6ap-244, 0x1.a6df243f2c6f4p-239 }, + { 0x1.4a1e48fd99b8ep-244, 0x1.49a26968a8fd1p-239 }, + { 0x1.012cc9c3d142ap-244, 0x1.00ec5ed2dbe3ep-239 }, + { 0x1.90a652d08b6ecp-245, 0x1.9073f3afbdfebp-240 }, + { 0x1.380bacb3471d9p-245, 0x1.380b5f70c487dp-240 }, + { 0x1.e603798765b0ap-246, 0x1.e63fa380d130bp-241 }, + { 0x1.7a705e88ab4c8p-246, 0x1.7ace6e086aab7p-241 }, + { 0x1.26a399e180e7cp-246, 0x1.2711978a97cf7p-241 }, + { 0x1.cabc2c3d98d7cp-247, 0x1.cba0a72ae9c08p-242 }, + { 0x1.651157275ac6fp-247, 0x1.65efbb20adf2dp-242 }, + { 0x1.15e60bb1a2bacp-247, 0x1.16b5cc5019368p-242 }, + { 0x1.b08358e30e1b1p-248, 0x1.b1fca598944c3p-243 }, + { 0x1.5088c08941b89p-248, 0x1.51d84fa353951p-243 }, + { 0x1.05d2722aa0abep-248, 0x1.06f82c9619b9p-243 }, + { 0x1.9757d44a0d5d1p-249, 0x1.9953a1cf16aadp-244 }, + { 0x1.3cd5765cc7b51p-249, 0x1.3e87f66d27bbp-244 }, + { 0x1.eccf7568ff3afp-250, 0x1.efb0c5f0312cdp-245 }, + { 0x1.7f37a88128933p-250, 0x1.81a4d1085cfd1p-245 }, + { 0x1.29f5b70afae6ep-250, 0x1.2bfdda4e2b20cp-245 }, + { 0x1.cf48b1a182cb9p-251, 0x1.d2ab3b59164a6p-246 }, + { 0x1.682022c0d8296p-251, 0x1.6aeea740e7e26p-246 }, + { 0x1.17e72ed48d1c2p-251, 0x1.1a389017ca93cp-246 }, + { 0x1.b30c9decefa86p-252, 0x1.b6dd2d215fccfp-247 }, + { 0x1.520de188c8ff4p-252, 0x1.552ee415230cdp-247 }, + { 0x1.06a7030db71fbp-252, 0x1.093620e33d9f9p-247 }, + { 0x1.98166f02e00aap-253, 0x1.9c4336b720df7p-248 }, + { 0x1.3cfce2d301755p-253, 0x1.40629fd47fda6p-248 }, + { 0x1.ec63bac9af50ap-254, 0x1.f1e828f7f1e6ep-249 }, + { 0x1.7e609b497d4bfp-254, 0x1.82d92bd0fbc5bp-249 }, + { 0x1.28e89244647b5p-254, 0x1.2c8658b1c7fabp-249 }, + { 0x1.cd07ee41894f6p-255, 0x1.d2def7b6139fbp-250 }, + { 0x1.65e4eca3c47cep-255, 0x1.6a9a29142865ap-250 }, + { 0x1.15cbd7439af48p-255, 0x1.1995fff959855p-250 }, + { 0x1.af324889fe32ep-256, 0x1.b549f742691f7p-251 }, + { 0x1.4e9c920d5db05p-256, 0x1.5380a4af4c2e9p-251 }, + { 0x1.03a122e1077b7p-256, 0x1.078d07375b0bp-251 }, + { 0x1.92d9bd168c63p-257, 0x1.9921acfd99f39p-252 }, + { 0x1.388030ea8589cp-257, 0x1.3d867ecfb60a5p-252 }, + { 0x1.e4c4faf832008p-258, 0x1.ecccda72dba49p-253 }, + { 0x1.77f4a046c515ep-258, 0x1.7e5deef2de87bp-253 }, + { 0x1.2387f5f4b712ep-258, 0x1.28a511d87ce7dp-253 }, + { 0x1.c413282821079p-259, 0x1.cc3995b1e2c4p-254 }, + { 0x1.5e78bc56d0fbbp-259, 0x1.64f5f80200f46p-254 }, + { 0x1.0faba5af01355p-259, 0x1.14d5424501d7ep-254 }, + { 0x1.a51f8a6830159p-260, 0x1.ad54bef9112dp-255 }, + { 0x1.465b65a83bdbbp-260, 0x1.4ce07b8d50856p-255 }, + { 0x1.f9c5589e7201fp-261, 0x1.020f8e226943ep-255 }, + { 0x1.87dc5ad8af9ecp-261, 0x1.90123a8271991p-256 }, + { 0x1.2f918e4d3f95cp-261, 0x1.3613b89391a8fp-256 }, + { 0x1.d6485a170413ap-262, 0x1.e098381b76cd3p-257 }, + { 0x1.6c3b66970be3dp-262, 0x1.7465697a54c64p-257 }, + { 0x1.1a0fd8c3a4e6fp-262, 0x1.20858c20a1795p-257 }, + { 0x1.b4ce217bd5e55p-263, 0x1.bf05934cfa1ccp-258 }, + { 0x1.522e259c7017ap-263, 0x1.5a41409f84e49p-258 }, + { 0x1.05caa9cf257c4p-263, 0x1.0c2b83023243dp-258 }, + { 0x1.954427a430b11p-264, 0x1.9f5672cf62a4fp-259 }, + { 0x1.39a5d07601e71p-264, 0x1.41985de8f7a14p-259 }, + { 0x1.e56c72cc01fccp-265, 0x1.f1f5d5615d783p-260 }, + { 0x1.7797a6e64ddc9p-265, 0x1.8179bfb69c631p-260 }, + { 0x1.229374c83806p-265, 0x1.2a5d1d1f1ae5cp-260 }, + { 0x1.c18d454a503aep-266, 0x1.cdd1c2bddbb9ep-261 }, + { 0x1.5bb5b3e414ad3p-266, 0x1.655e203c78adp-261 }, + { 0x1.0ce808921de57p-266, 0x1.1481ab5a1469ap-261 }, + { 0x1.9fdfe587f056ap-267, 0x1.abd4ca4bd8884p-262 }, + { 0x1.418b54bd6a895p-267, 0x1.4af20f59f283dp-262 }, + { 0x1.f128f851039d9p-268, 0x1.fff032b2dbde7p-263 }, + { 0x1.804c6e03f60cbp-268, 0x1.8be8c488684b4p-263 }, + { 0x1.290596a08a94fp-268, 0x1.3223f2e5be0fp-263 }, + { 0x1.cb1395c8187f6p-269, 0x1.d964d959533d1p-264 }, + { 0x1.62bb1316ec5fcp-269, 0x1.6df780d5ecc43p-264 }, + { 0x1.1211a1b47d3aep-269, 0x1.1ae2302fd4bcdp-264 }, + { 0x1.a772150026811p-270, 0x1.b5455f4e2ce45p-265 }, + { 0x1.47143aa78b5fep-270, 0x1.51eade2a24279p-265 }, + { 0x1.f93996ba5e93dp-271, 0x1.051b3f15282e5p-265 }, + { 0x1.8626f2553e204p-271, 0x1.93760037df87ap-266 }, + { 0x1.2d4091cd12adcp-271, 0x1.37ace1ccc1a8dp-266 }, + { 0x1.d1294db79df79p-272, 0x1.e17b7713cf17fp-267 }, + { 0x1.6715149108678p-272, 0x1.73db39c4b278bp-267 }, + { 0x1.1529206516167p-272, 0x1.1f27cc2724f9p-267 }, + { 0x1.abce28a1f17f2p-273, 0x1.bb70eb3792a1cp-268 }, + { 0x1.4a1fe3e55f964p-273, 0x1.5659e4463ddd1p-268 }, + { 0x1.fd6eb54be7326p-274, 0x1.08462ba9624dbp-268 }, + { 0x1.89049c51b8388p-274, 0x1.97f4ffe1284a1p-269 }, + { 0x1.2f2b5e6789756p-274, 0x1.3ad748e88c53fp-269 }, + { 0x1.d3aa617478594p-275, 0x1.e5e5db98318a5p-270 }, + { 0x1.68a9e9f7b2f9ap-275, 0x1.76e6798f53e9ap-270 }, + { 0x1.161c2a1de488ep-275, 0x1.21393590da64bp-270 }, + { 0x1.acda38e82463bp-276, 0x1.be32dc731f12cp-271 }, + { 0x1.4a9c33e05809ap-276, 0x1.5824d30f3fce1p-271 }, + { 0x1.fdaf4969fc45p-277, 0x1.09660e736b8bdp-271 }, + { 0x1.88d45a53c41c5p-277, 0x1.994b0856743cbp-272 }, + { 0x1.2eba8f55fe897p-277, 0x1.3b9051c5e7679p-272 }, + { 0x1.d287e1e77c85ap-278, 0x1.e689bae600601p-273 }, + { 0x1.6770239fc87e6p-278, 0x1.77071c1633b26p-273 }, + { 0x1.14e513c1b20dcp-278, 0x1.210a174166fcdp-273 }, + { 0x1.aa90041143186p-279, 0x1.bd7abebe480e6p-274 }, + { 0x1.488642c71cfa6p-279, 0x1.5740f6d4ed277p-274 }, + { 0x1.f9f9ce5a157bbp-280, 0x1.0874302ee34fdp-274 }, + { 0x1.85974997b931fp-280, 0x1.97701e51a6bfep-275 }, + { 0x1.2bf0c37efc00bp-280, 0x1.39d3aac239fe2p-275 }, + { 0x1.cdc89092e43c3p-281, 0x1.e36341a88ea0cp-276 }, + { 0x1.636f0e2785c54p-281, 0x1.743c5e4db43f9p-276 }, + { 0x1.118b19def65f8p-281, 0x1.1e9b8ad36fd99p-276 }, + { 0x1.a4fd2c459c71p-282, 0x1.b94cde5e4fc3p-277 }, + { 0x1.43ea7a73d5cfp-282, 0x1.53b3a109a94aep-277 }, + { 0x1.f26454740b953p-283, 0x1.057635a1ed1dfp-277 }, + { 0x1.7f60ab495565cp-283, 0x1.926f55b776f91p-278 }, + { 0x1.26de8be09d876p-283, 0x1.35abb1f1cadefp-278 }, + { 0x1.c5889cb51dbb9p-284, 0x1.dc853b381e5ap-279 }, + { 0x1.5cbe6a335189cp-284, 0x1.6e96e5d005f5dp-279 }, + { 0x1.0c22190c33c65p-284, 0x1.19fc0dba0e848p-279 }, + { 0x1.9c42b0a7816acp-285, 0x1.b1c21d6e11086p-280 }, + { 0x1.3ce41b9a97542p-285, 0x1.4d91f3701143cp-280 }, + { 0x1.e71ba6efe048bp-286, 0x1.007de792cfd6ep-280 }, + { 0x1.76552635a3b27p-286, 0x1.8a6663a0ececbp-281 }, + { 0x1.1fa1c7f04e719p-286, 0x1.2f310e41037d6p-281 }, + { 0x1.b9f88d1e59fb3p-287, 0x1.d2185735c5ad9p-282 }, + { 0x1.538582347c59ep-287, 0x1.66381bdd98a02p-282 }, + { 0x1.04c9ca3c242adp-287, 0x1.1346f1ba5a69ap-282 }, + { 0x1.9093a8968bba5p-288, 0x1.a706fd9470fb8p-283 }, + { 0x1.339c31e0d51b7p-288, 0x1.45000f1eec014p-283 }, + { 0x1.d8619415342d3p-289, 0x1.f3510620184eap-284 }, + { 0x1.6aa95f63dd017p-289, 0x1.7f84791f6fdbbp-284 }, + { 0x1.16648113f6ec6p-289, 0x1.2689bc620188bp-284 }, + { 0x1.ab5b65b277be7p-290, 0x1.c45998d7521aep-285 }, + { 0x1.47f9aad3382fep-290, 0x1.5b50e4b7d6356p-285 }, + { 0x1.f7591b1b1c875p-291, 0x1.0aa3508d5dbp-285 }, + { 0x1.82335294ba26p-291, 0x1.9959eb6f64db6p-286 }, + { 0x1.2848053b7dfb1p-291, 0x1.3a2fb2a16d1ccp-286 }, + { 0x1.c68a6f5a8ef62p-292, 0x1.e23b370697cbbp-287 }, + { 0x1.5c9ffcce7e5fdp-292, 0x1.720876851d9fbp-287 }, + { 0x1.0b5b54d487d35p-292, 0x1.1be79c992aff6p-287 }, + { 0x1.9a0421e5c5d71p-293, 0x1.b3980569c43a5p-288 }, + { 0x1.3a5c4268d4e27p-293, 0x1.4e1fc4f822568p-288 }, + { 0x1.e1fba80d34a41p-294, 0x1.0042910b94342p-288 }, + { 0x1.7172912ec21f8p-294, 0x1.8908e30f7a1b3p-289 }, + { 0x1.1b271db151968p-294, 0x1.2d5e5a1b8288ep-289 }, + { 0x1.b1f9ef2d6b135p-295, 0x1.ce1b3b9ea6267p-290 }, + { 0x1.4c872d1af92bcp-295, 0x1.623e8fb994f23p-290 }, + { 0x1.fd87064e02a6fp-296, 0x1.0f8695160ca38p-290 }, + { 0x1.8652a61cdcd3bp-296, 0x1.a031b186be289p-291 }, + { 0x1.2af84a660968dp-296, 0x1.3eee8e04dc3ap-291 }, + { 0x1.c9f07af149226p-297, 0x1.e8bd23cc416fp-292 }, + { 0x1.5eacf76fffc0cp-297, 0x1.766e8d5583265p-292 }, + { 0x1.0c80f3efbbf3fp-297, 0x1.1ed2fab014c43p-292 }, + { 0x1.9b1f8ffd8f3c8p-298, 0x1.b76010ebb6c6ap-293 }, + { 0x1.3ab5d5023fe4ap-298, 0x1.507d813502ab7p-293 }, + { 0x1.e1c174ea2aaa6p-299, 0x1.01aa61c90eaccp-293 }, + { 0x1.70b05029068dap-299, 0x1.8a90544ab274dp-294 }, + { 0x1.1a1fba21de5fp-299, 0x1.2e0fb0911dd84p-294 }, + { 0x1.afb70654af059p-300, 0x1.ce6f24739f7c7p-295 }, + { 0x1.4a458b53b2a84p-300, 0x1.61eefc532711fp-295 }, + { 0x1.f944d95c81983p-301, 0x1.0edb77098a96p-295 }, + { 0x1.8272ab43f7156p-301, 0x1.9e82e04d9025fp-296 }, + { 0x1.278886c5a4d73p-301, 0x1.3d237a2e0f859p-296 }, + { 0x1.c3f57b512a1f2p-302, 0x1.e5385c7d0efep-297 }, + { 0x1.598c52c5d1746p-302, 0x1.73258d0b919ebp-297 }, + { 0x1.0828ad1da0983p-302, 0x1.1bdb57d01ceccp-297 }, + { 0x1.93d4935512f54p-303, 0x1.b223e5e67d24ap-298 }, + { 0x1.34a3670d3cd59p-303, 0x1.4bf43098a2ef1p-298 }, + { 0x1.d7b67cefff216p-304, 0x1.fb93db1e39a21p-299 }, + { 0x1.686e7356020d2p-304, 0x1.8402d3eada60ap-299 }, + { 0x1.135e695d6d4f8p-304, 0x1.2892e3159736p-299 }, + { 0x1.a4b6028e1ae52p-305, 0x1.c5502f868f04bp-300 }, + { 0x1.415808da66669p-305, 0x1.5a670a5d83e0ep-300 }, + { 0x1.ead51e60a821dp-306, 0x1.08ac71830fd4ep-300 }, + { 0x1.76cfe88ffbfa7p-306, 0x1.9467d9d3bce7dp-301 }, + { 0x1.1e2e61d740a91p-306, 0x1.34ea92731d6fp-301 }, + { 0x1.b4f6c22875415p-307, 0x1.d7e402cf49a21p-302 }, + { 0x1.4d8e03e448998p-307, 0x1.6860e96265ba8p-302 }, + { 0x1.fd2c6816f010bp-308, 0x1.132f279000564p-302 }, + { 0x1.8494b75728df1p-308, 0x1.a4356bd52863ep-303 }, + { 0x1.28836b62851b4p-308, 0x1.40cac092d16a6p-303 }, + { 0x1.c476ceb4ce0a6p-309, 0x1.e9bb8c8c45eaap-304 }, + { 0x1.592d26553a529p-309, 0x1.75c6ad9777c96p-304 }, + { 0x1.074be65f60432p-309, 0x1.1d3d889242361p-304 }, + { 0x1.91a14719373e5p-310, 0x1.b34c7bf3e0108p-305 }, + { 0x1.3248b33f78dd9p-310, 0x1.4c1bf325b5886p-305 }, + { 0x1.d316bfa6ecf07p-311, 0x1.fab351a6d7271p-306 }, + { 0x1.641dc398561efp-311, 0x1.827d8b273a859p-306 }, + { 0x1.0f79d08c027e2p-311, 0x1.26c35a8453a6ep-306 }, + { 0x1.9ddabce45ff88p-312, 0x1.c18e854f7a653p-307 }, + { 0x1.3b6a0443345f1p-312, 0x1.56c727238c10ep-307 }, + { 0x1.e0b830517633fp-313, 0x1.05545196af9e3p-307 }, + { 0x1.6e4903f595976p-313, 0x1.8e6b62ae03487p-308 }, + { 0x1.170eca4e7a4cap-313, 0x1.2facf384d3a3bp-308 }, + { 0x1.a92756c27d93ap-314, 0x1.ceddf1e753b81p-309 }, + { 0x1.43d40bf74392dp-314, 0x1.60b61e0028436p-309 }, + { 0x1.ed3e286c4c0dep-315, 0x1.0cbd09b1e5e1p-309 }, + { 0x1.77993389df313p-315, 0x1.997719e8b73a8p-310 }, + { 0x1.1dfa945eaae99p-315, 0x1.37e77cf85ca37p-310 }, + { 0x1.b36ec5aa0588p-316, 0x1.db1e802a6c81fp-311 }, + { 0x1.4b749e64b35f5p-316, 0x1.69d3aa6fccfd9p-311 }, + { 0x1.f88d823260c9ep-317, 0x1.1383f4dd09079p-311 }, + { 0x1.7ffa0f1fabb65p-317, 0x1.a388f33976b7bp-312 }, + { 0x1.242e12375b352p-317, 0x1.3f613589599c6p-312 }, + { 0x1.bc9a844ffd2b5p-318, 0x1.e635a66e3ebe7p-313 }, + { 0x1.523af73f84783p-318, 0x1.720bfb4a981d7p-313 }, + { 0x1.0146a610e0588p-318, 0x1.199a49bcc51p-313 }, + { 0x1.87590d6d36008p-319, 0x1.ac8ae259e160cp-314 }, + { 0x1.299b80ea6bb7fp-319, 0x1.4609b0c4183cap-314 }, + { 0x1.c496292aa266bp-320, 0x1.f00af26520f9dp-315 }, + { 0x1.5817f72c95e4cp-320, 0x1.794ce31e24c7bp-315 }, + { 0x1.059392396d038p-320, 0x1.1ef2877dbfcadp-315 }, + { 0x1.8da5a346cbb3fp-321, 0x1.b468dc95cb829p-316 }, + { 0x1.2e36a9eb80d32p-321, 0x1.4bd213115ac94p-316 }, + { 0x1.cb4fb203e18ap-322, 0x1.f88862b544527p-317 }, + { 0x1.5cfe5be9615c7p-322, 0x1.7f861b04cbe3ap-317 }, + { 0x1.0923c6394f695p-322, 0x1.2380a7a548a2fp-317 }, + { 0x1.92d18166ccd51p-323, 0x1.bb1122f6e5762p-318 }, + { 0x1.31f510cb3f507p-323, 0x1.50ad48dd9b3a6p-318 }, + { 0x1.d0b7c794af438p-324, 0x1.ff9ab8e5d6631p-319 }, + { 0x1.60e2f23228dedp-324, 0x1.84a97f6b3e853p-319 }, + { 0x1.0bef1906dac58p-324, 0x1.273a4b16ba84fp-319 }, + { 0x1.96d0ca88e4fcp-325, 0x1.c07484e1da469p-320 }, + { 0x1.34ce1af3c1b6p-325, 0x1.549037ceef1fep-320 }, + { 0x1.d4c1f7c67dd18p-326, 0x1.0298e0fc06037p-320 }, + { 0x1.63bcc0600e3b1p-326, 0x1.88ab45875f419p-321 }, + { 0x1.0def17046c37ep-326, 0x1.2a16e161fa35fp-321 }, + { 0x1.999a40ba75f42p-327, 0x1.c48699c75f345p-322 }, + { 0x1.36bb3093bcf7fp-327, 0x1.5771e906a9978p-322 }, + { 0x1.d764e5657aa2p-328, 0x1.04a04a1699caap-322 }, + { 0x1.658528dc53bd5p-328, 0x1.8b822865b44e6p-323 }, + { 0x1.0f1f1acd583cp-328, 0x1.2c0fc98ac934cp-323 }, + { 0x1.9b2768ee2e28p-329, 0x1.c73df0b6d4334p-324 }, + { 0x1.37b7d60833afbp-329, 0x1.594bab8ddacb1p-324 }, + { 0x1.d89a6c43f4c1p-330, 0x1.05dee05833b3cp-324 }, + { 0x1.663803afd90e2p-330, 0x1.8d278c9cbfc58p-325 }, + { 0x1.0f7c5f2e4265p-330, 0x1.2d206b997c2ccp-325 }, + { 0x1.9b74a41343d69p-331, 0x1.c89434d36542fp-326 }, + { 0x1.37c1bd3bb9cfep-331, 0x1.5a192e33cf627p-326 }, + { 0x1.d85fb90bdf218p-332, 0x1.0651bc0c61b2p-326 }, + { 0x1.65d3aea4b609ep-332, 0x1.8d9799e5f2521p-327 }, + { 0x1.0f0609e7aa674p-332, 0x1.2d464a6b30dc2p-327 }, + { 0x1.9a813d2878f74p-333, 0x1.c88645e6c88eep-328 }, + { 0x1.36d8ce9d2217bp-333, 0x1.59d89052b0525p-328 }, + { 0x1.d6b5543d3c94p-334, 0x1.05f7d07f3fb02p-328 }, + { 0x1.645913a262a36p-334, 0x1.8cd14a1185c8dp-329 }, + { 0x1.0dbd2f003b6a5p-334, 0x1.2c810d60e767ep-329 }, + { 0x1.984f6bfe6778p-335, 0x1.c714448c370a6p-330 }, + { 0x1.34ff297cd534dp-335, 0x1.588a691f2cd1fp-330 }, + { 0x1.d39f201da2255p-336, 0x1.04d1f01416963p-330 }, + { 0x1.61cba521cabb4p-336, 0x1.8ad66d03eba59p-331 }, + { 0x1.0ba4cc94c45b3p-336, 0x1.2ad281b8cc2ap-331 }, + { 0x1.94e44c9a075e7p-337, 0x1.c44191b160ec2p-332 }, + { 0x1.32391bcecdc03p-337, 0x1.5631c55b5d22cp-332 }, + { 0x1.cf2449a3fda4bp-338, 0x1.02e2c911c7929p-332 }, + { 0x1.5e3150cc8eda4p-338, 0x1.87aba1a7120bfp-333 }, + { 0x1.08c1bf3c985fap-338, 0x1.283e938a586f7p-333 }, + { 0x1.9047cb663bb8cp-339, 0x1.c014c17012593p-334 }, + { 0x1.2e8d117dfdd44p-339, 0x1.52d41b7968429p-334 }, + { 0x1.c94f2cb2815a8p-340, 0x1.002edb3674f27p-334 }, + { 0x1.599268900e7bcp-340, 0x1.835843f5f0b0cp-335 }, + { 0x1.051aaf415041dp-340, 0x1.24cb3e8b7d756p-335 }, + { 0x1.8a84869fc8267p-341, 0x1.ba9781881c8a9p-336 }, + { 0x1.2a037bab743e1p-341, 0x1.4e79366e7a47p-336 }, + { 0x1.c22d2c350e306p-342, 0x1.f978cc962d426p-337 }, + { 0x1.53f982a03a248p-342, 0x1.7de65083f0e21p-337 }, + { 0x1.00b7f70f68972p-342, 0x1.208076f18ea3p-337 }, + { 0x1.83a7a5a0b9d4dp-343, 0x1.b3d6740403453p-338 }, + { 0x1.24a6b05eb3edap-343, 0x1.492b17a8d9ad4p-338 }, + { 0x1.b9ce7efad864cp-344, 0x1.f126a42ab2a64p-339 }, + { 0x1.4d7351162fad8p-344, 0x1.77623e1a3ca2fp-339 }, + { 0x1.f74706d1f613cp-345, 0x1.1b680aeae0c3cp-339 }, + { 0x1.7bc0a6e57fbc5p-345, 0x1.abe0fed214bcap-340 }, + { 0x1.1e82c35430e3dp-345, 0x1.42f5d0cb0afebp-340 }, + { 0x1.b045f25c98b4bp-346, 0x1.e77a20528f8f5p-341 }, + { 0x1.460e7202036c7p-346, 0x1.6fdace394b03cp-341 }, + { 0x1.ebd15c07c2acdp-347, 0x1.158d7d54f1681p-341 }, + { 0x1.72e125d540295p-347, 0x1.a2c9115542385p-342 }, + { 0x1.17a558b9c184fp-347, 0x1.3be755f8b210cp-342 }, + { 0x1.a5a8a3f3de092p-348, 0x1.dc88f077bd369p-343 }, + { 0x1.3ddb38ecb5b52p-348, 0x1.6760d57bb9982p-343 }, + { 0x1.df2826b036578p-349, 0x1.0efdda755dbb3p-343 }, + { 0x1.691c997f37f0ep-349, 0x1.98a2e123c782ep-344 }, + { 0x1.101d72c627ff7p-349, 0x1.340f49a72211p-344 }, + { 0x1.9a0db3d2b8dacp-350, 0x1.d06b3f65f6fdp-345 }, + { 0x1.34eb72e63e592p-350, 0x1.5e06fcff790f4p-345 }, + { 0x1.d166c8f34fca4p-351, 0x1.07c787991a68p-345 }, + { 0x1.5e880d9f1fe43p-351, 0x1.8d849f54265f7p-346 }, + { 0x1.07fb3b2ff1602p-351, 0x1.2b7ec30262d2bp-346 }, + { 0x1.8d8df0cbffd52p-352, 0x1.c33b5a8ad639fp-347 }, + { 0x1.2b52265317648p-352, 0x1.53e17e1a8afadp-347 }, + { 0x1.c2aa6bd34f17bp-353, 0x1.fff41d2913dabp-348 }, + { 0x1.5339d751ff2a1p-353, 0x1.818627da2e9e4p-348 }, + { 0x1.fe9f93308c405p-354, 0x1.2248100f21115p-348 }, + { 0x1.80438073219dep-354, 0x1.b515531d535ebp-349 }, + { 0x1.21234fbc4a127p-354, 0x1.4905d9b84e0cbp-349 }, + { 0x1.b31198aa5f8abp-355, 0x1.ef4bcc5f71a72p-350 }, + { 0x1.474946f304456p-355, 0x1.74c0ac8d03b2bp-350 }, + { 0x1.ec59d00f3fe38p-356, 0x1.187e74c209a91p-350 }, + { 0x1.7249848679fa9p-356, 0x1.a6169b09c4411p-351 }, + { 0x1.16739cec78bd4p-356, 0x1.3d8a8ccb26cd9p-351 }, + { 0x1.a2bbd0795adeep-357, 0x1.ddb87127c2076p-352 }, + { 0x1.3ace589cd3352p-357, 0x1.674e5d7be735cp-352 }, + { 0x1.d949ad392f075p-358, 0x1.0e35e84d33d3fp-352 }, + { 0x1.63bbbf78651ccp-358, 0x1.965d9f895d99cp-353 }, + { 0x1.0b5827a3ba382p-358, 0x1.3186c3440696p-353 }, + { 0x1.91c922f9ee4cp-359, 0x1.cb5d51a48d7d4p-354 }, + { 0x1.2de164c74e725p-359, 0x1.594a1039f0199p-354 }, + { 0x1.c5941f108d9d1p-360, 0x1.0382d1e479246p-354 }, + { 0x1.54b639c219649p-360, 0x1.8609634a384ccp-355 }, + { 0x1.ffcc62473097ap-361, 0x1.25120afe02122p-355 }, + { 0x1.8059c757355aep-361, 0x1.b85e31314f4b4p-356 }, + { 0x1.209ad26ca18d9p-361, 0x1.4acee7c0fcbafp-356 }, + { 0x1.b15e18d0d2d12p-362, 0x1.f0f38c6449ad9p-357 }, + { 0x1.4554e9983b016p-362, 0x1.753919ff4b182p-357 }, + { 0x1.e865bf893f8f4p-363, 0x1.1844080030d76p-357 }, + { 0x1.6e8db855aac9ap-363, 0x1.a4dede3a3eb93p-358 }, + { 0x1.1312cc0ae5d04p-363, 0x1.3bf7fe7aa33ap-358 }, + { 0x1.9ccc1bfbf7ecbp-364, 0x1.da5e8d4d639edp-359 }, + { 0x1.35b35e7d0088ep-364, 0x1.640bc7176cda7p-359 }, + { 0x1.d0a5ff60b92cfp-365, 0x1.0b342b640cc13p-359 }, + { 0x1.5c84558f35d95p-365, 0x1.9102c47629cb9p-360 }, + { 0x1.0560f8bafb2c7p-365, 0x1.2ce013e375d0fp-360 }, + { 0x1.8801ce509ea26p-366, 0x1.c36f07720a932p-361 }, + { 0x1.25ec7207b3c64p-366, 0x1.529fe13854ed9p-361 }, + { 0x1.b8b58f7c67c36p-367, 0x1.fbf2dc269c35dp-362 }, + { 0x1.4a5c0b3b7424dp-367, 0x1.7cec854a40ddcp-362 }, + { 0x1.ef3874e46141bp-368, 0x1.1da13f1aaaee6p-362 }, + { 0x1.732197e24d857p-368, 0x1.ac4c46230c45cp-363 }, + { 0x1.1619ff0ea7ec6p-368, 0x1.4112fbeff8a1fp-363 }, + { 0x1.a0bb46a0a2c53p-369, 0x1.e15420dda8758p-364 }, + { 0x1.383201c8ba71ap-369, 0x1.68bd97eb5b05dp-364 }, + { 0x1.d3b4e4b894768p-370, 0x1.0e54a78756b6bp-364 }, + { 0x1.5e4c4aaef013p-370, 0x1.951c14f527745p-365 }, + { 0x1.0654a030d3e7p-370, 0x1.2f8178dd14a04p-365 }, + { 0x1.88dc03d1ca801p-371, 0x1.c6b6bf9361ee4p-366 }, + { 0x1.2621d65152a67p-371, 0x1.5495f2949c65ep-366 }, + { 0x1.b860981f4834ap-372, 0x1.fe24891c8ca0cp-367 }, + { 0x1.49a0d4c97c281p-372, 0x1.7e02609a87253p-367 }, + { 0x1.ed66ed1143993p-373, 0x1.1e064158c947bp-367 }, + { 0x1.713a5a10cc9bp-373, 0x1.ac4304f253262p-368 }, + { 0x1.14455cbbff469p-373, 0x1.4093bdea6e36fp-368 }, + { 0x1.9d62205df47a6p-374, 0x1.dfe14a435c3c2p-369 }, + { 0x1.353bfdeb15aa4p-374, 0x1.6720e3d624fdcp-369 }, + { 0x1.ce97f23783a55p-375, 0x1.0cba8970a9d66p-369 }, + { 0x1.59f649793ea9ap-375, 0x1.921e961b81171p-370 }, + { 0x1.02b46c188f22dp-375, 0x1.2cd3135c626d1p-370 }, + { 0x1.82dcfdba2d59cp-376, 0x1.c2097f7f7c953p-371 }, + { 0x1.213830f44d648p-376, 0x1.5096e15b063dbp-371 }, + { 0x1.b0639acae41c7p-377, 0x1.f76b39886a20dp-372 }, + { 0x1.432d063e4cc5ap-377, 0x1.786c2636e4e2ap-372 }, + { 0x1.e3096b161ade1p-378, 0x1.196dc712e8651p-372 }, + { 0x1.68f1646f450ccp-378, 0x1.a4c39680abb0bp-373 }, + { 0x1.0dad51a121c5fp-378, 0x1.3a80eb1934625p-373 }, + { 0x1.92ed52465cf13p-379, 0x1.d6196b3830612p-374 }, + { 0x1.2cf8cdb32b26dp-379, 0x1.5f4b3b930a91ap-374 }, + { 0x1.c1934bb7035c1p-380, 0x1.067b3db09279ep-374 }, + { 0x1.4fbc11c19c0b7p-380, 0x1.8832413bcb6f5p-375 }, + { 0x1.f5613cdc1ad52p-381, 0x1.24f8b72bbd6eep-375 }, + { 0x1.76547ab0f816ap-381, 0x1.b5a5bcacf14ddp-376 }, + { 0x1.1770c93ef3136p-381, 0x1.46d8046ba690cp-376 }, + { 0x1.a128a30d837ebp-382, 0x1.e8209bd7c6d4dp-377 }, + { 0x1.375630e92b79p-382, 0x1.6c744b66f6406p-377 }, + { 0x1.d0a93cd8add1ep-383, 0x1.1015024fefc8dp-377 }, + { 0x1.5ab4549d6cf15p-383, 0x1.9631ba1694964p-378 }, + { 0x1.02a8fed4a1944p-383, 0x1.2f2b3b1ae197dp-378 }, + { 0x1.81e6d5efc2ecep-384, 0x1.c47e5b8f9de0cp-379 }, + { 0x1.1fd54f3e20bfcp-384, 0x1.51a481761d265p-379 }, + { 0x1.ad523512d80aep-385, 0x1.f7d2ff106229cp-380 }, + { 0x1.4023f854f9c86p-385, 0x1.77da522f79ec5p-380 }, + { 0x1.dd649c8fad0d5p-386, 0x1.185a192bd02b4p-380 }, + { 0x1.63e684c4d4572p-386, 0x1.a22ed5ef67f83p-381 }, + { 0x1.094b5ecc6e29p-386, 0x1.37d9a85948033p-381 }, + { 0x1.8b7643330549ep-387, 0x1.d10da89b8212ap-382 }, + { 0x1.26b65f14cd4dap-387, 0x1.5ab7d4224f7e2p-382 }, + { 0x1.b734f53e57228p-388, 0x1.0276587fa1c2p-382 }, + { 0x1.473b9d1931175p-388, 0x1.814bdb918424dp-383 }, + { 0x1.e78d8c6e84fddp-389, 0x1.1f2684f2af658p-383 }, + { 0x1.6b2a2c93cd65ap-389, 0x1.abf540fb4e1a1p-384 }, + { 0x1.0e7a7b055d281p-389, 0x1.3eddfeeed0dd2p-384 }, + { 0x1.92d87cacce695p-390, 0x1.db1c82f79707dp-385 }, + { 0x1.2bf57b6e0d98dp-390, 0x1.61ea0b7eb4c3cp-385 }, + { 0x1.bea4f9488e121p-391, 0x1.0799f1fb897d8p-385 }, + { 0x1.4c7d8bf7bdc41p-391, 0x1.889f21fdb1d69p-386 }, + { 0x1.eef6b8bfa9225p-392, 0x1.245c20ba28a39p-386 }, + { 0x1.705ed2bbfd521p-392, 0x1.b3598a0d5984p-387 }, + { 0x1.121f1b69882ebp-392, 0x1.4418fde75923ep-387 }, + { 0x1.97ec608197c79p-393, 0x1.e27e05b6c31f9p-388 }, + { 0x1.2f7b0edc74f1cp-393, 0x1.671af7f5d8858p-388 }, + { 0x1.c380c41f7503p-394, 0x1.0b3d4442eda68p-388 }, + { 0x1.4fd20f15083b3p-394, 0x1.8db341e4d4306p-389 }, + { 0x1.f37ea8d01e9c5p-395, 0x1.27e37e3bc73c9p-389 }, + { 0x1.736cebb19a201p-395, 0x1.b83a639f29a8p-390 }, + { 0x1.1428c012e2c57p-395, 0x1.47730acf38edcp-390 }, + { 0x1.9a9ae80c06018p-396, 0x1.e710d5155d028p-391 }, + { 0x1.31371c2b63b8p-396, 0x1.6a331ab64b688p-391 }, + { 0x1.c5b240b14f4d6p-397, 0x1.0d4fd25f7f52ep-391 }, + { 0x1.5129ffd17a136p-397, 0x1.90712f4e38e37p-392 }, + { 0x1.f510ba62354a5p-398, 0x1.29ac951c1e60bp-392 }, + { 0x1.74468acd1611cp-398, 0x1.ba819d5f14678p-393 }, + { 0x1.148e1d96c299ep-398, 0x1.48dce2dc3ecd5p-393 }, + { 0x1.9ad7d58aaba44p-399, 0x1.e8c0193d16d55p-394 }, + { 0x1.3121b71d77179p-399, 0x1.6b2456938b866p-394 }, + { 0x1.c52f68dd90e64p-400, 0x1.0dc826696c76cp-394 }, + { 0x1.507f397188496p-400, 0x1.90cc63cdbf2a2p-395 }, + { 0x1.f3a5bdf92c388p-401, 0x1.29af3c144f8cp-395 }, + { 0x1.72e7cbdbb95dbp-401, 0x1.ba24cc0f4c8e2p-396 }, + { 0x1.134d638b07143p-401, 0x1.48500e815d897p-396 }, + { 0x1.98a2111174d79p-402, 0x1.e7841c45926dp-397 }, + { 0x1.2f3b409e1b7b6p-402, 0x1.69ea5b1b71301p-397 }, + { 0x1.c1fa91a869695p-403, 0x1.0ca4195cda6d3p-397 }, + { 0x1.4dd4c7d7ec9fap-403, 0x1.8ec33daf13649p-398 }, + { 0x1.ef442d8796795p-404, 0x1.27eb66fea5e85p-398 }, + { 0x1.6f56f0c0f22b9p-404, 0x1.b72598c77c448p-399 }, + { 0x1.106c4a594a047p-404, 0x1.45cf12a60cb9ap-399 }, + { 0x1.9403b0e4bd1b9p-405, 0x1.e36284e81b5ffp-400 }, + { 0x1.2b8c63e7468c1p-405, 0x1.668ac570f2fc8p-400 }, + { 0x1.bc22598793379p-406, 0x1.09e8e37ef2488p-400 }, + { 0x1.4936d06178106p-406, 0x1.8a5f0c63b5c24p-401 }, + { 0x1.e7fffb3b16a7dp-407, 0x1.2469273320bdap-401 }, + { 0x1.69a431ed205ap-407, 0x1.b191b44e70edfp-402 }, + { 0x1.0bf7e7cce4d07p-407, 0x1.41655d7606103p-402 }, + { 0x1.8d11ace4d8996p-408, 0x1.dc6e2b76185d5p-403 }, + { 0x1.2625d4b960a47p-408, 0x1.6114f58eab906p-403 }, + { 0x1.b3c139841a735p-409, 0x1.05a2f4a403a4dp-403 }, + { 0x1.42ba35d81be5cp-409, 0x1.83b3c9af7ee45p-404 }, + { 0x1.ddf9fa6fc513ap-410, 0x1.1f386e3013e68p-404 }, + { 0x1.61e943a26f542p-410, 0x1.a9826f127d04dp-405 }, + { 0x1.06044c28d2704p-410, 0x1.3b26ef9596f74p-405 }, + { 0x1.83eb403668f94p-411, 0x1.d2c68adc24dd3p-406 }, + { 0x1.1f1fd15ed30fep-411, 0x1.59a199b7c8167p-406 }, + { 0x1.a8fcbdc7eab51p-412, 0x1.ffcb2bfa5b8dap-407 }, + { 0x1.3a7bfb4be9962p-412, 0x1.7adf828472cfdp-407 }, + { 0x1.d15ee90987618p-413, 0x1.1870951a86a79p-407 }, + { 0x1.584895194492p-413, 0x1.9f1bfa110cbbap-408 }, + { 0x1.fd57d7b45b3cap-414, 0x1.332fc55367264p-408 }, + { 0x1.78b8ffae32bfp-414, 0x1.c696d39db75f3p-409 }, + { 0x1.16996dab0cd1ep-414, 0x1.5051f4ea04fdfp-409 }, + { 0x1.9c046dcaa75a4p-415, 0x1.f194b2a4cb97p-410 }, + { 0x1.30a06c462f23ep-415, 0x1.700975cbb46aap-410 }, + { 0x1.c2662350ce7fap-416, 0x1.102fae0ec7794p-410 }, + { 0x1.4cec5169fb931p-416, 0x1.928c588cfb6d9p-411 }, + { 0x1.ec1db7d8e44b5p-417, 0x1.29a3060c44f3ap-411 }, + { 0x1.6babae8929706p-417, 0x1.b814aa869e0e4p-412 }, + { 0x1.0cb7ae5506e7ep-417, 0x1.454ee7edd0063p-412 }, + { 0x1.8d106f7f4047ep-418, 0x1.e0e0b72e6ef2ep-413 }, + { 0x1.255213192c405p-418, 0x1.6360f251c2f1fp-413 }, + { 0x1.b1500fc71b69ap-419, 0x1.0699a6631f93fp-413 }, + { 0x1.40052c8ba04b4p-419, 0x1.840a0d97bb129p-414 }, + { 0x1.d8a3d24511c07p-420, 0x1.1eaa023d58a69p-414 }, + { 0x1.5cfadd7b9716p-420, 0x1.a77ea01d8b821p-415 }, + { 0x1.01a47ddad3ea8p-420, 0x1.38c7c7057a652p-415 }, + { 0x1.7c5ff3799c35bp-421, 0x1.cdf6c504a93e5p-416 }, + { 0x1.18c087e86a1f3p-421, 0x1.551bff88c1175p-416 }, + { 0x1.9e64530b957f4p-422, 0x1.f7ae8590bb8p-417 }, + { 0x1.31c908986e1a8p-422, 0x1.73d293026bc2ap-417 }, + { 0x1.c33b25da2082ep-423, 0x1.12730a9790f69p-417 }, + { 0x1.4ce362055227ep-423, 0x1.951a7082f394ap-418 }, + { 0x1.eb1b0ae0a386ap-424, 0x1.2af1081b22794p-418 }, + { 0x1.6a3779e1ff3bp-424, 0x1.b925bc48353ep-419 }, + { 0x1.0b1f245435eeap-424, 0x1.4575deb5305a2p-419 }, + { 0x1.89efddb97fd18p-425, 0x1.e029ff0fc8645p-420 }, + { 0x1.227180cb0a8cap-425, 0x1.6228a92a17423p-420 }, + { 0x1.ac39e8a7de062p-426, 0x1.05302bb5e3a1ap-420 }, + { 0x1.3ba5b5279aa24p-426, 0x1.81331d3a2cc81p-421 }, + { 0x1.d145ea8ff6403p-427, 0x1.1c02d69097c72p-421 }, + { 0x1.56df011e743b9p-427, 0x1.a2c1b0ae83a64p-422 }, + { 0x1.f94750d0f9308p-428, 0x1.34ad734ae6135p-422 }, + { 0x1.7442e7172840ap-428, 0x1.c703bfdc748cdp-423 }, + { 0x1.123a683e9b9d5p-428, 0x1.4f5290291de6ep-423 }, + { 0x1.93f94a8e393e5p-429, 0x1.ee2bb5a2a447p-424 }, + { 0x1.298449094a08p-429, 0x1.6c16f34d9525ep-424 }, + { 0x1.b62c8f87855a8p-430, 0x1.0c379a70923bcp-424 }, + { 0x1.42a02f59d51efp-430, 0x1.8b21b8919710fp-425 }, + { 0x1.db09bb0ffb21fp-431, 0x1.2303a1b68b2dep-425 }, + { 0x1.5daee76f997a8p-431, 0x1.ac9c706a79cfcp-426 }, + { 0x1.01604a662bf4cp-431, 0x1.3b983b3f72fb5p-426 }, + { 0x1.7ad33d50dacdp-432, 0x1.d0b33fd9b6e85p-427 }, + { 0x1.16c1e4c8c451ap-432, 0x1.5615904c6373ap-427 }, + { 0x1.9a32159dea0d8p-433, 0x1.f7950165d693dp-428 }, + { 0x1.2dc48781056c9p-433, 0x1.729dc070c926ap-428 }, + { 0x1.bbf2871addffbp-434, 0x1.10b9b38c6e833p-428 }, + { 0x1.4684a4152d4ep-434, 0x1.9154f9f73ee5fp-429 }, + { 0x1.e03df4eb2c204p-435, 0x1.27418ebfd96bep-429 }, + { 0x1.6120558a89b12p-435, 0x1.b26192fa2f36ep-430 }, + { 0x1.03a014bcb5352p-435, 0x1.3f7df7d25b3e6p-430 }, + { 0x1.7db773a6f6623p-436, 0x1.d5ec232ba3385p-431 }, + { 0x1.1893b9023690dp-436, 0x1.598c75ff21ea4p-431 }, + { 0x1.9c6ba6a49465ap-437, 0x1.fc1f9e46a53e2p-432 }, + { 0x1.2f125d64e7642p-437, 0x1.758c452444076p-432 }, + { 0x1.bd607b51aff83p-438, 0x1.1294b791c6529p-432 }, + { 0x1.4735d5e25dd32p-438, 0x1.939e692035be7p-433 }, + { 0x1.e0bb7795ebab2p-439, 0x1.289cc9b3b4107p-433 }, + { 0x1.611962fb4b008p-439, 0x1.b3e5c199dc217p-434 }, + { 0x1.035217aa6e0adp-439, 0x1.40415be2c6028p-434 }, + { 0x1.7cd9c096da3b3p-440, 0x1.d6871e2c76342p-435 }, + { 0x1.17a22cd2a508fp-440, 0x1.599d2a64857abp-435 }, + { 0x1.9a95351e8c9f1p-441, 0x1.fba952efabe51p-436 }, + { 0x1.2d63f329a8bcbp-441, 0x1.74cc660d4897ap-436 }, + { 0x1.ba6ba0cb47e2bp-442, 0x1.11baa6a990cd8p-436 }, + { 0x1.44ae89d144108p-442, 0x1.91ecc31adec4ep-437 }, + { 0x1.dc7e8d1b8f556p-443, 0x1.270b14a1f9816p-437 }, + { 0x1.5d9a42222275cp-443, 0x1.b11d883fd3ec1p-438 }, + { 0x1.00789e350bd1ap-443, 0x1.3ddca348b8e79p-438 }, + { 0x1.7840aaba80c98p-444, 0x1.d27f9dd765764p-439 }, + { 0x1.13f45ccd8c935p-444, 0x1.56472f42babf3p-439 }, + { 0x1.94bc9a9955f26p-445, 0x1.f6359d3980ea5p-440 }, + { 0x1.28c5f3eaf8eddp-445, 0x1.7063ccd1b83c6p-440 }, + { 0x1.b32a3c3e46a35p-446, 0x1.0e31f012ad2b3p-440 }, + { 0x1.3f01c91fe7f47p-446, 0x1.8c4cd2c02ec2dp-441 }, + { 0x1.d3a718c61d154p-447, 0x1.2298481c2ca0dp-441 }, + { 0x1.56bd3dd5a05c1p-447, 0x1.aa1de55237abcp-442 }, + { 0x1.f65222fadfcp-448, 0x1.3861db33230bp-442 }, + { 0x1.700eb717cfb77p-448, 0x1.c9f401331dbf6p-443 }, + { 0x1.0da5e12700c8dp-448, 0x1.4fa3a533642f6p-443 }, + { 0x1.8b0da54d3c71fp-449, 0x1.ebed8656f1a7bp-444 }, + { 0x1.215aeed941b43p-449, 0x1.6873a105b43c2p-444 }, + { 0x1.a7d28bd609e5p-450, 0x1.081521636047p-444 }, + { 0x1.3659f3261d19p-450, 0x1.82e8d038330cap-445 }, + { 0x1.c6770887b13f6p-451, 0x1.1b65bea6b7e6ap-445 }, + { 0x1.4cb570f463d9dp-451, 0x1.9f1b427ce89a2p-446 }, + { 0x1.e715dafe5cd6p-452, 0x1.2ff9fffd4f5f9p-446 }, + { 0x1.6480ba9b1723cp-452, 0x1.bd241d06b6757p-447 }, + { 0x1.04e575dd6f2ebp-452, 0x1.45e411382662bp-447 }, + { 0x1.7dcff6d521467p-453, 0x1.dd1da1bc7ec85p-448 }, + { 0x1.1759a98201ff3p-453, 0x1.5d36e9f7af39cp-448 }, + { 0x1.98b82586ccf2dp-454, 0x1.ff233639de02ap-449 }, + { 0x1.2af6afc0ce651p-454, 0x1.7606528b3cf28p-449 }, + { 0x1.b54f244df93dfp-455, 0x1.11a8b54a30c34p-449 }, + { 0x1.3fcc4e4385b18p-455, 0x1.9066e8a3084adp-450 }, + { 0x1.d3abb2d5b9282p-456, 0x1.24e2ffedd9f78p-450 }, + { 0x1.55eaec016b2b5p-456, 0x1.ac6e23cde6ac9p-451 }, + { 0x1.f3e576e5bfb2cp-457, 0x1.394ff72563c26p-451 }, + { 0x1.6d6394041cb01p-457, 0x1.ca3259bb8013ep-452 }, + { 0x1.0b0a8012d71fbp-457, 0x1.4effb58fcce2p-452 }, + { 0x1.8647f7f3a91dep-458, 0x1.e9cac23b8427ep-453 }, + { 0x1.1d29e5c60946bp-458, 0x1.6602f707600f3p-453 }, + { 0x1.a0aa72640fd47p-459, 0x1.05a7bd790a4bcp-453 }, + { 0x1.305e23384e58ap-459, 0x1.7e6b1b23c38f4p-454 }, + { 0x1.bc9e08de1532fp-460, 0x1.176cc55ca9b8p-454 }, + { 0x1.44b4e89c6a35fp-460, 0x1.984a277e8539ap-455 }, + { 0x1.da366d9d2b975p-461, 0x1.2a417253e014bp-455 }, + { 0x1.5a3c60cb2c6b1p-461, 0x1.b3b2c9b4277c6p-456 }, + { 0x1.f98800fc076dbp-462, 0x1.3e333559670c8p-456 }, + { 0x1.71033226bf0afp-462, 0x1.d0b8591b88278p-457 }, + { 0x1.0d53e944a7e18p-462, 0x1.534ff7f271b4dp-457 }, + { 0x1.89187f3d75a14p-463, 0x1.ef6ed82d51675p-458 }, + { 0x1.1ed5d0deddfb7p-463, 0x1.69a61d0edc9d2p-458 }, + { 0x1.a28be72757b85p-464, 0x1.07f57aca805f1p-458 }, + { 0x1.3154ef266983dp-464, 0x1.814481a9f253cp-459 }, + { 0x1.bd6d859990532p-465, 0x1.1921067277b5dp-459 }, + { 0x1.44dcd404b4fcdp-465, 0x1.9a3a7d2712f82p-460 }, + { 0x1.d9cdf2aadd6a6p-466, 0x1.2b45137355f77p-460 }, + { 0x1.5979672b76b96p-466, 0x1.b497e1657b91bp-461 }, + { 0x1.f7be424410479p-467, 0x1.3e6cfcc06ed27p-461 }, + { 0x1.6f36e7903ba4fp-467, 0x1.d06cfa865bc4ep-462 }, + { 0x1.0ba8019bd4e86p-467, 0x1.52a47395ed2aep-462 }, + { 0x1.8621eaa755f34p-468, 0x1.edca8e605e67ap-463 }, + { 0x1.1c4a9efdce654p-468, 0x1.67f77ef705254p-463 }, + { 0x1.9e475b5aaea97p-469, 0x1.0660edcde1e02p-463 }, + { 0x1.2dd03980220acp-469, 0x1.7e727aec99554p-464 }, + { 0x1.b7b478b8fda1cp-470, 0x1.16b24c391593bp-464 }, + { 0x1.40424c4fd21f7p-470, 0x1.96221780dfe95p-465 }, + { 0x1.d276d459f43c7p-471, 0x1.27e2788696d86p-465 }, + { 0x1.53aa8c500f5dp-471, 0x1.af1357749947cp-466 }, + { 0x1.ee9c5073f397ep-472, 0x1.39fac2bf7a531p-466 }, + { 0x1.6812e6a2e8fcp-472, 0x1.c9538eaa71fbp-467 }, + { 0x1.06198ecffc0ep-472, 0x1.4d04b3a802aeep-467 }, + { 0x1.7d857ef6fe55ap-473, 0x1.e4f0604536408p-468 }, + { 0x1.15a4dc243cc5fp-473, 0x1.610a0b4ec8401p-468 }, + { 0x1.940cad97ee071p-474, 0x1.00fbde3ac71c6p-468 }, + { 0x1.25f772e00c70ap-474, 0x1.7614bf61d6bfap-469 }, + { 0x1.abb2fd3f529efp-475, 0x1.103beefa0765p-469 }, + { 0x1.3718d87e8a0afp-475, 0x1.8c2ef94786008p-470 }, + { 0x1.c48328a4346ebp-476, 0x1.203fa39242793p-470 }, + { 0x1.4910b37b4de72p-476, 0x1.a36313f8e64ecp-471 }, + { 0x1.de8817c6f33b9p-477, 0x1.310e5f6fbfd44p-471 }, + { 0x1.5be6c950a7e6fp-477, 0x1.bbbb999bb060ap-472 }, + { 0x1.f9ccdcf7c94fep-478, 0x1.42afa66f9fdc1p-472 }, + { 0x1.6fa2fc442a9d3p-478, 0x1.d54340d9c375dp-473 }, + { 0x1.0b2e58cb15f5cp-478, 0x1.552b1ae6aeaa2p-473 }, + { 0x1.844d490056942p-479, 0x1.f004e9f45a94bp-474 }, + { 0x1.1a217943b9ac7p-479, 0x1.68887b7750462p-474 }, + { 0x1.99edc3fa555f4p-480, 0x1.0605cdc8a1e5ep-474 }, + { 0x1.29c58e31af831p-480, 0x1.7ccfa0b55e3f7p-475 }, + { 0x1.b08c96a2d341cp-481, 0x1.14b13fa04509fp-475 }, + { 0x1.3a2063aa9bfc9p-481, 0x1.92087a96ea8f4p-476 }, + { 0x1.c831fc61280f7p-482, 0x1.240a6edc95f53p-476 }, + { 0x1.4b37d15842e1dp-482, 0x1.a83b0db0fa5b6p-477 }, + { 0x1.e0e63f582488bp-483, 0x1.34170d65d2fe5p-477 }, + { 0x1.5d11b81c3fea7p-483, 0x1.bf6f703f6c8b1p-478 }, + { 0x1.fab1b4f400c2ep-484, 0x1.44dcd884a52dcp-478 }, + { 0x1.6fb3ff8ccf41cp-484, 0x1.d7adc6f76430fp-479 }, + { 0x1.0ace5d20891a2p-484, 0x1.5661968fc8c68p-479 }, + { 0x1.8324934a763f4p-485, 0x1.f0fe41a3b588bp-480 }, + { 0x1.18d7d8058e531p-485, 0x1.68ab147365bffp-480 }, + { 0x1.9769602e7d2c4p-486, 0x1.05b48bc57ed71p-480 }, + { 0x1.27797b62a04a4p-486, 0x1.7bbf2311e9661p-481 }, + { 0x1.ac8851524d431p-487, 0x1.137b41cf9c9a4p-481 }, + { 0x1.36b7751d5da7fp-487, 0x1.8fa3947e525d9p-482 }, + { 0x1.c2874cefea298p-488, 0x1.21d7603b6e2ccp-482 }, + { 0x1.4695ee8470b66p-488, 0x1.a45e3910021acp-483 }, + { 0x1.d96c311be3eb3p-489, 0x1.30cd0207d04edp-483 }, + { 0x1.571909f179506p-489, 0x1.b9f4dc504a668p-484 }, + { 0x1.f13cd05945d89p-490, 0x1.40603dadb780ap-484 }, + { 0x1.6844e0504f766p-490, 0x1.d06d41c212c13p-485 }, + { 0x1.04ff770417c7ep-490, 0x1.509522cc01f2fp-485 }, + { 0x1.7a1d7e8c27e5p-491, 0x1.e7cd2184183ebp-486 }, + { 0x1.11dc1d57f7df8p-491, 0x1.616fb7b910c11p-486 }, + { 0x1.8ca6e2e342651p-492, 0x1.000d1267395e3p-486 }, + { 0x1.1f372812d1e14p-492, 0x1.72f3f6faafe57p-487 }, + { 0x1.9fe4fa21e8c98p-493, 0x1.0cacf12619fe1p-487 }, + { 0x1.2d1356c845fd1p-493, 0x1.8525cca4f244dp-488 }, + { 0x1.b3db9cc5a58f3p-494, 0x1.19c8ed29100e2p-488 }, + { 0x1.3b7359a6b9391p-494, 0x1.980913a0c5f1ep-489 }, + { 0x1.c88e8c09b9bb2p-495, 0x1.2763b979d57b5p-489 }, + { 0x1.4a59cf5958098p-495, 0x1.aba192db244fdp-490 }, + { 0x1.de016eddfacadp-496, 0x1.357ff9fbc97f4p-490 }, + { 0x1.59c942db45eaep-496, 0x1.bff2fa5de1e9dp-491 }, + { 0x1.f437cec9632b8p-497, 0x1.44204156d00fcp-491 }, + { 0x1.69c4293cefa3fp-497, 0x1.d500e0534289dp-492 }, + { 0x1.059a8a5ce0ce7p-497, 0x1.53470ed39dd97p-492 }, + { 0x1.7a4cdf5c8de47p-498, 0x1.eacebdf5973c2p-493 }, + { 0x1.117e42e10afc5p-498, 0x1.62f6cc2a62dbdp-493 }, + { 0x1.8b65a792fe14p-499, 0x1.00aff63626acfp-493 }, + { 0x1.1dc89fe4a5f8ap-499, 0x1.7331cb44dd6ecp-494 }, + { 0x1.9d10a7562f377p-500, 0x1.0c5bd0cbfba3p-494 }, + { 0x1.2a7b1b1593291p-500, 0x1.83fa43f4f73d5p-495 }, + { 0x1.af4fe4d278bf9p-501, 0x1.186c76677c8f7p-495 }, + { 0x1.37971726a776ep-501, 0x1.955251a12574cp-496 }, + { 0x1.c225447c48b85p-502, 0x1.24e359c6528bbp-496 }, + { 0x1.451dde15504ecp-502, 0x1.a73bf0e7dcf7bp-497 }, + { 0x1.d592869bae136p-503, 0x1.31c1d70a5a26cp-497 }, + { 0x1.53109f6b70a02p-503, 0x1.b9b8fd3b82acep-498 }, + { 0x1.e99944d35a898p-504, 0x1.3f09320694d4p-498 }, + { 0x1.61706e7ea0b42p-504, 0x1.cccb2e7856e93p-499 }, + { 0x1.fe3aefa4cdaa2p-505, 0x1.4cba948866255p-499 }, + { 0x1.703e40ae0b133p-505, 0x1.e0741675f15a5p-500 }, + { 0x1.09bc65f9b8064p-505, 0x1.5ad70c9e433d4p-500 }, + { 0x1.7f7aeba02f7efp-506, 0x1.f4b51e95f89d5p-501 }, + { 0x1.14a9f8443d058p-506, 0x1.695f8add0a062p-501 }, + { 0x1.8f272381e3222p-507, 0x1.04c7c2a8ead79p-501 }, + { 0x1.1fe6a1ccca721p-507, 0x1.7854e0a5444cfp-502 }, + { 0x1.9f437947f2743p-508, 0x1.0f822de49bc54p-502 }, + { 0x1.2b72bc2a1bb29p-508, 0x1.87b7be69a8c26p-503 }, + { 0x1.afd058f4d5cb9p-509, 0x1.1a8a41a9a734p-503 }, + { 0x1.374e8637e822fp-509, 0x1.9788b1f83908ep-504 }, + { 0x1.c0ce07e3f5247p-510, 0x1.25e0558a5c077p-504 }, + { 0x1.437a22e46ffc9p-510, 0x1.a7c824c7683f1p-505 }, + { 0x1.d23ca31c0220cp-511, 0x1.3184a6ce13b46p-505 }, + { 0x1.4ff5980398e02p-511, 0x1.b8765a48c0cf1p-506 }, + { 0x1.e41c1da9f8a5fp-512, 0x1.3d775743f06aep-506 }, + { 0x1.5cc0cd28b81e5p-512, 0x1.c9936e428a9d9p-507 }, + { 0x1.f66c3f065ea05p-513, 0x1.49b86c1b194cep-507 }, + { 0x1.69db8a882e29p-513, 0x1.db1f5331fbe71p-508 }, + { 0x1.049650c331274p-513, 0x1.5647ccc18e717p-508 }, + { 0x1.774577e1faf4fp-514, 0x1.ed19d0b78718cp-509 }, + { 0x1.0e2e586d3df5cp-514, 0x1.632541cab3acp-509 }, + { 0x1.84fe1b767669bp-515, 0x1.ff82820edeaabp-510 }, + { 0x1.17fdd44e1dc6cp-515, 0x1.705073deb552ap-510 }, + { 0x1.9304d9065a4b9p-516, 0x1.092c6a4a26abfp-510 }, + { 0x1.220449767742ap-516, 0x1.7dc8eab3ed87ap-511 }, + { 0x1.a158f0df4c356p-517, 0x1.12ce032c827cep-511 }, + { 0x1.2c4123936432bp-517, 0x1.8b8e0c1372c25p-512 }, + { 0x1.aff97ef6163edp-518, 0x1.1ca5926404568p-512 }, + { 0x1.36b3b4511d82bp-518, 0x1.999f1ae9f978bp-513 }, + { 0x1.bee57a0fbbbdcp-519, 0x1.26b285aeabdbep-513 }, + { 0x1.415b32c89327cp-519, 0x1.a7fb366632c72p-514 }, + { 0x1.ce1bb2fa9523ep-520, 0x1.30f431387ee69p-514 }, + { 0x1.4c36baf8c2285p-520, 0x1.b6a15925d0c25p-515 }, + { 0x1.dd9ad3d89a4a5p-521, 0x1.3b69cf0bd5608p-515 }, + { 0x1.57454d4c97f21p-521, 0x1.c590587256b75p-516 }, + { 0x1.ed615f7bfd7d2p-522, 0x1.46127e8d37ba7p-516 }, + { 0x1.6285ce2e2e29bp-522, 0x1.d4c6e38ed7f06p-517 }, + { 0x1.fd6db0d73348ep-523, 0x1.50ed44039bd53p-517 }, + { 0x1.6df705a8252f7p-523, 0x1.e4438317c2a1ep-518 }, + { 0x1.06defd40bdb09p-523, 0x1.5bf9082dc8412p-518 }, + { 0x1.79979f15ddb0dp-524, 0x1.f4049875ce63p-519 }, + { 0x1.0f2823287afb6p-524, 0x1.673497e5a0d03p-519 }, + { 0x1.856628e34ac2cp-525, 0x1.02042eb28efefp-519 }, + { 0x1.17913a85a33a7p-525, 0x1.729ea3d219a53p-520 }, + { 0x1.9161145d0e326p-526, 0x1.0a2671c8cdbeep-520 }, + { 0x1.20191f16dc709p-526, 0x1.7e35c0288722ep-521 }, + { 0x1.9d86b59187f4ep-527, 0x1.12680a24c58f5p-521 }, + { 0x1.28be97e6e9065p-527, 0x1.89f8647df9662p-522 }, + { 0x1.a9d5434377e7bp-528, 0x1.1ac7d823a316cp-522 }, + { 0x1.31805749922c3p-528, 0x1.95e4eba9494cap-523 }, + { 0x1.b64ad6eec66d3p-529, 0x1.2344a7c981006p-523 }, + { 0x1.3a5cfae5998ecp-529, 0x1.a1f993b67371dp-524 }, + { 0x1.c2e56cdffce02p-530, 0x1.2bdd30bebc795p-524 }, + { 0x1.43530bcc0ee3ap-530, 0x1.ae347debd307p-525 }, + { 0x1.cfa2e45eea63dp-531, 0x1.3490165a1de5p-525 }, + { 0x1.4c60fe9d5cbc1p-531, 0x1.ba93aee1c301fp-526 }, + { 0x1.dc80ffece4451p-532, 0x1.3d5be7b8309a9p-526 }, + { 0x1.558533bc564e3p-532, 0x1.c7150ead1fd0ep-527 }, + { 0x1.e97d659702f92p-533, 0x1.463f1fe01b7dap-527 }, + { 0x1.5ebdf78f85a03p-533, 0x1.d3b6691d169e3p-528 }, + { 0x1.f6959f5cadd73p-534, 0x1.4f3825f642bp-528 }, + { 0x1.680982d0eea8ap-534, 0x1.e0756e0ca137bp-529 }, + { 0x1.01e38dd55bfc7p-534, 0x1.58454d7cf072p-529 }, + { 0x1.7165faec70a1p-535, 0x1.ed4fb1c7fef16p-530 }, + { 0x1.088796f5a026p-535, 0x1.6164d6a338985p-530 }, + { 0x1.7ad1726ce2f3cp-536, 0x1.fa42ad866b6p-531 }, + { 0x1.0f3587953aeb5p-536, 0x1.6a94eea23ecd2p-531 }, + { 0x1.8449e977fef01p-537, 0x1.03a5dffc21d0dp-531 }, + { 0x1.15ebef6827c9dp-537, 0x1.73d3b028fc2cfp-532 }, + { 0x1.8dcd4e591ac76p-538, 0x1.0a3416f4dd0f1p-532 }, + { 0x1.1ca951b79a938p-538, 0x1.7d1f23d694b62p-533 }, + { 0x1.97597e1aad586p-539, 0x1.10ca917d13a59p-533 }, + { 0x1.236c25d3c18a2p-539, 0x1.867540c340902p-534 }, + { 0x1.a0ec452e85047p-540, 0x1.1767d933fa0f7p-534 }, + { 0x1.2a32d78fe110fp-540, 0x1.8fd3ed17c059fp-535 }, + { 0x1.aa8360248e3edp-541, 0x1.1e0a6bf884441p-535 }, + { 0x1.30fbc7c8ab284p-541, 0x1.9938feb3469d1p-536 }, + { 0x1.b41c7c6ff8cc6p-542, 0x1.24b0bc63cac6bp-536 }, + { 0x1.37c54cf4ab1fcp-542, 0x1.a2a23bdfb3241p-537 }, + { 0x1.bdb5393a7ccd2p-543, 0x1.2b59324d7fd9bp-537 }, + { 0x1.3e8db3be9418cp-543, 0x1.ac0d5c13ef72ap-538 }, + { 0x1.c74b284572b4cp-544, 0x1.32022b5a4d882p-538 }, + { 0x1.45533fa93710cp-544, 0x1.b57808c42df0bp-539 }, + { 0x1.d0dbced86364cp-545, 0x1.38a9fb93eb86p-539 }, + { 0x1.4c142bbcdb51bp-545, 0x1.bedfde3fbf9f1p-540 }, + { 0x1.da64a6bca7adp-546, 0x1.3f4eee0ab230dp-540 }, + { 0x1.52ceab3daa53bp-546, 0x1.c8426c9c266d4p-541 }, + { 0x1.e3e31f45a0a96p-547, 0x1.45ef458066425p-541 }, + { 0x1.5980ea6ad6692p-547, 0x1.d19d38acfc932p-542 }, + { 0x1.ed549e6504cf2p-548, 0x1.4c893d1bef1fep-542 }, + { 0x1.60290f4619f98p-548, 0x1.daedbd083bb8ep-543 }, + { 0x1.f6b681cab013bp-549, 0x1.531b0925a021ep-543 }, + { 0x1.66c53a6323b06p-549, 0x1.e4316b16614afp-544 }, + { 0x1.00031007ac3e3p-549, 0x1.59a2d7cbb3c39p-544 }, + { 0x1.6d5387be7adf6p-550, 0x1.ed65ac2de0264p-545 }, + { 0x1.04a064f4bdd38p-550, 0x1.601ed1ee8e719p-545 }, + { 0x1.73d20f9b5e73bp-551, 0x1.f687e2b942e41p-546 }, + { 0x1.0931e5b5e6c43p-551, 0x1.668d1bf455ad8p-546 }, + { 0x1.7a3ee7681856fp-552, 0x1.ff956b675583bp-547 }, + { 0x1.0db636a632668p-552, 0x1.6cebd6a35f863p-547 }, + { 0x1.809822a836e1fp-553, 0x1.0445cf3250898p-547 }, + { 0x1.122bfb19eafe7p-553, 0x1.73392002f5fc2p-548 }, + { 0x1.86dbd3e416493p-554, 0x1.08b3e84ebc2b9p-548 }, + { 0x1.1691d609b1ec9p-554, 0x1.79731441e1e21p-549 }, + { 0x1.8d080d9d1c96dp-555, 0x1.0d13aa83e4b01p-549 }, + { 0x1.1ae66ac0b0b6ap-555, 0x1.7f97cea22928bp-550 }, + { 0x1.931ae34603f62p-556, 0x1.1163bef9eebc1p-550 }, + { 0x1.1f285d8d6c817p-556, 0x1.85a56a6965552p-551 }, + { 0x1.99126a3e88ca5p-557, 0x1.15a2cf3193875p-551 }, + { 0x1.23565474c154ep-557, 0x1.8b9a03d510324p-552 }, + { 0x1.9eecbad1cb519p-558, 0x1.19cf85b21a11fp-552 }, + { 0x1.276ef7e686addp-558, 0x1.9173b9121e9f7p-553 }, + { 0x1.a4a7f136af77ep-559, 0x1.1de88eb969b39p-553 }, + { 0x1.2b70f3735b79fp-559, 0x1.9730ab373bc61p-554 }, + { 0x1.aa422e918100dp-560, 0x1.21ec98edb9593p-554 }, + { 0x1.2f5af68314ac2p-560, 0x1.9cceff40f1fb1p-555 }, + { 0x1.afb999f61e5d4p-561, 0x1.25da56105b758p-555 }, + { 0x1.332bb50b471fbp-561, 0x1.a24cdf0f0a2e7p-556 }, + { 0x1.b50c6169e961bp-562, 0x1.29b07bb123c75p-556 }, + { 0x1.36e1e845638bbp-562, 0x1.a7a87a6267113p-557 }, + { 0x1.ba38bae4baa67p-563, 0x1.2d6dc3e1e1b47p-557 }, + { 0x1.3a7c4f63d9d53p-563, 0x1.ace007da9e0c8p-558 }, + { 0x1.bf3ce55012ad1p-564, 0x1.3110ede9680cep-558 }, + { 0x1.3df9b045b81fcp-564, 0x1.b1f1c5f28dcc9p-559 }, + { 0x1.c4172983c2f7ep-565, 0x1.3498bef599a58p-559 }, + { 0x1.4158d828399aep-565, 0x1.b6dbfbfb30836p-560 }, + { 0x1.c8c5db3f49157p-566, 0x1.380402cbf1542p-560 }, + { 0x1.44989c55b9312p-566, 0x1.bb9cfb13e7262p-561 }, + { 0x1.cd475a1f163eep-567, 0x1.3b518c77fb7d2p-561 }, + { 0x1.47b7dad17cf31p-567, 0x1.c0331f1f7ac71p-562 }, + { 0x1.d19a128cff8a4p-568, 0x1.3e8036f737914p-562 }, + { 0x1.4ab57affd05a9p-568, 0x1.c49ccfb511d2cp-563 }, + { 0x1.d5bc7eab14dfbp-569, 0x1.418ee5e1d890ep-563 }, + { 0x1.4d906e49e5535p-569, 0x1.c8d8810c585d4p-564 }, + { 0x1.d9ad27381fd3dp-570, 0x1.447c860fdcf2cp-564 }, + { 0x1.5047b0bcf6527p-570, 0x1.cce4b4e41cdcap-565 }, + { 0x1.dd6aa46d0f45cp-571, 0x1.47480e39f8181p-565 }, + { 0x1.52da49a426b16p-571, 0x1.d0bffb62a59f5p-566 }, + { 0x1.e0f39ed2991f9p-572, 0x1.49f07f95c9d66p-566 }, + { 0x1.55474c1ca1f2bp-572, 0x1.d468f3ef07049p-567 }, + { 0x1.e446d00e60d84p-573, 0x1.4c74e66ce3841p-567 }, + { 0x1.578dd7a37e92bp-573, 0x1.d7de4e02c6f6fp-568 }, + { 0x1.e76303a6f7572p-574, 0x1.4ed45aae1d60cp-568 }, + { 0x1.59ad189ced845p-574, 0x1.db1ec9f31f5e1p-569 }, + { 0x1.ea4717be0f8c8p-575, 0x1.510e0078c325ep-569 }, + { 0x1.5ba448d444792p-575, 0x1.de2939b1372f7p-570 }, + { 0x1.ecf1fdc04a7dbp-576, 0x1.532108a122ff3p-570 }, + { 0x1.5d72aff4768dap-576, 0x1.e0fc8180b06b8p-571 }, + { 0x1.ef62bb0a0594ap-577, 0x1.550cb12e0f1dbp-571 }, + { 0x1.5f17a3f894e1dp-577, 0x1.e39798a3f0a89p-572 }, + { 0x1.f19869809eb8ap-578, 0x1.56d045cee7811p-572 }, + { 0x1.60928993f7077p-578, 0x1.e5f989fd91cadp-573 }, + { 0x1.f392381fab056p-579, 0x1.586b2049c7737p-573 }, + { 0x1.61e2d491b1f68p-579, 0x1.e82174a67122fp-574 }, + { 0x1.f54f6b79a6d5fp-580, 0x1.59dca8e17880fp-574 }, + { 0x1.6308082b0b65cp-580, 0x1.ea0e8c77dc629p-575 }, + { 0x1.f6cf5e2bb03dcp-581, 0x1.5b2456b2d3672p-575 }, + { 0x1.6401b7549eebbp-581, 0x1.ebc01a8965943p-576 }, + { 0x1.f8118143e7ebp-582, 0x1.5c41b0093e8e9p-576 }, + { 0x1.64cf8501f223bp-582, 0x1.ed357da1f18bap-577 }, + { 0x1.f9155c9a1fbd1p-583, 0x1.5d344aaa010f1p-577 }, + { 0x1.6571245f3d39ap-583, 0x1.ee6e2a9b9efdp-578 }, + { 0x1.f9da8f1a8a0ccp-584, 0x1.5dfbcc1628fd2p-578 }, + { 0x1.65e6590135ap-584, 0x1.ef69acba2f951p-579 }, + { 0x1.fa60cf0228aadp-585, 0x1.5e97e9c2cbc7fp-579 }, + { 0x1.662ef70ab154bp-585, 0x1.f027a5f3a7f56p-580 }, + { 0x1.faa7ea0cc6ecbp-586, 0x1.5f0869476fb64p-580 }, + { 0x1.664ae34801e0ep-586, 0x1.f0a7cf2ae7563p-581 }, + { 0x1.faafc59456a8cp-587, 0x1.5f4d2082760f5p-581 }, + { 0x1.663a133fef35p-587, 0x1.f0e9f85c03b41p-582 }, + { 0x1.fa785ea194bf2p-588, 0x1.5f65f5b366281p-582 }, + { 0x1.65fc8d3a43882p-588, 0x1.f0ee08ba43cd5p-583 }, + { 0x1.fa01c9ede6a16p-589, 0x1.5f52df8b025d3p-583 }, + { 0x1.6592683be2829p-589, 0x1.f0b3febf9cbcdp-584 }, + { 0x1.f94c33d66f35bp-590, 0x1.5f13e53118eaap-584 }, + { 0x1.64fbcbf86f1abp-590, 0x1.f03bf02da5a7ap-585 }, + { 0x1.f857e040665ap-591, 0x1.5ea91e400b8afp-585 }, + { 0x1.6438f0b98cabp-591, 0x1.ef860a0000a7ap-586 }, + { 0x1.f7252a6ecb2bbp-592, 0x1.5e12b2b611c72p-586 }, + { 0x1.634a1f3bd0d7ep-592, 0x1.ee92905044d53p-587 }, + { 0x1.f5b484c995f72p-593, 0x1.5d50dadc42d9dp-587 }, + { 0x1.622fb08184d56p-593, 0x1.ed61de2b81fc4p-588 }, + { 0x1.f40678969b4f4p-594, 0x1.5c63df237cf4dp-588 }, + { 0x1.60ea0d9b5d711p-594, 0x1.ebf4655983167p-589 }, + { 0x1.f21ba5a45e2afp-595, 0x1.5b4c17f7488b1p-589 }, + { 0x1.5f79af6759efdp-595, 0x1.ea4aae160108ap-590 }, + { 0x1.eff4c1e71b057p-596, 0x1.5a09ed86def16p-590 }, + { 0x1.5ddf1e460242cp-596, 0x1.e86556bc034fep-591 }, + { 0x1.ed92990861c73p-597, 0x1.589dd784842fp-591 }, + { 0x1.5c1af1c6454bep-597, 0x1.e6451363b8311p-592 }, + { 0x1.eaf60be99fa59p-598, 0x1.57085cdb6c23ep-592 }, + { 0x1.5a2dd0483fd76p-598, 0x1.e3eaad7319948p-593 }, + { 0x1.e820101a05296p-599, 0x1.554a135c6b3d2p-593 }, + { 0x1.58186e973c8cbp-599, 0x1.e1570321beee3p-594 }, + { 0x1.e511af403f0e1p-600, 0x1.53639f61bab8bp-594 }, + { 0x1.55db8f7b445c6p-600, 0x1.de8b06f0475d8p-595 }, + { 0x1.e1cc067882b19p-601, 0x1.5155b36a1ff17p-595 }, + { 0x1.537803429dd3dp-601, 0x1.db87bf13d1856p-596 }, + { 0x1.de5045a77840fp-602, 0x1.4f210fabcd4fep-596 }, + { 0x1.50eea743a03bp-602, 0x1.d84e44d6006fdp-597 }, + { 0x1.da9faec295ac1p-603, 0x1.4cc6819f5a3a9p-597 }, + { 0x1.4e406557456e3p-603, 0x1.d4dfc3ea1615fp-598 }, + { 0x1.d6bb950e85a76p-604, 0x1.4a46e38335bf7p-598 }, + { 0x1.4b6e334ceafc3p-604, 0x1.d13d79b7b4d75p-599 }, + { 0x1.d2a55c543d97bp-605, 0x1.47a31bd7fd98ap-599 }, + { 0x1.48791257b832ep-605, 0x1.cd68b49be13bdp-600 }, + { 0x1.ce5e780d6c294p-606, 0x1.44dc1cd628aecp-600 }, + { 0x1.45620e7623619p-606, 0x1.c962d320e4c77p-601 }, + { 0x1.c9e86a88f07ffp-607, 0x1.41f2e3dd79383p-601 }, + { 0x1.422a3dd414b5ep-607, 0x1.c52d432db963cp-602 }, + { 0x1.c544c4080f626p-608, 0x1.3ee878deaf1c1p-602 }, + { 0x1.3ed2c02828af5p-608, 0x1.c0c9812daaed1p-603 }, + { 0x1.c07521d52071ep-609, 0x1.3bbdedbff743p-603 }, + { 0x1.3b5cbe0c97302p-609, 0x1.bc391730e1bf4p-604 }, + { 0x1.bb7b2d547171ap-610, 0x1.38745dbc97fd1p-604 }, + { 0x1.37c9685446b6bp-610, 0x1.b77d9c068db21p-605 }, + { 0x1.b6589b1020c3ep-611, 0x1.350cecc05d9cfp-605 }, + { 0x1.3419f75c953bcp-611, 0x1.b298b2516cc35p-606 }, + { 0x1.b10f29bfb2a68p-612, 0x1.3188c6bf4cd49p-606 }, + { 0x1.304faa5c619afp-612, 0x1.ad8c07976bbcp-607 }, + { 0x1.aba0a14c264ccp-613, 0x1.2de91f0a22435p-607 }, + { 0x1.2c6bc6b0e1424p-613, 0x1.a859534d21642p-608 }, + { 0x1.a60ed1d150c44p-614, 0x1.2a2f2fa027fc3p-608 }, + { 0x1.286f9728ce321p-614, 0x1.a30255dde65bep-609 }, + { 0x1.a05b929d439abp-615, 0x1.265c387eea954p-609 }, + { 0x1.245c6b4e79163p-615, 0x1.9d88d7b14c6d3p-610 }, + { 0x1.9a88c12e847c2p-616, 0x1.22717ef05792fp-610 }, + { 0x1.203396b14a77p-616, 0x1.97eea82eb8229p-611 }, + { 0x1.94984031d9858p-617, 0x1.1e704cd7ceb7cp-611 }, + { 0x1.1bf6702f3caf4p-617, 0x1.92359cbfdea74p-612 }, + { 0x1.8e8bf6806bcabp-618, 0x1.1a59effeaeef1p-612 }, + { 0x1.17a6513ed67fap-618, 0x1.8c5f8fd2e86f6p-613 }, + { 0x1.8865ce1efe9b6p-619, 0x1.162fb960e6361p-613 }, + { 0x1.1344953a2bc16p-619, 0x1.866e5fdcf6e5cp-614 }, + { 0x1.8227b33ef66f4p-620, 0x1.11f2fc7a0a0a9p-614 }, + { 0x1.0ed298ab66e97p-620, 0x1.8063ee5dc8676p-615 }, + { 0x1.7bd39341e60d2p-621, 0x1.0da50e937b941p-615 }, + { 0x1.0a51b89b5ac38p-621, 0x1.7a421ee53231bp-616 }, + { 0x1.756b5bc0538cfp-622, 0x1.0947461417eb2p-616 }, + { 0x1.05c351e298147p-622, 0x1.740ad61b23997p-617 }, + { 0x1.6ef0f9946142ep-623, 0x1.04daf9d1f19dp-617 }, + { 0x1.0128c07d7eac9p-623, 0x1.6dbff8cae0f32p-618 }, + { 0x1.686657e900799p-624, 0x1.006180668cd93p-618 }, + { 0x1.f906bdc779cfcp-625, 0x1.67636af21f0cbp-619 }, + { 0x1.61cd5f4e4d33cp-625, 0x1.f7b85f0c272bbp-620 }, + { 0x1.efa90ac757637p-626, 0x1.60f70ed4a200ep-620 }, + { 0x1.5b27f4d3aafafp-626, 0x1.ee98b6b3e4f34p-621 }, + { 0x1.e63b1303dfbfbp-627, 0x1.5a7cc414fb8aap-621 }, + { 0x1.5477f92833195p-627, 0x1.e566abbe94f87p-622 }, + { 0x1.dcbf7abb88524p-628, 0x1.53f666d2fde17p-622 }, + { 0x1.4dbf47c1fc8ap-628, 0x1.dc24dc933bf6dp-623 }, + { 0x1.d338de3492428p-629, 0x1.4d65ced070949p-623 }, + { 0x1.46ffb60cbd76p-629, 0x1.d2d5e0d43505p-624 }, + { 0x1.c9a9d09a6515fp-630, 0x1.46ccce9c8cdf5p-624 }, + { 0x1.403b12a03d499p-630, 0x1.c97c4837b573ep-625 }, + { 0x1.c014dae645fc3p-631, 0x1.402d32c6be96dp-625 }, + { 0x1.3973247f05596p-631, 0x1.c01a996aebdb3p-626 }, + { 0x1.b67c7ad400b86p-632, 0x1.3988c1191e211p-626 }, + { 0x1.32a9aa5db4bb3p-632, 0x1.b6b3510058b7ap-627 }, + { 0x1.ace321e309c7bp-633, 0x1.32e137db0ef23p-627 }, + { 0x1.2be059f3526f7p-633, 0x1.ad48e069f2207p-628 }, + { 0x1.a34b346493cc3p-634, 0x1.2c384d1c64d5bp-628 }, + { 0x1.2518df52ef492p-634, 0x1.a3ddacff96f65p-629 }, + { 0x1.99b70897047dcp-635, 0x1.258fae0968e74p-629 }, + { 0x1.1e54dc4edf3a3p-635, 0x1.9a740f1248851p-630 }, + { 0x1.9028e5cf277c7p-636, 0x1.1ee8fe480d92cp-630 }, + { 0x1.1795e7e5c7ccap-636, 0x1.910e510c93fe1p-631 }, + { 0x1.86a303af6f699p-637, 0x1.1845d75e974c6p-631 }, + { 0x1.10dd8db9b7b2p-637, 0x1.87aeaea087811p-632 }, + { 0x1.7d27896d87b8ep-638, 0x1.11a7c823f5ff5p-632 }, + { 0x1.0a2d4d917179ap-638, 0x1.7e57540380a9p-633 }, + { 0x1.73b88d266bc5ap-639, 0x1.0b10543a01766p-633 }, + { 0x1.03869ae409b27p-639, 0x1.750a5d3814d59p-634 }, + { 0x1.6a58134129f18p-640, 0x1.0480f391c14fcp-634 }, + { 0x1.f9d5b8ddde221p-641, 0x1.6bc9d56645be6p-635 }, + { 0x1.61080de06bfbp-641, 0x1.fbf623f3bedbap-636 }, + { 0x1.ecb6d7acd34f7p-642, 0x1.6297b642274f2p-636 }, + { 0x1.57ca5c62d05ddp-642, 0x1.ef001d6eb49dfp-637 }, + { 0x1.dfb32aa129cc6p-643, 0x1.5975e7810e7p-637 }, + { 0x1.4ea0caf213789p-643, 0x1.e222785106b16p-638 }, + { 0x1.d2cd2eb59de4cp-644, 0x1.50663e5d53392p-638 }, + { 0x1.458d1220fa79dp-644, 0x1.d55fbee497ep-639 }, + { 0x1.c60744f31e198p-645, 0x1.476a7d28a437bp-639 }, + { 0x1.3c90d697e5b5dp-645, 0x1.c8ba606fb6833p-640 }, + { 0x1.b963b20518321p-646, 0x1.3e8452ecdbe84p-640 }, + { 0x1.33ada8cfe418fp-646, 0x1.bc34b0b8bbc6p-641 }, + { 0x1.ace49de2283aep-647, 0x1.35b55b1b3d652p-641 }, + { 0x1.2ae504dc15f24p-647, 0x1.afd0e79df00ebp-642 }, + { 0x1.a08c1388db34fp-648, 0x1.2cff1d49f192cp-642 }, + { 0x1.223852412258p-648, 0x1.a39120c175c51p-643 }, + { 0x1.945c00d028182p-649, 0x1.24630cff92d39p-643 }, + { 0x1.19a8e3da77fbep-649, 0x1.97775b48ec1aap-644 }, + { 0x1.8856364b336c5p-650, 0x1.1be2898c8a8a4p-644 }, + { 0x1.1137f7cd08642p-650, 0x1.8b8579b06ca2cp-645 }, + { 0x1.7c7c673fe436ep-651, 0x1.137eddf1f97aep-645 }, + { 0x1.08e6b787233bap-651, 0x1.7fbd41b078795p-646 }, + { 0x1.70d029afc4472p-652, 0x1.0b3940d5da6fcp-646 }, + { 0x1.00b637cd0ec0bp-652, 0x1.74205c365c73ep-647 }, + { 0x1.6552f6729a259p-653, 0x1.0312d48405757p-647 }, + { 0x1.f14ef1a3e4ac2p-654, 0x1.68b0556e87723p-648 }, + { 0x1.5a06296220023p-654, 0x1.f6194df7630e5p-649 }, + { 0x1.e176ccb941b53p-655, 0x1.5d6e9ce0425a7p-649 }, + { 0x1.4eeb0196310cdp-655, 0x1.e64f64121563ep-650 }, + { 0x1.d1e5afef936dap-656, 0x1.525c859a2ea9ap-650 }, + { 0x1.4402a1b0bd9dfp-656, 0x1.d6c9b6d4d6fc5p-651 }, + { 0x1.c29d225a230e3p-657, 0x1.477b466ee6cc1p-651 }, + { 0x1.394e1038ce88ep-657, 0x1.c789ea0183d02p-652 }, + { 0x1.b39e83951bdaap-658, 0x1.3ccbfa4112a58p-652 }, + { 0x1.2ece3803d8d68p-658, 0x1.b8917a154498bp-653 }, + { 0x1.a4eb0c6436cf4p-659, 0x1.324fa05e3adc4p-653 }, + { 0x1.2483e8ac9d061p-659, 0x1.a9e1bcd30af1fp-654 }, + { 0x1.9683cf6400112p-660, 0x1.28071ce79e917p-654 }, + { 0x1.1a6fd716c7c18p-660, 0x1.9b7be1e1550cbp-655 }, + { 0x1.8869b9cc95345p-661, 0x1.1df33948493fap-655 }, + { 0x1.10929dfe85b79p-661, 0x1.8d60f37a227b9p-656 }, + { 0x1.7a9d9444b613ep-662, 0x1.1414a4b7a1729p-656 }, + { 0x1.06ecbe9338febp-662, 0x1.7f91d72bfd333p-657 }, + { 0x1.6d2003c3fdf54p-663, 0x1.0a6bf4c7a4f95p-657 }, + { 0x1.fafd4238f8063p-664, 0x1.720f4eaaf4bbbp-658 }, + { 0x1.5ff18a8317f0ap-664, 0x1.00f9a5fe04069p-658 }, + { 0x1.e8912b5139031p-665, 0x1.64d9f8b065b73p-659 }, + { 0x1.531288f8c01c7p-665, 0x1.ef7c38ee94e41p-660 }, + { 0x1.d695a98770e4bp-666, 0x1.57f251e86550ep-660 }, + { 0x1.46833ee262b1p-666, 0x1.dd73492689d2p-661 }, + { 0x1.c50b006d4e015p-667, 0x1.4b58b5eba6cc7p-661 }, + { 0x1.3a43cc572b3d3p-667, 0x1.cbd8e7539eac7p-662 }, + { 0x1.b3f14799b1616p-668, 0x1.3f0d6044b145dp-662 }, + { 0x1.2e5432e458097p-668, 0x1.baad518e7426ep-663 }, + { 0x1.a3486c40b74f1p-669, 0x1.33106d7f3cac9p-663 }, + { 0x1.22b456b1a8db7p-669, 0x1.a9f09adee91e3p-664 }, + { 0x1.931032d667261p-670, 0x1.2761dc408f1efp-664 }, + { 0x1.1763ffacc46acp-670, 0x1.99a2acce5bd7fp-665 }, + { 0x1.834838ba6fe3dp-671, 0x1.1c018e67b6eaep-665 }, + { 0x1.0c62daba74e7cp-671, 0x1.89c349043d67ep-666 }, + { 0x1.73eff5eb5eca5p-672, 0x1.10ef4a3481a29p-666 }, + { 0x1.01b07aeca1f42p-672, 0x1.7a520aeb63faep-667 }, + { 0x1.6506bebfc67bdp-673, 0x1.062abb7415c63p-667 }, + { 0x1.ee98b577ea7cap-674, 0x1.6b4e695e9099fp-668 }, + { 0x1.568bc5a3d72eep-674, 0x1.f766e96435041p-669 }, + { 0x1.da6bba883d22ap-675, 0x1.5cb7b85aa6067p-669 }, + { 0x1.487e1cd9f3e43p-675, 0x1.e311e0dabf963p-670 }, + { 0x1.c6d89f0368fc1p-676, 0x1.4e8d2ab5187d6p-670 }, + { 0x1.3adcb83cdccc3p-676, 0x1.cf55249e0172ap-671 }, + { 0x1.b3ddd3216f86ep-677, 0x1.40cdd3d52967cp-671 }, + { 0x1.2da66f0214306p-677, 0x1.bc2f50c60488ep-672 }, + { 0x1.a1799fd5925f4p-678, 0x1.3378a96e8e29ap-672 }, + { 0x1.20d9fd7b31257p-678, 0x1.a99ed8a2f2e6bp-673 }, + { 0x1.8faa294857a39p-679, 0x1.268c853c2e48dp-673 }, + { 0x1.147606d4e1ee3p-679, 0x1.97a2092e9b19dp-674 }, + { 0x1.7e6d714d6fce7p-680, 0x1.1a0826b9b2f1ep-674 }, + { 0x1.087916d26f37cp-680, 0x1.86370b7b69b46p-675 }, + { 0x1.6dc159d3dbce3p-681, 0x1.0dea34dab05c3p-675 }, + { 0x1.f9c3470942341p-682, 0x1.755be71f29feap-676 }, + { 0x1.5da3a74ec8bc7p-682, 0x1.02313fbe40a01p-676 }, + { 0x1.e35c1df5edf07p-683, 0x1.650e8497f58cdp-677 }, + { 0x1.4e120315adc06p-683, 0x1.edb784bbee452p-678 }, + { 0x1.cdb951dc67cbfp-684, 0x1.554cafa9d0c34p-678 }, + { 0x1.3f09fdba5037ep-684, 0x1.d7d0486e476ccp-679 }, + { 0x1.b8d760c6a3faap-685, 0x1.461419b3892c2p-679 }, + { 0x1.308911536a23dp-685, 0x1.c2a975dad9bep-680 }, + { 0x1.a4b2aa8c000cap-686, 0x1.37625bf981bdbp-680 }, + { 0x1.228ca3bac6e07p-686, 0x1.ae3f97cbb25cep-681 }, + { 0x1.914773f3bbbacp-687, 0x1.2934f9e530badp-681 }, + { 0x1.151208bdc254ep-687, 0x1.9a8f1bb2e0d78p-682 }, + { 0x1.7e91e9c37a26bp-688, 0x1.1b8963382a86p-682 }, + { 0x1.0816843f2edd8p-688, 0x1.879454bd5bf1ap-683 }, + { 0x1.6c8e23b87885fp-689, 0x1.0e5cf631ac83bp-683 }, + { 0x1.f72e98937c4f8p-690, 0x1.754b7ed21d736p-684 }, + { 0x1.5b38276a48eap-690, 0x1.01ad01a5b2ddp-684 }, + { 0x1.df23162441e8bp-691, 0x1.63b0c17c2afp-685 }, + { 0x1.4a8beb16012edp-691, 0x1.eaed8e09770edp-686 }, + { 0x1.c804c1d0522ebp-692, 0x1.52c032be62aabp-686 }, + { 0x1.3a855850eeeeap-692, 0x1.d36ef8a6e08fap-687 }, + { 0x1.b1cdcc2ca0214p-693, 0x1.4275d9d00481dp-687 }, + { 0x1.2b204ea20186ep-693, 0x1.bcd89c2310d59p-688 }, + { 0x1.9c78595e362cep-694, 0x1.32cdb1c10f0eep-688 }, + { 0x1.1c58a6013aaeep-694, 0x1.a724c21e93002p-689 }, + { 0x1.87fe848fd6bffp-695, 0x1.23c3ac05a8c19p-689 }, + { 0x1.0e2a313c94bb5p-695, 0x1.924da8624908p-690 }, + { 0x1.745a6341bd9d3p-696, 0x1.1553b2e7eba16p-690 }, + { 0x1.0090c041eb55fp-696, 0x1.7e4d844204d5fp-691 }, + { 0x1.61860872f36c7p-697, 0x1.0779abdf88654p-691 }, + { 0x1.e710449b20327p-698, 0x1.6b1e85d9cfdc3p-692 }, + { 0x1.4f7b87a3ccd22p-698, 0x1.f462f39da55f5p-693 }, + { 0x1.ce184ffaa0275p-699, 0x1.58badb2559681p-693 }, + { 0x1.3e34f7b15484dp-699, 0x1.daedfe49c8a9fp-694 }, + { 0x1.b6314a8f93441p-700, 0x1.471cb2f12adecp-694 }, + { 0x1.2dac75898461p-700, 0x1.c28c3fc94131bp-695 }, + { 0x1.9f52e6b0168fbp-701, 0x1.363e3fa56683p-695 }, + { 0x1.1ddc26b854422p-701, 0x1.ab358720f461fp-696 }, + { 0x1.8974e49b18481p-702, 0x1.2619b9e9f9276p-696 }, + { 0x1.0ebe3bcdc6652p-702, 0x1.94e1adf5ef17ap-697 }, + { 0x1.748f15c14a99p-703, 0x1.16a96324493c1p-697 }, + { 0x1.004cf29d383afp-703, 0x1.7f889bf8109c7p-698 }, + { 0x1.60995fd7916b4p-704, 0x1.07e787ce8decbp-698 }, + { 0x1.e50530acb7a2bp-705, 0x1.6b224a16aa4ep-699 }, + { 0x1.4d8bbfb38c98p-705, 0x1.f39d03522ee6ep-700 }, + { 0x1.cab316f0b29dep-706, 0x1.57a6c57f8fed2p-700 }, + { 0x1.3b5e4bf3051bbp-706, 0x1.d8b1738bdcb74p-701 }, + { 0x1.b1987b3f62cd2p-707, 0x1.450e32693ba8dp-701 }, + { 0x1.2a09376f26716p-707, 0x1.bf0154de94403p-702 }, + { 0x1.99aa6a5f22416p-708, 0x1.3350cea8cd61ap-702 }, + { 0x1.1984d37c8d151p-708, 0x1.a681c1d2f0b94p-703 }, + { 0x1.82de1daeb9c47p-709, 0x1.2266f414ce57bp-703 }, + { 0x1.09c991f950457p-709, 0x1.8f27fe21c9591p-704 }, + { 0x1.6d28fdea9871ap-710, 0x1.12491ab5c17d9p-704 }, + { 0x1.f5a00e548f085p-711, 0x1.78e979aa0c9bep-705 }, + { 0x1.5880a5ae03598p-711, 0x1.02efdac5a4ff4p-705 }, + { 0x1.d921d6d1c821bp-712, 0x1.63bbd32217718p-706 }, + { 0x1.44dae3b23367bp-712, 0x1.e8a7dcff4677cp-707 }, + { 0x1.be0a394617721p-713, 0x1.4f94da865b2a3p-707 }, + { 0x1.322dbccd73cabp-713, 0x1.ccdc67829105bp-708 }, + { 0x1.a44b3f5ce9c8bp-714, 0x1.3c6a934743c05p-708 }, + { 0x1.206f6db46b93p-714, 0x1.b26f5afd4ebc9p-709 }, + { 0x1.8bd742e227a38p-715, 0x1.2a3336386b4d7p-709 }, + { 0x1.0f966c7fd2396p-715, 0x1.99530a15ce61ap-710 }, + { 0x1.74a0efc06d36ep-716, 0x1.18e533433f227p-710 }, + { 0x1.ff32d3f1c0a49p-717, 0x1.817a166d90dbdp-711 }, + { 0x1.5e9b45aff1bep-717, 0x1.087732df4f3abp-711 }, + { 0x1.e0dea55db81c4p-718, 0x1.6ad7728d6db01p-712 }, + { 0x1.49b9999981d6cp-718, 0x1.f1c02ea5235f3p-713 }, + { 0x1.c41e9fb058b1ep-719, 0x1.555e63841a093p-713 }, + { 0x1.35ef96b0fe655p-719, 0x1.d42dfb77e321ep-714 }, + { 0x1.a8e19002cb47fp-720, 0x1.4102823a6a0a2p-714 }, + { 0x1.23313f4adb099p-720, 0x1.b8267dd51660dp-715 }, + { 0x1.8f16bf19917acp-721, 0x1.2db7bc80b123ep-715 }, + { 0x1.1172ed701cd4p-721, 0x1.9d98e007ff597p-716 }, + { 0x1.76adf2095d808p-722, 0x1.1b7255d8af1cep-716 }, + { 0x1.00a953345bce4p-722, 0x1.8474c5f89cf1fp-717 }, + { 0x1.5f976a86ba7a3p-723, 0x1.0a26e7ff7c8ap-717 }, + { 0x1.e192f5a290a0dp-724, 0x1.6caa4dc34bcc6p-718 }, + { 0x1.49c3e6e576cf8p-724, 0x1.f394c675d5da1p-719 }, + { 0x1.c3918d16606afp-725, 0x1.562a0ffd36fefp-719 }, + { 0x1.3524a1ccb90cep-725, 0x1.d4a41cdb95576p-720 }, + { 0x1.a739e0c3f00b3p-726, 0x1.40e51faa74ee4p-720 }, + { 0x1.21ab51a49a64p-726, 0x1.b7670ded07be7p-721 }, + { 0x1.8c781323e2b8bp-727, 0x1.2ccd09eaa341p-721 }, + { 0x1.0f4a27c210b83p-727, 0x1.9bc980b6cd88bp-722 }, + { 0x1.7338f3cfd4b18p-728, 0x1.19d3d560c7458p-722 }, + { 0x1.fbe79eabbab8bp-729, 0x1.81b807901b2ddp-723 }, + { 0x1.5b69fdd784131p-729, 0x1.07ec015b26bbfp-723 }, + { 0x1.db36d8463b3e1p-730, 0x1.691fdebe382bep-724 }, + { 0x1.44f955c9776f6p-730, 0x1.ee11097f70374p-725 }, + { 0x1.bc693203fe92cp-731, 0x1.51eeeac7320bep-725 }, + { 0x1.2fd5c7756dd24p-731, 0x1.ce39998362bf9p-726 }, + { 0x1.9f66cc65fb2cbp-732, 0x1.3c13b67a17ff2p-726 }, + { 0x1.1beec36eb8502p-732, 0x1.b03976c943068p-727 }, + { 0x1.8418af0dd65edp-733, 0x1.277d70b2ebc6fp-727 }, + { 0x1.09345c546e7cdp-733, 0x1.93f94ba2c6b6ap-728 }, + { 0x1.6a68c4bfd764bp-734, 0x1.141be9e049453p-728 }, + { 0x1.ef2e87ca7b717p-735, 0x1.7962a50231832p-729 }, + { 0x1.5241d71eb6e19p-735, 0x1.01df915097b64p-729 }, + { 0x1.ce118fc8beeeap-736, 0x1.605fee84767fp-730 }, + { 0x1.3b8f8a28fd848p-736, 0x1.e172e498cd2fcp-731 }, + { 0x1.aef59daa19c93p-737, 0x1.48dc6e3757e71p-731 }, + { 0x1.263e577f574dp-737, 0x1.c1366206ca036p-732 }, + { 0x1.91bfa9231de5cp-738, 0x1.32c440230ef3ap-732 }, + { 0x1.123b897af1af4p-738, 0x1.a2ee0ea25a216p-733 }, + { 0x1.7655cd85a2773p-739, 0x1.1e04519eb8f87p-733 }, + { 0x1.feea6c3554149p-740, 0x1.867f82bdccb8fp-734 }, + { 0x1.5c9f427a491a4p-740, 0x1.0a8a5c7678dffp-734 }, + { 0x1.dbb4739afff2ep-741, 0x1.6bd1744d1513ep-735 }, + { 0x1.4484548d479a3p-741, 0x1.f089c3d3d8b6fp-736 }, + { 0x1.bab46440d8e4bp-742, 0x1.52cbafb8bc99fp-736 }, + { 0x1.2dee5d96e696ep-742, 0x1.ce464b1286c0dp-737 }, + { 0x1.9bcaf0aad775cp-743, 0x1.3b571085ef9dbp-737 }, + { 0x1.18c7bd07b007fp-743, 0x1.ae2a4fedee59cp-738 }, + { 0x1.7eda37d26ae66p-744, 0x1.255d79dbe3905p-738 }, + { 0x1.04fbd01fd3b9ap-744, 0x1.9017432798e26p-739 }, + { 0x1.63c5ba199716fp-745, 0x1.10c9ceee61d28p-739 }, + { 0x1.e4edd431a7a4p-746, 0x1.73effa34f57abp-740 }, + { 0x1.4a724e2f6eadep-746, 0x1.fb0fd6a99ec28p-741 }, + { 0x1.c24c9890314cdp-747, 0x1.5998a4600495bp-741 }, + { 0x1.32c615eef6a3dp-747, 0x1.d70936a92f04ap-742 }, + { 0x1.a1f03c81340fdp-748, 0x1.40f6bfdad1f14p-742 }, + { 0x1.1ca87340e1c39p-748, 0x1.b55b284add8c1p-743 }, + { 0x1.83b6cbf2ba29fp-749, 0x1.29f10ece9036ep-743 }, + { 0x1.0801fd07f7284p-749, 0x1.95e2d86ae92c8p-744 }, + { 0x1.677ffffc31b92p-750, 0x1.146f8c6e8dc57p-744 }, + { 0x1.e978e83ebd95dp-751, 0x1.787f26e598ebbp-745 }, + { 0x1.4d2d2f5dd4096p-751, 0x1.005b6216a17eap-745 }, + { 0x1.c58570e2f641dp-752, 0x1.5d10973fbab06p-746 }, + { 0x1.34a13f272cdfap-752, 0x1.db3db8f832a58p-747 }, + { 0x1.a4017c5ace0dep-753, 0x1.4379416dfac63p-747 }, + { 0x1.1dc0938cfb932p-753, 0x1.b84ac1ef46255p-748 }, + { 0x1.84c7064147f81p-754, 0x1.2b9cc2c3d6738p-748 }, + { 0x1.087100f5e6429p-754, 0x1.97b6c5dc3637ap-749 }, + { 0x1.67b20873fc995p-755, 0x1.15602f1227af8p-749 }, + { 0x1.e9337a8979dap-756, 0x1.795cb2bb480b6p-750 }, + { 0x1.4ca0667456eb8p-756, 0x1.00aa01fc8a73ep-750 }, + { 0x1.c446a2ccade1cp-757, 0x1.5d196927cdaccp-751 }, + { 0x1.3371d92c55c69p-757, 0x1.dac421184af19p-752 }, + { 0x1.a1ef1650d3562p-758, 0x1.42cba823b93cbp-752 }, + { 0x1.1c07db1df4cf6p-758, 0x1.b6e2f60b615c1p-753 }, + { 0x1.8202debc2593cp-759, 0x1.2a53f94211ba9p-753 }, + { 0x1.064595037ce7bp-759, 0x1.95853e0fd75adp-754 }, + { 0x1.645a58ac6913cp-760, 0x1.13949d3b2fbd2p-754 }, + { 0x1.e41f95cc492cep-761, 0x1.768213ee2ba9cp-755 }, + { 0x1.48d0194e5b153p-761, 0x1.fce2f1e195a7ap-756 }, + { 0x1.be99935f38c42p-762, 0x1.59b2d772c1b04p-756 }, + { 0x1.2f40d4a5d287p-762, 0x1.d5a005ce1b15dp-757 }, + { 0x1.9bc8aa74c3805p-763, 0x1.3ef3138f8ae58p-757 }, + { 0x1.178b448b82b16p-763, 0x1.b12e626e3c8a1p-758 }, + { 0x1.7b7f2dc7fa066p-764, 0x1.2620652c3102cp-758 }, + { 0x1.0190106456396p-764, 0x1.8f5ecffd9c995p-759 }, + { 0x1.5d92194746ef2p-765, 0x1.0f1a62a97a48ep-759 }, + { 0x1.da636b2add63ap-766, 0x1.7004d0a0dd3fcp-760 }, + { 0x1.41d8f14e2d235p-766, 0x1.f38508375a815p-761 }, + { 0x1.b4a8e16df3a2ep-767, 0x1.52f67f4a45dbdp-761 }, + { 0x1.282da2ee06e9fp-767, 0x1.cbf8187da97p-762 }, + { 0x1.91bc4f0e82a1p-768, 0x1.380c6fa6ddd1bp-762 }, + { 0x1.106c65473611bp-768, 0x1.a757e44dde4fbp-763 }, + { 0x1.716ca73d3a1dcp-769, 0x1.1f218f165083cp-763 }, + { 0x1.f4e737e667fe6p-770, 0x1.8571975a9ba0cp-764 }, + { 0x1.538bdbc88035p-770, 0x1.081306aee058bp-764 }, + { 0x1.cc4774fe05a13p-771, 0x1.661571375ee31p-765 }, + { 0x1.37eeb586702afp-771, 0x1.e5803c9b677cp-766 }, + { 0x1.a6be51e94d2c3p-772, 0x1.49169d29f057fp-766 }, + { 0x1.1e6cae3cc5ce4p-772, 0x1.be144165bfdadp-767 }, + { 0x1.841452e30c6ecp-773, 0x1.2e4b0b7596d86p-767 }, + { 0x1.06dfcc0330324p-773, 0x1.99a8814f82396p-768 }, + { 0x1.64157d8dbcaa1p-774, 0x1.158b4c1d7aa61p-768 }, + { 0x1.e248fc3725278p-775, 0x1.7806fe5adc0dep-769 }, + { 0x1.4691284199248p-775, 0x1.fd64d63539ac4p-770 }, + { 0x1.ba32f675bcca1p-776, 0x1.58fd2560c98e3p-770 }, + { 0x1.2b59cb5fcd07p-776, 0x1.d33b9c01b8858p-771 }, + { 0x1.953f4278d9771p-777, 0x1.3c5b9e7be019ep-771 }, + { 0x1.1244d4a198783p-777, 0x1.ac5a261b57bd2p-772 }, + { 0x1.7333ac721d353p-778, 0x1.21f61f6e6a3a5p-772 }, + { 0x1.f654f8b2c9938p-779, 0x1.8883e334bf813p-773 }, + { 0x1.53d9d5f4e3889p-779, 0x1.09a33ffab8174p-773 }, + { 0x1.cbcb3935e8707p-780, 0x1.678037d69a88ap-774 }, + { 0x1.36fefd85e37f7p-780, 0x1.e678a0474dd4dp-775 }, + { 0x1.a4a7147e53789p-781, 0x1.491a44a8cc267p-775 }, + { 0x1.1c73c8c2f3143p-781, 0x1.bd3a60953bab8p-776 }, + { 0x1.80a7df6e9e4abp-782, 0x1.2d20af56e98e4p-776 }, + { 0x1.040c111171b21p-782, 0x1.9748563f2a02cp-777 }, + { 0x1.5f9153468350dp-783, 0x1.13656dff66048p-777 }, + { 0x1.db3d65827b6f1p-784, 0x1.7463a2ae57157p-778 }, + { 0x1.412b4a3b0b6bbp-784, 0x1.f77b2a384d071p-779 }, + { 0x1.b20abd232bd72p-785, 0x1.5451ae34b02aep-779 }, + { 0x1.25417f5fe18aap-785, 0x1.cc024fa52d21ep-780 }, + { 0x1.8c38db09c3d68p-786, 0x1.36dbe645ba702p-780 }, + { 0x1.0ba351c6b2c44p-786, 0x1.a415d531b6e85p-781 }, + { 0x1.69856de02317p-787, 0x1.1bcf7eeeba2f5p-781 }, + { 0x1.e847157246bfcp-788, 0x1.7f70703ac5558p-782 }, + { 0x1.49b2d16422141p-788, 0x1.02fd377359b1p-782 }, + { 0x1.bd304de355d85p-789, 0x1.5dd1b0bb84b26p-783 }, + { 0x1.2c87c2ff697dcp-789, 0x1.d87243e77ecadp-784 }, + { 0x1.95b4456f24a66p-790, 0x1.3efdb3b369292p-784 }, + { 0x1.11cf1a60f1d84p-790, 0x1.aeb4dc01a4631p-785 }, + { 0x1.718a9184a8678p-791, 0x1.22bcd99dbdb06p-785 }, + { 0x1.f2af0be1fde49p-792, 0x1.88766c06b0833p-786 }, + { 0x1.507007917e3d9p-792, 0x1.08db80d427d79p-786 }, + { 0x1.c5e695f15072bp-793, 0x1.65709eb54bf5ep-787 }, + { 0x1.32266540e08c2p-793, 0x1.e253876b38acep-788 }, + { 0x1.9cf012acb820bp-794, 0x1.45623a2f6a451p-788 }, + { 0x1.1673fda512b46p-794, 0x1.b6f674d703273p-789 }, + { 0x1.777d05328bd26p-795, 0x1.280eca736b4b1p-789 }, + { 0x1.fa46d62b8e57dp-796, 0x1.8f4d804e3ad6fp-790 }, + { 0x1.5544c8bc23e1cp-796, 0x1.0d3e50a2eecdcp-790 }, + { 0x1.cc068b1dc8ab2p-797, 0x1.6b0c7763ce52bp-791 }, + { 0x1.36042b906571p-797, 0x1.e979edc5b3767p-792 }, + { 0x1.a1cbbab815b4cp-798, 0x1.49ecd657d5dd6p-792 }, + { 0x1.197d0fe71564cp-798, 0x1.bcb59141dc715p-793 }, + { 0x1.7b41f3bcb1869p-799, 0x1.2bad65a82bb23p-793 }, + { 0x1.feec24eca8006p-800, 0x1.93d6de18ac6bfp-794 }, + { 0x1.581b387627669p-800, 0x1.1011dd6dfecf6p-794 }, + { 0x1.cf746ccaba032p-801, 0x1.6e8be31f2fe24p-795 }, + { 0x1.380f8b864e1acp-801, 0x1.edc51c8649aaap-796 }, + { 0x1.a4312cc2f816ap-802, 0x1.4c88f43732a1p-796 }, + { 0x1.1adc83c96accfp-802, 0x1.bfd81ed74f1cdp-797 }, + { 0x1.7cc835281bbf3p-803, 0x1.2d883a292df3bp-797 }, + { 0x1.0044e6f2b903fp-803, 0x1.95fde403b5724p-798 }, + { 0x1.58e66674c0f82p-804, 0x1.11494966870b7p-798 }, + { 0x1.d0209514d613dp-805, 0x1.6fdef1ca550b3p-799 }, + { 0x1.383f2f4495aedp-805, 0x1.ef217eb67d36dp-800 }, + { 0x1.a41575f0363d6p-806, 0x1.4d2aaa5b8e28ap-800 }, + { 0x1.1a8c12a0cae91p-806, 0x1.c04fcbf1fddd8p-801 }, + { 0x1.7c08d08f2ccbbp-807, 0x1.2d96cdd2a30b8p-801 }, + { 0x1.ff186c5b90604p-808, 0x1.95b8ba50a2687p-802 }, + { 0x1.57a2b0b1c4c86p-808, 0x1.10df03cd711e3p-802 }, + { 0x1.ce07ef98af2aep-809, 0x1.6eff939f51c8fp-803 }, + { 0x1.36923c5eb270bp-809, 0x1.ed88d96607fb4p-804 }, + { 0x1.a1791489717bfp-810, 0x1.4bcf1445c1d61p-804 }, + { 0x1.188d2c2d680a3p-810, 0x1.be1a747b458c8p-805 }, + { 0x1.7907312c7e255p-811, 0x1.2bd8dde16ba8ap-805 }, + { 0x1.fa9e995f4c414p-812, 0x1.93089dc23e417p-806 }, + { 0x1.5455df149c7b5p-812, 0x1.0ed4f34d6e965p-806 }, + { 0x1.c93410e8142f8p-813, 0x1.6bf1c754a3325p-807 }, + { 0x1.33105a5b594f7p-813, 0x1.e9027b1c5a4abp-808 }, + { 0x1.9c67f441e11b3p-814, 0x1.487c687197597p-808 }, + { 0x1.14e8ebae7496ep-814, 0x1.b942323a72767p-809 }, + { 0x1.73d10c597b774p-815, 0x1.285660efb3e9ap-809 }, + { 0x1.f330b99c7f9e7p-816, 0x1.8df9d62fb9c5ep-810 }, + { 0x1.4f0ef77c81a6fp-816, 0x1.0b34677fe9486p-810 }, + { 0x1.c1baedb5f2e65p-817, 0x1.66c37bb05de1ep-811 }, + { 0x1.2dc9788ad9864p-817, 0x1.e1a30436bcde5p-812 }, + { 0x1.94f913add4907p-818, 0x1.4341c90c553e7p-812 }, + { 0x1.0fafd2c40ba27p-818, 0x1.b1dd0ffc5d04bp-813 }, + { 0x1.6c7df995241d1p-819, 0x1.231f4a6757469p-813 }, + { 0x1.e8f062cc963cep-820, 0x1.86a35930ed5e1p-814 }, + { 0x1.47e5cbff0d92ep-820, 0x1.060dd236f49a3p-814 }, + { 0x1.b7be34be4e18dp-821, 0x1.5f8c25cd122d7p-815 }, + { 0x1.26d5559b935e7p-821, 0x1.d78bca82e9f37p-816 }, + { 0x1.8b4dd6af9c05dp-822, 0x1.3c36d15093021p-816 }, + { 0x1.08f94cfc79158p-822, 0x1.a80c62c44a65bp-817 }, + { 0x1.632ec0e0d009cp-823, 0x1.1c4b11ed6627ap-817 }, + { 0x1.dc0b5f2e40ea4p-824, 0x1.7d261cc2edf72p-818 }, + { 0x1.3efa480ea698bp-824, 0x1.fef096f5252fp-819 }, + { 0x1.ab6a5245de9e5p-825, 0x1.566c107178d1fp-819 }, + { 0x1.1e52cde409267p-825, 0x1.cae9de8f00c0bp-820 }, + { 0x1.7f910d0084829p-826, 0x1.337ae444bd293p-820 }, + { 0x1.00e3012bd4171p-826, 0x1.9bfbcfe9dc1e8p-821 }, + { 0x1.580c66bfc7cf5p-827, 0x1.13f803c0631d9p-821 }, + { 0x1.ccba595fe34b5p-828, 0x1.71ac2109d33c9p-822 }, + { 0x1.347383dcf4a9bp-828, 0x1.ef21caa7d80c3p-823 }, + { 0x1.9cf52785fcd1fp-829, 0x1.4b8b6bbdb7a4fp-823 }, + { 0x1.1466f7a4ba4b3p-829, 0x1.bbf4bcf8ca0c3p-824 }, + { 0x1.71f5b701cb667p-830, 0x1.2934441fdae8bp-824 }, + { 0x1.ef1fef5338f87p-831, 0x1.8de00a5d4cff3p-825 }, + { 0x1.4b46ffc2e70ccp-831, 0x1.0a4a61359d63ap-825 }, + { 0x1.bb3f3e667d5e5p-832, 0x1.64673b39bdd54p-826 }, + { 0x1.287ea78b8278fp-832, 0x1.dcf3acd0cc1f4p-827 }, + { 0x1.8c9c8347a2863p-833, 0x1.3f1926f0c2aa4p-827 }, + { 0x1.093c166d47d9p-833, 0x1.aaecb94ca24e1p-828 }, + { 0x1.62b5957e6b822p-834, 0x1.1d8efbbc88d6cp-828 }, + { 0x1.da4f3c5b8c56fp-835, 0x1.7df554174928cp-829 }, + { 0x1.3d1457a1afdaep-835, 0x1.fed6b4a9440a8p-830 }, + { 0x1.a7e3665ffae25p-836, 0x1.558fae0fed7aap-830 }, + { 0x1.1b4da97b89113p-836, 0x1.c8b307e047613p-831 }, + { 0x1.7aa46b2ec675cp-837, 0x1.3149a005e5984p-831 }, + { 0x1.fa00e080e536p-838, 0x1.9819329634547p-832 }, + { 0x1.520f92dcad4a2p-838, 0x1.10bba52994e8ep-832 }, + { 0x1.c3a9666328faap-839, 0x1.6c7dd2d93c0f9p-833 }, + { 0x1.2dae795ce73b6p-839, 0x1.e70fd5d6d806dp-834 }, + { 0x1.92f5963d343cfp-840, 0x1.45629dffe1fa7p-834 }, + { 0x1.0d15f439254bep-840, 0x1.b2b2e959996bp-835 }, + { 0x1.675546ac2c967p-841, 0x1.2255364dfcfd7p-835 }, + { 0x1.dfca1ff236f02p-842, 0x1.83c6a3841fccap-836 }, + { 0x1.4046155930cfbp-842, 0x1.02ee197efc99dp-836 }, + { 0x1.ab8846c89a496p-843, 0x1.59bfc8bdbfffep-837 }, + { 0x1.1d5226b496f7ep-843, 0x1.cd9f4c973304p-838 }, + { 0x1.7cc7edd2bedd1p-844, 0x1.3420703d360eap-838 }, + { 0x1.fc1e021531b11p-845, 0x1.9b4a6e4580455p-839 }, + { 0x1.52f9fd29afa7bp-845, 0x1.1276cde31355ep-839 }, + { 0x1.c439018f9e7bp-846, 0x1.6e44a0da72dedp-840 }, + { 0x1.2d9d4a3bfacfap-846, 0x1.e8b82d35e9882p-841 }, + { 0x1.9247c7d6b7109p-847, 0x1.4603c1a2de688p-841 }, + { 0x1.0c3d4d5746632p-847, 0x1.b2e6fa531d555p-842 }, + { 0x1.65add59367765p-848, 0x1.220b241172407p-842 }, + { 0x1.dce1e8301e6efp-849, 0x1.82d28ae825549p-843 }, + { 0x1.3dde18cb97a8dp-849, 0x1.01ea51e3f541cp-843 }, + { 0x1.a7b31ccb0b2f4p-850, 0x1.57e3d8e31e749p-844 }, + { 0x1.1a59798dd7aa2p-850, 0x1.ca77ce984ce61p-845 }, + { 0x1.7843a7981f8e3p-851, 0x1.3192c63185ef2p-845 }, + { 0x1.f55b0f3ffe463p-852, 0x1.974911a73b1a7p-846 }, + { 0x1.4df9fe655b0fbp-852, 0x1.0f64b579273f6p-846 }, + { 0x1.bce68ce6bcfedp-853, 0x1.69a3e1bad13dap-847 }, + { 0x1.284bfe1cdea24p-853, 0x1.e1d6859c11527p-848 }, + { 0x1.8a9c29acbf47dp-854, 0x1.40f425a16dca3p-848 }, + { 0x1.06bd70b72892bp-854, 0x1.ab8633790b1e2p-849 }, + { 0x1.5dd55c1a48477p-855, 0x1.1cb4a43b9229fp-849 }, + { 0x1.d1bd6b173b9f2p-856, 0x1.7b25cc6523c3bp-850 }, + { 0x1.35fc8451ff49ep-856, 0x1.f8db2dc70232bp-851 }, + { 0x1.9c9712232f548p-857, 0x1.5014bc06e7f91p-851 }, + { 0x1.128b47439dcd5p-857, 0x1.bf66ba3b9066cp-852 }, + { 0x1.6d53d2be0a0b6p-858, 0x1.29c2c1dc958dbp-852 }, + { 0x1.e6122171333dfp-859, 0x1.8c4a9d76af90fp-853 }, + { 0x1.435229d0cc681p-859, 0x1.07ae5a7347d0bp-853 }, + { 0x1.ae1371b74ea2dp-860, 0x1.5ed9539dfd0c9p-854 }, + { 0x1.1e01427183001p-860, 0x1.d2c69c7599edcp-855 }, + { 0x1.7c589442700ecp-861, 0x1.3677341a98a13p-855 }, + { 0x1.f9be9e1d7b4e4p-862, 0x1.9cf2c5625685ep-856 }, + { 0x1.5033c96eb757p-862, 0x1.1298aebe8af0fp-856 }, + { 0x1.bef014f36ffa9p-863, 0x1.6d2655c8560ebp-857 }, + { 0x1.290979be09b3bp-863, 0x1.e58166789d0bcp-858 }, + { 0x1.8ac6ba86dcc3cp-864, 0x1.42b9e90b536b6p-858 }, + { 0x1.064e638fb2517p-864, 0x1.acfe7e64002b1p-859 }, + { 0x1.5c884857d8adep-865, 0x1.1d179e12ade6ep-859 }, + { 0x1.cf0beaeb1b319p-866, 0x1.7ae01eb0f55cbp-860 }, + { 0x1.338e29511ffcdp-866, 0x1.f772a9e0423a1p-861 }, + { 0x1.9881a23b2ff9bp-867, 0x1.4e72e15f0f016p-861 }, + { 0x1.0f43798c4f845p-867, 0x1.bc4e2f5a8c9afp-862 }, + { 0x1.6836e63bd7d88p-868, 0x1.27165d875ec78p-862 }, + { 0x1.de466f9c32fdap-869, 0x1.87eb54ae1860dp-863 }, + { 0x1.3d79f883687bfp-869, 0x1.043b38d103ec9p-863 }, + { 0x1.a56d48500b8a3p-870, 0x1.598a7d65e3b67p-864 }, + { 0x1.17ac327f9b5e5p-870, 0x1.cac2d1ee89db1p-865 }, + { 0x1.73278f241bb95p-871, 0x1.308090afcd9f3p-865 }, + { 0x1.ec801820c3f3dp-872, 0x1.942d41e7bf2a3p-866 }, + { 0x1.46b841565ab3ep-872, 0x1.0c34dc595f4bfp-866 }, + { 0x1.b16ea850bfa34p-873, 0x1.63e9cb83e74b2p-867 }, + { 0x1.1f76e44abf0ecp-873, 0x1.d83e5a3ffd7adp-868 }, + { 0x1.7d432d7dd0ca1p-874, 0x1.39428e0fd00c5p-868 }, + { 0x1.f99abec00b682p-875, 0x1.9f8c2eadfb109p-869 }, + { 0x1.4f35579392d4bp-875, 0x1.13957092e7741p-869 }, + { 0x1.bc6c19eee10e8p-876, 0x1.6d7ad6ac744f9p-870 }, + { 0x1.2692d6adc530fp-876, 0x1.e4a41e3c393c2p-871 }, + { 0x1.8673fad41c337p-877, 0x1.4149a31665d1ep-871 }, + { 0x1.02bd066e6e446p-877, 0x1.a9efbad7c9909p-872 }, + { 0x1.56dece3f159c3p-878, 0x1.1a4d14ca40e6p-872 }, + { 0x1.c64dabfd6babdp-879, 0x1.7628f37011dc7p-873 }, + { 0x1.2cf07ed3ac7cap-879, 0x1.efd93aae49244p-874 }, + { 0x1.8ea5cdb1b77f8p-880, 0x1.4884565714d83p-874 }, + { 0x1.0801f05da3babp-880, 0x1.b341347ab9d2ep-875 }, + { 0x1.5da3ba0723cbcp-881, 0x1.204d0f497ca7dp-875 }, + { 0x1.cefd7b19fc691p-882, 0x1.7de10a24a9be3p-876 }, + { 0x1.3281b7ca3d771p-882, 0x1.f9c4f419d97b9p-877 }, + { 0x1.95c663259c5d8p-883, 0x1.4ee2a6bb63f1dp-877 }, + { 0x1.0c90568fe453bp-883, 0x1.bb6bea4d790c6p-878 }, + { 0x1.6374ef6370a23p-884, 0x1.258802fee3a1bp-878 }, + { 0x1.d668024e6e773p-885, 0x1.8491dcb50d65p-879 }, + { 0x1.3739f6c74a992p-885, 0x1.012888bcf5e1bp-879 }, + { 0x1.9bc5a2748239p-886, 0x1.5456466d99824p-880 }, + { 0x1.105de86fb726ep-886, 0x1.c25d7813e5a28p-881 }, + { 0x1.68453b252f9afp-887, 0x1.29f220ff323bdp-881 }, + { 0x1.dc7c640bf856fp-888, 0x1.8a2c46b36447dp-882 }, + { 0x1.3b0e7a2d8004dp-888, 0x1.04b5178932d9ep-882 }, + { 0x1.a095d99893beap-889, 0x1.58d2d04dcdef9p-883 }, + { 0x1.1361f24d04a1ep-889, 0x1.c8060b8a624d8p-884 }, + { 0x1.6c0994513d45bp-890, 0x1.2d8154e3020f5p-884 }, + { 0x1.e12caa0268707p-891, 0x1.8ea37661d565fp-885 }, + { 0x1.3df6725a60cf5p-891, 0x1.078003d294269p-885 }, + { 0x1.a42bf15180a09p-892, 0x1.5c4df6da1a5fp-886 }, + { 0x1.15957e82800c6p-892, 0x1.cc58a0676d26ep-887 }, + { 0x1.6eb9463d29a0dp-893, 0x1.302d6b1661efp-887 }, + { 0x1.e46dfa81a2018p-894, 0x1.91ed1d851d1ddp-888 }, + { 0x1.3feb236502138p-894, 0x1.0982d94421652p-888 }, + { 0x1.a67f97b02e026p-895, 0x1.5ebfab91b4a2bp-889 }, + { 0x1.16f37032d6085p-895, 0x1.cf4b3235443f5p-890 }, + { 0x1.704e120e656fdp-896, 0x1.31f0304f01ddbp-890 }, + { 0x1.e638c247f445dp-897, 0x1.940198fd0e1c2p-891 }, + { 0x1.40e7ff18c854cp-897, 0x1.0ab8eaa8fae67p-891 }, + { 0x1.a78b6039c7039p-898, 0x1.60223e0067b2cp-892 }, + { 0x1.1778970df4481p-898, 0x1.d0d6e2f89dd66p-893 }, + { 0x1.70c446e7535ccp-899, 0x1.32c589802b4bap-893 }, + { 0x1.e688d1dc06742p-900, 0x1.94dc0e4e3bd62p-894 }, + { 0x1.40eab69ffb357p-900, 0x1.0b1f64079cf15p-894 }, + { 0x1.a74cd8f49285bp-901, 0x1.607271cb1c23p-895 }, + { 0x1.1723bbb37e71p-901, 0x1.d0f815d3e30e4p-896 }, + { 0x1.701ad03f5aba2p-902, 0x1.32ab83cb1b9aap-896 }, + { 0x1.e55d6dd34aeb5p-903, 0x1.947a7e7d08e62p-897 }, + { 0x1.3ff3437e5e592p-903, 0x1.0ab555a059592p-897 }, + { 0x1.a5c493ec4b75bp-904, 0x1.5faf8b45ee11cp-898 }, + { 0x1.15f5a46f2a8c5p-904, 0x1.cfae7d166a387p-899 }, + { 0x1.6e533a1804da5p-905, 0x1.31a25c153692fp-899 }, + { 0x1.e2b951ac76b4bp-906, 0x1.92ddcdd3a585ap-900 }, + { 0x1.3e03e7aaf4a23p-906, 0x1.097bb793410b5p-900 }, + { 0x1.a2f624fa2da41p-907, 0x1.5ddb524f58124p-901 }, + { 0x1.13f112353b2e2p-907, 0x1.ccfd1b6b2b0d1p-902 }, + { 0x1.6b71aaf8395acp-908, 0x1.2fac7e1ac1a55p-902 }, + { 0x1.dea2a52e6f8d6p-909, 0x1.9009c068a7447p-903 }, + { 0x1.3b2124c85eb7dp-909, 0x1.077566199da13p-903 }, + { 0x1.9ee813dcc82f4p-910, 0x1.5afa0b60e30adp-904 }, + { 0x1.111ab5ef7d9cep-910, 0x1.c8ea38207b48cp-905 }, + { 0x1.677cd3ce598a2p-911, 0x1.2cce7b0334e93p-905 }, + { 0x1.d922e485849dfp-912, 0x1.8c04eb792831bp-906 }, + { 0x1.3751aaab95803p-912, 0x1.04a716678c7d9p-906 }, + { 0x1.99a3c2eb312dfp-913, 0x1.571266fb205e7p-907 }, + { 0x1.0d791e54efc95p-913, 0x1.c37f46c8a36cep-908 }, + { 0x1.627dd610c1f2fp-914, 0x1.290ef7aa6784ep-908 }, + { 0x1.d246bba093dddp-915, 0x1.86d89be61c44fp-909 }, + { 0x1.329e3d8fc35e5p-915, 0x1.011744722e8f8p-909 }, + { 0x1.93354aecb0f91p-916, 0x1.522d67c700dd9p-910 }, + { 0x1.09149eae599f4p-916, 0x1.bcc8c2b79e5e6p-911 }, + { 0x1.5c8020a89d6a7p-917, 0x1.247692feaf7c7p-911 }, + { 0x1.ca1dd59404578p-918, 0x1.8090b25f1fb1cp-912 }, + { 0x1.2d1194826d1d9p-918, 0x1.f99c33fa36826p-913 }, + { 0x1.8bab4cd7bc185p-919, 0x1.4c563ff8738edp-913 }, + { 0x1.03f72f0fa181cp-919, 0x1.b4d5ff233ee8bp-914 }, + { 0x1.559144638d7d2p-920, 0x1.1f0fc4fe41aefp-914 }, + { 0x1.c0baa10766979p-921, 0x1.793b75fbd2367p-915 }, + { 0x1.26b830bbc4f33p-921, 0x1.efaa9eeaa4992p-916 }, + { 0x1.8316ba6f8ef74p-922, 0x1.459a26ac43fcfp-916 }, + { 0x1.fc588d5eeb3p-923, 0x1.abb8ece685efep-917 }, + { 0x1.4dc0c0d42f863p-923, 0x1.18e6b704952c1p-917 }, + { 0x1.b6320aea7077ap-924, 0x1.70e95e366ca95p-918 }, + { 0x1.1fa02ebad6485p-924, 0x1.e4700e7fab75ep-919 }, + { 0x1.798a96e59845bp-925, 0x1.3e0826243926dp-919 }, + { 0x1.ef81624855ca5p-926, 0x1.a185d71d9ae78p-920 }, + { 0x1.451fcaaed5e7p-926, 0x1.1209163a43d8ap-920 }, + { 0x1.aa9b30dd7b333p-927, 0x1.67acd56555624p-921 }, + { 0x1.17d9121b4ff43p-927, 0x1.d805487b20ec2p-922 }, + { 0x1.6f1bb0c9eff18p-928, 0x1.35b0e3e76f72ap-922 }, + { 0x1.e184bec96bcc5p-929, 0x1.965317fc3f8ebp-923 }, + { 0x1.3bc10ccdff1d7p-929, 0x1.0a85e11600392p-923 }, + { 0x1.9e0f0cdf83a76p-930, 0x1.5d99f4f4fa7a2p-924 }, + { 0x1.0f738d3253e75p-930, 0x1.ca8538b911cc2p-925 }, + { 0x1.63e056b37b486p-931, 0x1.2ca663e8f6c6ep-925 }, + { 0x1.d2806afda0512p-932, 0x1.8a38c763ae5p-926 }, + { 0x1.31b865207923bp-932, 0x1.026d30f31261ep-926 }, + { 0x1.90a81bef15367p-933, 0x1.52c63cbe5201dp-927 }, + { 0x1.068145905baddp-933, 0x1.bc0c903e2dd51p-928 }, + { 0x1.57f0081c7461bp-934, 0x1.22fbc7eb40c8ep-928 }, + { 0x1.c293abfeb81c1p-935, 0x1.7d5064d5d2e6ap-929 }, + { 0x1.271a9ed146425p-935, 0x1.f3a001a1da12ap-930 }, + { 0x1.8282015bfd093p-936, 0x1.474846e880b8p-930 }, + { 0x1.fa292d1f4b615p-937, 0x1.acb96019278e3p-931 }, + { 0x1.4b6323fa7fafcp-937, 0x1.18c50c637e437p-931 }, + { 0x1.b1ded81f6cf48p-938, 0x1.6fb47e7243b1p-932 }, + { 0x1.1bfd2aff12d23p-938, 0x1.e17fe4af1cdcdp-933 }, + { 0x1.73b9288cf980bp-939, 0x1.3b3779cd081bcp-933 }, + { 0x1.e680a6315c8f9p-940, 0x1.9caab20737c4bp-934 }, + { 0x1.3e52969a46a03p-940, 0x1.0e16c42489121p-934 }, + { 0x1.a082ea93d471fp-941, 0x1.618056ad2fa0dp-935 }, + { 0x1.1075d9566cab2p-941, 0x1.ce9e247afa7efp-936 }, + { 0x1.646a66f6fb197p-942, 0x1.2eabb9557e4c3p-936 }, + { 0x1.d22f0f82317a8p-943, 0x1.8c0020c90fd02p-937 }, + { 0x1.30d7883df3e07p-943, 0x1.0305d4157bdecp-937 }, + { 0x1.8ea1187daf8b3p-944, 0x1.52cf8a69cbdeep-938 }, + { 0x1.049a91d747c02p-944, 0x1.bb1f3a4ce848cp-939 }, + { 0x1.54b29ff375e83p-945, 0x1.21bd19407d3a8p-939 }, + { 0x1.bd5a7cbaf896dp-946, 0x1.7ad97206eb3e9p-940 }, + { 0x1.230b0dec754dap-946, 0x1.ef4e6059f1fe4p-941 }, + { 0x1.7c5a693980a4p-947, 0x1.43bdb9112e65bp-941 }, + { 0x1.f10221f87a1cap-948, 0x1.a7278c0b2c815p-942 }, + { 0x1.44ae6c097e3b8p-948, 0x1.148391a9b5b7p-942 }, + { 0x1.a8288818abb4p-949, 0x1.69563388e87eep-943 }, }, - -/* Coefficients for each order 12 polynomial on each of the 20 intervals. */ -.poly = { - {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1, - -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2, - 0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5, - -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9, - 0x1.c9bfafa73899cp-11}, - {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1, - -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3, - 0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6, - -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10, - 0x1.526a8a14e9bfcp-12}, - {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2, - -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4, - 0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7, - -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12, - 0x1.b451af7dd52fep-14}, - {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2, - -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5, - 0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9, - -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13, - 0x1.e654e67532b44p-16}, - {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3, - -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6, - 0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10, - -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15, - 0x1.d213a128a75c9p-18}, - {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4, - -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8, - 0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12, - -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17, - 0x1.7f90154bde15dp-20}, - {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5, - -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9, - 0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14, - -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20, - 0x1.1020f4741f79ep-22}, - {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6, - -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11, - 0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16, - -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22, - 0x1.501716d098f14p-25}, - {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6, - -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12, - 0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18, - -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25, - 0x1.6eb74e2e99662p-28}, - {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7, - -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14, - 0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20, - -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28, - 0x1.68510d1c32842p-31}, - {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8, - -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15, - 0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23, - -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30, - 0x1.45aabbe505f6ap-34}, - {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9, - -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17, - 0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25, - -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33, - 0x1.14989aac741c2p-37}, - {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10, - -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18, - 0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27, - -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37, - 0x1.c21ba1b404f5ap-41}, - {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11, - -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20, - 0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30, - -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40, - 0x1.6487c50052867p-44}, - {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11, - -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22, - 0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32, - -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43, - 0x1.165732f1ae138p-47}, - {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12, - -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23, - 0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34, - -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46, - 0x1.b0241c6d5b761p-51}, - {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13, - -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25, - 0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37, - -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49, - 0x1.4f8abb4398a0dp-54}, - {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14, - -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26, - 0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39, - -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52, - 0x1.058cd4ea9bf04p-57}, - {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15, - -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28, - 0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41, - -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55, - 0x1.9a2af47d77e44p-61}, - {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15, - -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30, - 0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44, - -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58, - 0x1.43d3358c64dafp-64} -} }; diff --git a/contrib/arm-optimized-routines/pl/math/erfcf.h b/contrib/arm-optimized-routines/pl/math/erfcf.h deleted file mode 100644 index 8f1e5f4226e3..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erfcf.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Shared functions for scalar and vector single-precision erfc(x) functions. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_ERFCF_H -#define PL_MATH_ERFCF_H - -#include "math_config.h" - -#define FMA fma -#include "estrin_wrap.h" - -/* Accurate exponential from optimized-routines. */ -double -__exp_dd (double x, double xtail); - -static inline double -eval_poly (double z, const double *coeff) -{ - double z2 = z * z; - double z4 = z2 * z2; - double z8 = z4 * z4; -#define C(i) coeff[i] - return ESTRIN_15 (z, z2, z4, z8, C); -#undef C -} - -static inline double -eval_exp_mx2 (double x) -{ - return __exp_dd (-(x * x), 0.0); -} - -#undef FMA -#endif // PL_MATH_ERFCF_H diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_1u7.c b/contrib/arm-optimized-routines/pl/math/erfcf_1u7.c new file mode 100644 index 000000000000..c8ce95cca058 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfcf_1u7.c @@ -0,0 +1,103 @@ +/* + * Single-precision erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Shift 0x1p17f +#define OneThird 0x1.555556p-2f +#define TwoThird 0x1.555556p-1f + +#define TwoOverFifteen 0x1.111112p-3f +#define TwoOverFive 0x1.99999ap-2f +#define Tenth 0x1.99999ap-4f + +#define SignMask 0x7fffffff + +/* Fast erfcf approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +float +erfcf (float x) +{ + /* Get top words and sign. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & SignMask; + uint32_t sign = ix & ~SignMask; + + /* |x| < 0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */ + if (unlikely (ia < 0x32800000)) + return 1.0f - x; /* Small case. */ + + /* For |x| < 10.0625, the following approximation holds. */ + if (likely (ia < 0x41210000)) + { + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + float a = asfloat (ia); + float z = a + Shift; + uint32_t i = asuint (z) - asuint (Shift); + float r = z - Shift; + + /* These values are scaled by 2^-47. */ + float erfcr = __erfcf_data.tab[i].erfc; + float scale = __erfcf_data.tab[i].scale; + + /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ + float d = a - r; + float d2 = d * d; + float r2 = r * r; + float p1 = -r; + float p2 = fmaf (TwoThird, r2, -OneThird); + float p3 = -r * fmaf (OneThird, r2, -0.5f); + float p4 = fmaf (fmaf (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth); + float y = fmaf (p4, d, p3); + y = fmaf (y, d, p2); + y = fmaf (y, d, p1); + y = fmaf (-fmaf (y, d2, d), scale, erfcr); + /* Handle sign and scale back in a single fma. */ + float off = asfloat (sign >> 1); + float fac = asfloat (asuint (0x1p-47f) | sign); + y = fmaf (y, fac, off); + /* The underflow exception needs to be signaled explicitly when + result gets into subormnal range. */ + if (x >= 0x1.2639cp+3f) + force_eval_float (opt_barrier_float (0x1p-123f) * 0x1p-123f); + return y; + } + + /* erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2. */ + if (unlikely (ia >= 0x7f800000)) + return asfloat (sign >> 1) + 1.0f / x; /* Special cases. */ + + /* Above this threshold erfcf is constant and needs to raise underflow + exception for positive x. */ + return sign ? 2.0f : __math_uflowf (0); +} + +PL_SIG (S, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (erfcf, 1.14) +PL_TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000) +PL_TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000) +PL_TEST_INTERVAL (erfcf, 10.0625, inf, 40000) +PL_TEST_INTERVAL (erfcf, -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_2u.c b/contrib/arm-optimized-routines/pl/math/erfcf_2u.c deleted file mode 100644 index 5a3f9b00aa5c..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erfcf_2u.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Single-precision erfc(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "erfcf.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define P(i) __erfcf_poly_data.poly[i] - -/* Approximation of erfcf for |x| > 4.0. */ -static inline float -approx_erfcf_hi (float x, uint32_t sign, const double *coeff) -{ - if (sign) - { - return 2.0f; - } - - /* Polynomial contribution. */ - double z = (double) fabs (x); - float p = (float) eval_poly (z, coeff); - /* Gaussian contribution. */ - float e_mx2 = (float) eval_exp_mx2 (z); - - return p * e_mx2; -} - -/* Approximation of erfcf for |x| < 4.0. */ -static inline float -approx_erfcf_lo (float x, uint32_t sign, const double *coeff) -{ - /* Polynomial contribution. */ - double z = (double) fabs (x); - float p = (float) eval_poly (z, coeff); - /* Gaussian contribution. */ - float e_mx2 = (float) eval_exp_mx2 (z); - - if (sign) - return fmaf (-p, e_mx2, 2.0f); - else - return p * e_mx2; -} - -/* Top 12 bits of a float (sign and exponent bits). */ -static inline uint32_t -abstop12 (float x) -{ - return (asuint (x) >> 20) & 0x7ff; -} - -/* Top 12 bits of a float. */ -static inline uint32_t -top12 (float x) -{ - return asuint (x) >> 20; -} - -/* Fast erfcf approximation using polynomial approximation - multiplied by gaussian. - Most of the computation is carried out in double precision, - and is very sensitive to accuracy of polynomial and exp - evaluation. - Worst-case error is 1.968ulps, obtained for x = 2.0412941. - erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp - err 1.46788. */ -float -erfcf (float x) -{ - /* Get top words and sign. */ - uint32_t ix = asuint (x); /* We need to compare at most 32 bits. */ - uint32_t sign = ix >> 31; - uint32_t ia12 = top12 (x) & 0x7ff; - - /* Handle special cases and small values with a single comparison: - abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small) - - Special cases - erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2 - - Errno - EDOM does not have to be set in case of erfcf(nan). - Only ERANGE may be set in case of underflow. - - Small values (|x| accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */ - if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328)) - { - if (abstop12 (x) >= 0x7f8) - return (float) (sign << 1) + 1.0f / x; /* Special cases. */ - else - return 1.0f - x; /* Small case. */ - } - - /* Normalized numbers divided in 4 intervals - with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for - the interesting region as it is the smallest value, representable as a - 12-bit integer, for which returning 0 gives <1.5 ULP. */ - if (ia12 < 0x400) - { - return approx_erfcf_lo (x, sign, P (0)); - } - if (ia12 < 0x408) - { - return approx_erfcf_lo (x, sign, P (1)); - } - if (ia12 < 0x410) - { - return approx_erfcf_hi (x, sign, P (2)); - } - if (ia12 < 0x412) - { - return approx_erfcf_hi (x, sign, P (3)); - } - if (sign) - { - return 2.0f; - } - return __math_uflowf (0); -} - -PL_SIG (S, F, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (erfcf, 1.5) -PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (erfcf, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_data.c b/contrib/arm-optimized-routines/pl/math/erfcf_data.c index 2e018c8c6710..a54e11973819 100644 --- a/contrib/arm-optimized-routines/pl/math/erfcf_data.c +++ b/contrib/arm-optimized-routines/pl/math/erfcf_data.c @@ -1,57 +1,664 @@ /* * Data used in single-precision erfc(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double - precision. Generated using the Remez algorithm on each interval separately - (see erfcf.sollya for more detail). */ -const struct erfcf_poly_data __erfcf_poly_data - = {.poly - = {{ -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1, - -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2, - 0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5, - -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9, - 0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17, - -0x1.32712a6275c4dp-21 -#endif +/* Lookup table used in erfcf. + For each possible rounded input r (multiples of 1/64), between + r = 0.0 and r = 10.0625 (645 values): + - the first entry __erfcf_data.tab.erfc contains the values of erfc(r), + - the second entry __erfcf_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore + they are scaled by a large enough value 2^47 (fits in 8 bits). */ +const struct erfcf_data __erfcf_data = { + .tab = { { 0x1p47, 0x1.20dd76p47 }, + { 0x1.f6f944p46, 0x1.20cb68p47 }, + { 0x1.edf3aap46, 0x1.209546p47 }, + { 0x1.e4f05p46, 0x1.203b26p47 }, + { 0x1.dbf056p46, 0x1.1fbd28p47 }, + { 0x1.d2f4dcp46, 0x1.1f1b7ap47 }, + { 0x1.c9fefep46, 0x1.1e565cp47 }, + { 0x1.c10fd4p46, 0x1.1d6e14p47 }, + { 0x1.b8287ap46, 0x1.1c62fap47 }, + { 0x1.af4ap46, 0x1.1b3572p47 }, + { 0x1.a6757ep46, 0x1.19e5eap47 }, + { 0x1.9dabfcp46, 0x1.1874dep47 }, + { 0x1.94ee88p46, 0x1.16e2d8p47 }, + { 0x1.8c3e24p46, 0x1.153068p47 }, + { 0x1.839bd6p46, 0x1.135e3p47 }, + { 0x1.7b0894p46, 0x1.116cd8p47 }, + { 0x1.728558p46, 0x1.0f5d16p47 }, + { 0x1.6a1312p46, 0x1.0d2fa6p47 }, + { 0x1.61b2acp46, 0x1.0ae55p47 }, + { 0x1.596508p46, 0x1.087ee4p47 }, + { 0x1.512b06p46, 0x1.05fd3ep47 }, + { 0x1.49057ap46, 0x1.03614p47 }, + { 0x1.40f536p46, 0x1.00abdp47 }, + { 0x1.38fbp46, 0x1.fbbbbep46 }, + { 0x1.311796p46, 0x1.f5f0cep46 }, + { 0x1.294bb4p46, 0x1.eff8c4p46 }, + { 0x1.21980ap46, 0x1.e9d5a8p46 }, + { 0x1.19fd3ep46, 0x1.e38988p46 }, + { 0x1.127bf2p46, 0x1.dd167cp46 }, + { 0x1.0b14bcp46, 0x1.d67ea2p46 }, + { 0x1.03c82ap46, 0x1.cfc41ep46 }, + { 0x1.f92d8cp45, 0x1.c8e91cp46 }, + { 0x1.eb0214p45, 0x1.c1efcap46 }, + { 0x1.dd0edap45, 0x1.bada5ap46 }, + { 0x1.cf54b4p45, 0x1.b3aafcp46 }, + { 0x1.c1d46ap45, 0x1.ac63e8p46 }, + { 0x1.b48eaep45, 0x1.a5074ep46 }, + { 0x1.a78428p45, 0x1.9d9762p46 }, + { 0x1.9ab566p45, 0x1.96165p46 }, + { 0x1.8e22eap45, 0x1.8e8646p46 }, + { 0x1.81cd24p45, 0x1.86e96ap46 }, + { 0x1.75b47p45, 0x1.7f41dcp46 }, + { 0x1.69d91ep45, 0x1.7791b8p46 }, + { 0x1.5e3b66p45, 0x1.6fdb12p46 }, + { 0x1.52db78p45, 0x1.681ff2p46 }, + { 0x1.47b96ep45, 0x1.60625cp46 }, + { 0x1.3cd554p45, 0x1.58a446p46 }, + { 0x1.322f26p45, 0x1.50e79ep46 }, + { 0x1.27c6d2p45, 0x1.492e42p46 }, + { 0x1.1d9c34p45, 0x1.417a0cp46 }, + { 0x1.13af1ep45, 0x1.39ccc2p46 }, + { 0x1.09ff5p45, 0x1.32281ep46 }, + { 0x1.008c8p45, 0x1.2a8dcep46 }, + { 0x1.eeaca8p44, 0x1.22ff72p46 }, + { 0x1.dcb8cap44, 0x1.1b7e98p46 }, + { 0x1.cb3c86p44, 0x1.140cc4p46 }, + { 0x1.ba36dap44, 0x1.0cab62p46 }, + { 0x1.a9a6bap44, 0x1.055bd6p46 }, + { 0x1.998afap44, 0x1.fc3ee6p45 }, + { 0x1.89e25ep44, 0x1.edeeeep45 }, + { 0x1.7aab98p44, 0x1.dfca26p45 }, + { 0x1.6be542p44, 0x1.d1d2dp45 }, + { 0x1.5d8decp44, 0x1.c40b08p45 }, + { 0x1.4fa40ep44, 0x1.b674c8p45 }, + { 0x1.422616p44, 0x1.a911fp45 }, + { 0x1.351262p44, 0x1.9be438p45 }, + { 0x1.28674p44, 0x1.8eed36p45 }, + { 0x1.1c22f8p44, 0x1.822e66p45 }, + { 0x1.1043c2p44, 0x1.75a91ap45 }, + { 0x1.04c7cap44, 0x1.695e8cp45 }, + { 0x1.f35a72p43, 0x1.5d4fd4p45 }, + { 0x1.dde456p43, 0x1.517de6p45 }, + { 0x1.c9296cp43, 0x1.45e99cp45 }, + { 0x1.b525d6p43, 0x1.3a93b2p45 }, + { 0x1.a1d5a6p43, 0x1.2f7cc4p45 }, + { 0x1.8f34eap43, 0x1.24a554p45 }, + { 0x1.7d3fa6p43, 0x1.1a0dc6p45 }, + { 0x1.6bf1dcp43, 0x1.0fb662p45 }, + { 0x1.5b4784p43, 0x1.059f5ap45 }, + { 0x1.4b3c98p43, 0x1.f79184p44 }, + { 0x1.3bcd14p43, 0x1.e4653p44 }, + { 0x1.2cf4eep43, 0x1.d1b982p44 }, + { 0x1.1eb024p43, 0x1.bf8e1cp44 }, + { 0x1.10fab8p43, 0x1.ade26cp44 }, + { 0x1.03d0acp43, 0x1.9cb5bep44 }, + { 0x1.ee5c18p42, 0x1.8c0732p44 }, + { 0x1.d61dd6p42, 0x1.7bd5c8p44 }, + { 0x1.bedec8p42, 0x1.6c2056p44 }, + { 0x1.a8973cp42, 0x1.5ce596p44 }, + { 0x1.933f9p42, 0x1.4e241ep44 }, + { 0x1.7ed03ap42, 0x1.3fda6cp44 }, + { 0x1.6b41ccp42, 0x1.3206dcp44 }, + { 0x1.588cf2p42, 0x1.24a7b8p44 }, + { 0x1.46aa72p42, 0x1.17bb2cp44 }, + { 0x1.359332p42, 0x1.0b3f52p44 }, + { 0x1.254038p42, 0x1.fe646p43 }, + { 0x1.15aaa8p42, 0x1.e72372p43 }, + { 0x1.06cbcap42, 0x1.d0b7ap43 }, + { 0x1.f13a04p41, 0x1.bb1c98p43 }, + { 0x1.d62fbep41, 0x1.a64de6p43 }, + { 0x1.bc6c1ep41, 0x1.92470ap43 }, + { 0x1.a3e2ccp41, 0x1.7f036cp43 }, + { 0x1.8c87b8p41, 0x1.6c7e64p43 }, + { 0x1.764f2p41, 0x1.5ab342p43 }, + { 0x1.612d8ap41, 0x1.499d48p43 }, + { 0x1.4d17cap41, 0x1.3937b2p43 }, + { 0x1.3a03p41, 0x1.297dbap43 }, + { 0x1.27e498p41, 0x1.1a6a96p43 }, + { 0x1.16b24cp41, 0x1.0bf97ep43 }, + { 0x1.066222p41, 0x1.fc4b5ep42 }, + { 0x1.edd4d2p40, 0x1.e1d4dp42 }, + { 0x1.d08382p40, 0x1.c885ep42 }, + { 0x1.b4be2p40, 0x1.b0553p42 }, + { 0x1.9a7316p40, 0x1.99397ap42 }, + { 0x1.81915cp40, 0x1.83298ep42 }, + { 0x1.6a088p40, 0x1.6e1c58p42 }, + { 0x1.53c89ep40, 0x1.5a08e8p42 }, + { 0x1.3ec25ep40, 0x1.46e66cp42 }, + { 0x1.2ae6fap40, 0x1.34ac36p42 }, + { 0x1.18282ep40, 0x1.2351c2p42 }, + { 0x1.067844p40, 0x1.12ceb4p42 }, + { 0x1.eb940ep39, 0x1.031ad6p42 }, + { 0x1.cc2186p39, 0x1.e85c44p41 }, + { 0x1.ae808cp39, 0x1.cc018p41 }, + { 0x1.9299bp39, 0x1.b1160ap41 }, + { 0x1.785674p39, 0x1.978ae8p41 }, + { 0x1.5fa14ap39, 0x1.7f5188p41 }, + { 0x1.486586p39, 0x1.685bb6p41 }, + { 0x1.328f5ep39, 0x1.529b9ep41 }, + { 0x1.1e0be6p39, 0x1.3e03d8p41 }, + { 0x1.0ac8fcp39, 0x1.2a875cp41 }, + { 0x1.f16aaep38, 0x1.181984p41 }, + { 0x1.cf80d4p38, 0x1.06ae14p41 }, + { 0x1.afb4e2p38, 0x1.ec7262p40 }, + { 0x1.91e8bep38, 0x1.cd5ecap40 }, + { 0x1.75ffb4p38, 0x1.b00b38p40 }, + { 0x1.5bde72p38, 0x1.94624ep40 }, + { 0x1.436af4p38, 0x1.7a4f6ap40 }, + { 0x1.2c8c7ap38, 0x1.61beaep40 }, + { 0x1.172b7ap38, 0x1.4a9cf6p40 }, + { 0x1.033198p38, 0x1.34d7dcp40 }, + { 0x1.e11332p37, 0x1.205dacp40 }, + { 0x1.be3ebp37, 0x1.0d1d6ap40 }, + { 0x1.9dbf72p37, 0x1.f60d8ap39 }, + { 0x1.7f714p37, 0x1.d4143ap39 }, + { 0x1.6331cap37, 0x1.b430ecp39 }, + { 0x1.48e09cp37, 0x1.9646f4p39 }, + { 0x1.305ef8p37, 0x1.7a3adep39 }, + { 0x1.198fd6p37, 0x1.5ff276p39 }, + { 0x1.0457c6p37, 0x1.4754acp39 }, + { 0x1.e139bcp36, 0x1.30499cp39 }, + { 0x1.bc8d52p36, 0x1.1aba78p39 }, + { 0x1.9a7c3p36, 0x1.06918cp39 }, + { 0x1.7adadep36, 0x1.e77448p38 }, + { 0x1.5d806ap36, 0x1.c4412cp38 }, + { 0x1.424642p36, 0x1.a36454p38 }, + { 0x1.290826p36, 0x1.84ba3p38 }, + { 0x1.11a3f8p36, 0x1.6821p38 }, + { 0x1.f7f358p35, 0x1.4d78bcp38 }, + { 0x1.cfd652p35, 0x1.34a306p38 }, + { 0x1.aab85ap35, 0x1.1d8318p38 }, + { 0x1.88647p35, 0x1.07fdb4p38 }, + { 0x1.68a8e4p35, 0x1.e7f232p37 }, + { 0x1.4b5726p35, 0x1.c2b9dp37 }, + { 0x1.30439cp35, 0x1.a02436p37 }, + { 0x1.174578p35, 0x1.8005fp37 }, + { 0x1.003692p35, 0x1.6235fcp37 }, + { 0x1.d5e678p34, 0x1.468daep37 }, + { 0x1.aeb442p34, 0x1.2ce898p37 }, + { 0x1.8a9848p34, 0x1.15246ep37 }, + { 0x1.695876p34, 0x1.fe41cep36 }, + { 0x1.4abea2p34, 0x1.d57f52p36 }, + { 0x1.2e984ep34, 0x1.afc85ep36 }, + { 0x1.14b676p34, 0x1.8ce75ep36 }, + { 0x1.f9daap33, 0x1.6caa0ep36 }, + { 0x1.ce283ap33, 0x1.4ee142p36 }, + { 0x1.a609f8p33, 0x1.3360ccp36 }, + { 0x1.81396ap33, 0x1.19ff46p36 }, + { 0x1.5f7524p33, 0x1.0295fp36 }, + { 0x1.40806ep33, 0x1.da011p35 }, + { 0x1.2422eep33, 0x1.b23a5ap35 }, + { 0x1.0a286p33, 0x1.8d986ap35 }, + { 0x1.e4c0bp32, 0x1.6be022p35 }, + { 0x1.b93bf4p32, 0x1.4cda54p35 }, + { 0x1.916f7cp32, 0x1.30539p35 }, + { 0x1.6d0e7p32, 0x1.161be4p35 }, + { 0x1.4bd1cp32, 0x1.fc0d56p34 }, + { 0x1.2d77bep32, 0x1.cfd4a6p34 }, + { 0x1.11c3bep32, 0x1.a74068p34 }, + { 0x1.f0fb86p31, 0x1.8208bcp34 }, + { 0x1.c2e43ep31, 0x1.5feadap34 }, + { 0x1.98e254p31, 0x1.40a8c2p34 }, + { 0x1.729df6p31, 0x1.2408eap34 }, + { 0x1.4fc63cp31, 0x1.09d5f8p34 }, + { 0x1.3010aap31, 0x1.e3bcf4p33 }, + { 0x1.1338b8p31, 0x1.b7e946p33 }, + { 0x1.f1fecp30, 0x1.8fdc1cp33 }, + { 0x1.c2556ap30, 0x1.6b4702p33 }, + { 0x1.970b06p30, 0x1.49e178p33 }, + { 0x1.6fbddep30, 0x1.2b6876p33 }, + { 0x1.4c144ep30, 0x1.0f9e1cp33 }, + { 0x1.2bbc1ep30, 0x1.ec929ap32 }, + { 0x1.0e69f2p30, 0x1.be6abcp32 }, + { 0x1.e7b188p29, 0x1.94637ep32 }, + { 0x1.b792bcp29, 0x1.6e2368p32 }, + { 0x1.8c03d2p29, 0x1.4b581cp32 }, + { 0x1.649b02p29, 0x1.2bb5ccp32 }, + { 0x1.40f794p29, 0x1.0ef6c4p32 }, + { 0x1.20c13p29, 0x1.e9b5e8p31 }, + { 0x1.03a72ap29, 0x1.ba4f04p31 }, + { 0x1.d2bfc6p28, 0x1.8f4cccp31 }, + { 0x1.a35068p28, 0x1.684c22p31 }, + { 0x1.7885cep28, 0x1.44f21ep31 }, + { 0x1.51f06ap28, 0x1.24eb72p31 }, + { 0x1.2f2aaap28, 0x1.07ebd2p31 }, + { 0x1.0fd816p28, 0x1.db5adp30 }, + { 0x1.e7493p27, 0x1.abe09ep30 }, + { 0x1.b48774p27, 0x1.80f43ap30 }, + { 0x1.86e006p27, 0x1.5a2aep30 }, + { 0x1.5dd4bp27, 0x1.37231p30 }, + { 0x1.38f2e8p27, 0x1.1783cep30 }, + { 0x1.17d2c6p27, 0x1.f5f7d8p29 }, + { 0x1.f42c18p26, 0x1.c282cep29 }, + { 0x1.beceb2p26, 0x1.94219cp29 }, + { 0x1.8ef2aap26, 0x1.6a5972p29 }, + { 0x1.640bf6p26, 0x1.44ba86p29 }, + { 0x1.3d9be6p26, 0x1.22df2ap29 }, + { 0x1.1b2fe4p26, 0x1.046aeap29 }, + { 0x1.f8c0c2p25, 0x1.d21398p28 }, + { 0x1.c19fa8p25, 0x1.a0df1p28 }, + { 0x1.90538cp25, 0x1.74adc8p28 }, + { 0x1.6443fep25, 0x1.4d0232p28 }, + { 0x1.3ce784p25, 0x1.296a7p28 }, + { 0x1.19c232p25, 0x1.097f62p28 }, + { 0x1.f4c8c4p24, 0x1.d9c736p27 }, + { 0x1.bcd30ep24, 0x1.a6852cp27 }, + { 0x1.8aee4cp24, 0x1.789fb8p27 }, + { 0x1.5e77b6p24, 0x1.4f8c96p27 }, + { 0x1.36dcf2p24, 0x1.2acee2p27 }, + { 0x1.139a7cp24, 0x1.09f5dp27 }, + { 0x1.e8747p23, 0x1.d9371ep26 }, + { 0x1.b0a44ap23, 0x1.a4c89ep26 }, + { 0x1.7f064ap23, 0x1.75fa8ep26 }, + { 0x1.52efep23, 0x1.4c37cp26 }, + { 0x1.2bc82ap23, 0x1.26f9ep26 }, + { 0x1.09064p23, 0x1.05c804p26 }, + { 0x1.d45f16p22, 0x1.d06ad6p25 }, + { 0x1.9dacb2p22, 0x1.9bc0ap25 }, + { 0x1.6d3126p22, 0x1.6ce1aap25 }, + { 0x1.423d14p22, 0x1.43302cp25 }, + { 0x1.1c33cep22, 0x1.1e1e86p25 }, + { 0x1.f512dep21, 0x1.fa5b5p24 }, + { 0x1.b9823cp21, 0x1.bfd756p24 }, + { 0x1.84d6fep21, 0x1.8be4f8p24 }, + { 0x1.564a92p21, 0x1.5dcd66p24 }, + { 0x1.2d2c0ap21, 0x1.34ecf8p24 }, + { 0x1.08ddd2p21, 0x1.10b148p24 }, + { 0x1.d1a75p20, 0x1.e12eep23 }, + { 0x1.99218cp20, 0x1.a854eap23 }, + { 0x1.674c6ap20, 0x1.7603bap23 }, + { 0x1.3b62b6p20, 0x1.4980ccp23 }, + { 0x1.14b54p20, 0x1.2225b2p23 }, + { 0x1.e55102p19, 0x1.febc1p22 }, + { 0x1.a964eep19, 0x1.c14b22p22 }, + { 0x1.74b17ap19, 0x1.8b0cfcp22 }, + { 0x1.465daap19, 0x1.5b2fe6p22 }, + { 0x1.1da944p19, 0x1.30f93cp22 }, + { 0x1.f3d41p18, 0x1.0bc30cp22 }, + { 0x1.b512a2p18, 0x1.d5f3a8p21 }, + { 0x1.7e03b2p18, 0x1.9c3518p21 }, + { 0x1.4dbb98p18, 0x1.6961b8p21 }, + { 0x1.236a1ap18, 0x1.3cab14p21 }, + { 0x1.fcae94p17, 0x1.155a0ap21 }, + { 0x1.bbc1ap17, 0x1.e5989p20 }, + { 0x1.82eedcp17, 0x1.a8e406p20 }, + { 0x1.5139a6p17, 0x1.7397c6p20 }, + { 0x1.25c354p17, 0x1.44d26ep20 }, + { 0x1.ff8f84p16, 0x1.1bcca4p20 }, + { 0x1.bd3474p16, 0x1.efac52p19 }, + { 0x1.834586p16, 0x1.b0a68ap19 }, + { 0x1.50b75cp16, 0x1.7974e8p19 }, + { 0x1.249ef2p16, 0x1.4924a8p19 }, + { 0x1.fc5b88p15, 0x1.1edfa4p19 }, + { 0x1.b95ceep15, 0x1.f3d218p18 }, + { 0x1.7f03bap15, 0x1.b334fap18 }, + { 0x1.4c389cp15, 0x1.7ac2d8p18 }, + { 0x1.2006aep15, 0x1.4979acp18 }, + { 0x1.f32eap14, 0x1.1e767cp18 }, + { 0x1.b05cfep14, 0x1.f1e352p17 }, + { 0x1.764f46p14, 0x1.b0778cp17 }, + { 0x1.43e56cp14, 0x1.77756ep17 }, + { 0x1.18238p14, 0x1.45ce66p17 }, + { 0x1.e45a98p13, 0x1.1a95p17 }, + { 0x1.a284ccp13, 0x1.e9f2p16 }, + { 0x1.697596p13, 0x1.a887bep16 }, + { 0x1.3807acp13, 0x1.6fab64p16 }, + { 0x1.0d3b36p13, 0x1.3e44e4p16 }, + { 0x1.d0624p12, 0x1.135f28p16 }, + { 0x1.904e0cp12, 0x1.dc479ep15 }, + { 0x1.58e72ap12, 0x1.9baed4p15 }, + { 0x1.2906ccp12, 0x1.63ac6cp15 }, + { 0x1.ff58dap11, 0x1.33225ap15 }, + { 0x1.b7f1f4p11, 0x1.0916fp15 }, + { 0x1.7a551p11, 0x1.c960cp14 }, + { 0x1.453142p11, 0x1.8a6174p14 }, + { 0x1.1761f8p11, 0x1.53e4f8p14 }, + { 0x1.dfd296p10, 0x1.24caf2p14 }, + { 0x1.9bd5fp10, 0x1.f830cp13 }, + { 0x1.61501p10, 0x1.b1e5acp13 }, + { 0x1.2ef6p10, 0x1.7538c6p13 }, + { 0x1.03a918p10, 0x1.40dfd8p13 }, + { 0x1.bce26ap9, 0x1.13bc08p13 }, + { 0x1.7cef42p9, 0x1.d9a88p12 }, + { 0x1.46056p9, 0x1.96a0b4p12 }, + { 0x1.16e3cap9, 0x1.5ce9acp12 }, + { 0x1.dcea68p8, 0x1.2b3e54p12 }, + { 0x1.97945ap8, 0x1.0085p12 }, + { 0x1.5c2828p8, 0x1.b7937ep11 }, + { 0x1.29415p8, 0x1.7872dap11 }, + { 0x1.fb58fap7, 0x1.423acp11 }, + { 0x1.b0c1a8p7, 0x1.13af5p11 }, + { 0x1.70f474p7, 0x1.d77f0cp10 }, + { 0x1.3a68a8p7, 0x1.92ff34p10 }, + { 0x1.0bcc6p7, 0x1.5847eep10 }, + { 0x1.c7fa0cp6, 0x1.25f9eep10 }, + { 0x1.8401b6p6, 0x1.f5cc78p9 }, + { 0x1.4a029ap6, 0x1.ac0f6p9 }, + { 0x1.188c46p6, 0x1.6cfa9cp9 }, + { 0x1.dcc4fap5, 0x1.370ab8p9 }, + { 0x1.94ec06p5, 0x1.08f24p9 }, + { 0x1.57bc96p5, 0x1.c324c2p8 }, + { 0x1.23a81ap5, 0x1.7fe904p8 }, + { 0x1.eeb278p4, 0x1.46897ep8 }, + { 0x1.a35794p4, 0x1.159a38p8 }, + { 0x1.634b8p4, 0x1.d7c594p7 }, + { 0x1.2ce2a4p4, 0x1.90ae4ep7 }, + { 0x1.fd5f08p3, 0x1.5422fp7 }, + { 0x1.aef3cep3, 0x1.20998p7 }, + { 0x1.6c6e62p3, 0x1.e98102p6 }, + { 0x1.3407b6p3, 0x1.9eee06p6 }, + { 0x1.043bap3, 0x1.5f8b88p6 }, + { 0x1.b77e5cp2, 0x1.29b294p6 }, + { 0x1.72f0c4p2, 0x1.f7f338p5 }, + { 0x1.38ee18p2, 0x1.aa5772p5 }, + { 0x1.07dd68p2, 0x1.68823ep5 }, + { 0x1.bcc58ep1, 0x1.30b14ep5 }, + { 0x1.76aca4p1, 0x1.01647cp5 }, + { 0x1.3b7912p1, 0x1.b2a87ep4 }, + { 0x1.097f82p1, 0x1.6ed2f2p4 }, + { 0x1.beaa3ep0, 0x1.356cd6p4 }, + { 0x1.778be2p0, 0x1.04e15ep4 }, + { 0x1.3b9984p0, 0x1.b7b04p3 }, + { 0x1.09182cp0, 0x1.725862p3 }, + { 0x1.bd20fcp-1, 0x1.37c92cp3 }, + { 0x1.75892p-1, 0x1.065b96p3 }, + { 0x1.394e7ap-1, 0x1.b950d4p2 }, + { 0x1.06a996p-1, 0x1.72fd94p2 }, + { 0x1.b8328ep-2, 0x1.37b83cp2 }, + { 0x1.70aff4p-2, 0x1.05ca5p2 }, + { 0x1.34a53cp-2, 0x1.b7807ep1 }, + { 0x1.0241dep-2, 0x1.70bebp1 }, + { 0x1.affb9p-3, 0x1.353a6cp1 }, + { 0x1.691c7cp-3, 0x1.0330fp1 }, + { 0x1.2db8cap-3, 0x1.b24a16p0 }, + { 0x1.f7f4f8p-4, 0x1.6ba91ap0 }, + { 0x1.a4ab64p-4, 0x1.305e98p0 }, + { 0x1.5efa4ep-4, 0x1.fd3de2p-1 }, + { 0x1.24b0d8p-4, 0x1.a9cc94p-1 }, + { 0x1.e7eeap-5, 0x1.63daf8p-1 }, + { 0x1.96826ep-5, 0x1.294176p-1 }, + { 0x1.5282d2p-5, 0x1.f05e82p-2 }, + { 0x1.19c05p-5, 0x1.9e39dcp-2 }, + { 0x1.d4ca9cp-6, 0x1.5982p-2 }, + { 0x1.85cfacp-6, 0x1.200c8ap-2 }, + { 0x1.43fb32p-6, 0x1.e00e92p-3 }, + { 0x1.0d2382p-6, 0x1.8fd4ep-3 }, + { 0x1.bef1b2p-7, 0x1.4cd9cp-3 }, + { 0x1.72ede4p-7, 0x1.14f48ap-3 }, + { 0x1.33b1cap-7, 0x1.ccaaeap-4 }, + { 0x1.fe3bdp-8, 0x1.7eef14p-4 }, + { 0x1.a6d7d2p-8, 0x1.3e2964p-4 }, + { 0x1.5e4062p-8, 0x1.083768p-4 }, + { 0x1.21fb7ap-8, 0x1.b69f1p-5 }, + { 0x1.dfefbep-9, 0x1.6be574p-5 }, + { 0x1.8cf816p-9, 0x1.2dc11ap-5 }, + { 0x1.482fa8p-9, 0x1.f4343cp-6 }, + { 0x1.0f30c4p-9, 0x1.9e614ep-6 }, + { 0x1.bff86ep-10, 0x1.571d34p-6 }, + { 0x1.71d0b6p-10, 0x1.1bf742p-6 }, + { 0x1.3125f6p-10, 0x1.d5cc6cp-7 }, + { 0x1.f755eap-11, 0x1.846e9ep-7 }, + { 0x1.9eebaap-11, 0x1.410048p-7 }, + { 0x1.55df18p-11, 0x1.09258p-7 }, + { 0x1.198c18p-11, 0x1.b5ceb6p-8 }, + { 0x1.cf82ep-12, 0x1.69468p-8 }, + { 0x1.7d5af6p-12, 0x1.29f9e8p-8 }, + { 0x1.399c28p-12, 0x1.eb4b9ep-9 }, + { 0x1.01c65ap-12, 0x1.94d1dep-9 }, + { 0x1.a78e82p-13, 0x1.4d6706p-9 }, + { 0x1.5bcf92p-13, 0x1.127346p-9 }, + { 0x1.1d791cp-13, 0x1.c39fap-10 }, + { 0x1.d463dcp-14, 0x1.73679cp-10 }, + { 0x1.8011fcp-14, 0x1.314916p-10 }, + { 0x1.3ac71cp-14, 0x1.f5a11ap-11 }, + { 0x1.01dcc2p-14, 0x1.9beca8p-11 }, + { 0x1.a6459cp-15, 0x1.52189ap-11 }, + { 0x1.59962ap-15, 0x1.155d48p-11 }, + { 0x1.1ab0e4p-15, 0x1.c6dc8ap-12 }, + { 0x1.ce42dep-16, 0x1.74ca88p-12 }, + { 0x1.79c43p-16, 0x1.31612ap-12 }, + { 0x1.349128p-16, 0x1.f4125ap-13 }, + { 0x1.f7d80ep-17, 0x1.993e82p-13 }, + { 0x1.9b270cp-17, 0x1.4ec006p-13 }, + { 0x1.4f59fap-17, 0x1.11aebp-13 }, + { 0x1.1164acp-17, 0x1.bf4ab2p-14 }, + { 0x1.bd8c96p-18, 0x1.6d561ep-14 }, + { 0x1.6ae172p-18, 0x1.2a406ep-14 }, + { 0x1.276874p-18, 0x1.e6bba6p-15 }, + { 0x1.e0bad2p-19, 0x1.8cf814p-15 }, + { 0x1.86f788p-19, 0x1.4399f8p-15 }, + { 0x1.3dcfaep-19, 0x1.07aa3p-15 }, + { 0x1.023828p-19, 0x1.ad7302p-16 }, + { 0x1.a3666ep-20, 0x1.5d90f4p-16 }, + { 0x1.546e38p-20, 0x1.1c674ep-16 }, + { 0x1.143264p-20, 0x1.ce8ccp-17 }, + { 0x1.bff316p-21, 0x1.77f562p-17 }, + { 0x1.6b13ecp-21, 0x1.316da8p-17 }, + { 0x1.2624f4p-21, 0x1.f0046p-18 }, + { 0x1.dc5de4p-22, 0x1.92920ap-18 }, + { 0x1.818d3ap-22, 0x1.4691b2p-18 }, + { 0x1.37e62p-22, 0x1.08c96ap-18 }, + { 0x1.f8637ep-23, 0x1.ad2d0ap-19 }, + { 0x1.97a3dcp-23, 0x1.5ba462p-19 }, + { 0x1.494a4p-23, 0x1.1975ep-19 }, + { 0x1.09dee4p-23, 0x1.c78892p-20 }, + { 0x1.ad1fap-24, 0x1.7073c4p-20 }, + { 0x1.5a245ep-24, 0x1.29df48p-20 }, + { 0x1.171278p-24, 0x1.e163bep-21 }, + { 0x1.c1c74cp-25, 0x1.84cbbp-21 }, + { 0x1.6a46f4p-25, 0x1.39dbcep-21 }, + { 0x1.23a858p-25, 0x1.fa7b92p-22 }, + { 0x1.d56196p-26, 0x1.9876ap-22 }, + { 0x1.7984b6p-26, 0x1.4940bcp-22 }, + { 0x1.2f7cc4p-26, 0x1.094608p-22 }, + { 0x1.e7b62cp-27, 0x1.ab3e8cp-23 }, + { 0x1.87b15ep-27, 0x1.57e33ep-23 }, + { 0x1.3a6dp-27, 0x1.14a8b6p-23 }, + { 0x1.f88ebap-28, 0x1.bcede6p-24 }, + { 0x1.94a282p-28, 0x1.659918p-24 }, + { 0x1.44580ap-28, 0x1.1f4498p-24 }, + { 0x1.03dbf8p-28, 0x1.cd5086p-25 }, + { 0x1.a03066p-29, 0x1.723974p-25 }, + { 0x1.4d1f2ep-29, 0x1.28f9cap-25 }, + { 0x1.0a814ap-29, 0x1.dc34b6p-26 }, + { 0x1.aa36cap-30, 0x1.7d9dbp-26 }, + { 0x1.54a6b6p-30, 0x1.31aa56p-26 }, + { 0x1.102232p-30, 0x1.e96c26p-27 }, + { 0x1.b2959ep-31, 0x1.87a218p-27 }, + { 0x1.5ad66cp-31, 0x1.393ad2p-27 }, + { 0x1.14ac7ep-31, 0x1.f4ccdap-28 }, + { 0x1.b931b8p-32, 0x1.9026a8p-28 }, + { 0x1.5f9a24p-32, 0x1.3f92eap-28 }, + { 0x1.181154p-32, 0x1.fe3208p-29 }, + { 0x1.bdf55ep-33, 0x1.970fbp-29 }, + { 0x1.62e226p-33, 0x1.449de6p-29 }, + { 0x1.1a4576p-33, 0x1.02be7p-29 }, + { 0x1.c0d0bep-34, 0x1.9c4672p-30 }, + { 0x1.64a386p-34, 0x1.484b1ep-30 }, + { 0x1.1b418cp-34, 0x1.054a9ap-30 }, + { 0x1.c1ba4ap-35, 0x1.9fb994p-31 }, + { 0x1.64d86p-35, 0x1.4a8e4ep-31 }, + { 0x1.1b0242p-35, 0x1.06b4fep-31 }, + { 0x1.c0aee6p-36, 0x1.a15d86p-32 }, + { 0x1.637ffap-36, 0x1.4b5fdep-32 }, + { 0x1.198862p-36, 0x1.06f8dap-32 }, + { 0x1.bdb204p-37, 0x1.a12cc8p-33 }, + { 0x1.609ec2p-37, 0x1.4abd0ap-33 }, + { 0x1.16d8d2p-37, 0x1.06154ap-33 }, + { 0x1.b8cd88p-38, 0x1.9f27fap-34 }, + { 0x1.5c3e42p-38, 0x1.48a7fcp-34 }, + { 0x1.12fc6cp-38, 0x1.040d4ap-34 }, + { 0x1.b2119p-39, 0x1.9b55e8p-35 }, + { 0x1.566cep-39, 0x1.4527acp-35 }, + { 0x1.0dffep-39, 0x1.00e7acp-35 }, + { 0x1.a99426p-40, 0x1.95c358p-36 }, + { 0x1.4f3d92p-40, 0x1.4047cep-36 }, + { 0x1.07f35ep-40, 0x1.f95dcep-37 }, + { 0x1.9f70cp-41, 0x1.8e82cep-37 }, + { 0x1.46c77ap-41, 0x1.3a1882p-37 }, + { 0x1.00ea48p-41, 0x1.eee1d4p-38 }, + { 0x1.93c7acp-42, 0x1.85ac18p-38 }, + { 0x1.3d256ap-42, 0x1.32ae04p-38 }, + { 0x1.f1f59p-43, 0x1.e27d88p-39 }, + { 0x1.86bd6ap-43, 0x1.7b5bdap-39 }, + { 0x1.327554p-43, 0x1.2a2036p-39 }, + { 0x1.e07ab4p-44, 0x1.d458ap-40 }, + { 0x1.7879ecp-44, 0x1.6fb2eap-40 }, + { 0x1.26d7bp-44, 0x1.208a2cp-40 }, + { 0x1.cd98a2p-45, 0x1.c49f8ap-41 }, + { 0x1.6927c2p-45, 0x1.62d5aap-41 }, + { 0x1.1a6ed6p-45, 0x1.16098ep-41 }, + { 0x1.b986acp-46, 0x1.b3828ep-42 }, + { 0x1.58f35ap-46, 0x1.54eb3ep-42 }, + { 0x1.0d5e6p-46, 0x1.0abe0ep-42 }, + { 0x1.a47db6p-47, 0x1.a134d4p-43 }, + { 0x1.480a18p-47, 0x1.461cdap-43 }, + { 0x1.ff94e4p-48, 0x1.fd9182p-44 }, + { 0x1.8eb738p-48, 0x1.8deb62p-44 }, + { 0x1.369994p-48, 0x1.3694e8p-44 }, + { 0x1.e3ae4ap-49, 0x1.e49706p-45 }, + { 0x1.786c3ep-49, 0x1.79dc28p-45 }, + { 0x1.24cec8p-49, 0x1.267e46p-45 }, + { 0x1.c74fc4p-50, 0x1.cad0bp-46 }, + { 0x1.61d46cp-50, 0x1.653d08p-46 }, + { 0x1.12d55cp-50, 0x1.16038cp-46 }, + { 0x1.aabdacp-51, 0x1.b081aap-47 }, + { 0x1.4b252ep-51, 0x1.5042e2p-47 }, + { 0x1.00d6f8p-51, 0x1.054e44p-47 }, + { 0x1.8e38ep-52, 0x1.95eb2cp-48 }, + { 0x1.3490e8p-52, 0x1.3b20c6p-48 }, + { 0x1.ddf56ap-53, 0x1.e90cb6p-49 }, + { 0x1.71fdep-53, 0x1.7b4b76p-49 }, + { 0x1.1e465ap-53, 0x1.26072ap-49 }, + { 0x1.bac92ep-54, 0x1.c7a2ecp-50 }, + { 0x1.56441cp-54, 0x1.60dcfp-50 }, + { 0x1.08700cp-54, 0x1.112346p-50 }, + { 0x1.986a66p-55, 0x1.a6a50ap-51 }, + { 0x1.3b3d56p-55, 0x1.46d572p-51 }, + { 0x1.e667dap-56, 0x1.f93d0ep-52 }, + { 0x1.7712b8p-56, 0x1.86529ep-52 }, + { 0x1.211544p-56, 0x1.2d65aep-52 }, + { 0x1.bd660ap-57, 0x1.d13c32p-53 }, + { 0x1.56f3eep-57, 0x1.66e45ap-53 }, + { 0x1.07f14ap-57, 0x1.14b8b6p-53 }, + { 0x1.96129cp-58, 0x1.aa854cp-54 }, + { 0x1.3837cp-58, 0x1.488b94p-54 }, + { 0x1.dfe0c2p-59, 0x1.f9e772p-55 }, + { 0x1.709b5ap-59, 0x1.85503p-55 }, + { 0x1.1affd2p-59, 0x1.2b7218p-55 }, + { 0x1.b2564p-60, 0x1.cc6bb6p-56 }, + { 0x1.4d23fap-60, 0x1.61cb1ap-56 }, + { 0x1.fecbdp-61, 0x1.0fba0ep-56 }, + { 0x1.8767d8p-61, 0x1.a13072p-57 }, + { 0x1.2bc67ep-61, 0x1.401abcp-57 }, + { 0x1.caf846p-62, 0x1.eafc2cp-58 }, + { 0x1.5f2e7ap-62, 0x1.785cp-58 }, + { 0x1.0c93acp-62, 0x1.205a7ep-58 }, + { 0x1.9a9b06p-63, 0x1.b9a31ap-59 }, + { 0x1.39b7fcp-63, 0x1.520968p-59 }, + { 0x1.df277ap-64, 0x1.029ce6p-59 }, + { 0x1.6dbcdp-64, 0x1.8b81d6p-60 }, + { 0x1.17080ap-64, 0x1.2e48f2p-60 }, + { 0x1.a98e26p-65, 0x1.cdd86cp-61 }, + { 0x1.445a6ap-65, 0x1.60a47ap-61 }, + { 0x1.ee324ep-66, 0x1.0d210cp-61 }, + { 0x1.784e3p-66, 0x1.9a961ep-62 }, + { 0x1.1e65fep-66, 0x1.390b74p-62 }, + { 0x1.b3bb86p-67, 0x1.dd1e52p-63 }, + { 0x1.4b4e36p-67, 0x1.6b6a7ap-63 }, + { 0x1.f790f6p-68, 0x1.14acc2p-63 }, + { 0x1.7e82cep-68, 0x1.a511aap-64 }, + { 0x1.226a7ap-68, 0x1.404114p-64 }, + { 0x1.b8c634p-69, 0x1.e6ea96p-65 }, + { 0x1.4e53acp-69, 0x1.71f97ap-65 }, + { 0x1.faed5cp-70, 0x1.18fb2ep-65 }, + { 0x1.80217ep-70, 0x1.aa947ep-66 }, + { 0x1.22f066p-70, 0x1.43a796p-66 }, + { 0x1.b87f86p-71, 0x1.eae2fp-67 }, + { 0x1.4d4ec8p-71, 0x1.7414e6p-67 }, + { 0x1.f8283ep-72, 0x1.19e474p-67 }, + { 0x1.7d1b22p-72, 0x1.aaeb7ep-68 }, + { 0x1.1ff2dp-72, 0x1.431f66p-68 }, + { 0x1.b2e9e8p-73, 0x1.e8e272p-69 }, + { 0x1.4848dep-73, 0x1.71a91ep-69 }, + { 0x1.ef5b16p-74, 0x1.176014p-69 }, + { 0x1.758b92p-74, 0x1.a6137cp-70 }, + { 0x1.198d42p-74, 0x1.3ead74p-70 }, + { 0x1.a838bp-75, 0x1.e0fbc2p-71 }, + { 0x1.3f700cp-75, 0x1.6accaep-71 }, + { 0x1.e0d68ep-76, 0x1.118578p-71 }, + { 0x1.69b7f4p-76, 0x1.9c3974p-72 }, + { 0x1.0ffa12p-76, 0x1.367afap-72 }, + { 0x1.98cd1cp-77, 0x1.d377fap-73 }, + { 0x1.33148p-77, 0x1.5fbee6p-73 }, + { 0x1.cd1dbap-78, 0x1.088a8p-73 }, + { 0x1.5a0a9cp-78, 0x1.8db7ccp-74 }, + { 0x1.038ef4p-78, 0x1.2ad2ecp-74 }, + { 0x1.85308ap-79, 0x1.c0d23ep-75 }, + { 0x1.23a3cp-79, 0x1.50e41ap-75 }, + { 0x1.b4de68p-80, 0x1.f980a8p-76 }, + { 0x1.470ce4p-80, 0x1.7b10fep-76 }, + { 0x1.e9700cp-81, 0x1.1c1d98p-76 }, + { 0x1.6e0c9p-81, 0x1.a9b08p-77 }, + { 0x1.11a25ap-81, 0x1.3ebfb4p-77 }, + { 0x1.98e73ap-82, 0x1.dd1d36p-78 }, + { 0x1.315f58p-82, 0x1.64e7fp-78 }, + { 0x1.c7e35cp-83, 0x1.0ada94p-78 }, + { 0x1.542176p-83, 0x1.8ed9e8p-79 }, + { 0x1.fb491ep-84, 0x1.29ecb2p-79 }, + { 0x1.7a1c34p-84, 0x1.bcdb34p-80 }, + { 0x1.19b0f2p-84, 0x1.4bf6cap-80 }, + { 0x1.a383cap-85, 0x1.ef3318p-81 }, + { 0x1.383bf2p-85, 0x1.712bc2p-81 }, + { 0x1.d08cdap-86, 0x1.13151p-81 }, + { 0x1.596adp-86, 0x1.99bf36p-82 }, + { 0x1.00b602p-86, 0x1.3104d6p-82 }, + { 0x1.7d62a2p-87, 0x1.c5e534p-83 }, + { 0x1.1b2abcp-87, 0x1.518db2p-83 }, + { 0x1.a4480ep-88, 0x1.f5d1c6p-84 }, + { 0x1.37be42p-88, 0x1.74d45ap-84 }, + { 0x1.ce3ee4p-89, 0x1.14dc4ap-84 }, + { 0x1.568986p-89, 0x1.9afd0ep-85 }, + { 0x1.fb69c6p-90, 0x1.30e632p-85 }, + { 0x1.77a47ep-90, 0x1.c42b48p-86 }, + { 0x1.15f4ep-90, 0x1.4f1f52p-86 }, + { 0x1.9b25dcp-91, 0x1.f08156p-87 }, + { 0x1.2feeeep-91, 0x1.6f9f62p-87 }, + { 0x1.c122bcp-92, 0x1.100ffap-87 }, + { 0x1.4bb154p-92, 0x1.927ce6p-88 }, + { 0x1.e9ae56p-93, 0x1.2992f4p-88 }, + { 0x1.6948e8p-93, 0x1.b7cccap-89 }, + { 0x1.0a6cd2p-93, 0x1.44d7c4p-89 }, + { 0x1.88c0cap-94, 0x1.dfa22p-90 }, + { 0x1.215988p-94, 0x1.61eb26p-90 }, + { 0x1.aa222ap-95, 0x1.0506e2p-90 }, + { 0x1.39a30ep-95, 0x1.80d828p-91 }, + { 0x1.cd740ep-96, 0x1.1b8f04p-91 }, + { 0x1.534d82p-96, 0x1.a1a7ecp-92 }, + { 0x1.f2bb06p-97, 0x1.336f3p-92 }, + { 0x1.6e5b34p-97, 0x1.c46172p-93 }, + { 0x1.0cfc82p-97, 0x1.4cab82p-93 }, + { 0x1.8acc82p-98, 0x1.e9094cp-94 }, + { 0x1.219686p-98, 0x1.67465p-94 }, + { 0x1.a89fa6p-99, 0x1.07d0b8p-94 }, + { 0x1.372982p-99, 0x1.833ffap-95 }, + { 0x1.c7d094p-100, 0x1.1c147ap-95 }, + { 0x1.4db1c8p-100, 0x1.a096ccp-96 }, + { 0x1.e858d8p-101, 0x1.314decp-96 }, + { 0x1.6529ep-101, 0x1.bf46cep-97 }, + { 0x1.0517bap-101, 0x1.47796ap-97 }, + { 0x1.7d8a8p-102, 0x1.df49a2p-98 }, + { 0x1.16a46p-102, 0x1.5e9198p-98 }, + { 0x1.96ca76p-103, 0x1.004b34p-98 }, + { 0x1.28cb2cp-103, 0x1.768f3ep-99 }, + { 0x1.b0de98p-104, 0x1.1190d2p-99 }, }, - - { -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1, - -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3, - 0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8, - -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15, - 0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27, - -0x1.8ec1581647f9fp-33 -#endif - }, - - { -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1, - -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6, - 0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13, - -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23, - 0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38, - -0x1.027034672f11cp-44 -#endif - }, - - { -#if ERFCF_POLY_NCOEFFS == 16 - 0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2, - -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8, - 0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17, - -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30, - 0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46, - -0x1.45abac612344bp-53 -#endif - }}}; + }; diff --git a/contrib/arm-optimized-routines/pl/math/erff_1u5.c b/contrib/arm-optimized-routines/pl/math/erff_1u5.c deleted file mode 100644 index 1a69872c43e5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/erff_1u5.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Single-precision erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "estrinf.h" -#include "hornerf.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f -#define A __erff_data.erff_poly_A -#define B __erff_data.erff_poly_B - -/* Top 12 bits of a float. */ -static inline uint32_t -top12 (float x) -{ - return asuint (x) >> 20; -} - -/* Efficient implementation of erff using either a pure polynomial approximation - or the exponential of a polynomial. Worst-case error is 1.09ulps at - 0x1.c111acp-1. */ -float -erff (float x) -{ - float r, x2; - - /* Get top word. */ - uint32_t ix = asuint (x); - uint32_t sign = ix >> 31; - uint32_t ia12 = top12 (x) & 0x7ff; - - /* Limit of both intervals is 0.875 for performance reasons but coefficients - computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy - from 0.94 to 1.1ulps. */ - if (ia12 < 0x3f6) - { /* a = |x| < 0.875. */ - - /* Tiny and subnormal cases. */ - if (unlikely (ia12 < 0x318)) - { /* |x| < 2^(-28). */ - if (unlikely (ia12 < 0x040)) - { /* |x| < 2^(-119). */ - float y = fmaf (TwoOverSqrtPiMinusOne, x, x); - return check_uflowf (y); - } - return x + TwoOverSqrtPiMinusOne * x; - } - - x2 = x * x; - - /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2). - */ -#define C(i) A[i] - r = fmaf (HORNER_5 (x2, C), x, x); -#undef C - } - else if (ia12 < 0x408) - { /* |x| < 4.0 - Use a custom Estrin scheme. */ - - float a = fabsf (x); - /* Use Estrin scheme on high order (small magnitude) coefficients. */ -#define C(i) B[i] - r = ESTRIN_3_ (a, x * x, C, 3); -#undef C - /* Then switch to pure Horner scheme. */ - r = fmaf (r, a, B[2]); - r = fmaf (r, a, B[1]); - r = fmaf (r, a, B[0]); - r = fmaf (r, a, a); - /* Single precision exponential with ~0.5ulps ensures erff has maximum - relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on - [0.875, 4.0]. */ - r = expf (-r); - /* Explicit copysign (calling copysignf increases latency). */ - if (sign) - r = -1.0f + r; - else - r = 1.0f - r; - } - else - { /* |x| >= 4.0. */ - - /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ - if (unlikely (ia12 >= 0x7f8)) - return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; - - /* Explicit copysign (calling copysignf increases latency). */ - if (sign) - r = -1.0f; - else - r = 1.0f; - } - return r; -} - -PL_SIG (S, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (erff, 0.6) -PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000) -PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (erff, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erff_2u.c b/contrib/arm-optimized-routines/pl/math/erff_2u.c new file mode 100644 index 000000000000..f43e647072f8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erff_2u.c @@ -0,0 +1,82 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define Shift 0x1p16f +#define OneThird 0x1.555556p-2f + +/* Fast erff approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3) ) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + ] + + This single precision implementation uses only the following terms: + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float +erff (float x) +{ + /* Get absolute value and sign. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix & ~0x7fffffff; + + /* |x| < 0x1p-62. Triggers exceptions. */ + if (unlikely (ia < 0x20800000)) + return fmaf (TwoOverSqrtPiMinusOne, x, x); + + if (ia < 0x407b8000) /* |x| < 4 - 8 / 128 = 3.9375. */ + { + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + float a = asfloat (ia); + float z = a + Shift; + uint32_t i = asuint (z) - asuint (Shift); + float r = z - Shift; + float erfr = __erff_data.tab[i].erf; + float scale = __erff_data.tab[i].scale; + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float d = a - r; + float d2 = d * d; + float y = -fmaf (OneThird, d, r); + y = fmaf (fmaf (y, d2, d), scale, erfr); + return asfloat (asuint (y) | sign); + } + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia >= 0x7f800000)) + return (1.0f - (float) (sign >> 30)) + 1.0f / x; + + /* Boring domain (|x| >= 4.0). */ + return asfloat (sign | asuint (1.0f)); +} + +PL_SIG (S, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (erff, 1.43) +PL_TEST_SYM_INTERVAL (erff, 0, 3.9375, 40000) +PL_TEST_SYM_INTERVAL (erff, 3.9375, inf, 40000) +PL_TEST_SYM_INTERVAL (erff, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erff_data.c b/contrib/arm-optimized-routines/pl/math/erff_data.c index 2352baefd35f..84c0d2e95463 100644 --- a/contrib/arm-optimized-routines/pl/math/erff_data.c +++ b/contrib/arm-optimized-routines/pl/math/erff_data.c @@ -1,16 +1,532 @@ /* * Data for approximation of erff. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Minimax approximation of erff. */ -const struct erff_data __erff_data - = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f, - -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f}, - .erff_poly_B - = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f, - 0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}}; +/* Lookup table used in erff. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 4.0 (513 values): + - the first entry __erff_data.tab.erf contains the values of erf(r), + - the second entry __erff_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct erff_data __erff_data = { + .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, + { 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, + { 0x1.20d770p-6, 0x1.20cb68p+0 }, + { 0x1.b137e0p-6, 0x1.20b4d8p+0 }, + { 0x1.20c564p-5, 0x1.209546p+0 }, + { 0x1.68e5d4p-5, 0x1.206cb4p+0 }, + { 0x1.b0fafep-5, 0x1.203b26p+0 }, + { 0x1.f902a8p-5, 0x1.2000a0p+0 }, + { 0x1.207d48p-4, 0x1.1fbd28p+0 }, + { 0x1.44703ep-4, 0x1.1f70c4p+0 }, + { 0x1.68591ap-4, 0x1.1f1b7ap+0 }, + { 0x1.8c36bep-4, 0x1.1ebd56p+0 }, + { 0x1.b00812p-4, 0x1.1e565cp+0 }, + { 0x1.d3cbf8p-4, 0x1.1de698p+0 }, + { 0x1.f7815ap-4, 0x1.1d6e14p+0 }, + { 0x1.0d9390p-3, 0x1.1cecdcp+0 }, + { 0x1.1f5e1ap-3, 0x1.1c62fap+0 }, + { 0x1.311fc2p-3, 0x1.1bd07cp+0 }, + { 0x1.42d7fcp-3, 0x1.1b3572p+0 }, + { 0x1.548642p-3, 0x1.1a91e6p+0 }, + { 0x1.662a0cp-3, 0x1.19e5eap+0 }, + { 0x1.77c2d2p-3, 0x1.19318cp+0 }, + { 0x1.895010p-3, 0x1.1874dep+0 }, + { 0x1.9ad142p-3, 0x1.17aff0p+0 }, + { 0x1.ac45e4p-3, 0x1.16e2d8p+0 }, + { 0x1.bdad72p-3, 0x1.160da4p+0 }, + { 0x1.cf076ep-3, 0x1.153068p+0 }, + { 0x1.e05354p-3, 0x1.144b3cp+0 }, + { 0x1.f190aap-3, 0x1.135e30p+0 }, + { 0x1.015f78p-2, 0x1.12695ep+0 }, + { 0x1.09eed6p-2, 0x1.116cd8p+0 }, + { 0x1.127632p-2, 0x1.1068bap+0 }, + { 0x1.1af54ep-2, 0x1.0f5d16p+0 }, + { 0x1.236bf0p-2, 0x1.0e4a08p+0 }, + { 0x1.2bd9dcp-2, 0x1.0d2fa6p+0 }, + { 0x1.343ed6p-2, 0x1.0c0e0ap+0 }, + { 0x1.3c9aa8p-2, 0x1.0ae550p+0 }, + { 0x1.44ed18p-2, 0x1.09b590p+0 }, + { 0x1.4d35f0p-2, 0x1.087ee4p+0 }, + { 0x1.5574f4p-2, 0x1.07416cp+0 }, + { 0x1.5da9f4p-2, 0x1.05fd3ep+0 }, + { 0x1.65d4b8p-2, 0x1.04b27cp+0 }, + { 0x1.6df50ap-2, 0x1.036140p+0 }, + { 0x1.760abap-2, 0x1.0209a6p+0 }, + { 0x1.7e1594p-2, 0x1.00abd0p+0 }, + { 0x1.861566p-2, 0x1.fe8fb0p-1 }, + { 0x1.8e0a02p-2, 0x1.fbbbbep-1 }, + { 0x1.95f336p-2, 0x1.f8dc0ap-1 }, + { 0x1.9dd0d2p-2, 0x1.f5f0cep-1 }, + { 0x1.a5a2acp-2, 0x1.f2fa4cp-1 }, + { 0x1.ad6896p-2, 0x1.eff8c4p-1 }, + { 0x1.b52264p-2, 0x1.ecec78p-1 }, + { 0x1.bccfecp-2, 0x1.e9d5a8p-1 }, + { 0x1.c47104p-2, 0x1.e6b498p-1 }, + { 0x1.cc0584p-2, 0x1.e38988p-1 }, + { 0x1.d38d44p-2, 0x1.e054bep-1 }, + { 0x1.db081cp-2, 0x1.dd167cp-1 }, + { 0x1.e275eap-2, 0x1.d9cf06p-1 }, + { 0x1.e9d68ap-2, 0x1.d67ea2p-1 }, + { 0x1.f129d4p-2, 0x1.d32592p-1 }, + { 0x1.f86faap-2, 0x1.cfc41ep-1 }, + { 0x1.ffa7eap-2, 0x1.cc5a8ap-1 }, + { 0x1.03693ap-1, 0x1.c8e91cp-1 }, + { 0x1.06f794p-1, 0x1.c5701ap-1 }, + { 0x1.0a7ef6p-1, 0x1.c1efcap-1 }, + { 0x1.0dff50p-1, 0x1.be6872p-1 }, + { 0x1.117894p-1, 0x1.bada5ap-1 }, + { 0x1.14eab4p-1, 0x1.b745c6p-1 }, + { 0x1.1855a6p-1, 0x1.b3aafcp-1 }, + { 0x1.1bb95cp-1, 0x1.b00a46p-1 }, + { 0x1.1f15ccp-1, 0x1.ac63e8p-1 }, + { 0x1.226ae8p-1, 0x1.a8b828p-1 }, + { 0x1.25b8a8p-1, 0x1.a5074ep-1 }, + { 0x1.28ff02p-1, 0x1.a1519ep-1 }, + { 0x1.2c3decp-1, 0x1.9d9762p-1 }, + { 0x1.2f755cp-1, 0x1.99d8dap-1 }, + { 0x1.32a54cp-1, 0x1.961650p-1 }, + { 0x1.35cdb4p-1, 0x1.925008p-1 }, + { 0x1.38ee8ap-1, 0x1.8e8646p-1 }, + { 0x1.3c07cap-1, 0x1.8ab950p-1 }, + { 0x1.3f196ep-1, 0x1.86e96ap-1 }, + { 0x1.42236ep-1, 0x1.8316d6p-1 }, + { 0x1.4525c8p-1, 0x1.7f41dcp-1 }, + { 0x1.482074p-1, 0x1.7b6abcp-1 }, + { 0x1.4b1372p-1, 0x1.7791b8p-1 }, + { 0x1.4dfebap-1, 0x1.73b714p-1 }, + { 0x1.50e24cp-1, 0x1.6fdb12p-1 }, + { 0x1.53be26p-1, 0x1.6bfdf0p-1 }, + { 0x1.569244p-1, 0x1.681ff2p-1 }, + { 0x1.595ea6p-1, 0x1.644156p-1 }, + { 0x1.5c2348p-1, 0x1.60625cp-1 }, + { 0x1.5ee02ep-1, 0x1.5c8342p-1 }, + { 0x1.619556p-1, 0x1.58a446p-1 }, + { 0x1.6442c0p-1, 0x1.54c5a6p-1 }, + { 0x1.66e86ep-1, 0x1.50e79ep-1 }, + { 0x1.69865ep-1, 0x1.4d0a68p-1 }, + { 0x1.6c1c98p-1, 0x1.492e42p-1 }, + { 0x1.6eab18p-1, 0x1.455366p-1 }, + { 0x1.7131e6p-1, 0x1.417a0cp-1 }, + { 0x1.73b102p-1, 0x1.3da26ep-1 }, + { 0x1.762870p-1, 0x1.39ccc2p-1 }, + { 0x1.789836p-1, 0x1.35f940p-1 }, + { 0x1.7b0058p-1, 0x1.32281ep-1 }, + { 0x1.7d60d8p-1, 0x1.2e5992p-1 }, + { 0x1.7fb9c0p-1, 0x1.2a8dcep-1 }, + { 0x1.820b12p-1, 0x1.26c508p-1 }, + { 0x1.8454d6p-1, 0x1.22ff72p-1 }, + { 0x1.869712p-1, 0x1.1f3d3cp-1 }, + { 0x1.88d1cep-1, 0x1.1b7e98p-1 }, + { 0x1.8b050ep-1, 0x1.17c3b6p-1 }, + { 0x1.8d30dep-1, 0x1.140cc4p-1 }, + { 0x1.8f5544p-1, 0x1.1059eep-1 }, + { 0x1.91724ap-1, 0x1.0cab62p-1 }, + { 0x1.9387f6p-1, 0x1.09014cp-1 }, + { 0x1.959652p-1, 0x1.055bd6p-1 }, + { 0x1.979d68p-1, 0x1.01bb2cp-1 }, + { 0x1.999d42p-1, 0x1.fc3ee6p-2 }, + { 0x1.9b95e8p-1, 0x1.f511aap-2 }, + { 0x1.9d8768p-1, 0x1.edeeeep-2 }, + { 0x1.9f71cap-1, 0x1.e6d700p-2 }, + { 0x1.a1551ap-1, 0x1.dfca26p-2 }, + { 0x1.a33162p-1, 0x1.d8c8aap-2 }, + { 0x1.a506b0p-1, 0x1.d1d2d0p-2 }, + { 0x1.a6d50cp-1, 0x1.cae8dap-2 }, + { 0x1.a89c86p-1, 0x1.c40b08p-2 }, + { 0x1.aa5d26p-1, 0x1.bd3998p-2 }, + { 0x1.ac16fcp-1, 0x1.b674c8p-2 }, + { 0x1.adca14p-1, 0x1.afbcd4p-2 }, + { 0x1.af767ap-1, 0x1.a911f0p-2 }, + { 0x1.b11c3cp-1, 0x1.a27456p-2 }, + { 0x1.b2bb68p-1, 0x1.9be438p-2 }, + { 0x1.b4540ap-1, 0x1.9561c8p-2 }, + { 0x1.b5e630p-1, 0x1.8eed36p-2 }, + { 0x1.b771e8p-1, 0x1.8886b2p-2 }, + { 0x1.b8f742p-1, 0x1.822e66p-2 }, + { 0x1.ba764ap-1, 0x1.7be47ap-2 }, + { 0x1.bbef10p-1, 0x1.75a91ap-2 }, + { 0x1.bd61a2p-1, 0x1.6f7c6ap-2 }, + { 0x1.bece0ep-1, 0x1.695e8cp-2 }, + { 0x1.c03464p-1, 0x1.634fa6p-2 }, + { 0x1.c194b2p-1, 0x1.5d4fd4p-2 }, + { 0x1.c2ef08p-1, 0x1.575f34p-2 }, + { 0x1.c44376p-1, 0x1.517de6p-2 }, + { 0x1.c5920ap-1, 0x1.4bac00p-2 }, + { 0x1.c6dad2p-1, 0x1.45e99cp-2 }, + { 0x1.c81de2p-1, 0x1.4036d0p-2 }, + { 0x1.c95b46p-1, 0x1.3a93b2p-2 }, + { 0x1.ca930ep-1, 0x1.350052p-2 }, + { 0x1.cbc54cp-1, 0x1.2f7cc4p-2 }, + { 0x1.ccf20cp-1, 0x1.2a0916p-2 }, + { 0x1.ce1962p-1, 0x1.24a554p-2 }, + { 0x1.cf3b5cp-1, 0x1.1f518ap-2 }, + { 0x1.d0580cp-1, 0x1.1a0dc6p-2 }, + { 0x1.d16f7ep-1, 0x1.14da0ap-2 }, + { 0x1.d281c4p-1, 0x1.0fb662p-2 }, + { 0x1.d38ef0p-1, 0x1.0aa2d0p-2 }, + { 0x1.d49710p-1, 0x1.059f5ap-2 }, + { 0x1.d59a34p-1, 0x1.00ac00p-2 }, + { 0x1.d6986cp-1, 0x1.f79184p-3 }, + { 0x1.d791cap-1, 0x1.edeb40p-3 }, + { 0x1.d8865ep-1, 0x1.e46530p-3 }, + { 0x1.d97636p-1, 0x1.daff4ap-3 }, + { 0x1.da6162p-1, 0x1.d1b982p-3 }, + { 0x1.db47f4p-1, 0x1.c893cep-3 }, + { 0x1.dc29fcp-1, 0x1.bf8e1cp-3 }, + { 0x1.dd0788p-1, 0x1.b6a856p-3 }, + { 0x1.dde0aap-1, 0x1.ade26cp-3 }, + { 0x1.deb570p-1, 0x1.a53c42p-3 }, + { 0x1.df85eap-1, 0x1.9cb5bep-3 }, + { 0x1.e0522ap-1, 0x1.944ec2p-3 }, + { 0x1.e11a3ep-1, 0x1.8c0732p-3 }, + { 0x1.e1de36p-1, 0x1.83deeap-3 }, + { 0x1.e29e22p-1, 0x1.7bd5c8p-3 }, + { 0x1.e35a12p-1, 0x1.73eba4p-3 }, + { 0x1.e41214p-1, 0x1.6c2056p-3 }, + { 0x1.e4c638p-1, 0x1.6473b6p-3 }, + { 0x1.e5768cp-1, 0x1.5ce596p-3 }, + { 0x1.e62322p-1, 0x1.5575c8p-3 }, + { 0x1.e6cc08p-1, 0x1.4e241ep-3 }, + { 0x1.e7714ap-1, 0x1.46f066p-3 }, + { 0x1.e812fcp-1, 0x1.3fda6cp-3 }, + { 0x1.e8b12ap-1, 0x1.38e1fap-3 }, + { 0x1.e94be4p-1, 0x1.3206dcp-3 }, + { 0x1.e9e336p-1, 0x1.2b48dap-3 }, + { 0x1.ea7730p-1, 0x1.24a7b8p-3 }, + { 0x1.eb07e2p-1, 0x1.1e233ep-3 }, + { 0x1.eb9558p-1, 0x1.17bb2cp-3 }, + { 0x1.ec1fa2p-1, 0x1.116f48p-3 }, + { 0x1.eca6ccp-1, 0x1.0b3f52p-3 }, + { 0x1.ed2ae6p-1, 0x1.052b0cp-3 }, + { 0x1.edabfcp-1, 0x1.fe6460p-4 }, + { 0x1.ee2a1ep-1, 0x1.f2a902p-4 }, + { 0x1.eea556p-1, 0x1.e72372p-4 }, + { 0x1.ef1db4p-1, 0x1.dbd32ap-4 }, + { 0x1.ef9344p-1, 0x1.d0b7a0p-4 }, + { 0x1.f00614p-1, 0x1.c5d04ap-4 }, + { 0x1.f07630p-1, 0x1.bb1c98p-4 }, + { 0x1.f0e3a6p-1, 0x1.b09bfcp-4 }, + { 0x1.f14e82p-1, 0x1.a64de6p-4 }, + { 0x1.f1b6d0p-1, 0x1.9c31c6p-4 }, + { 0x1.f21ca0p-1, 0x1.92470ap-4 }, + { 0x1.f27ff8p-1, 0x1.888d1ep-4 }, + { 0x1.f2e0eap-1, 0x1.7f036cp-4 }, + { 0x1.f33f7ep-1, 0x1.75a960p-4 }, + { 0x1.f39bc2p-1, 0x1.6c7e64p-4 }, + { 0x1.f3f5c2p-1, 0x1.6381e2p-4 }, + { 0x1.f44d88p-1, 0x1.5ab342p-4 }, + { 0x1.f4a31ep-1, 0x1.5211ecp-4 }, + { 0x1.f4f694p-1, 0x1.499d48p-4 }, + { 0x1.f547f2p-1, 0x1.4154bcp-4 }, + { 0x1.f59742p-1, 0x1.3937b2p-4 }, + { 0x1.f5e490p-1, 0x1.31458ep-4 }, + { 0x1.f62fe8p-1, 0x1.297dbap-4 }, + { 0x1.f67952p-1, 0x1.21df9ap-4 }, + { 0x1.f6c0dcp-1, 0x1.1a6a96p-4 }, + { 0x1.f7068cp-1, 0x1.131e14p-4 }, + { 0x1.f74a6ep-1, 0x1.0bf97ep-4 }, + { 0x1.f78c8cp-1, 0x1.04fc3ap-4 }, + { 0x1.f7cceep-1, 0x1.fc4b5ep-5 }, + { 0x1.f80ba2p-1, 0x1.eeea8cp-5 }, + { 0x1.f848acp-1, 0x1.e1d4d0p-5 }, + { 0x1.f8841ap-1, 0x1.d508fap-5 }, + { 0x1.f8bdf2p-1, 0x1.c885e0p-5 }, + { 0x1.f8f63ep-1, 0x1.bc4a54p-5 }, + { 0x1.f92d08p-1, 0x1.b05530p-5 }, + { 0x1.f96256p-1, 0x1.a4a54ap-5 }, + { 0x1.f99634p-1, 0x1.99397ap-5 }, + { 0x1.f9c8a8p-1, 0x1.8e109cp-5 }, + { 0x1.f9f9bap-1, 0x1.83298ep-5 }, + { 0x1.fa2974p-1, 0x1.78832cp-5 }, + { 0x1.fa57dep-1, 0x1.6e1c58p-5 }, + { 0x1.fa84fep-1, 0x1.63f3f6p-5 }, + { 0x1.fab0dep-1, 0x1.5a08e8p-5 }, + { 0x1.fadb84p-1, 0x1.505a18p-5 }, + { 0x1.fb04f6p-1, 0x1.46e66cp-5 }, + { 0x1.fb2d40p-1, 0x1.3dacd2p-5 }, + { 0x1.fb5464p-1, 0x1.34ac36p-5 }, + { 0x1.fb7a6cp-1, 0x1.2be38cp-5 }, + { 0x1.fb9f60p-1, 0x1.2351c2p-5 }, + { 0x1.fbc344p-1, 0x1.1af5d2p-5 }, + { 0x1.fbe61ep-1, 0x1.12ceb4p-5 }, + { 0x1.fc07fap-1, 0x1.0adb60p-5 }, + { 0x1.fc28d8p-1, 0x1.031ad6p-5 }, + { 0x1.fc48c2p-1, 0x1.f7182ap-6 }, + { 0x1.fc67bcp-1, 0x1.e85c44p-6 }, + { 0x1.fc85d0p-1, 0x1.da0006p-6 }, + { 0x1.fca2fep-1, 0x1.cc0180p-6 }, + { 0x1.fcbf52p-1, 0x1.be5ecep-6 }, + { 0x1.fcdaccp-1, 0x1.b1160ap-6 }, + { 0x1.fcf576p-1, 0x1.a4255ap-6 }, + { 0x1.fd0f54p-1, 0x1.978ae8p-6 }, + { 0x1.fd286ap-1, 0x1.8b44e6p-6 }, + { 0x1.fd40bep-1, 0x1.7f5188p-6 }, + { 0x1.fd5856p-1, 0x1.73af0cp-6 }, + { 0x1.fd6f34p-1, 0x1.685bb6p-6 }, + { 0x1.fd8562p-1, 0x1.5d55ccp-6 }, + { 0x1.fd9ae2p-1, 0x1.529b9ep-6 }, + { 0x1.fdafb8p-1, 0x1.482b84p-6 }, + { 0x1.fdc3e8p-1, 0x1.3e03d8p-6 }, + { 0x1.fdd77ap-1, 0x1.3422fep-6 }, + { 0x1.fdea6ep-1, 0x1.2a875cp-6 }, + { 0x1.fdfcccp-1, 0x1.212f62p-6 }, + { 0x1.fe0e96p-1, 0x1.181984p-6 }, + { 0x1.fe1fd0p-1, 0x1.0f443ep-6 }, + { 0x1.fe3080p-1, 0x1.06ae14p-6 }, + { 0x1.fe40a6p-1, 0x1.fcab14p-7 }, + { 0x1.fe504cp-1, 0x1.ec7262p-7 }, + { 0x1.fe5f70p-1, 0x1.dcaf36p-7 }, + { 0x1.fe6e18p-1, 0x1.cd5ecap-7 }, + { 0x1.fe7c46p-1, 0x1.be7e5ap-7 }, + { 0x1.fe8a00p-1, 0x1.b00b38p-7 }, + { 0x1.fe9748p-1, 0x1.a202bep-7 }, + { 0x1.fea422p-1, 0x1.94624ep-7 }, + { 0x1.feb090p-1, 0x1.87275ep-7 }, + { 0x1.febc96p-1, 0x1.7a4f6ap-7 }, + { 0x1.fec836p-1, 0x1.6dd7fep-7 }, + { 0x1.fed374p-1, 0x1.61beaep-7 }, + { 0x1.fede52p-1, 0x1.56011cp-7 }, + { 0x1.fee8d4p-1, 0x1.4a9cf6p-7 }, + { 0x1.fef2fep-1, 0x1.3f8ff6p-7 }, + { 0x1.fefccep-1, 0x1.34d7dcp-7 }, + { 0x1.ff064cp-1, 0x1.2a727ap-7 }, + { 0x1.ff0f76p-1, 0x1.205dacp-7 }, + { 0x1.ff1852p-1, 0x1.169756p-7 }, + { 0x1.ff20e0p-1, 0x1.0d1d6ap-7 }, + { 0x1.ff2924p-1, 0x1.03ede2p-7 }, + { 0x1.ff3120p-1, 0x1.f60d8ap-8 }, + { 0x1.ff38d6p-1, 0x1.e4cc4ap-8 }, + { 0x1.ff4048p-1, 0x1.d4143ap-8 }, + { 0x1.ff4778p-1, 0x1.c3e1a6p-8 }, + { 0x1.ff4e68p-1, 0x1.b430ecp-8 }, + { 0x1.ff551ap-1, 0x1.a4fe84p-8 }, + { 0x1.ff5b90p-1, 0x1.9646f4p-8 }, + { 0x1.ff61ccp-1, 0x1.8806d8p-8 }, + { 0x1.ff67d0p-1, 0x1.7a3adep-8 }, + { 0x1.ff6d9ep-1, 0x1.6cdfccp-8 }, + { 0x1.ff7338p-1, 0x1.5ff276p-8 }, + { 0x1.ff789ep-1, 0x1.536fc2p-8 }, + { 0x1.ff7dd4p-1, 0x1.4754acp-8 }, + { 0x1.ff82dap-1, 0x1.3b9e40p-8 }, + { 0x1.ff87b2p-1, 0x1.30499cp-8 }, + { 0x1.ff8c5cp-1, 0x1.2553eep-8 }, + { 0x1.ff90dcp-1, 0x1.1aba78p-8 }, + { 0x1.ff9532p-1, 0x1.107a8cp-8 }, + { 0x1.ff9960p-1, 0x1.06918cp-8 }, + { 0x1.ff9d68p-1, 0x1.f9f9d0p-9 }, + { 0x1.ffa14ap-1, 0x1.e77448p-9 }, + { 0x1.ffa506p-1, 0x1.d58da6p-9 }, + { 0x1.ffa8a0p-1, 0x1.c4412cp-9 }, + { 0x1.ffac18p-1, 0x1.b38a3ap-9 }, + { 0x1.ffaf6ep-1, 0x1.a36454p-9 }, + { 0x1.ffb2a6p-1, 0x1.93cb12p-9 }, + { 0x1.ffb5bep-1, 0x1.84ba30p-9 }, + { 0x1.ffb8b8p-1, 0x1.762d84p-9 }, + { 0x1.ffbb98p-1, 0x1.682100p-9 }, + { 0x1.ffbe5ap-1, 0x1.5a90b0p-9 }, + { 0x1.ffc102p-1, 0x1.4d78bcp-9 }, + { 0x1.ffc390p-1, 0x1.40d564p-9 }, + { 0x1.ffc606p-1, 0x1.34a306p-9 }, + { 0x1.ffc862p-1, 0x1.28de12p-9 }, + { 0x1.ffcaa8p-1, 0x1.1d8318p-9 }, + { 0x1.ffccd8p-1, 0x1.128ebap-9 }, + { 0x1.ffcef4p-1, 0x1.07fdb4p-9 }, + { 0x1.ffd0fap-1, 0x1.fb99b8p-10 }, + { 0x1.ffd2eap-1, 0x1.e7f232p-10 }, + { 0x1.ffd4cap-1, 0x1.d4fed8p-10 }, + { 0x1.ffd696p-1, 0x1.c2b9d0p-10 }, + { 0x1.ffd84ep-1, 0x1.b11d70p-10 }, + { 0x1.ffd9f8p-1, 0x1.a02436p-10 }, + { 0x1.ffdb90p-1, 0x1.8fc8c8p-10 }, + { 0x1.ffdd18p-1, 0x1.8005f0p-10 }, + { 0x1.ffde90p-1, 0x1.70d6a4p-10 }, + { 0x1.ffdffap-1, 0x1.6235fcp-10 }, + { 0x1.ffe154p-1, 0x1.541f34p-10 }, + { 0x1.ffe2a2p-1, 0x1.468daep-10 }, + { 0x1.ffe3e2p-1, 0x1.397ceep-10 }, + { 0x1.ffe514p-1, 0x1.2ce898p-10 }, + { 0x1.ffe63cp-1, 0x1.20cc76p-10 }, + { 0x1.ffe756p-1, 0x1.15246ep-10 }, + { 0x1.ffe866p-1, 0x1.09ec86p-10 }, + { 0x1.ffe96ap-1, 0x1.fe41cep-11 }, + { 0x1.ffea64p-1, 0x1.e97ba4p-11 }, + { 0x1.ffeb54p-1, 0x1.d57f52p-11 }, + { 0x1.ffec3ap-1, 0x1.c245d4p-11 }, + { 0x1.ffed16p-1, 0x1.afc85ep-11 }, + { 0x1.ffedeap-1, 0x1.9e0058p-11 }, + { 0x1.ffeeb4p-1, 0x1.8ce75ep-11 }, + { 0x1.ffef76p-1, 0x1.7c7744p-11 }, + { 0x1.fff032p-1, 0x1.6caa0ep-11 }, + { 0x1.fff0e4p-1, 0x1.5d79ecp-11 }, + { 0x1.fff18ep-1, 0x1.4ee142p-11 }, + { 0x1.fff232p-1, 0x1.40daa4p-11 }, + { 0x1.fff2d0p-1, 0x1.3360ccp-11 }, + { 0x1.fff366p-1, 0x1.266ea8p-11 }, + { 0x1.fff3f6p-1, 0x1.19ff46p-11 }, + { 0x1.fff480p-1, 0x1.0e0de8p-11 }, + { 0x1.fff504p-1, 0x1.0295f0p-11 }, + { 0x1.fff582p-1, 0x1.ef25d4p-12 }, + { 0x1.fff5fcp-1, 0x1.da0110p-12 }, + { 0x1.fff670p-1, 0x1.c5b542p-12 }, + { 0x1.fff6dep-1, 0x1.b23a5ap-12 }, + { 0x1.fff74ap-1, 0x1.9f8894p-12 }, + { 0x1.fff7aep-1, 0x1.8d986ap-12 }, + { 0x1.fff810p-1, 0x1.7c629ap-12 }, + { 0x1.fff86cp-1, 0x1.6be022p-12 }, + { 0x1.fff8c6p-1, 0x1.5c0a38p-12 }, + { 0x1.fff91cp-1, 0x1.4cda54p-12 }, + { 0x1.fff96cp-1, 0x1.3e4a24p-12 }, + { 0x1.fff9bap-1, 0x1.305390p-12 }, + { 0x1.fffa04p-1, 0x1.22f0b4p-12 }, + { 0x1.fffa4cp-1, 0x1.161be4p-12 }, + { 0x1.fffa90p-1, 0x1.09cfa4p-12 }, + { 0x1.fffad0p-1, 0x1.fc0d56p-13 }, + { 0x1.fffb0ep-1, 0x1.e577bcp-13 }, + { 0x1.fffb4ap-1, 0x1.cfd4a6p-13 }, + { 0x1.fffb82p-1, 0x1.bb1a96p-13 }, + { 0x1.fffbb8p-1, 0x1.a74068p-13 }, + { 0x1.fffbecp-1, 0x1.943d4ap-13 }, + { 0x1.fffc1ep-1, 0x1.8208bcp-13 }, + { 0x1.fffc4ep-1, 0x1.709a8ep-13 }, + { 0x1.fffc7ap-1, 0x1.5feadap-13 }, + { 0x1.fffca6p-1, 0x1.4ff208p-13 }, + { 0x1.fffccep-1, 0x1.40a8c2p-13 }, + { 0x1.fffcf6p-1, 0x1.3207fcp-13 }, + { 0x1.fffd1ap-1, 0x1.2408eap-13 }, + { 0x1.fffd3ep-1, 0x1.16a502p-13 }, + { 0x1.fffd60p-1, 0x1.09d5f8p-13 }, + { 0x1.fffd80p-1, 0x1.fb2b7ap-14 }, + { 0x1.fffda0p-1, 0x1.e3bcf4p-14 }, + { 0x1.fffdbep-1, 0x1.cd5528p-14 }, + { 0x1.fffddap-1, 0x1.b7e946p-14 }, + { 0x1.fffdf4p-1, 0x1.a36eecp-14 }, + { 0x1.fffe0ep-1, 0x1.8fdc1cp-14 }, + { 0x1.fffe26p-1, 0x1.7d2738p-14 }, + { 0x1.fffe3ep-1, 0x1.6b4702p-14 }, + { 0x1.fffe54p-1, 0x1.5a329cp-14 }, + { 0x1.fffe68p-1, 0x1.49e178p-14 }, + { 0x1.fffe7ep-1, 0x1.3a4b60p-14 }, + { 0x1.fffe90p-1, 0x1.2b6876p-14 }, + { 0x1.fffea2p-1, 0x1.1d3120p-14 }, + { 0x1.fffeb4p-1, 0x1.0f9e1cp-14 }, + { 0x1.fffec4p-1, 0x1.02a868p-14 }, + { 0x1.fffed4p-1, 0x1.ec929ap-15 }, + { 0x1.fffee4p-1, 0x1.d4f4b4p-15 }, + { 0x1.fffef2p-1, 0x1.be6abcp-15 }, + { 0x1.ffff00p-1, 0x1.a8e8ccp-15 }, + { 0x1.ffff0cp-1, 0x1.94637ep-15 }, + { 0x1.ffff18p-1, 0x1.80cfdcp-15 }, + { 0x1.ffff24p-1, 0x1.6e2368p-15 }, + { 0x1.ffff30p-1, 0x1.5c540cp-15 }, + { 0x1.ffff3ap-1, 0x1.4b581cp-15 }, + { 0x1.ffff44p-1, 0x1.3b2652p-15 }, + { 0x1.ffff4ep-1, 0x1.2bb5ccp-15 }, + { 0x1.ffff56p-1, 0x1.1cfe02p-15 }, + { 0x1.ffff60p-1, 0x1.0ef6c4p-15 }, + { 0x1.ffff68p-1, 0x1.019842p-15 }, + { 0x1.ffff70p-1, 0x1.e9b5e8p-16 }, + { 0x1.ffff78p-1, 0x1.d16f58p-16 }, + { 0x1.ffff7ep-1, 0x1.ba4f04p-16 }, + { 0x1.ffff84p-1, 0x1.a447b8p-16 }, + { 0x1.ffff8cp-1, 0x1.8f4cccp-16 }, + { 0x1.ffff92p-1, 0x1.7b5224p-16 }, + { 0x1.ffff98p-1, 0x1.684c22p-16 }, + { 0x1.ffff9cp-1, 0x1.562facp-16 }, + { 0x1.ffffa2p-1, 0x1.44f21ep-16 }, + { 0x1.ffffa6p-1, 0x1.34894ap-16 }, + { 0x1.ffffacp-1, 0x1.24eb72p-16 }, + { 0x1.ffffb0p-1, 0x1.160f44p-16 }, + { 0x1.ffffb4p-1, 0x1.07ebd2p-16 }, + { 0x1.ffffb8p-1, 0x1.f4f12ep-17 }, + { 0x1.ffffbcp-1, 0x1.db5ad0p-17 }, + { 0x1.ffffc0p-1, 0x1.c304f0p-17 }, + { 0x1.ffffc4p-1, 0x1.abe09ep-17 }, + { 0x1.ffffc6p-1, 0x1.95df98p-17 }, + { 0x1.ffffcap-1, 0x1.80f43ap-17 }, + { 0x1.ffffccp-1, 0x1.6d1178p-17 }, + { 0x1.ffffd0p-1, 0x1.5a2ae0p-17 }, + { 0x1.ffffd2p-1, 0x1.483488p-17 }, + { 0x1.ffffd4p-1, 0x1.372310p-17 }, + { 0x1.ffffd6p-1, 0x1.26eb9ep-17 }, + { 0x1.ffffd8p-1, 0x1.1783cep-17 }, + { 0x1.ffffdcp-1, 0x1.08e1bap-17 }, + { 0x1.ffffdep-1, 0x1.f5f7d8p-18 }, + { 0x1.ffffdep-1, 0x1.db92b6p-18 }, + { 0x1.ffffe0p-1, 0x1.c282cep-18 }, + { 0x1.ffffe2p-1, 0x1.aab7acp-18 }, + { 0x1.ffffe4p-1, 0x1.94219cp-18 }, + { 0x1.ffffe6p-1, 0x1.7eb1a2p-18 }, + { 0x1.ffffe8p-1, 0x1.6a5972p-18 }, + { 0x1.ffffe8p-1, 0x1.570b6ap-18 }, + { 0x1.ffffeap-1, 0x1.44ba86p-18 }, + { 0x1.ffffeap-1, 0x1.335a62p-18 }, + { 0x1.ffffecp-1, 0x1.22df2ap-18 }, + { 0x1.ffffeep-1, 0x1.133d96p-18 }, + { 0x1.ffffeep-1, 0x1.046aeap-18 }, + { 0x1.fffff0p-1, 0x1.ecb9d0p-19 }, + { 0x1.fffff0p-1, 0x1.d21398p-19 }, + { 0x1.fffff2p-1, 0x1.b8d094p-19 }, + { 0x1.fffff2p-1, 0x1.a0df10p-19 }, + { 0x1.fffff2p-1, 0x1.8a2e26p-19 }, + { 0x1.fffff4p-1, 0x1.74adc8p-19 }, + { 0x1.fffff4p-1, 0x1.604ea8p-19 }, + { 0x1.fffff4p-1, 0x1.4d0232p-19 }, + { 0x1.fffff6p-1, 0x1.3aba86p-19 }, + { 0x1.fffff6p-1, 0x1.296a70p-19 }, + { 0x1.fffff6p-1, 0x1.190562p-19 }, + { 0x1.fffff8p-1, 0x1.097f62p-19 }, + { 0x1.fffff8p-1, 0x1.f59a20p-20 }, + { 0x1.fffff8p-1, 0x1.d9c736p-20 }, + { 0x1.fffff8p-1, 0x1.bf716cp-20 }, + { 0x1.fffffap-1, 0x1.a6852cp-20 }, + { 0x1.fffffap-1, 0x1.8eefd8p-20 }, + { 0x1.fffffap-1, 0x1.789fb8p-20 }, + { 0x1.fffffap-1, 0x1.6383f8p-20 }, + { 0x1.fffffap-1, 0x1.4f8c96p-20 }, + { 0x1.fffffap-1, 0x1.3caa62p-20 }, + { 0x1.fffffcp-1, 0x1.2acee2p-20 }, + { 0x1.fffffcp-1, 0x1.19ec60p-20 }, + { 0x1.fffffcp-1, 0x1.09f5d0p-20 }, + { 0x1.fffffcp-1, 0x1.f5bd96p-21 }, + { 0x1.fffffcp-1, 0x1.d9371ep-21 }, + { 0x1.fffffcp-1, 0x1.be41dep-21 }, + { 0x1.fffffcp-1, 0x1.a4c89ep-21 }, + { 0x1.fffffcp-1, 0x1.8cb738p-21 }, + { 0x1.fffffep-1, 0x1.75fa8ep-21 }, + { 0x1.fffffep-1, 0x1.608078p-21 }, + { 0x1.fffffep-1, 0x1.4c37c0p-21 }, + { 0x1.fffffep-1, 0x1.39100ep-21 }, + { 0x1.fffffep-1, 0x1.26f9e0p-21 }, + { 0x1.fffffep-1, 0x1.15e682p-21 }, + { 0x1.fffffep-1, 0x1.05c804p-21 }, + { 0x1.fffffep-1, 0x1.ed2254p-22 }, + { 0x1.fffffep-1, 0x1.d06ad6p-22 }, + { 0x1.fffffep-1, 0x1.b551c8p-22 }, + { 0x1.fffffep-1, 0x1.9bc0a0p-22 }, + { 0x1.fffffep-1, 0x1.83a200p-22 }, + { 0x1.fffffep-1, 0x1.6ce1aap-22 }, + { 0x1.fffffep-1, 0x1.576c72p-22 }, + { 0x1.fffffep-1, 0x1.43302cp-22 }, + { 0x1.fffffep-1, 0x1.301ba2p-22 }, + { 0x1.fffffep-1, 0x1.1e1e86p-22 }, + { 0x1.fffffep-1, 0x1.0d2966p-22 }, + { 0x1.000000p+0, 0x1.fa5b50p-23 }, + { 0x1.000000p+0, 0x1.dc3ae4p-23 }, + { 0x1.000000p+0, 0x1.bfd756p-23 }, + { 0x1.000000p+0, 0x1.a517dap-23 }, + { 0x1.000000p+0, 0x1.8be4f8p-23 }, + { 0x1.000000p+0, 0x1.74287ep-23 }, + { 0x1.000000p+0, 0x1.5dcd66p-23 }, + { 0x1.000000p+0, 0x1.48bfd4p-23 }, + { 0x1.000000p+0, 0x1.34ecf8p-23 }, + { 0x1.000000p+0, 0x1.224310p-23 }, + { 0x1.000000p+0, 0x1.10b148p-23 }, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/erfinv_24u5.c b/contrib/arm-optimized-routines/pl/math/erfinv_24u5.c new file mode 100644 index 000000000000..20e1e361befc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfinv_24u5.c @@ -0,0 +1,81 @@ +/* + * Double-precision inverse error function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "pl_sig.h" +#define IGNORE_SCALAR_FENV +#include "pl_test.h" + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10]; +} data = { + .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7, + -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3, + 0x1.689181bbafd0cp-3 }, + .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7, + -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4, + 0x1p+0 }, + .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1, + 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4, + -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 }, + .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1, + 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4, + -0x1.4075c56404eecp+3, 0x1p+0 }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2, + 0x1p+0 } +}; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7 + Largest observed error is 24.46 ULP, in the extreme tail: + erfinv(0x1.fd9504351b757p-1) got 0x1.ff72c1092917p+0 + want 0x1.ff72c10929158p+0. */ +double +erfinv (double x) +{ + double a = fabs (x); + + if (a <= 0.75) + { + /* Largest observed error in this region is 6.06 ULP: + erfinv(0x1.1884650fd2d41p-2) got 0x1.fb65998cbd3fep-3 + want 0x1.fb65998cbd404p-3. */ + double t = x * x - 0.5625; + return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17); + } + + if (a <= 0.9375) + { + /* Largest observed error in this region is 6.95 ULP: + erfinv(0x1.a8d65b94d8c6p-1) got 0x1.f08325591b54p-1 + want 0x1.f08325591b547p-1. */ + double t = x * x - 0.87890625; + return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37); + } + + double t = 1.0 / (sqrt (-log (1 - a))); + return horner_8_f64 (t, data.P_57) + / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); +} + +PL_SIG (S, D, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (erfinv, 24.0) +PL_TEST_INTERVAL (erfinv, 0, 1, 40000) +PL_TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c b/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c new file mode 100644 index 000000000000..40736da08be8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c @@ -0,0 +1,74 @@ +/* + * Single-precision inverse error function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + float P_10[3], Q_10[4], P_29[4], Q_29[4], P_50[6], Q_50[3]; +} data = { .P_10 = { -0x1.a31268p+3, 0x1.ac9048p+4, -0x1.293ff6p+3 }, + .Q_10 = { -0x1.8265eep+3, 0x1.ef5eaep+4, -0x1.12665p+4, 0x1p+0 }, + .P_29 + = { -0x1.fc0252p-4, 0x1.119d44p+0, -0x1.f59ee2p+0, 0x1.b13626p-2 }, + .Q_29 = { -0x1.69952p-4, 0x1.c7b7d2p-1, -0x1.167d7p+1, 0x1p+0 }, + .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1, + -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 }, + .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0, 0x1p+0 } }; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7 + Largest error is 4.71 ULP, in the tail region: + erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0 + want 0x1.b83274p+0. */ +float +erfinvf (float x) +{ + if (x == 1.0f) + return __math_oflowf (0); + if (x == -1.0f) + return __math_oflowf (1); + + float a = fabsf (x); + if (a > 1.0f) + return __math_invalidf (x); + + if (a <= 0.75f) + { + /* Greatest error in this region is 4.60 ULP: + erfinvf(0x1.0a98bap-5) got 0x1.d8a93ep-6 + want 0x1.d8a948p-6. */ + float t = x * x - 0.5625f; + return x * horner_2_f32 (t, data.P_10) / horner_3_f32 (t, data.Q_10); + } + if (a < 0.9375f) + { + /* Greatest error in this region is 3.79 ULP: + erfinvf(0x1.ac82d6p-1) got 0x1.f8fc54p-1 + want 0x1.f8fc5cp-1. */ + float t = x * x - 0.87890625f; + return x * horner_3_f32 (t, data.P_29) / horner_3_f32 (t, data.Q_29); + } + + /* Tail region, where error is greatest (and sensitive to sqrt and log1p + implementations. */ + float t = 1.0 / sqrtf (-log1pf (-a)); + return horner_5_f32 (t, data.P_50) + / (copysignf (t, x) * horner_2_f32 (t, data.Q_50)); +} + +PL_SIG (S, F, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (erfinvf, 4.09) +PL_TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfinvl.c b/contrib/arm-optimized-routines/pl/math/erfinvl.c new file mode 100644 index 000000000000..ea4aadfccd00 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfinvl.c @@ -0,0 +1,114 @@ +/* + * Extended precision inverse error function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define _GNU_SOURCE +#include +#include +#include + +#include "math_config.h" +#include "poly_scalar_f64.h" + +#define SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p0l +#define HF_SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p-1l + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10]; +} data = { + .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7, + -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3, + 0x1.689181bbafd0cp-3 }, + .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7, + -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4, + 0x1p+0 }, + .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1, + 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4, + -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 }, + .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1, + 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4, + -0x1.4075c56404eecp+3, 0x1p+0 }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2, + 0x1p+0 } +}; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. */ +static inline double +__erfinv (double x) +{ + if (x == 1.0) + return __math_oflow (0); + if (x == -1.0) + return __math_oflow (1); + + double a = fabs (x); + if (a > 1) + return __math_invalid (x); + + if (a <= 0.75) + { + double t = x * x - 0.5625; + return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17); + } + + if (a <= 0.9375) + { + double t = x * x - 0.87890625; + return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37); + } + + double t = 1.0 / (sqrtl (-log1pl (-a))); + return horner_8_f64 (t, data.P_57) + / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); +} + +/* Extended-precision variant, which uses the above (or asymptotic estimate) as + starting point for Newton refinement. This implementation is a port to C of + the version in the SpecialFunctions.jl Julia package, with relaxed stopping + criteria for the Newton refinement. */ +long double +erfinvl (long double x) +{ + if (x == 0) + return 0; + + double yf = __erfinv (x); + long double y; + if (isfinite (yf)) + y = yf; + else + { + /* Double overflowed, use asymptotic estimate instead. */ + y = copysignl (sqrtl (-logl (1.0l - fabsl (x)) * SQRT_PIl), x); + if (!isfinite (y)) + return y; + } + + double eps = fabs (yf - nextafter (yf, 0)); + while (true) + { + long double dy = HF_SQRT_PIl * (erfl (y) - x) * exp (y * y); + y -= dy; + /* Stopping criterion is different to Julia implementation, but is enough + to ensure result is accurate when rounded to double-precision. */ + if (fabsl (dy) < eps) + break; + } + return y; +} diff --git a/contrib/arm-optimized-routines/pl/math/estrin.h b/contrib/arm-optimized-routines/pl/math/estrin.h deleted file mode 100644 index f967fb0475b0..000000000000 --- a/contrib/arm-optimized-routines/pl/math/estrin.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Helper macros for double-precision Estrin polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#if V_SUPPORTED -#define FMA v_fma_f64 -#else -#define FMA fma -#endif - -#include "estrin_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/estrin_wrap.h b/contrib/arm-optimized-routines/pl/math/estrin_wrap.h deleted file mode 100644 index 2ae07001f2cf..000000000000 --- a/contrib/arm-optimized-routines/pl/math/estrin_wrap.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Helper macros for double-precision Estrin polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -// clang-format off -#define ESTRIN_1_(x, c, i) FMA(x, c(1 + i), c(i)) -#define ESTRIN_2_(x, x2, c, i) FMA(x2, c(2 + i), ESTRIN_1_(x, c, i)) -#define ESTRIN_3_(x, x2, c, i) FMA(x2, ESTRIN_1_(x, c, 2 + i), ESTRIN_1_(x, c, i)) -#define ESTRIN_4_(x, x2, x4, c, i) FMA(x4, c(4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_5_(x, x2, x4, c, i) FMA(x4, ESTRIN_1_(x, c, 4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_6_(x, x2, x4, c, i) FMA(x4, ESTRIN_2_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_7_(x, x2, x4, c, i) FMA(x4, ESTRIN_3_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) -#define ESTRIN_8_(x, x2, x4, x8, c, i) FMA(x8, c(8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_9_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_1_(x, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_10_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_2_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_11_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_3_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_12_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_4_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_13_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_5_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_14_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_6_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_15_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_7_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) -#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) -#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) -#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) -#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) - -#define ESTRIN_1(x, c) ESTRIN_1_(x, c, 0) -#define ESTRIN_2(x, x2, c) ESTRIN_2_(x, x2, c, 0) -#define ESTRIN_3(x, x2, c) ESTRIN_3_(x, x2, c, 0) -#define ESTRIN_4(x, x2, x4, c) ESTRIN_4_(x, x2, x4, c, 0) -#define ESTRIN_5(x, x2, x4, c) ESTRIN_5_(x, x2, x4, c, 0) -#define ESTRIN_6(x, x2, x4, c) ESTRIN_6_(x, x2, x4, c, 0) -#define ESTRIN_7(x, x2, x4, c) ESTRIN_7_(x, x2, x4, c, 0) -#define ESTRIN_8(x, x2, x4, x8, c) ESTRIN_8_(x, x2, x4, x8, c, 0) -#define ESTRIN_9(x, x2, x4, x8, c) ESTRIN_9_(x, x2, x4, x8, c, 0) -#define ESTRIN_10(x, x2, x4, x8, c) ESTRIN_10_(x, x2, x4, x8, c, 0) -#define ESTRIN_11(x, x2, x4, x8, c) ESTRIN_11_(x, x2, x4, x8, c, 0) -#define ESTRIN_12(x, x2, x4, x8, c) ESTRIN_12_(x, x2, x4, x8, c, 0) -#define ESTRIN_13(x, x2, x4, x8, c) ESTRIN_13_(x, x2, x4, x8, c, 0) -#define ESTRIN_14(x, x2, x4, x8, c) ESTRIN_14_(x, x2, x4, x8, c, 0) -#define ESTRIN_15(x, x2, x4, x8, c) ESTRIN_15_(x, x2, x4, x8, c, 0) -#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0) -#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0) -#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0) -#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0) -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/estrinf.h b/contrib/arm-optimized-routines/pl/math/estrinf.h deleted file mode 100644 index 175233c6c799..000000000000 --- a/contrib/arm-optimized-routines/pl/math/estrinf.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for single-precision Estrin polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f32 -#else -#define FMA fmaf -#endif - -#include "estrin_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/expf.c b/contrib/arm-optimized-routines/pl/math/expf.c index c325e45d5cc6..cd3cfa925c64 100644 --- a/contrib/arm-optimized-routines/pl/math/expf.c +++ b/contrib/arm-optimized-routines/pl/math/expf.c @@ -1,76 +1,76 @@ /* * Single-precision e^x function. * * Copyright (c) 2017-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" /* EXPF_TABLE_BITS = 5 EXPF_POLY_ORDER = 3 ULP error: 0.502 (nearest rounding.) Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.) Wrong count: 170635 (all nearest rounding wrong results with fma.) Non-nearest ULP error: 1 (rounded ULP error) */ #define N (1 << EXPF_TABLE_BITS) #define InvLn2N __expf_data.invln2_scaled #define T __expf_data.tab #define C __expf_data.poly_scaled static inline uint32_t top12 (float x) { return asuint (x) >> 20; } float optr_aor_exp_f32 (float x) { uint32_t abstop; uint64_t ki, t; /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ double_t kd, xd, z, r, r2, y, s; xd = (double_t) x; abstop = top12 (x) & 0x7ff; if (unlikely (abstop >= top12 (88.0f))) { /* |x| >= 88 or x is nan. */ if (asuint (x) == asuint (-INFINITY)) return 0.0f; if (abstop >= top12 (INFINITY)) return x + x; if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */ return __math_oflowf (0); if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */ return __math_uflowf (0); } /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */ z = InvLn2N * xd; /* Round and convert z to int, the result is in [-150*N, 128*N] and ideally nearest int is used, otherwise the magnitude of r can be bigger which gives larger approximation error. */ - kd = roundtoint (z); - ki = converttoint (z); + kd = round (z); + ki = lround (z); r = z - kd; /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ t = T[ki % N]; t += ki << (52 - EXPF_TABLE_BITS); s = asdouble (t); z = C[0] * r + C[1]; r2 = r * r; y = C[2] * r + 1; y = z * r2 + y; y = y * s; return eval_as_float (y); } diff --git a/contrib/arm-optimized-routines/pl/math/expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c index a3faff70cb62..f7d431198614 100644 --- a/contrib/arm-optimized-routines/pl/math/expm1_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c @@ -1,86 +1,85 @@ /* * Double-precision e^x - 1 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrin.h" +#include "poly_scalar_f64.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define InvLn2 0x1.71547652b82fep0 #define Ln2hi 0x1.62e42fefa39efp-1 #define Ln2lo 0x1.abc9e3b39803fp-56 #define Shift 0x1.8p52 -#define TinyBound \ - 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ -#define BigBound 0x1.63108c75a1937p+9 /* Above which expm1(x) overflows. */ -#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */ +/* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ +#define TinyBound 0x3cc0000000000000 +/* Above which expm1(x) overflows. */ +#define BigBound 0x1.63108c75a1937p+9 +/* Below which expm1(x) rounds to 1. */ +#define NegBound -0x1.740bf7c0d927dp+9 #define AbsMask 0x7fffffffffffffff -#define C(i) __expm1_poly[i] - /* Approximation for exp(x) - 1 using polynomial on a reduced interval. The maximum error observed error is 2.17 ULP: expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2 want 0x1.a9af566038788p-2. */ double expm1 (double x) { uint64_t ix = asuint64 (x); uint64_t ax = ix & AbsMask; /* Tiny, +Infinity. */ if (ax <= TinyBound || ix == 0x7ff0000000000000) return x; /* +/-NaN. */ if (ax > 0x7ff0000000000000) return __math_invalid (x); /* Result is too large to be represented as a double. */ if (x >= 0x1.63108c75a1937p+9) return __math_oflow (0); /* Result rounds to -1 in double precision. */ if (x <= NegBound) return -1; /* Reduce argument to smaller range: Let i = round(x / ln2) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ double j = fma (InvLn2, x, Shift) - Shift; int64_t i = j; double f = fma (j, -Ln2hi, x); f = fma (j, -Ln2lo, f); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ double f2 = f * f; double f4 = f2 * f2; - double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f); /* Assemble the result, using a slight rearrangement to achieve acceptable accuracy. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^(i - 1). */ double t = ldexp (0.5, i); /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ return 2 * fma (p, t, t - 0.5); } PL_SIG (S, D, 1, expm1, -9.9, 9.9) PL_TEST_ULP (expm1, 1.68) -PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000) -PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000) +PL_TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000) PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000) PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100) PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c index 70b14e48519d..e12c9ba9a8a2 100644 --- a/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c +++ b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c @@ -1,80 +1,79 @@ /* * Single-precision e^x - 1 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "hornerf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define Shift (0x1.8p23f) #define InvLn2 (0x1.715476p+0f) #define Ln2hi (0x1.62e4p-1f) #define Ln2lo (0x1.7f7d1cp-20f) #define AbsMask (0x7fffffff) #define InfLimit \ (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows. */ #define NegLimit \ (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */ -#define C(i) __expm1f_poly[i] - /* Approximation for exp(x) - 1 using polynomial on a reduced interval. The maximum error is 1.51 ULP: expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 want 0x1.e2fb94p-2. */ float expm1f (float x) { uint32_t ix = asuint (x); uint32_t ax = ix & AbsMask; /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x. Inf: x == +Inf => expm1(x) = x. */ if (ax <= 0x34000000 || (ix == 0x7f800000)) return x; /* +/-NaN. */ if (ax > 0x7f800000) return __math_invalidf (x); if (x >= InfLimit) return __math_oflowf (0); if (x <= NegLimit || ix == 0xff800000) return -1; /* Reduce argument to smaller range: Let i = round(x / ln2) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ float j = fmaf (InvLn2, x, Shift) - Shift; int32_t i = j; float f = fmaf (j, -Ln2hi, x); f = fmaf (j, -Ln2lo, f); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float p = fmaf (f * f, HORNER_4 (f, C), f); + float p = fmaf (f * f, horner_4_f32 (f, __expm1f_poly), f); /* Assemble the result, using a slight rearrangement to achieve acceptable accuracy. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^(i - 1). */ float t = ldexpf (0.5f, i); /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ return 2 * fmaf (p, t, t - 0.5f); } PL_SIG (S, F, 1, expm1, -9.9, 9.9) PL_TEST_ULP (expm1f, 1.02) -PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000) PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000) +PL_TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000) PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000) +PL_TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/finite_pow.h b/contrib/arm-optimized-routines/pl/math/finite_pow.h new file mode 100644 index 000000000000..8944d4fae625 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/finite_pow.h @@ -0,0 +1,365 @@ +/* + * Double-precision x^y function. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Scalar version of pow used for fallbacks in vector implementations. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define Off 0x3fe6955500000000 +#define As __v_pow_log_data.poly + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.0). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define InvLn2N __v_pow_exp_data.n_over_ln2 +#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi +#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo +#define SBits __v_pow_exp_data.sbits +#define Cs __v_pow_exp_data.poly + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline double +log_inline (uint64_t ix, double *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64_t tmp = ix - Off; + int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1); + int k = (int64_t) tmp >> 52; /* arithmetic shift. */ + uint64_t iz = ix - (tmp & 0xfffULL << 52); + double z = asdouble (iz); + double kd = (double) k; + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + double invc = __v_pow_log_data.invc[i]; + double logc = __v_pow_log_data.logc[i]; + double logctail = __v_pow_log_data.logctail[i]; + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + double r = fma (z, invc, -1.0); + + /* k*Ln2 + log(c) + r. */ + double t1 = kd * __v_pow_log_data.ln2_hi + logc; + double t2 = t1 + r; + double lo1 = kd * __v_pow_log_data.ln2_lo + logctail; + double lo2 = t1 - t2 + r; + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double ar = As[0] * r; + double ar2 = r * ar; + double ar3 = r * ar2; + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + double hi = t2 + ar2; + double lo3 = fma (ar, r, -ar2); + double lo4 = t2 - hi + ar2; + /* p = log1p(r) - r - A[0]*r*r. */ + double p = (ar3 + * (As[1] + r * As[2] + + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6])))); + double lo = lo1 + lo2 + lo3 + lo4 + p; + double y = hi + lo; + *tail = hi - y + lo; + return y; +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +special_case (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + y = scale + scale * tmp; +#if WANT_SIMD_EXCEPT + if (fabs (y) < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double hi, lo, one = 1.0; + if (y < 0.0) + one = -1.0; + lo = scale - y + scale * tmp; + hi = one + y; + lo = one - hi + y + lo; + y = eval_as_double (hi + lo) - one; + /* Fix the sign of 0. */ + if (y == 0.0) + y = asdouble (sbits & 0x8000000000000000); + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } +#endif + y = 0x1p-1022 * y; + return check_uflow (eval_as_double (y)); +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline double +exp_inline (double x, double xtail, uint32_t sign_bias) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + if (abstop - SmallExp >= 0x80000000) + { + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return sign_bias ? -1.0 : 1.0; + } + if (abstop >= top12 (1024.0)) + { + /* Note: inf and nan are already handled. */ + /* Skip errno handling. */ +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (sign_bias) + : __math_oflow (sign_bias); +#else + double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY; + return sign_bias ? -res_uoflow : res_uoflow; +#endif + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double (scale + scale * tmp); +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + A version of exp_inline that is not inlined and for which sign_bias is + equal to 0. */ +static double NOINLINE +exp_nosignbias (double x, double xtail) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + /* Avoid spurious underflow for tiny x. */ + if (abstop - SmallExp >= 0x80000000) + return 1.0; + /* Note: inf and nan are already handled. */ + if (abstop >= top12 (1024.0)) +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0); +#else + return asuint64 (x) >> 63 ? 0.0 : INFINITY; +#endif + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double (scale + scale * tmp); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +static double NOINLINE +__pl_finite_pow (double x, double y) +{ + uint32_t sign_bias = 0; + uint64_t ix, iy; + uint32_t topx, topy; + + ix = asuint64 (x); + iy = asuint64 (y); + topx = top12 (x); + topy = top12 (y); + if (unlikely (topx - SmallPowX >= ThresPowX + || (topy & 0x7ff) - SmallPowY >= ThresPowY)) + { + /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 + and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */ + /* Special cases: (x < 0x1p-126 or inf or nan) or + (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) + || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + { + x2 = -x2; + sign_bias = 1; + } +#if WANT_SIMD_EXCEPT + if (2 * ix == 0 && iy >> 63) + return __math_divzero (sign_bias); +#endif + /* Without the barrier some versions of clang hoist the 1/x2 and + thus division by zero exception can be signaled spuriously. */ + return iy >> 63 ? opt_barrier_double (1 / x2) : x2; + } + /* Here x and y are non-zero finite. */ + if (ix >> 63) + { + /* Finite x < 0. */ + int yint = checkint (iy); + if (yint == 0) +#if WANT_SIMD_EXCEPT + return __math_invalid (x); +#else + return __builtin_nan (""); +#endif + if (yint == 1) + sign_bias = SignBias; + ix &= 0x7fffffffffffffff; + topx &= 0x7ff; + } + if ((topy & 0x7ff) - SmallPowY >= ThresPowY) + { + /* Note: sign_bias == 0 here because y is not odd. */ + if (ix == asuint64 (1.0)) + return 1.0; + /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ + if ((topy & 0x7ff) < SmallPowY) + return 1.0; +#if WANT_SIMD_EXCEPT + return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0) + : __math_uflow (0); +#else + return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0; +#endif + } + if (topx == 0) + { + /* Normalize subnormal x so exponent becomes negative. */ + /* Without the barrier some versions of clang evalutate the mul + unconditionally causing spurious overflow exceptions. */ + ix = asuint64 (opt_barrier_double (x) * 0x1p52); + ix &= 0x7fffffffffffffff; + ix -= 52ULL << 52; + } + } + + double lo; + double hi = log_inline (ix, &lo); + double ehi = y * hi; + double elo = y * lo + fma (y, hi, -ehi); + return exp_inline (ehi, elo, sign_bias); +} diff --git a/contrib/arm-optimized-routines/pl/math/horner.h b/contrib/arm-optimized-routines/pl/math/horner.h deleted file mode 100644 index f92ab6752110..000000000000 --- a/contrib/arm-optimized-routines/pl/math/horner.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for single-precision Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f64 -#else -#define FMA fma -#endif - -#include "horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/horner_wrap.h b/contrib/arm-optimized-routines/pl/math/horner_wrap.h deleted file mode 100644 index 6478968db913..000000000000 --- a/contrib/arm-optimized-routines/pl/math/horner_wrap.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Helper macros for Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -// clang-format off -#define HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i)) -#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i)) -#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i)) -#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i)) -#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i)) -#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i)) -#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i)) -#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i)) -#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i)) -#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i)) -#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i)) -#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i)) - -#define HORNER_1(x, c) HORNER_1_ (x, c, 0) -#define HORNER_2(x, c) HORNER_2_ (x, c, 0) -#define HORNER_3(x, c) HORNER_3_ (x, c, 0) -#define HORNER_4(x, c) HORNER_4_ (x, c, 0) -#define HORNER_5(x, c) HORNER_5_ (x, c, 0) -#define HORNER_6(x, c) HORNER_6_ (x, c, 0) -#define HORNER_7(x, c) HORNER_7_ (x, c, 0) -#define HORNER_8(x, c) HORNER_8_ (x, c, 0) -#define HORNER_9(x, c) HORNER_9_ (x, c, 0) -#define HORNER_10(x, c) HORNER_10_(x, c, 0) -#define HORNER_11(x, c) HORNER_11_(x, c, 0) -#define HORNER_12(x, c) HORNER_12_(x, c, 0) -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/hornerf.h b/contrib/arm-optimized-routines/pl/math/hornerf.h deleted file mode 100644 index 0703817b0fbb..000000000000 --- a/contrib/arm-optimized-routines/pl/math/hornerf.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for double-precision Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f32 -#else -#define FMA fmaf -#endif - -#include "horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/include/mathlib.h b/contrib/arm-optimized-routines/pl/math/include/mathlib.h index af5f9f9c6afb..f886e7f8c07a 100644 --- a/contrib/arm-optimized-routines/pl/math/include/mathlib.h +++ b/contrib/arm-optimized-routines/pl/math/include/mathlib.h @@ -1,244 +1,206 @@ -// clang-format off /* * Public API. * * Copyright (c) 2015-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATHLIB_H #define _MATHLIB_H +float acosf (float); float acoshf (float); +float asinf (float); float asinhf (float); float atan2f (float, float); float atanf (float); float atanhf (float); float cbrtf (float); float coshf (float); +float cospif (float); float erfcf (float); float erff (float); +float erfinvf (float); +float exp10f (float); float expm1f (float); float log10f (float); float log1pf (float); float sinhf (float); +float sinpif (float); float tanf (float); float tanhf (float); +double acos (double); double acosh (double); +double asin (double); double asinh (double); double atan (double); double atan2 (double, double); double atanh (double); double cbrt (double); double cosh (double); +double cospi (double); double erfc (double); +double erfinv (double); +double exp10 (double); double expm1 (double); double log10 (double); double log1p (double); double sinh (double); +double sinpi (double); double tanh (double); -float __s_acoshf (float); -float __s_asinhf (float); -float __s_atanf (float); -float __s_atan2f (float, float); -float __s_atanhf (float); -float __s_cbrtf (float); -float __s_coshf (float); -float __s_erfcf (float); -float __s_erff (float); -float __s_expm1f (float); -float __s_log10f (float); -float __s_log1pf (float); -float __s_log2f (float); -float __s_sinhf (float); -float __s_tanf (float); -float __s_tanhf (float); - -double __s_acosh (double); -double __s_asinh (double); -double __s_atan (double); -double __s_atan2 (double, double); -double __s_atanh (double); -double __s_cbrt (double); -double __s_cosh (double); -double __s_erf (double); -double __s_erfc (double); -double __s_expm1 (double); -double __s_log10 (double); -double __s_log1p (double); -double __s_log2 (double); -double __s_sinh (double); -double __s_tan (double); -double __s_tanh (double); +long double cospil (long double); +long double erfinvl (long double); +long double exp10l (long double); +long double sinpil (long double); #if __aarch64__ -#if __GNUC__ >= 5 +# if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -#elif __clang_major__*100+__clang_minor__ >= 305 -typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; -typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -#else -#error Unsupported compiler -#endif +# elif __clang_major__ * 100 + __clang_minor__ >= 305 +typedef __attribute__ ((__neon_vector_type__ (4))) float __f32x4_t; +typedef __attribute__ ((__neon_vector_type__ (2))) double __f64x2_t; +# else +# error Unsupported compiler +# endif -/* Vector functions following the base PCS. */ -__f32x4_t __v_acoshf (__f32x4_t); -__f64x2_t __v_acosh (__f64x2_t); -__f32x4_t __v_asinhf (__f32x4_t); -__f64x2_t __v_asinh (__f64x2_t); -__f32x4_t __v_atanf (__f32x4_t); -__f64x2_t __v_atan (__f64x2_t); -__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); -__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); -__f32x4_t __v_atanhf (__f32x4_t); -__f64x2_t __v_atanh (__f64x2_t); -__f32x4_t __v_cbrtf (__f32x4_t); -__f64x2_t __v_cbrt (__f64x2_t); -__f32x4_t __v_coshf (__f32x4_t); -__f64x2_t __v_cosh (__f64x2_t); -__f32x4_t __v_erff (__f32x4_t); -__f64x2_t __v_erf (__f64x2_t); -__f32x4_t __v_erfcf (__f32x4_t); -__f64x2_t __v_erfc (__f64x2_t); -__f32x4_t __v_expm1f (__f32x4_t); -__f64x2_t __v_expm1 (__f64x2_t); -__f32x4_t __v_log10f (__f32x4_t); -__f64x2_t __v_log10 (__f64x2_t); -__f32x4_t __v_log1pf (__f32x4_t); -__f64x2_t __v_log1p (__f64x2_t); -__f32x4_t __v_log2f (__f32x4_t); -__f64x2_t __v_log2 (__f64x2_t); -__f32x4_t __v_sinhf (__f32x4_t); -__f64x2_t __v_sinh (__f64x2_t); -__f32x4_t __v_tanf (__f32x4_t); -__f64x2_t __v_tan (__f64x2_t); -__f32x4_t __v_tanhf (__f32x4_t); -__f64x2_t __v_tanh (__f64x2_t); +# if __GNUC__ >= 9 || __clang_major__ >= 8 +# define __vpcs __attribute__ ((__aarch64_vector_pcs__)) -#if __GNUC__ >= 9 || __clang_major__ >= 8 -#define __vpcs __attribute__((__aarch64_vector_pcs__)) +typedef struct __f32x4x2_t +{ + __f32x4_t val[2]; +} __f32x4x2_t; -/* Vector functions following the vector PCS. */ -__vpcs __f32x4_t __vn_acoshf (__f32x4_t); -__vpcs __f64x2_t __vn_acosh (__f64x2_t); -__vpcs __f32x4_t __vn_asinhf (__f32x4_t); -__vpcs __f64x2_t __vn_asinh (__f64x2_t); -__vpcs __f32x4_t __vn_atanf (__f32x4_t); -__vpcs __f64x2_t __vn_atan (__f64x2_t); -__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); -__vpcs __f32x4_t __vn_atanhf (__f32x4_t); -__vpcs __f64x2_t __vn_atanh (__f64x2_t); -__vpcs __f32x4_t __vn_cbrtf (__f32x4_t); -__vpcs __f64x2_t __vn_cbrt (__f64x2_t); -__vpcs __f32x4_t __vn_coshf (__f32x4_t); -__vpcs __f64x2_t __vn_cosh (__f64x2_t); -__vpcs __f32x4_t __vn_erff (__f32x4_t); -__vpcs __f64x2_t __vn_erf (__f64x2_t); -__vpcs __f32x4_t __vn_erfcf (__f32x4_t); -__vpcs __f64x2_t __vn_erfc (__f64x2_t); -__vpcs __f32x4_t __vn_expm1f (__f32x4_t); -__vpcs __f64x2_t __vn_expm1 (__f64x2_t); -__vpcs __f32x4_t __vn_log10f (__f32x4_t); -__vpcs __f64x2_t __vn_log10 (__f64x2_t); -__vpcs __f32x4_t __vn_log1pf (__f32x4_t); -__vpcs __f64x2_t __vn_log1p (__f64x2_t); -__vpcs __f32x4_t __vn_log2f (__f32x4_t); -__vpcs __f64x2_t __vn_log2 (__f64x2_t); -__vpcs __f32x4_t __vn_sinhf (__f32x4_t); -__vpcs __f64x2_t __vn_sinh (__f64x2_t); -__vpcs __f32x4_t __vn_tanf (__f32x4_t); -__vpcs __f64x2_t __vn_tan (__f64x2_t); -__vpcs __f32x4_t __vn_tanhf (__f32x4_t); -__vpcs __f64x2_t __vn_tanh (__f64x2_t); +typedef struct __f64x2x2_t +{ + __f64x2_t val[2]; +} __f64x2x2_t; /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); +__vpcs __f32x4x2_t _ZGVnN4v_cexpif (__f32x4_t); +__vpcs __f64x2x2_t _ZGVnN2v_cexpi (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_cospif (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_cospi (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_erfinvf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_erfinv (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t); +__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); __vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t); +__vpcs void _ZGVnN4vl4l4_sincosf (__f32x4_t, __f32x4_t *, __f32x4_t *); +__vpcs void _ZGVnN2vl8l8_sincos (__f64x2_t, __f64x2_t *, __f64x2_t *); -#endif +# endif -#if WANT_SVE_MATH -#include -svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t); -svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_atan_x (svfloat64_t, svbool_t); -svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t); -svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_cos_x (svfloat64_t, svbool_t); -svfloat32_t __sv_erff_x (svfloat32_t, svbool_t); -svfloat64_t __sv_erf_x (svfloat64_t, svbool_t); -svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t); -svfloat32_t __sv_expf_x (svfloat32_t, svbool_t); -svfloat32_t __sv_logf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_log_x (svfloat64_t, svbool_t); -svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t); -svfloat64_t __sv_log10_x (svfloat64_t, svbool_t); -svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t); -svfloat64_t __sv_log2_x (svfloat64_t, svbool_t); -svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t); -svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t); -svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t); -svfloat64_t __sv_sin_x (svfloat64_t, svbool_t); -svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t); -/* SVE ABI names. */ +# if WANT_SVE_MATH +# include +svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t); svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t); svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t); +svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t); +svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t); svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t); svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t); -svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t); +svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t); +svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t); +svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t); svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t); svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t); -#endif +svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t); +void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t); +void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t); +# endif #endif #endif -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/include/pl_test.h b/contrib/arm-optimized-routines/pl/math/include/pl_test.h index 6a81360ba287..3a3407e337b8 100644 --- a/contrib/arm-optimized-routines/pl/math/include/pl_test.h +++ b/contrib/arm-optimized-routines/pl/math/include/pl_test.h @@ -1,26 +1,24 @@ /* * PL macros to aid testing. This version of this file is used for building the * routine, not the tests. Separate definitions are found in test/pl_test.h * which emit test parameters. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. */ /* Emit max ULP threshold - silenced for building the routine. */ #define PL_TEST_ULP(f, l) -/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of - strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is - also added to the test suite. */ -#define PL_ALIAS(a, b) strong_alias (a, b) - /* Emit routine name if e == 1 and f is expected to correctly trigger fenv exceptions. e allows declaration to be emitted conditionally upon certain build flags - defer expansion by one pass to allow those flags to be expanded properly. */ #define PL_TEST_EXPECT_FENV(f, e) #define PL_TEST_EXPECT_FENV_ALWAYS(f) #define PL_TEST_INTERVAL(f, lo, hi, n) +#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) #define PL_TEST_INTERVAL_C(f, lo, hi, n, c) +#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) +#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) diff --git a/contrib/arm-optimized-routines/pl/math/log1p_2u.c b/contrib/arm-optimized-routines/pl/math/log1p_2u.c index 23c8ed4a1914..f9491ce52b44 100644 --- a/contrib/arm-optimized-routines/pl/math/log1p_2u.c +++ b/contrib/arm-optimized-routines/pl/math/log1p_2u.c @@ -1,136 +1,131 @@ /* * Double-precision log(1+x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "estrin.h" +#include "poly_scalar_f64.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define Ln2Hi 0x1.62e42fefa3800p-1 #define Ln2Lo 0x1.ef35793c76730p-45 #define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ #define OneMHfRt2Top \ 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ #define OneTop12 0x3ff #define BottomMask 0xffffffff #define OneMHfRt2 0x3fd2bec333018866 #define Rt2MOne 0x3fda827999fcef32 #define AbsMask 0x7fffffffffffffff #define ExpM63 0x3c00 -#define C(i) __log1p_data.coeffs[i] static inline double eval_poly (double f) { double f2 = f * f; double f4 = f2 * f2; double f8 = f4 * f4; - return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); + return estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs); } /* log1p approximation using polynomial on reduced interval. Largest observed errors are near the lower boundary of the region where k is 0. Maximum measured error: 1.75ULP. log1p(-0x1.2e1aea97b3e5cp-2) got -0x1.65fb8659a2f9p-2 want -0x1.65fb8659a2f92p-2. */ double log1p (double x) { uint64_t ix = asuint64 (x); uint64_t ia = ix & AbsMask; uint32_t ia16 = ia >> 48; /* Handle special cases first. */ if (unlikely (ia16 >= 0x7ff0 || ix >= 0xbff0000000000000 || ix == 0x8000000000000000)) { if (ix == 0x8000000000000000 || ix == 0x7ff0000000000000) { /* x == -0 => log1p(x) = -0. x == Inf => log1p(x) = Inf. */ return x; } if (ix == 0xbff0000000000000) { /* x == -1 => log1p(x) = -Inf. */ return __math_divzero (-1); ; } if (ia16 >= 0x7ff0) { /* x == +/-NaN => log1p(x) = NaN. */ return __math_invalid (asdouble (ia)); } /* x < -1 => log1p(x) = NaN. x == -Inf => log1p(x) = NaN. */ return __math_invalid (x); } /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f is in [sqrt(2)/2, sqrt(2)]): log1p(x) = k*log(2) + log1p(f). f may not be representable exactly, so we need a correction term: let m = round(1 + x), c = (1 + x) - m. c << m: at very small x, log1p(x) ~ x, hence: log(1+x) - log(m) ~ c/m. We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ uint64_t sign = ix & ~AbsMask; if (ia <= OneMHfRt2 || (!sign && ia <= Rt2MOne)) { if (unlikely (ia16 <= ExpM63)) { /* If exponent of x <= -63 then shortcut the polynomial and avoid underflow by just returning x, which is exactly rounded in this region. */ return x; } /* If x is in [sqrt(2)/2 - 1, sqrt(2) - 1] then we can shortcut all the logic below, as k = 0 and f = x and therefore representable exactly. All we need is to return the polynomial. */ return fma (x, eval_poly (x) * x, x); } /* Obtain correctly scaled k by manipulation in the exponent. */ double m = x + 1; uint64_t mi = asuint64 (m); uint32_t u = (mi >> 32) + OneMHfRt2Top; int32_t k = (int32_t) (u >> 20) - OneTop12; /* Correction term c/m. */ double cm = (x - (m - 1)) / m; /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ uint32_t utop = (u & 0x000fffff) + HfRt2Top; uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask); double f = asdouble (u_red) - 1; /* Approximate log1p(x) on the reduced input using a polynomial. Because log1p(0)=0 we choose an approximation of the form: x + C0*x^2 + C1*x^3 + C2x^4 + ... Hence approximation has the form f + f^2 * P(f) where P(x) = C0 + C1*x + C2x^2 + ... */ double p = fma (f, eval_poly (f) * f, f); double kd = k; double y = fma (Ln2Lo, kd, cm); return y + fma (Ln2Hi, kd, p); } PL_SIG (S, D, 1, log1p, -0.9, 10.0) PL_TEST_ULP (log1p, 1.26) -PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000) -PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000) -PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000) -PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000) -PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000) -PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000) -PL_TEST_INTERVAL (log1p, -1.0, inf, 5000) +PL_TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000) +PL_TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c index fcfd05a6fcb7..e99174853720 100644 --- a/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c +++ b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c @@ -1,165 +1,161 @@ /* * Single-precision log(1+x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "hornerf.h" +#include "poly_scalar_f32.h" #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define Ln2 (0x1.62e43p-1f) #define SignMask (0x80000000) /* Biased exponent of the largest float m for which m^8 underflows. */ #define M8UFLOW_BOUND_BEXP 112 /* Biased exponent of the largest float for which we just return x. */ #define TINY_BOUND_BEXP 103 #define C(i) __log1pf_data.coeffs[i] static inline float eval_poly (float m, uint32_t e) { #ifdef LOG1PF_2U5 /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using slightly modified Estrin scheme (no x^0 term, and x term is just x). */ float p_12 = fmaf (m, C (1), C (0)); float p_34 = fmaf (m, C (3), C (2)); float p_56 = fmaf (m, C (5), C (4)); float p_78 = fmaf (m, C (7), C (6)); float m2 = m * m; float p_02 = fmaf (m2, p_12, m); float p_36 = fmaf (m2, p_56, p_34); float p_79 = fmaf (m2, C (8), p_78); float m4 = m2 * m2; float p_06 = fmaf (m4, p_36, p_02); if (unlikely (e < M8UFLOW_BOUND_BEXP)) return p_06; float m8 = m4 * m4; return fmaf (m8, p_79, p_06); #elif defined(LOG1PF_1U3) /* 1.3 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Horner scheme. Our polynomial approximation for log1p has the form x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ... Hence approximation has the form m + m^2 * P(m) where P(x) = C1 + C2 * x + C3 * x^2 + ... . */ - return fmaf (m, m * HORNER_8 (m, C), m); + return fmaf (m, m * horner_8_f32 (m, __log1pf_data.coeffs), m); #else #error No log1pf approximation exists with the requested precision. Options are 13 or 25. #endif } static inline uint32_t biased_exponent (uint32_t ix) { return (ix & 0x7f800000) >> 23; } /* log1pf approximation using polynomial on reduced interval. Worst-case error when using Estrin is roughly 2.02 ULP: log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ float log1pf (float x) { uint32_t ix = asuint (x); uint32_t ia = ix & ~SignMask; uint32_t ia12 = ia >> 20; uint32_t e = biased_exponent (ix); /* Handle special cases first. */ if (unlikely (ia12 >= 0x7f8 || ix >= 0xbf800000 || ix == 0x80000000 || e <= TINY_BOUND_BEXP)) { if (ix == 0xff800000) { /* x == -Inf => log1pf(x) = NaN. */ return NAN; } if ((ix == 0x7f800000 || e <= TINY_BOUND_BEXP) && ia12 <= 0x7f8) { /* |x| < TinyBound => log1p(x) = x. x == Inf => log1pf(x) = Inf. */ return x; } if (ix == 0xbf800000) { /* x == -1.0 => log1pf(x) = -Inf. */ return __math_divzerof (-1); } if (ia12 >= 0x7f8) { /* x == +/-NaN => log1pf(x) = NaN. */ return __math_invalidf (asfloat (ia)); } /* x < -1.0 => log1pf(x) = NaN. */ return __math_invalidf (x); } /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m is in [-0.25, 0.5]): log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). We approximate log1p(m) with a polynomial, then scale by k*log(2). Instead of doing this directly, we use an intermediate scale factor s = 4*k*log(2) to ensure the scale is representable as a normalised fp32 number. */ if (ix <= 0x3f000000 || ia <= 0x3e800000) { /* If x is in [-0.25, 0.5] then we can shortcut all the logic below, as k = 0 and m = x. All we need is to return the polynomial. */ return eval_poly (x, e); } float m = x + 1.0f; /* k is used scale the input. 0x3f400000 is chosen as we are trying to reduce x to the range [-0.25, 0.5]. Inside this range, k is 0. Outside this range, if k is reinterpreted as (NOT CONVERTED TO) float: let k = sign * 2^p where sign = -1 if x < 0 1 otherwise and p is a negative integer whose magnitude increases with the magnitude of x. */ int k = (asuint (m) - 0x3f400000) & 0xff800000; /* By using integer arithmetic, we obtain the necessary scaling by subtracting the unbiased exponent of k from the exponent of x. */ float m_scale = asfloat (asuint (x) - k); /* Scale up to ensure that the scale factor is representable as normalised fp32 number (s in [2**-126,2**26]), and scale m down accordingly. */ float s = asfloat (asuint (4.0f) - k); m_scale = m_scale + fmaf (0.25f, s, -1.0f); float p = eval_poly (m_scale, biased_exponent (asuint (m_scale))); /* The scale factor to be applied back at the end - by multiplying float(k) by 2^-23 we get the unbiased exponent of k. */ float scale_back = (float) k * 0x1.0p-23f; /* Apply the scaling back. */ return fmaf (scale_back, Ln2, p); } PL_SIG (S, F, 1, log1p, -0.9, 10.0) PL_TEST_ULP (log1pf, 1.52) -PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000) -PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) -PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000) -PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000) -PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000) -PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000) -PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000) +PL_TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000) +PL_TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/math_config.h b/contrib/arm-optimized-routines/pl/math/math_config.h index dccb3ce4c775..c3dd8f2db8c7 100644 --- a/contrib/arm-optimized-routines/pl/math/math_config.h +++ b/contrib/arm-optimized-routines/pl/math/math_config.h @@ -1,572 +1,624 @@ /* * Configuration for math routines. * * Copyright (c) 2017-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATH_CONFIG_H #define _MATH_CONFIG_H #include #include #ifndef WANT_ROUNDING /* If defined to 1, return correct results for special cases in non-nearest - rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f). - This may be set to 0 if there is no fenv support or if math functions only - get called in round to nearest mode. */ + rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than + -0.0f). This may be set to 0 if there is no fenv support or if math + functions only get called in round to nearest mode. */ # define WANT_ROUNDING 1 #endif #ifndef WANT_ERRNO /* If defined to 1, set errno in math functions according to ISO C. Many math libraries do not set errno, so this is 0 by default. It may need to be set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */ # define WANT_ERRNO 0 #endif #ifndef WANT_SIMD_EXCEPT /* If defined to 1, trigger fp exceptions in vector routines, consistently with behaviour expected from the corresponding scalar routine. */ -#define WANT_SIMD_EXCEPT 0 +# define WANT_SIMD_EXCEPT 0 #endif /* Compiler can inline round as a single instruction. */ #ifndef HAVE_FAST_ROUND # if __aarch64__ -# define HAVE_FAST_ROUND 1 +# define HAVE_FAST_ROUND 1 # else -# define HAVE_FAST_ROUND 0 +# define HAVE_FAST_ROUND 0 # endif #endif /* Compiler can inline lround, but not (long)round(x). */ #ifndef HAVE_FAST_LROUND -# if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__ -# define HAVE_FAST_LROUND 1 +# if __aarch64__ && (100 * __GNUC__ + __GNUC_MINOR__) >= 408 \ + && __NO_MATH_ERRNO__ +# define HAVE_FAST_LROUND 1 # else -# define HAVE_FAST_LROUND 0 +# define HAVE_FAST_LROUND 0 # endif #endif /* Compiler can inline fma as a single instruction. */ #ifndef HAVE_FAST_FMA # if defined FP_FAST_FMA || __aarch64__ -# define HAVE_FAST_FMA 1 +# define HAVE_FAST_FMA 1 # else -# define HAVE_FAST_FMA 0 +# define HAVE_FAST_FMA 0 # endif #endif /* Provide *_finite symbols and some of the glibc hidden symbols so libmathlib can be used with binaries compiled against glibc to interpose math functions with both static and dynamic linking. */ #ifndef USE_GLIBC_ABI # if __GNUC__ -# define USE_GLIBC_ABI 1 +# define USE_GLIBC_ABI 1 # else -# define USE_GLIBC_ABI 0 +# define USE_GLIBC_ABI 0 # endif #endif /* Optionally used extensions. */ #ifdef __GNUC__ # define HIDDEN __attribute__ ((__visibility__ ("hidden"))) # define NOINLINE __attribute__ ((noinline)) # define UNUSED __attribute__ ((unused)) # define likely(x) __builtin_expect (!!(x), 1) # define unlikely(x) __builtin_expect (x, 0) # if __GNUC__ >= 9 -# define attribute_copy(f) __attribute__ ((copy (f))) +# define attribute_copy(f) __attribute__ ((copy (f))) # else -# define attribute_copy(f) +# define attribute_copy(f) # endif -# define strong_alias(f, a) \ - extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); -# define hidden_alias(f, a) \ - extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ - attribute_copy (f); +# define strong_alias(f, a) \ + extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); +# define hidden_alias(f, a) \ + extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ + attribute_copy (f); #else # define HIDDEN # define NOINLINE # define UNUSED # define likely(x) (x) # define unlikely(x) (x) #endif +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +/* Symbol renames to avoid libc conflicts. */ +#define __math_oflowf arm_math_oflowf +#define __math_uflowf arm_math_uflowf +#define __math_may_uflowf arm_math_may_uflowf +#define __math_divzerof arm_math_divzerof +#define __math_oflow arm_math_oflow +#define __math_uflow arm_math_uflow +#define __math_may_uflow arm_math_may_uflow +#define __math_divzero arm_math_divzero +#define __math_invalidf arm_math_invalidf +#define __math_invalid arm_math_invalid +#define __math_check_oflow arm_math_check_oflow +#define __math_check_uflow arm_math_check_uflow +#define __math_check_oflowf arm_math_check_oflowf +#define __math_check_uflowf arm_math_check_uflowf + #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ # define TOINT_INTRINSICS 1 /* Round x to nearest int in all rounding modes, ties have to be rounded consistently with converttoint so the results match. If the result would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */ static inline double_t roundtoint (double_t x) { return round (x); } /* Convert x to nearest int in all rounding modes, ties have to be rounded consistently with roundtoint. If the result is not representible in an int32_t then the semantics is unspecified. */ static inline int32_t converttoint (double_t x) { # if HAVE_FAST_LROUND return lround (x); # else return (long) round (x); # endif } #endif static inline uint32_t asuint (float f) { union { float f; uint32_t i; - } u = {f}; + } u = { f }; return u.i; } static inline float asfloat (uint32_t i) { union { uint32_t i; float f; - } u = {i}; + } u = { i }; return u.f; } static inline uint64_t asuint64 (double f) { union { double f; uint64_t i; - } u = {f}; + } u = { f }; return u.i; } static inline double asdouble (uint64_t i) { union { uint64_t i; double f; - } u = {i}; + } u = { i }; return u.f; } #ifndef IEEE_754_2008_SNAN # define IEEE_754_2008_SNAN 1 #endif static inline int issignalingf_inline (float x) { uint32_t ix = asuint (x); if (!IEEE_754_2008_SNAN) return (ix & 0x7fc00000) == 0x7fc00000; return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000; } static inline int issignaling_inline (double x) { uint64_t ix = asuint64 (x); if (!IEEE_754_2008_SNAN) return (ix & 0x7ff8000000000000) == 0x7ff8000000000000; return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; } #if __aarch64__ && __GNUC__ /* Prevent the optimization of a floating-point expression. */ static inline float opt_barrier_float (float x) { __asm__ __volatile__ ("" : "+w" (x)); return x; } static inline double opt_barrier_double (double x) { __asm__ __volatile__ ("" : "+w" (x)); return x; } /* Force the evaluation of a floating-point expression for its side-effect. */ static inline void force_eval_float (float x) { __asm__ __volatile__ ("" : "+w" (x)); } static inline void force_eval_double (double x) { __asm__ __volatile__ ("" : "+w" (x)); } #else static inline float opt_barrier_float (float x) { volatile float y = x; return y; } static inline double opt_barrier_double (double x) { volatile double y = x; return y; } static inline void force_eval_float (float x) { volatile float y UNUSED = x; } static inline void force_eval_double (double x) { volatile double y UNUSED = x; } #endif /* Evaluate an expression as the specified type, normally a type cast should be enough, but compilers implement non-standard excess-precision handling, so when FLT_EVAL_METHOD != 0 then these functions may need to be customized. */ static inline float eval_as_float (float x) { return x; } static inline double eval_as_double (double x) { return x; } /* Error handling tail calls for special cases, with a sign argument. The sign of the return value is set if the argument is non-zero. */ /* The result overflows. */ HIDDEN float __math_oflowf (uint32_t); /* The result underflows to 0 in nearest rounding mode. */ HIDDEN float __math_uflowf (uint32_t); /* The result underflows to 0 in some directed rounding mode only. */ HIDDEN float __math_may_uflowf (uint32_t); /* Division by zero. */ HIDDEN float __math_divzerof (uint32_t); /* The result overflows. */ HIDDEN double __math_oflow (uint32_t); /* The result underflows to 0 in nearest rounding mode. */ HIDDEN double __math_uflow (uint32_t); /* The result underflows to 0 in some directed rounding mode only. */ HIDDEN double __math_may_uflow (uint32_t); /* Division by zero. */ HIDDEN double __math_divzero (uint32_t); /* Error handling using input checking. */ /* Invalid input unless it is a quiet NaN. */ HIDDEN float __math_invalidf (float); /* Invalid input unless it is a quiet NaN. */ HIDDEN double __math_invalid (double); /* Error handling using output checking, only for errno setting. */ /* Check if the result overflowed to infinity. */ HIDDEN double __math_check_oflow (double); /* Check if the result underflowed to 0. */ HIDDEN double __math_check_uflow (double); /* Check if the result overflowed to infinity. */ static inline double check_oflow (double x) { return WANT_ERRNO ? __math_check_oflow (x) : x; } /* Check if the result underflowed to 0. */ static inline double check_uflow (double x) { return WANT_ERRNO ? __math_check_uflow (x) : x; } /* Check if the result overflowed to infinity. */ HIDDEN float __math_check_oflowf (float); /* Check if the result underflowed to 0. */ HIDDEN float __math_check_uflowf (float); /* Check if the result overflowed to infinity. */ static inline float check_oflowf (float x) { return WANT_ERRNO ? __math_check_oflowf (x) : x; } /* Check if the result underflowed to 0. */ static inline float check_uflowf (float x) { return WANT_ERRNO ? __math_check_uflowf (x) : x; } extern const struct erff_data { - float erff_poly_A[6]; - float erff_poly_B[7]; + struct + { + float erf, scale; + } tab[513]; } __erff_data HIDDEN; +extern const struct sv_erff_data +{ + float erf[513]; + float scale[513]; +} __sv_erff_data HIDDEN; + +extern const struct erfcf_data +{ + struct + { + float erfc, scale; + } tab[645]; +} __erfcf_data HIDDEN; + /* Data for logf and log10f. */ #define LOGF_TABLE_BITS 4 #define LOGF_POLY_ORDER 4 extern const struct logf_data { struct { double invc, logc; } tab[1 << LOGF_TABLE_BITS]; double ln2; double invln10; double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */ } __logf_data HIDDEN; /* Data for low accuracy log10 (with 1/ln(10) included in coefficients). */ #define LOG10_TABLE_BITS 7 #define LOG10_POLY_ORDER 6 #define LOG10_POLY1_ORDER 12 extern const struct log10_data { double ln2hi; double ln2lo; double invln10; double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */ double poly1[LOG10_POLY1_ORDER - 1]; - struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS]; + struct + { + double invc, logc; + } tab[1 << LOG10_TABLE_BITS]; #if !HAVE_FAST_FMA - struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS]; + struct + { + double chi, clo; + } tab2[1 << LOG10_TABLE_BITS]; #endif } __log10_data HIDDEN; #define EXP_TABLE_BITS 7 #define EXP_POLY_ORDER 5 /* Use polynomial that is optimized for a wider input range. This may be needed for good precision in non-nearest rounding and !TOINT_INTRINSICS. */ #define EXP_POLY_WIDE 0 /* Use close to nearest rounding toint when !TOINT_INTRINSICS. This may be needed for good precision in non-nearest rouning and !EXP_POLY_WIDE. */ #define EXP_USE_TOINT_NARROW 0 #define EXP2_POLY_ORDER 5 #define EXP2_POLY_WIDE 0 extern const struct exp_data { double invln2N; double shift; double negln2hiN; double negln2loN; double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; - uint64_t tab[2*(1 << EXP_TABLE_BITS)]; + uint64_t tab[2 * (1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; -#define ERFC_NUM_INTERVALS 20 -#define ERFC_POLY_ORDER 12 -extern const struct erfc_data -{ - double interval_bounds[ERFC_NUM_INTERVALS + 1]; - double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1]; -} __erfc_data HIDDEN; -extern const struct v_erfc_data -{ - double interval_bounds[ERFC_NUM_INTERVALS + 1]; - double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1]; -} __v_erfc_data HIDDEN; - -#define ERFCF_POLY_NCOEFFS 16 -extern const struct erfcf_poly_data -{ - double poly[4][ERFCF_POLY_NCOEFFS]; -} __erfcf_poly_data HIDDEN; - +/* Copied from math/v_exp.h for use in vector exp_tail. */ #define V_EXP_TAIL_TABLE_BITS 8 extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN; -#define V_ERF_NINTS 49 -#define V_ERF_NCOEFFS 10 -extern const struct v_erf_data +/* Copied from math/v_exp.h for use in vector exp2. */ +#define V_EXP_TABLE_BITS 7 +extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; + +extern const struct erf_data +{ + struct + { + double erf, scale; + } tab[769]; +} __erf_data HIDDEN; + +extern const struct sv_erf_data { - double shifts[V_ERF_NINTS]; - double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS]; -} __v_erf_data HIDDEN; + double erf[769]; + double scale[769]; +} __sv_erf_data HIDDEN; -#define V_ERFF_NCOEFFS 7 -extern const struct v_erff_data +extern const struct erfc_data { - float coeffs[V_ERFF_NCOEFFS][2]; -} __v_erff_data HIDDEN; + struct + { + double erfc, scale; + } tab[3488]; +} __erfc_data HIDDEN; #define ATAN_POLY_NCOEFFS 20 extern const struct atan_poly_data { double poly[ATAN_POLY_NCOEFFS]; } __atan_poly_data HIDDEN; #define ATANF_POLY_NCOEFFS 8 extern const struct atanf_poly_data { float poly[ATANF_POLY_NCOEFFS]; } __atanf_poly_data HIDDEN; #define ASINHF_NCOEFFS 8 extern const struct asinhf_data { float coeffs[ASINHF_NCOEFFS]; } __asinhf_data HIDDEN; #define LOG_TABLE_BITS 7 #define LOG_POLY_ORDER 6 #define LOG_POLY1_ORDER 12 extern const struct log_data { double ln2hi; double ln2lo; double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ double poly1[LOG_POLY1_ORDER - 1]; struct { double invc, logc; } tab[1 << LOG_TABLE_BITS]; #if !HAVE_FAST_FMA struct { double chi, clo; } tab2[1 << LOG_TABLE_BITS]; #endif } __log_data HIDDEN; #define ASINH_NCOEFFS 18 extern const struct asinh_data { double poly[ASINH_NCOEFFS]; } __asinh_data HIDDEN; #define LOG1P_NCOEFFS 19 extern const struct log1p_data { double coeffs[LOG1P_NCOEFFS]; } __log1p_data HIDDEN; #define LOG1PF_2U5 -#define V_LOG1PF_2U5 #define LOG1PF_NCOEFFS 9 extern const struct log1pf_data { float coeffs[LOG1PF_NCOEFFS]; } __log1pf_data HIDDEN; #define TANF_P_POLY_NCOEFFS 6 /* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps. */ #define TANF_Q_POLY_NCOEFFS 4 extern const struct tanf_poly_data { float poly_tan[TANF_P_POLY_NCOEFFS]; float poly_cotan[TANF_Q_POLY_NCOEFFS]; } __tanf_poly_data HIDDEN; -#define V_LOG2F_POLY_NCOEFFS 9 -extern const struct v_log2f_data -{ - float poly[V_LOG2F_POLY_NCOEFFS]; -} __v_log2f_data HIDDEN; - #define V_LOG2_TABLE_BITS 7 -#define V_LOG2_POLY_ORDER 6 extern const struct v_log2_data { - double poly[V_LOG2_POLY_ORDER - 1]; + double poly[5]; + double invln2; struct { double invc, log2c; - } tab[1 << V_LOG2_TABLE_BITS]; + } table[1 << V_LOG2_TABLE_BITS]; } __v_log2_data HIDDEN; -#define V_SINF_NCOEFFS 4 -extern const struct sv_sinf_data -{ - float coeffs[V_SINF_NCOEFFS]; -} __sv_sinf_data HIDDEN; - #define V_LOG10_TABLE_BITS 7 -#define V_LOG10_POLY_ORDER 6 extern const struct v_log10_data { + double poly[5]; + double invln10, log10_2; struct { double invc, log10c; - } tab[1 << V_LOG10_TABLE_BITS]; - double poly[V_LOG10_POLY_ORDER - 1]; - double invln10, log10_2; + } table[1 << V_LOG10_TABLE_BITS]; } __v_log10_data HIDDEN; -#define V_LOG10F_POLY_ORDER 9 -extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN; - -#define SV_LOGF_POLY_ORDER 8 -extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN; - -#define SV_LOG_POLY_ORDER 6 -#define SV_LOG_TABLE_BITS 7 -extern const struct sv_log_data +/* Some data for SVE powf's internal exp and log. */ +#define V_POWF_EXP2_TABLE_BITS 5 +#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS) +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS) +extern const struct v_powf_data { - double invc[1 << SV_LOG_TABLE_BITS]; - double logc[1 << SV_LOG_TABLE_BITS]; - double poly[SV_LOG_POLY_ORDER - 1]; -} __sv_log_data HIDDEN; + double invc[V_POWF_LOG2_N]; + double logc[V_POWF_LOG2_N]; + uint64_t scale[V_POWF_EXP2_N]; +} __v_powf_data HIDDEN; -#ifndef SV_EXPF_USE_FEXPA -#define SV_EXPF_USE_FEXPA 0 -#endif -#define SV_EXPF_POLY_ORDER 6 -extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN; +#define V_LOG_POLY_ORDER 6 +#define V_LOG_TABLE_BITS 7 +extern const struct v_log_data +{ + /* Shared data for vector log and log-derived routines (e.g. asinh). */ + double poly[V_LOG_POLY_ORDER - 1]; + double ln2; + struct + { + double invc, logc; + } table[1 << V_LOG_TABLE_BITS]; +} __v_log_data HIDDEN; #define EXPM1F_POLY_ORDER 5 extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN; #define EXPF_TABLE_BITS 5 #define EXPF_POLY_ORDER 3 extern const struct expf_data { uint64_t tab[1 << EXPF_TABLE_BITS]; double invln2_scaled; double poly_scaled[EXPF_POLY_ORDER]; } __expf_data HIDDEN; #define EXPM1_POLY_ORDER 11 extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN; extern const struct cbrtf_data { float poly[4]; float table[5]; } __cbrtf_data HIDDEN; extern const struct cbrt_data { double poly[4]; double table[5]; } __cbrt_data HIDDEN; -extern const struct v_tan_data +#define ASINF_POLY_ORDER 4 +extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN; + +#define ASIN_POLY_ORDER 11 +extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN; + +/* Some data for AdvSIMD and SVE pow's internal exp and log. */ +#define V_POW_EXP_TABLE_BITS 8 +extern const struct v_pow_exp_data { - double neg_half_pi_hi, neg_half_pi_lo; - double poly[9]; -} __v_tan_data HIDDEN; + double poly[3]; + double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift; + uint64_t sbits[1 << V_POW_EXP_TABLE_BITS]; +} __v_pow_exp_data HIDDEN; + +#define V_POW_LOG_TABLE_BITS 7 +extern const struct v_pow_log_data +{ + double poly[7]; /* First coefficient is 1. */ + double ln2_hi, ln2_lo; + double invc[1 << V_POW_LOG_TABLE_BITS]; + double logc[1 << V_POW_LOG_TABLE_BITS]; + double logctail[1 << V_POW_LOG_TABLE_BITS]; +} __v_pow_log_data HIDDEN; + #endif diff --git a/contrib/arm-optimized-routines/pl/math/math_err.c b/contrib/arm-optimized-routines/pl/math/math_err.c index d246a89982de..74db54a5b2cd 100644 --- a/contrib/arm-optimized-routines/pl/math/math_err.c +++ b/contrib/arm-optimized-routines/pl/math/math_err.c @@ -1,78 +1,78 @@ /* * Double-precision math error handling. * * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #if WANT_ERRNO -#include +# include /* NOINLINE reduces code size and avoids making math functions non-leaf when the error handling is inlined. */ NOINLINE static double with_errno (double y, int e) { errno = e; return y; } #else -#define with_errno(x, e) (x) +# define with_errno(x, e) (x) #endif /* NOINLINE reduces code size. */ NOINLINE static double xflow (uint32_t sign, double y) { y = eval_as_double (opt_barrier_double (sign ? -y : y) * y); return with_errno (y, ERANGE); } HIDDEN double __math_uflow (uint32_t sign) { return xflow (sign, 0x1p-767); } /* Underflows to zero in some non-nearest rounding mode, setting errno is valid even if the result is non-zero, but in the subnormal range. */ HIDDEN double __math_may_uflow (uint32_t sign) { return xflow (sign, 0x1.8p-538); } HIDDEN double __math_oflow (uint32_t sign) { return xflow (sign, 0x1p769); } HIDDEN double __math_divzero (uint32_t sign) { double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0; return with_errno (y, ERANGE); } HIDDEN double __math_invalid (double x) { double y = (x - x) / (x - x); return isnan (x) ? y : with_errno (y, EDOM); } /* Check result and set errno if necessary. */ HIDDEN double __math_check_uflow (double y) { return y == 0.0 ? with_errno (y, ERANGE) : y; } HIDDEN double __math_check_oflow (double y) { return isinf (y) ? with_errno (y, ERANGE) : y; } diff --git a/contrib/arm-optimized-routines/pl/math/math_errf.c b/contrib/arm-optimized-routines/pl/math/math_errf.c index 96271ff18bc1..2b8c6bd25753 100644 --- a/contrib/arm-optimized-routines/pl/math/math_errf.c +++ b/contrib/arm-optimized-routines/pl/math/math_errf.c @@ -1,78 +1,78 @@ /* * Single-precision math error handling. * * Copyright (c) 2017-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #if WANT_ERRNO -#include +# include /* NOINLINE reduces code size and avoids making math functions non-leaf when the error handling is inlined. */ NOINLINE static float with_errnof (float y, int e) { errno = e; return y; } #else -#define with_errnof(x, e) (x) +# define with_errnof(x, e) (x) #endif /* NOINLINE reduces code size. */ NOINLINE static float xflowf (uint32_t sign, float y) { y = eval_as_float (opt_barrier_float (sign ? -y : y) * y); return with_errnof (y, ERANGE); } HIDDEN float __math_uflowf (uint32_t sign) { return xflowf (sign, 0x1p-95f); } /* Underflows to zero in some non-nearest rounding mode, setting errno is valid even if the result is non-zero, but in the subnormal range. */ HIDDEN float __math_may_uflowf (uint32_t sign) { return xflowf (sign, 0x1.4p-75f); } HIDDEN float __math_oflowf (uint32_t sign) { return xflowf (sign, 0x1p97f); } HIDDEN float __math_divzerof (uint32_t sign) { float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f; return with_errnof (y, ERANGE); } HIDDEN float __math_invalidf (float x) { float y = (x - x) / (x - x); return isnan (x) ? y : with_errnof (y, EDOM); } /* Check result and set errno if necessary. */ HIDDEN float __math_check_uflowf (float y) { return y == 0.0f ? with_errnof (y, ERANGE) : y; } HIDDEN float __math_check_oflowf (float y) { return isinf (y) ? with_errnof (y, ERANGE) : y; } diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner.h deleted file mode 100644 index 6ad98dccd6aa..000000000000 --- a/contrib/arm-optimized-routines/pl/math/pairwise_horner.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for double-precision pairwise Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f64 -#else -#define FMA fma -#endif - -#include "pairwise_horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h deleted file mode 100644 index e56f059514ad..000000000000 --- a/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Helper macros for pairwise Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -// clang-format off -#define PW_HORNER_1_(x, c, i) FMA(x, c(i + 1), c(i)) -#define PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) - -#define PAIRWISE_HORNER_1(x, c) PW_HORNER_1_ (x, c, 0) -#define PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0) -#define PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0) -#define PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0) -#define PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0) -#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0) -#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0) -#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0) -#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0) - -#define PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) -#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) - -#define PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0) -#define PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0) -#define PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0) -#define PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0) -#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0) -#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0) -#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0) -#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0) -#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0) -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h b/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h deleted file mode 100644 index 784750cde0b6..000000000000 --- a/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Helper macros for single-precision pairwise Horner polynomial evaluation. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if V_SUPPORTED -#define FMA v_fma_f32 -#else -#define FMA fmaf -#endif - -#include "pairwise_horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/pl_sig.h b/contrib/arm-optimized-routines/pl/math/pl_sig.h index 686d24f0d9a5..52d988f0e1ce 100644 --- a/contrib/arm-optimized-routines/pl/math/pl_sig.h +++ b/contrib/arm-optimized-routines/pl/math/pl_sig.h @@ -1,43 +1,59 @@ /* * PL macros for emitting various ulp/bench entries based on function signature * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. */ + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f +#define SV_NAME_D1(fun) _ZGVsMxv_##fun +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun + #define PL_DECL_SF1(fun) float fun##f (float); #define PL_DECL_SF2(fun) float fun##f (float, float); #define PL_DECL_SD1(fun) double fun (double); #define PL_DECL_SD2(fun) double fun (double, double); -#if V_SUPPORTED -#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t); -#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t); -#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t); -#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t); +#if WANT_VMATH +# define PL_DECL_VF1(fun) \ + VPCS_ATTR float32x4_t V_NAME_F1 (fun##f) (float32x4_t); +# define PL_DECL_VF2(fun) \ + VPCS_ATTR float32x4_t V_NAME_F2 (fun##f) (float32x4_t, float32x4_t); +# define PL_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t); +# define PL_DECL_VD2(fun) \ + VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t); #else -#define PL_DECL_VF1(fun) -#define PL_DECL_VF2(fun) -#define PL_DECL_VD1(fun) -#define PL_DECL_VD2(fun) +# define PL_DECL_VF1(fun) +# define PL_DECL_VF2(fun) +# define PL_DECL_VD1(fun) +# define PL_DECL_VD2(fun) #endif -#if SV_SUPPORTED -#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t); -#define PL_DECL_SVF2(fun) \ - sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t); -#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t); -#define PL_DECL_SVD2(fun) \ - sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t); +#if WANT_SVE_MATH +# define PL_DECL_SVF1(fun) \ + svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t); +# define PL_DECL_SVF2(fun) \ + svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t); +# define PL_DECL_SVD1(fun) \ + svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t); +# define PL_DECL_SVD2(fun) \ + svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t); #else -#define PL_DECL_SVF1(fun) -#define PL_DECL_SVF2(fun) -#define PL_DECL_SVD1(fun) -#define PL_DECL_SVD2(fun) +# define PL_DECL_SVF1(fun) +# define PL_DECL_SVF2(fun) +# define PL_DECL_SVD1(fun) +# define PL_DECL_SVD2(fun) #endif /* For building the routines, emit function prototype from PL_SIG. This ensures that the correct signature has been chosen (wrong one will be a compile error). PL_SIG is defined differently by various components of the build system to emit entries in the wrappers and entries for mathbench and ulp. */ #define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f) diff --git a/contrib/arm-optimized-routines/pl/math/poly_advsimd_f32.h b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f32.h new file mode 100644 index 000000000000..438e153dff90 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f32.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on single-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_ADVSIMD_F32_H +#define PL_MATH_POLY_ADVSIMD_F32_H + +#include + +/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f32. */ +#define VTYPE float32x4_t +#define FMA(x, y, z) vfmaq_f32 (z, x, y) +#define VWRAP(f) v_##f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_advsimd_f64.h b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f64.h new file mode 100644 index 000000000000..7ea249a91225 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_advsimd_f64.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on double-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_ADVSIMD_F64_H +#define PL_MATH_POLY_ADVSIMD_F64_H + +#include + +/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f64. */ +#define VTYPE float64x2_t +#define FMA(x, y, z) vfmaq_f64 (z, x, y) +#define VWRAP(f) v_##f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_generic.h b/contrib/arm-optimized-routines/pl/math/poly_generic.h new file mode 100644 index 000000000000..3fc25f8762f2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_generic.h @@ -0,0 +1,277 @@ +/* + * Generic helpers for evaluating polynomials with various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef VTYPE +# error Cannot use poly_generic without defining VTYPE +#endif +#ifndef VWRAP +# error Cannot use poly_generic without defining VWRAP +#endif +#ifndef FMA +# error Cannot use poly_generic without defining FMA +#endif + +static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2, + const VTYPE *poly) +{ + /* At order 3, Estrin and Pairwise Horner are identical. */ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + return FMA (p23, x2, p01); +} + +static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + return FMA (poly[4], x4, p03); +} +static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p45 = FMA (poly[5], x, poly[4]); + return FMA (p45, x4, p03); +} +static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p45 = FMA (poly[5], x, poly[4]); + VTYPE p46 = FMA (poly[6], x2, p45); + return FMA (p46, x4, p03); +} +static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4); + return FMA (p47, x4, p03); +} +static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p89 = FMA (poly[9], x, poly[8]); + return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p89 = FMA (poly[9], x, poly[8]); + VTYPE p8_10 = FMA (poly[10], x2, p89); + return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8); + return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_17 = FMA (poly[17], x, poly[16]); + return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_17 = FMA (poly[17], x, poly[16]); + VTYPE p16_18 = FMA (poly[18], x2, p16_17); + return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16); + return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} + +static inline VTYPE VWRAP (horner_2) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[2], x, poly[1]); + return FMA (x, p, poly[0]); +} +static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[3], x, poly[2]); + p = FMA (x, p, poly[1]); + p = FMA (x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[4], x, poly[3]); + p = FMA (x, p, poly[2]); + p = FMA (x, p, poly[1]); + p = FMA (x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]); +} + +static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + VTYPE p; + p = FMA (x2, poly[4], p23); + p = FMA (x2, p, p01); + return p; +} +static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + VTYPE p45 = FMA (poly[5], x, poly[4]); + VTYPE p; + p = FMA (x2, p45, p23); + p = FMA (x2, p, p01); + return p; +} +static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p26, p01); +} +static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p27, p01); +} +static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p28, p01); +} +static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p29, p01); +} +static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_10, p01); +} +static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_11, p01); +} +static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_12, p01); +} +static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_13, p01); +} +static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_14, p01); +} +static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_15, p01); +} +static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_16, p01); +} +static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_17, p01); +} +static inline VTYPE VWRAP (pw_horner_18) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_18 = VWRAP (pw_horner_16) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_18, p01); +} diff --git a/contrib/arm-optimized-routines/pl/math/poly_scalar_f32.h b/contrib/arm-optimized-routines/pl/math/poly_scalar_f32.h new file mode 100644 index 000000000000..a9b1c5544494 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_scalar_f32.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on siongle-precision scalar input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SCALAR_F32_H +#define PL_MATH_POLY_SCALAR_F32_H + +#include + +/* Wrap scalar f32 helpers: evaluation of some scheme/order has form: + [scheme]_[order]_f32. */ +#define VTYPE float +#define FMA fmaf +#define VWRAP(f) f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_scalar_f64.h b/contrib/arm-optimized-routines/pl/math/poly_scalar_f64.h new file mode 100644 index 000000000000..207dccee30ad --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_scalar_f64.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on double-precision scalar input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SCALAR_F64_H +#define PL_MATH_POLY_SCALAR_F64_H + +#include + +/* Wrap scalar f64 helpers: evaluation of some scheme/order has form: + [scheme]_[order]_f64. */ +#define VTYPE double +#define FMA fma +#define VWRAP(f) f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_sve_f32.h b/contrib/arm-optimized-routines/pl/math/poly_sve_f32.h new file mode 100644 index 000000000000..a97e2ced027a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_sve_f32.h @@ -0,0 +1,26 @@ +/* + * Helpers for evaluating polynomials on single-precision SVE input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SVE_F32_H +#define PL_MATH_POLY_SVE_F32_H + +#include + +/* Wrap SVE f32 helpers: evaluation of some scheme/order has form: + sv_[scheme]_[order]_f32_x. */ +#define VTYPE svfloat32_t +#define STYPE float +#define VWRAP(f) sv_##f##_f32_x +#define DUP svdup_f32 +#include "poly_sve_generic.h" +#undef DUP +#undef VWRAP +#undef STYPE +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_sve_f64.h b/contrib/arm-optimized-routines/pl/math/poly_sve_f64.h new file mode 100644 index 000000000000..5fb14b3c1700 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_sve_f64.h @@ -0,0 +1,26 @@ +/* + * Helpers for evaluating polynomials on double-precision SVE input, using + * various schemes. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_POLY_SVE_F64_H +#define PL_MATH_POLY_SVE_F64_H + +#include + +/* Wrap SVE f64 helpers: evaluation of some scheme/order has form: + sv_[scheme]_[order]_f64_x. */ +#define VTYPE svfloat64_t +#define STYPE double +#define VWRAP(f) sv_##f##_f64_x +#define DUP svdup_f64 +#include "poly_sve_generic.h" +#undef DUP +#undef VWRAP +#undef STYPE +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/poly_sve_generic.h b/contrib/arm-optimized-routines/pl/math/poly_sve_generic.h new file mode 100644 index 000000000000..b568e4cddff3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/poly_sve_generic.h @@ -0,0 +1,301 @@ +/* + * Helpers for evaluating polynomials with various schemes - specific to SVE + * but precision-agnostic. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef VTYPE +# error Cannot use poly_generic without defining VTYPE +#endif +#ifndef STYPE +# error Cannot use poly_generic without defining STYPE +#endif +#ifndef VWRAP +# error Cannot use poly_generic without defining VWRAP +#endif +#ifndef DUP +# error Cannot use poly_generic without defining DUP +#endif + +static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + /* At order 3, Estrin and Pairwise Horner are identical. */ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + return svmla_x (pg, p01, p23, x2); +} + +static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + return svmla_x (pg, p03, x4, poly[4]); +} +static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + return svmla_x (pg, p03, p45, x4); +} +static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + VTYPE p46 = svmla_x (pg, p45, x, poly[6]); + return svmla_x (pg, p03, p46, x4); +} +static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4); + return svmla_x (pg, p03, p47, x4); +} +static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]); +} +static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + VTYPE x8, const STYPE *poly) +{ + VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8); +} +static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); + VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8); +} +static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8); +} +static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16, + poly[16]); +} +static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17, + x16); +} +static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); + VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]); + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18, + x16); +} +static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), + VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16); +} + +static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]); + p = svmad_x (pg, x, p, poly[1]); + p = svmad_x (pg, x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]); + p = svmad_x (pg, x, p, poly[2]); + p = svmad_x (pg, x, p, poly[1]); + p = svmad_x (pg, x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]); +} + +static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + VTYPE p; + p = svmla_x (pg, p23, x2, poly[4]); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + VTYPE p; + p = svmla_x (pg, p23, x2, p45); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p26); +} +static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p27); +} +static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p28); +} +static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p29); +} +static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_10); +} +static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_11); +} +static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_12); +} +static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_13); +} +static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_14); +} +static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_15); +} +static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_16); +} +static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_17); +} +static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_18); +} diff --git a/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c deleted file mode 100644 index f62cbd6b53f0..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_acosh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c deleted file mode 100644 index 374066622a0f..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_acoshf_3u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c deleted file mode 100644 index ab8fbd9c3d69..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_asinh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c deleted file mode 100644 index 13e1a5fd314a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_asinhf_2u7.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c deleted file mode 100644 index 4603e5f72615..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atan2_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c deleted file mode 100644 index 894d843273ea..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atan2f_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c deleted file mode 100644 index 4b61bc4d1460..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atan_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c deleted file mode 100644 index 6b6571927195..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atanf_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c deleted file mode 100644 index f6a5f75b1779..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atanh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c deleted file mode 100644 index e7e5c6197406..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_atanhf_3u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c deleted file mode 100644 index 435e74a546c6..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cbrt_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c deleted file mode 100644 index 5c793704b62a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cbrtf_1u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c deleted file mode 100644 index cdf352cf5793..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_cosh_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c deleted file mode 100644 index 8f7d5da6e6ef..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_coshf_2u4.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erf_2u.c b/contrib/arm-optimized-routines/pl/math/s_erf_2u.c deleted file mode 100644 index 839535c3897f..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erf_2u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erf_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c deleted file mode 100644 index bf9e3e62bd31..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erfc_4u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c deleted file mode 100644 index 024d22498ff5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erfcf_1u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c deleted file mode 100644 index a5b9bf9afa72..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_erff_1u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_exp_tail.c b/contrib/arm-optimized-routines/pl/math/s_exp_tail.c deleted file mode 100644 index 20b1b41a9689..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_exp_tail.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_exp_tail.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expf.c b/contrib/arm-optimized-routines/pl/math/s_expf.c deleted file mode 100644 index 557a2e3d36af..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_expf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expf.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c deleted file mode 100644 index da2d6e7ebf82..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expm1_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c deleted file mode 100644 index eea8089da989..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_expm1f_1u6.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c deleted file mode 100644 index 2480e5aa2cf1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log10_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c deleted file mode 100644 index 173e0fdc3400..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log10f_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c deleted file mode 100644 index 20b395a5a2d0..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log1p_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c deleted file mode 100644 index 013ec4c1d903..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log1pf_2u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log2_3u.c b/contrib/arm-optimized-routines/pl/math/s_log2_3u.c deleted file mode 100644 index d46f3f998190..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log2_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log2_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c deleted file mode 100644 index e76c67dceb62..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_log2f_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c deleted file mode 100644 index 27e5e65db178..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sinh_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c deleted file mode 100644 index 607f94298a79..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_sinhf_2u3.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c deleted file mode 100644 index adb807c5beb8..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tan_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c deleted file mode 100644 index fa64c8aef697..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tanf_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c deleted file mode 100644 index a4d7bce649f1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tanh_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c deleted file mode 100644 index 896fc62ebe9b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#define SCALAR 1 -#include "v_tanhf_2u6.c" diff --git a/contrib/arm-optimized-routines/pl/math/sinh_3u.c b/contrib/arm-optimized-routines/pl/math/sinh_3u.c index f534815c6674..1d86629ee2a3 100644 --- a/contrib/arm-optimized-routines/pl/math/sinh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/sinh_3u.c @@ -1,66 +1,63 @@ /* * Double-precision sinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffffffffffff #define Half 0x3fe0000000000000 #define OFlowBound \ 0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results \ in NaN. */ double __exp_dd (double, double); /* Approximation for double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The greatest observed error is 2.57 ULP: __v_sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 want 0x1.ab34e59d678d9p-2. */ double sinh (double x) { uint64_t ix = asuint64 (x); uint64_t iax = ix & AbsMask; double ax = asdouble (iax); uint64_t sign = ix & ~AbsMask; double halfsign = asdouble (Half | sign); if (unlikely (iax >= OFlowBound)) { /* Special values and overflow. */ if (unlikely (iax > 0x7ff0000000000000)) return __math_invalidf (x); /* expm1 overflows a little before sinh. We have to fill this gap by using a different algorithm, in this case we use a double-precision exp helper. For large x sinh(x) is dominated by exp(x), however we cannot compute exp without overflow either. We use the identity: exp(a) = (exp(a / 2)) ^ 2 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. */ double e = __exp_dd (ax / 2, 0); return (e * halfsign) * e; } /* Use expm1f to retain acceptable precision for small numbers. Let t = e^(|x|) - 1. */ double t = expm1 (ax); /* Then sinh(x) = (t + t / (t + 1)) / 2 for x > 0 (t + t / (t + 1)) / -2 for x < 0. */ return (t + t / (t + 1)) * halfsign; } PL_SIG (S, D, 1, sinh, -10.0, 10.0) PL_TEST_ULP (sinh, 2.08) -PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100) -PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100) -PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) -PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000) -PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) -PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000) +PL_TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100) +PL_TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) +PL_TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c index de944288a02b..aa7aadcf67c5 100644 --- a/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c +++ b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c @@ -1,76 +1,73 @@ /* * Single-precision sinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffff #define Half 0x3f000000 #define Expm1OFlowLimit \ 0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f \ overflows. */ #define OFlowLimit \ 0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should \ overflow. */ float optr_aor_exp_f32 (float); /* Approximation for single-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The maximum error is 2.26 ULP: sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ float sinhf (float x) { uint32_t ix = asuint (x); uint32_t iax = ix & AbsMask; float ax = asfloat (iax); uint32_t sign = ix & ~AbsMask; float halfsign = asfloat (Half | sign); if (unlikely (iax >= Expm1OFlowLimit)) { /* Special values and overflow. */ if (iax >= 0x7fc00001 || iax == 0x7f800000) return x; if (iax >= 0x7f800000) return __math_invalidf (x); if (iax >= OFlowLimit) return __math_oflowf (sign); /* expm1f overflows a little before sinhf, (~88.7 vs ~89.4). We have to fill this gap by using a different algorithm, in this case we use a double-precision exp helper. For large x sinh(x) dominated by exp(x), however we cannot compute exp without overflow either. We use the identity: exp(a) = (exp(a / 2)) ^ 2. to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. Greatest error in this region is 1.89 ULP: sinhf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. */ float e = optr_aor_exp_f32 (ax / 2); return (e * halfsign) * e; } /* Use expm1f to retain acceptable precision for small numbers. Let t = e^(|x|) - 1. */ float t = expm1f (ax); /* Then sinh(x) = (t + t / (t + 1)) / 2 for x > 0 (t + t / (t + 1)) / -2 for x < 0. */ return (t + t / (t + 1)) * halfsign; } PL_SIG (S, F, 1, sinh, -10.0, 10.0) PL_TEST_ULP (sinhf, 1.76) -PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) -PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000) -PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) -PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100) -PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) -PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100) +PL_TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) +PL_TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) +PL_TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/sinpi_3u.c b/contrib/arm-optimized-routines/pl/math/sinpi_3u.c new file mode 100644 index 000000000000..a04a352a62e6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sinpi_3u.c @@ -0,0 +1,90 @@ +/* + * Double-precision scalar sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +static const double poly[] + = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, + -0x1.012a9870eeb7dp-25 }; + +#define Shift 0x1.8p+52 + +/* Approximation for scalar double-precision sinpi(x). + Maximum error: 3.03 ULP: + sinpi(0x1.a90da2818f8b5p+7) got 0x1.fe358f255a4b3p-1 + want 0x1.fe358f255a4b6p-1. */ +double +sinpi (double x) +{ + if (isinf (x)) + return __math_invalid (x); + + double r = asdouble (asuint64 (x) & ~0x8000000000000000); + uint64_t sign = asuint64 (x) & 0x8000000000000000; + + /* Edge cases for when sinpif should be exactly 0. (Integers) + 0x1p53 is the limit for single precision to store any decimal places. */ + if (r >= 0x1p53) + return 0; + + /* If x is an integer, return 0. */ + uint64_t m = (uint64_t) r; + if (r == m) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via sinpi(x) ≈ pi*x. */ + if (r < 0x1p-63) + return M_PI * x; + + /* Any non-integer values >= 0x1x51 will be int + 0.5. + These values should return exactly 1 or -1. */ + if (r >= 0x1p51) + { + uint64_t iy = ((m & 1) << 63) ^ asuint64 (1.0); + return asdouble (sign ^ iy); + } + + /* n = rint(|x|). */ + double n = r + Shift; + sign ^= (asuint64 (n) << 63); + n = n - Shift; + + /* r = |x| - n (range reduction into -1/2 .. 1/2). */ + r = r - n; + + /* y = sin(r). */ + double r2 = r * r; + double y = horner_9_f64 (r2, poly); + y = y * r; + + /* Reintroduce C2_hi. */ + y = fma (-4 * r2, r, y); + + /* Copy sign of x to sin(|x|). */ + return asdouble (asuint64 (y) ^ sign); +} + +PL_SIG (S, D, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (sinpi, 2.53) +PL_TEST_SYM_INTERVAL (sinpi, 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (sinpi, 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (sinpi, 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (sinpi, 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sinpif_2u5.c b/contrib/arm-optimized-routines/pl/math/sinpif_2u5.c new file mode 100644 index 000000000000..af9ca0573b37 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sinpif_2u5.c @@ -0,0 +1,83 @@ +/* + * Single-precision scalar sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Taylor series coefficents for sin(pi * x). */ +#define C0 0x1.921fb6p1f +#define C1 -0x1.4abbcep2f +#define C2 0x1.466bc6p1f +#define C3 -0x1.32d2ccp-1f +#define C4 0x1.50783p-4f +#define C5 -0x1.e30750p-8f + +#define Shift 0x1.0p+23f + +/* Approximation for scalar single-precision sinpi(x) - sinpif. + Maximum error: 2.48 ULP: + sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +float +sinpif (float x) +{ + if (isinf (x)) + return __math_invalidf (x); + + float r = asfloat (asuint (x) & ~0x80000000); + uint32_t sign = asuint (x) & 0x80000000; + + /* Edge cases for when sinpif should be exactly 0. (Integers) + 0x1p23 is the limit for single precision to store any decimal places. */ + if (r >= 0x1p23f) + return 0; + + int32_t m = roundf (r); + if (m == r) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via sinpi(x) ~= pi*x. */ + if (r < 0x1p-31f) + return C0 * x; + + /* Any non-integer values >= 0x1p22f will be int + 0.5. + These values should return exactly 1 or -1. */ + if (r >= 0x1p22f) + { + uint32_t iy = ((m & 1) << 31) ^ asuint (-1.0f); + return asfloat (sign ^ iy); + } + + /* n = rint(|x|). */ + float n = r + Shift; + sign ^= (asuint (n) << 31); + n = n - Shift; + + /* r = |x| - n (range reduction into -1/2 .. 1/2). */ + r = r - n; + + /* y = sin(pi * r). */ + float r2 = r * r; + float y = fmaf (C5, r2, C4); + y = fmaf (y, r2, C3); + y = fmaf (y, r2, C2); + y = fmaf (y, r2, C1); + y = fmaf (y, r2, C0); + + /* Copy sign of x to sin(|x|). */ + return asfloat (asuint (y * r) ^ sign); +} + +PL_SIG (S, F, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (sinpif, 1.99) +PL_TEST_SYM_INTERVAL (sinpif, 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (sinpif, 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (sinpif, 0.5, 0x1p22f, 10000) +PL_TEST_SYM_INTERVAL (sinpif, 0x1p22f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acos_2u.c b/contrib/arm-optimized-routines/pl/math/sv_acos_2u.c new file mode 100644 index 000000000000..e06db6cae6af --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acos_2u.c @@ -0,0 +1,91 @@ +/* + * Double-precision SVE acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[12]; + float64_t pi, pi_over_2; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, + .pi = 0x1.921fb54442d18p+1, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; + +/* Double-precision SVE implementation of vector acos(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0 + want 0x1.0d4d0f55667f7p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1 + want 0x1.ed82df4243f0bp-1. */ +svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); + + svbool_t a_gt_half = svacgt (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat64_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + svfloat64_t y + = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign)); + + svbool_t is_neg = svcmplt (pg, x, 0.0); + svfloat64_t off = svdup_f64_z (is_neg, d->pi); + svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0)); + svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2)); + + return svmla_x (pg, add, mul, y); +} + +PL_SIG (SV, D, 1, acos, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_D1 (acos), 1.02) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acosf_1u4.c b/contrib/arm-optimized-routines/pl/math/sv_acosf_1u4.c new file mode 100644 index 000000000000..7ac59ceedfbd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acosf_1u4.c @@ -0,0 +1,84 @@ +/* + * Single-precision SVE acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32_t poly[5]; + float32_t pi, pi_over_2; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, + 0x1.3af7d8p-5, }, + .pi = 0x1.921fb6p+1f, + .pi_over_2 = 0x1.921fb6p+0f, +}; + +/* Single-precision SVE implementation of vector acos(x). + + For |x| in [0, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.16 ulps, + _ZGVsMxv_acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 + want 0x1.0c27f6p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVsMxv_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000); + svfloat32_t ax = svabs_x (pg, x); + svbool_t a_gt_half = svacgt (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat32_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + svfloat32_t y + = svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (p), sign)); + + svbool_t is_neg = svcmplt (pg, x, 0.0); + svfloat32_t off = svdup_f32_z (is_neg, d->pi); + svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0), sv_f32 (-1.0)); + svfloat32_t add = svsel (a_gt_half, off, sv_f32 (d->pi_over_2)); + + return svmla_x (pg, add, mul, y); +} + +PL_SIG (SV, F, 1, acos, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_F1 (acos), 0.82) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_acosh_3u5.c new file mode 100644 index 000000000000..faf351331464 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acosh_3u5.c @@ -0,0 +1,50 @@ +/* + * Double-precision SVE acosh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 1 +#include "sv_log1p_inline.h" + +#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ +#define OneTop 0x3ff + +static NOINLINE svfloat64_t +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (acosh, x, y, special); +} + +/* SVE approximation for double-precision acosh, based on log1p. + The largest observed error is 3.19 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 + want 0x1.ed23399f51373p-2. */ +svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) +{ + svuint64_t itop = svlsr_x (pg, svreinterpret_u64 (x), 52); + /* (itop - OneTop) >= (BigBoundTop - OneTop). */ + svbool_t special = svcmpge (pg, svsub_x (pg, itop, OneTop), sv_u64 (0x1ff)); + + svfloat64_t xm1 = svsub_x (pg, x, 1); + svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1)); + svfloat64_t y = sv_log1p_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); + + /* Fall back to scalar routine for special lanes. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +PL_SIG (SV, D, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (acosh), 2.69) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_acoshf_2u8.c b/contrib/arm-optimized-routines/pl/math/sv_acoshf_2u8.c new file mode 100644 index 000000000000..f527083af40a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_acoshf_2u8.c @@ -0,0 +1,47 @@ +/* + * Single-precision SVE acosh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define One 0x3f800000 +#define Thres 0x20000000 /* asuint(0x1p64) - One. */ + +#include "sv_log1pf_inline.h" + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (acoshf, x, y, special); +} + +/* Single-precision SVE acosh(x) routine. Implements the same algorithm as + vector acoshf and log1p. + + Maximum error is 2.78 ULPs: + SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4 + want 0x1.f45b3cp-4. */ +svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) +{ + svuint32_t ix = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); + + svfloat32_t xm1 = svsub_x (pg, x, 1.0f); + svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); + svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + return y; +} + +PL_SIG (SV, F, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (acosh), 2.29) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000) +PL_TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_asin_3u.c b/contrib/arm-optimized-routines/pl/math/sv_asin_3u.c new file mode 100644 index 000000000000..c3dd37b145ae --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asin_3u.c @@ -0,0 +1,84 @@ +/* + * Double-precision SVE asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[12]; + float64_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, + 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6, + 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, + 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6, + -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, + .pi_over_2f = 0x1.921fb54442d18p+0, +}; + +#define P(i) sv_f64 (d->poly[i]) + +/* Double-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.52 ulps, + _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2 + want 0x1.ec13757305f26p-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 + want 0x1.110d7e85fdd53p-1. */ +svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); + svbool_t a_ge_half = svacge (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat64_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, asin, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_D1 (asin), 2.19) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_asinf_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_asinf_2u5.c new file mode 100644 index 000000000000..8e9edc2439f5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asinf_2u5.c @@ -0,0 +1,76 @@ +/* + * Single-precision SVE asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32_t poly[5]; + float32_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, + 0x1.3af7d8p-5, }, + .pi_over_2f = 0x1.921fb6p+0f, +}; + +/* Single-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVsMxv_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 + want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVsMxv_asinf (-0x1.00203ep-1) got -0x1.0c3a64p-1 + want -0x1.0c3a6p-1. */ +svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000); + + svfloat32_t ax = svabs_x (pg, x); + svbool_t a_ge_half = svacge (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat32_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat32_t y = svmad_m (a_ge_half, p, sv_f32 (-2.0), d->pi_over_2f); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, asin, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_F1 (asin), 1.91) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_asinh_3u0.c b/contrib/arm-optimized-routines/pl/math/sv_asinh_3u0.c new file mode 100644 index 000000000000..711f0dfdbedc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asinh_3u0.c @@ -0,0 +1,129 @@ +/* + * Double-precision SVE asinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define OneTop sv_u64 (0x3ff) /* top12(asuint64(1.0f)). */ +#define HugeBound sv_u64 (0x5fe) /* top12(asuint64(0x1p511)). */ +#define TinyBound (0x3e5) /* top12(asuint64(0x1p-26)). */ +#define SignMask (0x8000000000000000) + +/* Constants & data for log. */ +#define A(i) __v_log_data.poly[i] +#define Ln2 (0x1.62e42fefa39efp-1) +#define N (1 << V_LOG_TABLE_BITS) +#define OFF (0x3fe6900900000000) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (asinh, x, y, special); +} + +static inline svfloat64_t +__sv_log_inline (svfloat64_t x, const svbool_t pg) +{ + /* Double-precision SVE log, copied from pl/math/sv_log_2u5.c with some + cosmetic modification and special-cases removed. See that file for details + of the algorithm used. */ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t tmp = svsub_x (pg, ix, OFF); + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); + svfloat64_t kd = svcvt_f64_x (pg, k); + svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, Ln2); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, sv_f64 (A (2)), r, A (3)); + svfloat64_t p = svmla_x (pg, sv_f64 (A (0)), r, A (1)); + y = svmla_x (pg, y, r2, A (4)); + y = svmla_x (pg, p, r2, y); + y = svmla_x (pg, hi, r2, y); + return y; +} + +/* Double-precision implementation of SVE asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.51 ULP, in + |x| >= 1: + _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1 + want 0x1.e3181c43b0f39p-1. */ +svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) +{ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iax = svbic_x (pg, ix, SignMask); + svuint64_t sign = svand_x (pg, ix, SignMask); + svfloat64_t ax = svreinterpret_f64 (iax); + svuint64_t top12 = svlsr_x (pg, iax, 52); + + svbool_t ge1 = svcmpge (pg, top12, OneTop); + svbool_t special = svcmpge (pg, top12, HugeBound); + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */ + svfloat64_t option_1 = sv_f64 (0); + if (likely (svptest_any (pg, ge1))) + { + svfloat64_t axax = svmul_x (pg, ax, ax); + option_1 = __sv_log_inline ( + svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, axax, 1))), pg); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + The largest observed error in this region is 1.51 ULPs: + _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 + want 0x1.c1e649ee2681dp-1. */ + svfloat64_t option_2 = sv_f64 (0); + if (likely (svptest_any (pg, svnot_z (pg, ge1)))) + { + svfloat64_t x2 = svmul_x (pg, ax, ax); + svfloat64_t z2 = svmul_x (pg, x2, x2); + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p + = sv_estrin_17_f64_x (pg, x2, z2, z4, z8, z16, __asinh_data.poly); + option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax)); + } + + /* Choose the right option for each lane. */ + svfloat64_t y = svsel (ge1, option_1, option_2); + + /* Apply sign of x to y. */ + y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + return y; +} + +PL_SIG (SV, D, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (asinh), 2.52) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the svsel is choosing the right option in all cases. */ +#define SV_ASINH_INTERVAL(lo, hi, n) \ + PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0.5) \ + PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 2) \ + PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0x1p600) +SV_ASINH_INTERVAL (0, 0x1p-26, 50000) +SV_ASINH_INTERVAL (0x1p-26, 1, 50000) +SV_ASINH_INTERVAL (1, 0x1p511, 50000) +SV_ASINH_INTERVAL (0x1p511, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_asinhf_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_asinhf_2u5.c new file mode 100644 index 000000000000..1f1f6e5c846f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_asinhf_2u5.c @@ -0,0 +1,55 @@ +/* + * Single-precision SVE asinh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_log1pf_inline.h" + +#define BigBound (0x5f800000) /* asuint(0x1p64). */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (asinhf, x, y, special); +} + +/* Single-precision SVE asinh(x) routine. Implements the same algorithm as + vector asinhf and log1p. + + Maximum error is 2.48 ULPs: + SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4 + want 0x1.ffbbb8p-4. */ +svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) +{ + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svbool_t special = svcmpge (pg, iax, BigBound); + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + svfloat32_t ax2 = svmul_x (pg, ax, ax); + svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f); + svfloat32_t y + = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))), + special); + return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))); +} + +PL_SIG (SV, F, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (asinh), 1.98) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c index a4bea1dcba09..00530a324a76 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c @@ -1,93 +1,116 @@ /* * Double-precision vector atan2(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#include "sv_atan_common.h" +static const struct data +{ + float64_t poly[20]; + float64_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; /* Useful constants. */ -#define PiOver2 sv_f64 (0x1.921fb54442d18p+0) #define SignMask sv_u64 (0x8000000000000000) /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ -__attribute__ ((noinline)) static sv_f64_t -specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, + const svbool_t cmp) { return sv_call2_f64 (atan2, y, x, ret, cmp); } -/* Returns a predicate indicating true if the input is the bit representation of - 0, infinity or nan. */ +/* Returns a predicate indicating true if the input is the bit representation + of 0, infinity or nan. */ static inline svbool_t -zeroinfnan (sv_u64_t i, const svbool_t pg) +zeroinfnan (svuint64_t i, const svbool_t pg) { - return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1), - sv_u64 (2 * asuint64 (INFINITY) - 1)); + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1), + sv_u64 (2 * asuint64 (INFINITY) - 1)); } /* Fast implementation of SVE atan2. Errors are greatest when y and x are reasonably close together. The greatest observed error is 2.28 ULP: - sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) + _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ -sv_f64_t -__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg) +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t iy = sv_as_u64_f64 (y); + const struct data *data_ptr = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); svbool_t cmp_x = zeroinfnan (ix, pg); svbool_t cmp_y = zeroinfnan (iy, pg); - svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y); + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask); - sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask); - sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y); + svuint64_t sign_x = svand_x (pg, ix, SignMask); + svuint64_t sign_y = svand_x (pg, iy, SignMask); + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); - sv_f64_t ax = svabs_f64_x (pg, x); - sv_f64_t ay = svabs_f64_x (pg, y); + svfloat64_t ax = svabs_x (pg, x); + svfloat64_t ay = svabs_x (pg, y); - svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0)); - svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax); + svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ - sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay); - sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax); - sv_f64_t z = svdiv_f64_x (pg, n, d); + svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat64_t d = svsel (pred_aygtax, ay, ax); + svfloat64_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); - shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift); - shift = svmul_f64_x (pg, shift, PiOver2); + svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); + shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + shift = svmul_x (pg, shift, data_ptr->pi_over_2); + + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); + svfloat64_t x2 = svmul_x (pg, z2, z2); + svfloat64_t x4 = svmul_x (pg, x2, x2); + svfloat64_t x8 = svmul_x (pg, x4, x4); - sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift); + svfloat64_t ret = svmla_x ( + pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly), + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t z3 = svmul_x (pg, z2, z); + ret = svmla_x (pg, z, z3, ret); + + ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy)); + ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); if (unlikely (svptest_any (pg, cmp_xy))) - { - return specialcase (y, x, ret, cmp_xy); - } + return special_case (y, x, ret, cmp_xy); return ret; } -PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2) - /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (SV, D, 2, atan2) -PL_TEST_ULP (__sv_atan2, 1.78) -PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_D2 (atan2), 1.78) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c index f7674c441f2f..9ff73ecb74ba 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c @@ -1,94 +1,108 @@ /* * Single-precision vector atan2f(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f32.h" -#if SV_SUPPORTED - -#include "sv_atanf_common.h" +static const struct data +{ + float32_t poly[8]; + float32_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, + .pi_over_2 = 0x1.921fb6p+0f, +}; -/* Useful constants. */ -#define PiOver2 sv_f32 (0x1.921fb6p+0f) #define SignMask sv_u32 (0x80000000) /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ -static inline sv_f32_t -specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp) +static inline svfloat32_t +special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, + const svbool_t cmp) { return sv_call2_f32 (atan2f, y, x, ret, cmp); } -/* Returns a predicate indicating true if the input is the bit representation of - 0, infinity or nan. */ +/* Returns a predicate indicating true if the input is the bit representation + of 0, infinity or nan. */ static inline svbool_t -zeroinfnan (sv_u32_t i, const svbool_t pg) +zeroinfnan (svuint32_t i, const svbool_t pg) { - return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1), - sv_u32 (2 * 0x7f800000lu - 1)); + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1), + sv_u32 (2 * 0x7f800000lu - 1)); } -/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2) - with reduction to [0,1] using z=1/x and shift = pi/2. - Maximum observed error is 2.95 ULP: - __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ -sv_f32_t -__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg) +/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * + P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum + observed error is 2.95 ULP: + _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) { - sv_u32_t ix = sv_as_u32_f32 (x); - sv_u32_t iy = sv_as_u32_f32 (y); + const struct data *data_ptr = ptr_barrier (&data); + + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); svbool_t cmp_x = zeroinfnan (ix, pg); svbool_t cmp_y = zeroinfnan (iy, pg); - svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y); + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask); - sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask); - sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y); + svuint32_t sign_x = svand_x (pg, ix, SignMask); + svuint32_t sign_y = svand_x (pg, iy, SignMask); + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); - sv_f32_t ax = svabs_f32_x (pg, x); - sv_f32_t ay = svabs_f32_x (pg, y); + svfloat32_t ax = svabs_x (pg, x); + svfloat32_t ay = svabs_x (pg, y); - svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0)); - svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax); + svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ - sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay); - sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax); - sv_f32_t z = svdiv_f32_x (pg, n, d); + svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat32_t d = svsel (pred_aygtax, ay, ax); + svfloat32_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); - shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift); - shift = svmul_f32_x (pg, shift, PiOver2); + svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); + shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); + + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); - sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift); + svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly); + + /* ret = shift + z + z^3 * P(z^2). */ + svfloat32_t z3 = svmul_x (pg, z2, z); + ret = svmla_x (pg, z, z3, ret); + + ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy)); + ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); if (unlikely (svptest_any (pg, cmp_xy))) - { - return specialcase (y, x, ret, cmp_xy); - } + return special_case (y, x, ret, cmp_xy); return ret; } -PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f) - /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (SV, F, 2, atan2) -PL_TEST_ULP (__sv_atan2f, 2.45) -PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_F2 (atan2), 2.45) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c index 02ac331970c9..7ab486a4c9d2 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c @@ -1,62 +1,87 @@ /* * Double-precision vector atan(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#include "sv_atan_common.h" +static const struct data +{ + float64_t poly[20]; + float64_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; /* Useful constants. */ -#define PiOver2 sv_f64 (0x1.921fb54442d18p+0) -#define AbsMask (0x7fffffffffffffff) +#define SignMask (0x8000000000000000) /* Fast implementation of SVE atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed error is 2.27 ulps: - __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ -sv_f64_t -__sv_atan_x (sv_f64_t x, const svbool_t pg) + _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* No need to trigger special case. Small cases, infs and nans are supported by our approximation technique. */ - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t sign = svand_x (pg, ix, SignMask); /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt_n_f64 (pg, x, 1.0); + svbool_t red = svacgt (pg, x, 1.0); /* Avoid dependency in abs(x) in division (and comparison). */ - sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x); + svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x); /* Use absolute value only when needed (odd powers of z). */ - sv_f64_t az = svabs_f64_x (pg, z); - az = svneg_f64_m (az, red, az); + svfloat64_t az = svabs_x (pg, z); + az = svneg_m (az, red, az); + + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); + svfloat64_t x2 = svmul_x (pg, z2, z2); + svfloat64_t x4 = svmul_x (pg, x2, x2); + svfloat64_t x8 = svmul_x (pg, x4, x4); - sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2); + svfloat64_t y + = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly), + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t z3 = svmul_x (pg, z2, az); + y = svmla_x (pg, az, z3, y); + + /* Apply shift as indicated by `red` predicate. */ + y = svadd_m (red, y, d->pi_over_2); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); + y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); return y; } -PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan) - PL_SIG (SV, D, 1, atan, -3.1, 3.1) -PL_TEST_ULP (__sv_atan, 1.78) -PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_D1 (atan), 1.78) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_common.h b/contrib/arm-optimized-routines/pl/math/sv_atan_common.h deleted file mode 100644 index bfe6998d2416..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_atan_common.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Double-precision polynomial evaluation function for SVE atan(x) and - * atan2(y,x). - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" -#include "sv_math.h" - -#define P(i) sv_f64 (__atan_poly_data.poly[i]) - -/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations - The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline sv_f64_t -__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az, - sv_f64_t shift) -{ - /* Use full Estrin scheme for P(z^2) with deg(P)=19. */ - sv_f64_t z2 = svmul_f64_x (pg, z, z); - - /* Level 1. */ - sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0)); - sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2)); - sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4)); - sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6)); - sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8)); - sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10)); - sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12)); - sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14)); - sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16)); - sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18)); - - /* Level 2. */ - sv_f64_t x2 = svmul_f64_x (pg, z2, z2); - sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0); - sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4); - sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8); - sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12); - sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16); - - /* Level 3. */ - sv_f64_t x4 = svmul_f64_x (pg, x2, x2); - sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0); - sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8); - - /* Level 4. */ - sv_f64_t x8 = svmul_f64_x (pg, x4, x4); - sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8); - y = sv_fma_f64_x (pg, y, x8, P_7_0); - - /* Finalize. y = shift + z + z^3 * P(z^2). */ - sv_f64_t z3 = svmul_f64_x (pg, z2, az); - y = sv_fma_f64_x (pg, y, z3, az); - - /* Apply shift as indicated by `red` predicate. */ - y = svadd_f64_m (red, y, shift); - - return y; -} diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c index 8d38e42b2290..4defb356e7f9 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c +++ b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c @@ -1,59 +1,76 @@ /* * Single-precision vector atan(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f32.h" -#if SV_SUPPORTED - -#include "sv_atanf_common.h" +static const struct data +{ + float32_t poly[8]; + float32_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, + .pi_over_2 = 0x1.921fb6p+0f, +}; -#define PiOver2 sv_f32 (0x1.921fb6p+0f) -#define AbsMask (0x7fffffff) +#define SignMask (0x80000000) /* Fast implementation of SVE atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Largest observed error is 2.9 ULP, close to +/-1.0: - __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1 - want -0x1.967fp-1. */ -sv_f32_t -__sv_atanf_x (sv_f32_t x, const svbool_t pg) + _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 + want -0x1.967fp-1. */ +svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* No need to trigger special case. Small cases, infs and nans are supported by our approximation technique. */ - sv_u32_t ix = sv_as_u32_f32 (x); - sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask); + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t sign = svand_x (pg, ix, SignMask); /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt_n_f32 (pg, x, 1.0f); + svbool_t red = svacgt (pg, x, 1.0f); /* Avoid dependency in abs(x) in division (and comparison). */ - sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x); + svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x); /* Use absolute value only when needed (odd powers of z). */ - sv_f32_t az = svabs_f32_x (pg, z); - az = svneg_f32_m (az, red, az); + svfloat32_t az = svabs_x (pg, z); + az = svneg_m (az, red, az); + + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); - sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2); + svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat32_t z3 = svmul_x (pg, z2, az); + y = svmla_x (pg, az, z3, y); + + /* Apply shift as indicated by 'red' predicate. */ + y = svadd_m (red, y, sv_f32 (d->pi_over_2)); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign)); + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf) - PL_SIG (SV, F, 1, atan, -3.1, 3.1) -PL_TEST_ULP (__sv_atanf, 2.9) -PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (SV_NAME_F1 (atan), 2.9) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h b/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h deleted file mode 100644 index dc45effec1cd..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Single-precision polynomial evaluation function for SVE atan(x) and - * atan2(y,x). - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_SV_ATANF_COMMON_H -#define PL_MATH_SV_ATANF_COMMON_H - -#include "math_config.h" -#include "sv_math.h" - -#define P(i) sv_f32 (__atanf_poly_data.poly[i]) - -/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations - The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2). */ -static inline sv_f32_t -__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az, - sv_f32_t shift) -{ - /* Use full Estrin scheme for P(z^2) with deg(P)=7. */ - - /* First compute square powers of z. */ - sv_f32_t z2 = svmul_f32_x (pg, z, z); - sv_f32_t z4 = svmul_f32_x (pg, z2, z2); - sv_f32_t z8 = svmul_f32_x (pg, z4, z4); - - /* Then assemble polynomial. */ - sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))), - (sv_fma_f32_x (pg, z2, P (5), P (4)))); - sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))), - (sv_fma_f32_x (pg, z2, P (1), P (0)))); - sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3); - - /* Finalize. y = shift + z + z^3 * P(z^2). */ - sv_f32_t z3 = svmul_f32_x (pg, z2, az); - y = sv_fma_f32_x (pg, y, z3, az); - - /* Apply shift as indicated by 'red' predicate. */ - y = svadd_f32_m (red, y, shift); - - return y; -} - -#endif // PL_MATH_SV_ATANF_COMMON_H diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanh_3u3.c b/contrib/arm-optimized-routines/pl/math/sv_atanh_3u3.c new file mode 100644 index 000000000000..dcc9350b4962 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atanh_3u3.c @@ -0,0 +1,60 @@ +/* + * Double-precision SVE atanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 0 +#include "sv_log1p_inline.h" + +#define One (0x3ff0000000000000) +#define Half (0x3fe0000000000000) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (atanh, x, y, special); +} + +/* SVE approximation for double-precision atanh, based on log1p. + The greatest observed error is 2.81 ULP: + _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ +svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) +{ + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); + + /* It is special if iax >= 1. */ +// svbool_t special = svcmpge (pg, iax, One); + svbool_t special = svacge (pg, x, 1.0); + + /* Computation is performed based on the following sequence of equality: + (1+x)/(1-x) = 1 + 2x/(1-x). */ + svfloat64_t y; + y = svadd_x (pg, ax, ax); + y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax)); + /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ + y = sv_log1p_inline (y, pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, halfsign, y), special); + return svmul_x (pg, halfsign, y); +} + +PL_SIG (SV, D, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_D1 (atanh), 3.32) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 1, inf, 100, 0) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanhf_2u8.c b/contrib/arm-optimized-routines/pl/math/sv_atanhf_2u8.c new file mode 100644 index 000000000000..413c60ce05da --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atanhf_2u8.c @@ -0,0 +1,56 @@ +/* + * Single-precision vector atanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_log1pf_inline.h" + +#define One (0x3f800000) +#define Half (0x3f000000) + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (atanhf, x, y, special); +} + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 2.28 ULP: + _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5 + want 0x1.ffbbb6p-5. */ +svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) +{ + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half)); + svbool_t special = svcmpge (pg, iax, One); + + /* Computation is performed based on the following sequence of equality: + * (1+x)/(1-x) = 1 + 2x/(1-x). */ + svfloat32_t y = svadd_x (pg, ax, ax); + y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax)); + /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ + y = sv_log1pf_inline (y, pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, halfsign, y), special); + + return svmul_x (pg, halfsign, y); +} + +PL_SIG (SV, F, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (SV_NAME_F1 (atanh), 2.59) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000, 0) +PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 1, inf, 1000, 0) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/sv_cbrt_2u.c new file mode 100644 index 000000000000..192f1cd80d59 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cbrt_2u.c @@ -0,0 +1,122 @@ +/* + * Double-precision SVE cbrt(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +const static struct data +{ + float64_t poly[4]; + float64_t table[5]; + float64_t one_third, two_thirds, shift; + int64_t exp_bias; + uint64_t tiny_bound, thresh; +} data = { + /* Generated with FPMinimax in [0.5, 1]. */ + .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, + 0x1.2c74eaa3ba428p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, }, + .one_third = 0x1.5555555555555p-2, + .two_thirds = 0x1.5555555555555p-1, + .shift = 0x1.8p52, + .exp_bias = 1022, + .tiny_bound = 0x0010000000000000, /* Smallest normal. */ + .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */ +}; + +#define MantissaMask 0x000fffffffffffff +#define HalfExp 0x3fe0000000000000 + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cbrt, x, y, special); +} + +static inline svfloat64_t +shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. Greatest observed error is 1.79 ULP. + Errors repeat according to the exponent, for instance an error observed for + double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i + is an integer. + _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342 + want 0x1.965f53b0e5d95p-342. */ +svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat64_t m = svreinterpret_f64 (svorr_x ( + pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp)); + svint64_t e + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + svfloat64_t p + = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third); + svint64_t ey = svcvt_s64_x (pg, eb3f); + svint64_t em3 = svmls_x (pg, e, ey, 3); + + svfloat64_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexp. */ + svfloat64_t y = svscale_x (pg, my, ey); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (cbrt), 1.30) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cbrtf_1u7.c b/contrib/arm-optimized-routines/pl/math/sv_cbrtf_1u7.c new file mode 100644 index 000000000000..5b625f308827 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cbrtf_1u7.c @@ -0,0 +1,116 @@ +/* + * Single-precision SVE cbrt(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +const static struct data +{ + float32_t poly[4]; + float32_t table[5]; + float32_t one_third, two_thirds; +} data = { + /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax. + */ + .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, + 0x1.2c74c2p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, +}; + +#define SmallestNormal 0x00800000 +#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask 0x007fffff +#define HalfExp 0x3f000000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (cbrtf, x, y, special); +} + +static inline svfloat32_t +shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat32_t m = svreinterpret_f32 (svorr_x ( + pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp)); + svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + svfloat32_t p + = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third); + svint32_t ey = svcvt_s32_x (pg, ef); + svint32_t em3 = svmls_x (pg, e, ey, 3); + + svfloat32_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexpf. */ + svfloat32_t y = svscale_x (pg, my, ey); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (cbrt), 1.15) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cexpi_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_cexpi_3u5.c new file mode 100644 index 000000000000..920acfea5da0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cexpi_3u5.c @@ -0,0 +1,45 @@ +/* + * Double-precision vector cexpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_sincos_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static svfloat64x2_t NOINLINE +special_case (svfloat64_t x, svbool_t special, svfloat64x2_t y) +{ + return svcreate2 (sv_call_f64 (sin, x, svget2 (y, 0), special), + sv_call_f64 (cos, x, svget2 (y, 1), special)); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + sv_cexpi_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +svfloat64x2_t +_ZGVsMxv_cexpi (svfloat64_t x, svbool_t pg) +{ + const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat64x2_t sc = sv_sincos_inline (pg, x, d); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73) +PL_TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73) +#define SV_CEXPI_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n) +SV_CEXPI_INTERVAL (0, 0x1p23, 500000) +SV_CEXPI_INTERVAL (-0, -0x1p23, 500000) +SV_CEXPI_INTERVAL (0x1p23, inf, 10000) +SV_CEXPI_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cexpif_1u8.c b/contrib/arm-optimized-routines/pl/math/sv_cexpif_1u8.c new file mode 100644 index 000000000000..93f2f998cb38 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cexpif_1u8.c @@ -0,0 +1,47 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_sincosf_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static svfloat32x2_t NOINLINE +special_case (svfloat32_t x, svbool_t special, svfloat32x2_t y) +{ + return svcreate2 (sv_call_f32 (sinf, x, svget2 (y, 0), special), + sv_call_f32 (cosf, x, svget2 (y, 1), special)); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +svfloat32x2_t +_ZGVsMxv_cexpif (svfloat32_t x, svbool_t pg) +{ + const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat32x2_t sc = sv_sincosf_inline (pg, x, d); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17) +PL_TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31) +#define SV_CEXPIF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n) +SV_CEXPIF_INTERVAL (0, 0x1p20, 500000) +SV_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +SV_CEXPIF_INTERVAL (0x1p20, inf, 10000) +SV_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c index 194034802452..76af3459b3f2 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c @@ -1,84 +1,86 @@ /* * Double-precision SVE cos(x) function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + double inv_pio2, pio2_1, pio2_2, pio2_3, shift; +} data = { + /* Polynomial coefficients are hardwired in FTMAD instructions. */ + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2_1 = 0x1.921fb50000000p+0, + .pio2_2 = 0x1.110b460000000p-26, + .pio2_3 = 0x1.1a62633145c07p-54, + /* Original shift used in AdvSIMD cos, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.8000000000001p52 +}; -#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1)) -#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0)) -#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26)) -#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54)) -/* Original shift used in Neon cos, - plus a contribution to set the bit #0 of q - as expected by trigonometric instructions. */ -#define Shift (sv_f64 (0x1.8000000000001p52)) -#define RangeVal (sv_f64 (0x1p23)) -#define AbsMask (0x7fffffffffffffff) +#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ -static NOINLINE sv_f64_t -__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t oob) { - return sv_call_f64 (cos, x, y, cmp); + return sv_call_f64 (cos, x, y, oob); } /* A fast SVE implementation of cos based on trigonometric instructions (FTMAD, FTSSEL, FTSMUL). Maximum measured error: 2.108 ULPs. - __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 - want -0x1.fddd4c65c7f05p-3. */ -sv_f64_t -__sv_cos_x (sv_f64_t x, const svbool_t pg) + SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 + want -0x1.fddd4c65c7f05p-3. */ +svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) { - sv_f64_t n, r, r2, y; - svbool_t cmp; + const struct data *d = ptr_barrier (&data); + + svfloat64_t r = svabs_x (pg, x); + svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal); - r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask)); - cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal)); + /* Load some constants in quad-word chunks to minimise memory access. */ + svbool_t ptrue = svptrue_b64 (); + svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2); + svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2); /* n = rint(|x|/(pi/2)). */ - sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift); - n = svsub_f64_x (pg, q, Shift); + svfloat64_t q = svmla_lane (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); + svfloat64_t n = svsub_x (pg, q, d->shift); /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = sv_fma_f64_x (pg, NegPio2_1, n, r); - r = sv_fma_f64_x (pg, NegPio2_2, n, r); - r = sv_fma_f64_x (pg, NegPio2_3, n, r); + r = svmls_lane (r, n, invpio2_and_pio2_1, 1); + r = svmls_lane (r, n, pio2_23, 0); + r = svmls_lane (r, n, pio2_23, 1); /* cos(r) poly approx. */ - r2 = svtsmul_f64 (r, sv_as_u64_f64 (q)); - y = sv_f64 (0.0); - y = svtmad_f64 (y, r2, 7); - y = svtmad_f64 (y, r2, 6); - y = svtmad_f64 (y, r2, 5); - y = svtmad_f64 (y, r2, 4); - y = svtmad_f64 (y, r2, 3); - y = svtmad_f64 (y, r2, 2); - y = svtmad_f64 (y, r2, 1); - y = svtmad_f64 (y, r2, 0); + svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q)); + svfloat64_t y = sv_f64 (0.0); + y = svtmad (y, r2, 7); + y = svtmad (y, r2, 6); + y = svtmad (y, r2, 5); + y = svtmad (y, r2, 4); + y = svtmad (y, r2, 3); + y = svtmad (y, r2, 2); + y = svtmad (y, r2, 1); + y = svtmad (y, r2, 0); /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q)); - /* Apply factor. */ - y = svmul_f64_x (pg, f, y); + svfloat64_t f = svtssel (r, svreinterpret_u64 (q)); - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ - if (unlikely (svptest_any (pg, cmp))) - return __sv_cos_specialcase (x, y, cmp); - return y; -} + if (unlikely (svptest_any (pg, oob))) + return special_case (x, svmul_x (svnot_z (pg, oob), y, f), oob); -PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos) + /* Apply factor. */ + return svmul_x (pg, f, y); +} PL_SIG (SV, D, 1, cos, -3.1, 3.1) -PL_TEST_ULP (__sv_cos, 1.61) -PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (SV_NAME_D1 (cos), 1.61) +PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c index 8f138bcba7af..4bdb0dd146bb 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c +++ b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c @@ -1,82 +1,80 @@ /* * Single-precision SVE cos(x) function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift; +} data = { + /* Polynomial coefficients are hard-wired in FTMAD instructions. */ + .neg_pio2_1 = -0x1.921fb6p+0f, + .neg_pio2_2 = 0x1.777a5cp-25f, + .neg_pio2_3 = 0x1.ee59dap-50f, + .inv_pio2 = 0x1.45f306p-1f, + /* Original shift used in AdvSIMD cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.800002p+23f +}; -#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) -#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) -#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) -#define RangeVal (sv_f32 (0x1p20f)) -#define InvPio2 (sv_f32 (0x1.45f306p-1f)) -/* Original shift used in Neon cosf, - plus a contribution to set the bit #0 of q - as expected by trigonometric instructions. */ -#define Shift (sv_f32 (0x1.800002p+23f)) -#define AbsMask (0x7fffffff) +#define RangeVal 0x49800000 /* asuint32(0x1p20f). */ -static NOINLINE sv_f32_t -__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t oob) { - return sv_call_f32 (cosf, x, y, cmp); + return sv_call_f32 (cosf, x, y, oob); } /* A fast SVE implementation of cosf based on trigonometric instructions (FTMAD, FTSSEL, FTSMUL). Maximum measured error: 2.06 ULPs. - __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6 - want 0x1.fffe76p-6. */ -sv_f32_t -__sv_cosf_x (sv_f32_t x, const svbool_t pg) + SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) { - sv_f32_t n, r, r2, y; - svbool_t cmp; + const struct data *d = ptr_barrier (&data); + + svfloat32_t r = svabs_x (pg, x); + svbool_t oob = svcmpge (pg, svreinterpret_u32 (r), RangeVal); - r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); - cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t negpio2_and_invpio2 = svld1rq (svptrue_b32 (), &d->neg_pio2_1); /* n = rint(|x|/(pi/2)). */ - sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift); - n = svsub_f32_x (pg, q, Shift); + svfloat32_t q = svmla_lane (sv_f32 (d->shift), r, negpio2_and_invpio2, 3); + svfloat32_t n = svsub_x (pg, q, d->shift); /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = sv_fma_f32_x (pg, NegPio2_1, n, r); - r = sv_fma_f32_x (pg, NegPio2_2, n, r); - r = sv_fma_f32_x (pg, NegPio2_3, n, r); + r = svmla_lane (r, n, negpio2_and_invpio2, 0); + r = svmla_lane (r, n, negpio2_and_invpio2, 1); + r = svmla_lane (r, n, negpio2_and_invpio2, 2); /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q)); + svfloat32_t f = svtssel (r, svreinterpret_u32 (q)); /* cos(r) poly approx. */ - r2 = svtsmul_f32 (r, sv_as_u32_f32 (q)); - y = sv_f32 (0.0f); - y = svtmad_f32 (y, r2, 4); - y = svtmad_f32 (y, r2, 3); - y = svtmad_f32 (y, r2, 2); - y = svtmad_f32 (y, r2, 1); - y = svtmad_f32 (y, r2, 0); + svfloat32_t r2 = svtsmul (r, svreinterpret_u32 (q)); + svfloat32_t y = sv_f32 (0.0f); + y = svtmad (y, r2, 4); + y = svtmad (y, r2, 3); + y = svtmad (y, r2, 2); + y = svtmad (y, r2, 1); + y = svtmad (y, r2, 0); + if (unlikely (svptest_any (pg, oob))) + return special_case (x, svmul_x (svnot_z (pg, oob), f, y), oob); /* Apply factor. */ - y = svmul_f32_x (pg, f, y); - - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ - if (unlikely (svptest_any (pg, cmp))) - return __sv_cosf_specialcase (x, y, cmp); - return y; + return svmul_x (pg, f, y); } -PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf) - PL_SIG (SV, F, 1, cos, -3.1, 3.1) -PL_TEST_ULP (__sv_cosf, 1.57) -PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (SV_NAME_F1 (cos), 1.57) +PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/sv_cosh_2u.c new file mode 100644 index 000000000000..a6d743fb9b96 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cosh_2u.c @@ -0,0 +1,100 @@ +/* + * Double-precision SVE cosh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[3]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; + uint64_t index_mask, special_bound; +} data = { + .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, + 0x1.5555576a59599p-5, }, + + .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */ + /* -ln2/N. */ + .ln2_hi = -0x1.62e42fefa39efp-9, + .ln2_lo = -0x1.abc9e3b39803f3p-64, + .shift = 0x1.8p+52, + .thres = 704.0, + + .index_mask = 0xff, + /* 0x1.6p9, above which exp overflows. */ + .special_bound = 0x4086000000000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from sv_exp_tail, with no + special-case handling or tail. */ +static inline svfloat64_t +exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Calculate exp(x). */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svfloat64_t n = svsub_x (pg, z, d->shift); + + svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi); + r = svmla_x (pg, r, n, d->ln2_lo); + + svuint64_t u = svreinterpret_u64 (z); + svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); + svuint64_t i = svand_x (pg, u, d->index_mask); + + svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); + y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); + y = svmla_x (pg, sv_f64 (1.0), r, y); + y = svmul_x (pg, r, y); + + /* s = 2^(n/N). */ + u = svld1_gather_index (pg, __v_exp_tail_data, i); + svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e)); + + return svmla_x (pg, s, s, y); +} + +/* Approximation for SVE double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021 + want 0x1.fd774e958236fp+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8 + want 0x1.f5e2bb8d5c991p+8. */ +svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + svfloat64_t t = exp_inline (ax, pg, d); + svfloat64_t half_t = svmul_x (pg, t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + + /* Fall back to scalar for any special cases. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, svadd_x (pg, half_t, half_over_t), special); + + return svadd_x (pg, half_t, half_over_t); +} + +PL_SIG (SV, D, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (cosh), 1.43) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_coshf_2u.c b/contrib/arm-optimized-routines/pl/math/sv_coshf_2u.c new file mode 100644 index 000000000000..81680fef318e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_coshf_2u.c @@ -0,0 +1,56 @@ +/* + * Single-precision SVE cosh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_expf_inline.h" + +static const struct data +{ + struct sv_expf_data expf_consts; + uint32_t special_bound; +} data = { + .expf_consts = SV_EXPF_DATA, + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = 0x42ad496c, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +{ + return sv_call_f32 (coshf, x, y, pg); +} + +/* Single-precision vector cosh, using vector expf. + Maximum error is 1.89 ULP: + _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127 + want 0x1.f00adcp+127. */ +svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound); + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + svfloat32_t t = expf_inline (ax, pg, &d->expf_consts); + svfloat32_t half_t = svmul_x (pg, t, 0.5); + svfloat32_t half_over_t = svdivr_x (pg, t, 0.5); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svadd_x (pg, half_t, half_over_t), special); + + return svadd_x (pg, half_t, half_over_t); +} + +PL_SIG (SV, F, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (cosh), 1.39) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cospi_3u2.c b/contrib/arm-optimized-routines/pl/math/sv_cospi_3u2.c new file mode 100644 index 000000000000..d80f899c41e4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cospi_3u2.c @@ -0,0 +1,63 @@ +/* + * Double-precision SVE cospi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +static const struct data +{ + double poly[10]; + double range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p53, +}; + +/* A fast SVE implementation of cospi. + Maximum error 3.20 ULP: + _ZGVsMxv_cospi(0x1.f18ba32c63159p-6) got 0x1.fdabf595f9763p-1 + want 0x1.fdabf595f9766p-1. */ +svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f64 (0.5), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^53, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_D1 (cospi), 2.71) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_cospif_2u6.c b/contrib/arm-optimized-routines/pl/math/sv_cospif_2u6.c new file mode 100644 index 000000000000..fb2922d0533a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cospif_2u6.c @@ -0,0 +1,59 @@ +/* + * Single-precision SVE cospi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[6]; + float range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31f, +}; + +/* A fast SVE implementation of cospif. + Maximum error: 2.60 ULP: + _ZGVsMxv_cospif(+/-0x1.cae664p-4) got 0x1.e09c9ep-1 + want 0x1.e09c98p-1. */ +svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f32 (0.5f), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^31, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_F1 (cospi), 2.08) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_erf_2u5.c new file mode 100644 index 000000000000..cbf9718e5bb0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erf_2u5.c @@ -0,0 +1,111 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double third; + double tenth, two_over_five, two_over_fifteen; + double two_over_nine, two_over_fortyfive; + double max, shift; +} data = { + .third = 0x1.5555555555556p-2, /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = 0x1.1111111111111p-3, + .tenth = -0x1.999999999999ap-4, + .two_over_five = -0x1.999999999999ap-2, + .two_over_nine = -0x1.c71c71c71c71cp-3, + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, + .max = 5.9921875, /* 6 - 1/128. */ + .shift = 0x1p45, +}; + +#define SignMask (0x8000000000000000) + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + _ZGVsMxv_erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* |x| >= 6.0 - 1/128. Opposite conditions except none of them catch NaNs so + they can be used in lookup and BSLs to yield the expected results. */ + svbool_t a_ge_max = svacge (pg, x, dat->max); + svbool_t a_lt_max = svaclt (pg, x, dat->max); + + /* Set r to multiple of 1/128 nearest to |x|. */ + svfloat64_t a = svabs_x (pg, x); + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t z = svadd_x (pg, a, shift); + svuint64_t i + = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift)); + + /* Lookup without shortcut for small values but with predicate to avoid + segfault for large values and NaNs. */ + svfloat64_t r = svsub_x (pg, z, shift); + svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i); + svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + svfloat64_t d = svsub_x (pg, a, r); + svfloat64_t d2 = svmul_x (pg, d, d); + svfloat64_t r2 = svmul_x (pg, r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + svfloat64_t p1 = r; + svfloat64_t third = sv_f64 (dat->third); + svfloat64_t twothird = svmul_x (pg, third, 2.0); + svfloat64_t sixth = svmul_x (pg, third, 0.5); + svfloat64_t p2 = svmls_x (pg, third, r2, twothird); + svfloat64_t p3 = svmad_x (pg, r2, third, -0.5); + p3 = svmul_x (pg, r, p3); + svfloat64_t p4 + = svmla_x (pg, sv_f64 (dat->two_over_five), r2, dat->two_over_fifteen); + p4 = svmls_x (pg, sv_f64 (dat->tenth), r2, p4); + svfloat64_t p5 + = svmla_x (pg, sv_f64 (dat->two_over_nine), r2, dat->two_over_fortyfive); + p5 = svmla_x (pg, sixth, r2, p5); + p5 = svmul_x (pg, r, p5); + + svfloat64_t p34 = svmla_x (pg, p3, d, p4); + svfloat64_t p12 = svmla_x (pg, p1, d, p2); + svfloat64_t y = svmla_x (pg, p34, d2, p5); + y = svmla_x (pg, p12, d2, y); + + y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = svsel (a_ge_max, sv_f64 (1.0), y); + + /* Copy sign. */ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); + svuint64_t sign = svand_x (pg, ix, SignMask); + return svreinterpret_f64 (svorr_x (pg, sign, iy)); +} + +PL_SIG (SV, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (SV_NAME_D1 (erf), 1.79) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c b/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c deleted file mode 100644 index bec7f8a819d2..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Double-precision SVE erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED - -#define Scale (8.0) -#define AbsMask (0x7fffffffffffffff) - -static NOINLINE sv_f64_t -__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) -{ - return sv_call_f64 (erf, x, y, cmp); -} - -/* Optimized double precision SVE error function erf. - Maximum observed error is 2.62 ULP: - __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0 - want 0x1.fffffffffffffp-1. */ -sv_f64_t -__sv_erf_x (sv_f64_t x, const svbool_t pg) -{ - /* Use top 16 bits to test for special cases and small values. */ - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff); - - /* Handle both inf/nan as well as small values (|x|<2^-28). */ - svbool_t cmp - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30); - - /* Get sign and absolute value. */ - sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask)); - sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); - - /* i = trunc(Scale*x). */ - sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale); - /* Saturate index of intervals. */ - svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018); - sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale); - - /* Load polynomial coefficients. */ - sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i); - sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i); - sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i); - sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i); - sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i); - sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i); - sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i); - sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i); - sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i); - sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i); - - /* Get shift and scale. */ - sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i); - - /* Transform polynomial variable. - Set z = 0 in the boring domain to avoid overflow. */ - sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a); - - /* Evaluate polynomial P(z) using level-2 Estrin. */ - sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0); - sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2); - sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4); - sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6); - sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8); - - sv_f64_t z2 = svmul_f64_x (pg, z, z); - sv_f64_t z4 = svmul_f64_x (pg, z2, z2); - - sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3); - sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1); - - sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2); - y = sv_fma_f64_x (pg, z4, y, q1); - - /* y = erf(x) if x > 0, -erf(-x) otherwise. */ - y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); - - if (unlikely (svptest_any (pg, cmp))) - return __sv_erf_specialcase (x, y, cmp); - return y; -} - -PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf) - -PL_SIG (SV, D, 1, erf, -4.0, 4.0) -PL_TEST_ULP (__sv_erf, 2.13) -PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000) -PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000) -PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000) -PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_data.c b/contrib/arm-optimized-routines/pl/math/sv_erf_data.c new file mode 100644 index 000000000000..7244aceda5a5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erf_data.c @@ -0,0 +1,1558 @@ +/* + * Data for approximation of erf. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in vector erf. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 6.0 (769 values): + - the first entry __erf_data.tab.erf contains the values of erf(r), + - the second entry __erf_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct sv_erf_data __sv_erf_data = { + .erf = { 0x0.0000000000000p+0, + 0x1.20dbf3deb1340p-7, + 0x1.20d77083f17a0p-6, + 0x1.b137e0cf584dcp-6, + 0x1.20c5645dd2538p-5, + 0x1.68e5d3bbc9526p-5, + 0x1.b0fafef135745p-5, + 0x1.f902a77bd3821p-5, + 0x1.207d480e90658p-4, + 0x1.44703e87e8593p-4, + 0x1.68591a1e83b5dp-4, + 0x1.8c36beb8a8d23p-4, + 0x1.b0081148a873ap-4, + 0x1.d3cbf7e70a4b3p-4, + 0x1.f78159ec8bb50p-4, + 0x1.0d939005f65e5p-3, + 0x1.1f5e1a35c3b89p-3, + 0x1.311fc15f56d14p-3, + 0x1.42d7fc2f64959p-3, + 0x1.548642321d7c6p-3, + 0x1.662a0bdf7a89fp-3, + 0x1.77c2d2a765f9ep-3, + 0x1.895010fdbdbfdp-3, + 0x1.9ad142662e14dp-3, + 0x1.ac45e37fe2526p-3, + 0x1.bdad72110a648p-3, + 0x1.cf076d1233237p-3, + 0x1.e05354b96ff36p-3, + 0x1.f190aa85540e2p-3, + 0x1.015f78a3dcf3dp-2, + 0x1.09eed6982b948p-2, + 0x1.127631eb8de32p-2, + 0x1.1af54e232d609p-2, + 0x1.236bef825d9a2p-2, + 0x1.2bd9db0f7827fp-2, + 0x1.343ed6989b7d9p-2, + 0x1.3c9aa8b84bedap-2, + 0x1.44ed18d9f6462p-2, + 0x1.4d35ef3e5372ep-2, + 0x1.5574f4ffac98ep-2, + 0x1.5da9f415ff23fp-2, + 0x1.65d4b75b00471p-2, + 0x1.6df50a8dff772p-2, + 0x1.760aba57a76bfp-2, + 0x1.7e15944d9d3e4p-2, + 0x1.861566f5fd3c0p-2, + 0x1.8e0a01cab516bp-2, + 0x1.95f3353cbb146p-2, + 0x1.9dd0d2b721f39p-2, + 0x1.a5a2aca209394p-2, + 0x1.ad68966569a87p-2, + 0x1.b522646bbda68p-2, + 0x1.bccfec24855b8p-2, + 0x1.c4710406a65fcp-2, + 0x1.cc058392a6d2dp-2, + 0x1.d38d4354c3bd0p-2, + 0x1.db081ce6e2a48p-2, + 0x1.e275eaf25e458p-2, + 0x1.e9d68931ae650p-2, + 0x1.f129d471eabb1p-2, + 0x1.f86faa9428f9dp-2, + 0x1.ffa7ea8eb5fd0p-2, + 0x1.03693a371519cp-1, + 0x1.06f794ab2cae7p-1, + 0x1.0a7ef5c18edd2p-1, + 0x1.0dff4f247f6c6p-1, + 0x1.1178930ada115p-1, + 0x1.14eab43841b55p-1, + 0x1.1855a5fd3dd50p-1, + 0x1.1bb95c3746199p-1, + 0x1.1f15cb50bc4dep-1, + 0x1.226ae840d4d70p-1, + 0x1.25b8a88b6dd7fp-1, + 0x1.28ff0240d52cdp-1, + 0x1.2c3debfd7d6c1p-1, + 0x1.2f755ce9a21f4p-1, + 0x1.32a54cb8db67bp-1, + 0x1.35cdb3a9a144dp-1, + 0x1.38ee8a84beb71p-1, + 0x1.3c07ca9cb4f9ep-1, + 0x1.3f196dcd0f135p-1, + 0x1.42236e79a5fa6p-1, + 0x1.4525c78dd5966p-1, + 0x1.4820747ba2dc2p-1, + 0x1.4b13713ad3513p-1, + 0x1.4dfeba47f63ccp-1, + 0x1.50e24ca35fd2cp-1, + 0x1.53be25d016a4fp-1, + 0x1.569243d2b3a9bp-1, + 0x1.595ea53035283p-1, + 0x1.5c2348ecc4dc3p-1, + 0x1.5ee02e8a71a53p-1, + 0x1.61955607dd15dp-1, + 0x1.6442bfdedd397p-1, + 0x1.66e86d0312e82p-1, + 0x1.69865ee075011p-1, + 0x1.6c1c9759d0e5fp-1, + 0x1.6eab18c74091bp-1, + 0x1.7131e5f496a5ap-1, + 0x1.73b1021fc0cb8p-1, + 0x1.762870f720c6fp-1, + 0x1.78983697dc96fp-1, + 0x1.7b00578c26037p-1, + 0x1.7d60d8c979f7bp-1, + 0x1.7fb9bfaed8078p-1, + 0x1.820b1202f27fbp-1, + 0x1.8454d5f25760dp-1, + 0x1.8697120d92a4ap-1, + 0x1.88d1cd474a2e0p-1, + 0x1.8b050ef253c37p-1, + 0x1.8d30debfc572ep-1, + 0x1.8f5544bd00c04p-1, + 0x1.91724951b8fc6p-1, + 0x1.9387f53df5238p-1, + 0x1.959651980da31p-1, + 0x1.979d67caa6631p-1, + 0x1.999d4192a5715p-1, + 0x1.9b95e8fd26abap-1, + 0x1.9d8768656cc42p-1, + 0x1.9f71ca72cffb6p-1, + 0x1.a1551a16aaeafp-1, + 0x1.a331628a45b92p-1, + 0x1.a506af4cc00f4p-1, + 0x1.a6d50c20fa293p-1, + 0x1.a89c850b7d54dp-1, + 0x1.aa5d265064366p-1, + 0x1.ac16fc7143263p-1, + 0x1.adca142b10f98p-1, + 0x1.af767a741088bp-1, + 0x1.b11c3c79bb424p-1, + 0x1.b2bb679ead19cp-1, + 0x1.b4540978921eep-1, + 0x1.b5e62fce16095p-1, + 0x1.b771e894d602ep-1, + 0x1.b8f741ef54f83p-1, + 0x1.ba764a2af2b78p-1, + 0x1.bbef0fbde6221p-1, + 0x1.bd61a1453ab44p-1, + 0x1.bece0d82d1a5cp-1, + 0x1.c034635b66e23p-1, + 0x1.c194b1d49a184p-1, + 0x1.c2ef0812fc1bdp-1, + 0x1.c443755820d64p-1, + 0x1.c5920900b5fd1p-1, + 0x1.c6dad2829ec62p-1, + 0x1.c81de16b14cefp-1, + 0x1.c95b455cce69dp-1, + 0x1.ca930e0e2a825p-1, + 0x1.cbc54b476248dp-1, + 0x1.ccf20ce0c0d27p-1, + 0x1.ce1962c0e0d8bp-1, + 0x1.cf3b5cdaf0c39p-1, + 0x1.d0580b2cfd249p-1, + 0x1.d16f7dbe41ca0p-1, + 0x1.d281c49d818d0p-1, + 0x1.d38eefdf64fddp-1, + 0x1.d4970f9ce00d9p-1, + 0x1.d59a33f19ed42p-1, + 0x1.d6986cfa798e7p-1, + 0x1.d791cad3eff01p-1, + 0x1.d8865d98abe01p-1, + 0x1.d97635600bb89p-1, + 0x1.da61623cb41e0p-1, + 0x1.db47f43b2980dp-1, + 0x1.dc29fb60715afp-1, + 0x1.dd0787a8bb39dp-1, + 0x1.dde0a90611a0dp-1, + 0x1.deb56f5f12d28p-1, + 0x1.df85ea8db188ep-1, + 0x1.e0522a5dfda73p-1, + 0x1.e11a3e8cf4eb8p-1, + 0x1.e1de36c75ba58p-1, + 0x1.e29e22a89d766p-1, + 0x1.e35a11b9b61cep-1, + 0x1.e4121370224ccp-1, + 0x1.e4c6372cd8927p-1, + 0x1.e5768c3b4a3fcp-1, + 0x1.e62321d06c5e0p-1, + 0x1.e6cc0709c8a0dp-1, + 0x1.e7714aec96534p-1, + 0x1.e812fc64db369p-1, + 0x1.e8b12a44944a8p-1, + 0x1.e94be342e6743p-1, + 0x1.e9e335fb56f87p-1, + 0x1.ea7730ed0bbb9p-1, + 0x1.eb07e27a133aap-1, + 0x1.eb9558e6b42cep-1, + 0x1.ec1fa258c4beap-1, + 0x1.eca6ccd709544p-1, + 0x1.ed2ae6489ac1ep-1, + 0x1.edabfc7453e63p-1, + 0x1.ee2a1d004692cp-1, + 0x1.eea5557137ae0p-1, + 0x1.ef1db32a2277cp-1, + 0x1.ef93436bc2daap-1, + 0x1.f006135426b26p-1, + 0x1.f0762fde45ee6p-1, + 0x1.f0e3a5e1a1788p-1, + 0x1.f14e8211e8c55p-1, + 0x1.f1b6d0fea5f4dp-1, + 0x1.f21c9f12f0677p-1, + 0x1.f27ff89525acfp-1, + 0x1.f2e0e9a6a8b09p-1, + 0x1.f33f7e43a706bp-1, + 0x1.f39bc242e43e6p-1, + 0x1.f3f5c1558b19ep-1, + 0x1.f44d870704911p-1, + 0x1.f4a31ebcd47dfp-1, + 0x1.f4f693b67bd77p-1, + 0x1.f547f10d60597p-1, + 0x1.f59741b4b97cfp-1, + 0x1.f5e4907982a07p-1, + 0x1.f62fe80272419p-1, + 0x1.f67952cff6282p-1, + 0x1.f6c0db3c34641p-1, + 0x1.f7068b7b10fd9p-1, + 0x1.f74a6d9a38383p-1, + 0x1.f78c8b812d498p-1, + 0x1.f7cceef15d631p-1, + 0x1.f80ba18636f07p-1, + 0x1.f848acb544e95p-1, + 0x1.f88419ce4e184p-1, + 0x1.f8bdf1fb78370p-1, + 0x1.f8f63e416ebffp-1, + 0x1.f92d077f8d56dp-1, + 0x1.f96256700da8ep-1, + 0x1.f99633a838a57p-1, + 0x1.f9c8a7989af0dp-1, + 0x1.f9f9ba8d3c733p-1, + 0x1.fa2974addae45p-1, + 0x1.fa57ddfe27376p-1, + 0x1.fa84fe5e05c8dp-1, + 0x1.fab0dd89d1309p-1, + 0x1.fadb831a9f9c3p-1, + 0x1.fb04f6868a944p-1, + 0x1.fb2d3f20f9101p-1, + 0x1.fb54641aebbc9p-1, + 0x1.fb7a6c834b5a2p-1, + 0x1.fb9f5f4739170p-1, + 0x1.fbc3433260ca5p-1, + 0x1.fbe61eef4cf6ap-1, + 0x1.fc07f907bc794p-1, + 0x1.fc28d7e4f9cd0p-1, + 0x1.fc48c1d033c7ap-1, + 0x1.fc67bcf2d7b8fp-1, + 0x1.fc85cf56ecd38p-1, + 0x1.fca2fee770c79p-1, + 0x1.fcbf5170b578bp-1, + 0x1.fcdacca0bfb73p-1, + 0x1.fcf57607a6e7cp-1, + 0x1.fd0f5317f582fp-1, + 0x1.fd2869270a56fp-1, + 0x1.fd40bd6d7a785p-1, + 0x1.fd58550773cb5p-1, + 0x1.fd6f34f52013ap-1, + 0x1.fd85621b0876dp-1, + 0x1.fd9ae142795e3p-1, + 0x1.fdafb719e6a69p-1, + 0x1.fdc3e835500b3p-1, + 0x1.fdd7790ea5bc0p-1, + 0x1.fdea6e062d0c9p-1, + 0x1.fdfccb62e52d3p-1, + 0x1.fe0e9552ebdd6p-1, + 0x1.fe1fcfebe2083p-1, + 0x1.fe307f2b503d0p-1, + 0x1.fe40a6f70af4bp-1, + 0x1.fe504b1d9696cp-1, + 0x1.fe5f6f568b301p-1, + 0x1.fe6e1742f7cf6p-1, + 0x1.fe7c466dc57a1p-1, + 0x1.fe8a004c19ae6p-1, + 0x1.fe97483db8670p-1, + 0x1.fea4218d6594ap-1, + 0x1.feb08f7146046p-1, + 0x1.febc950b3fa75p-1, + 0x1.fec835695932ep-1, + 0x1.fed37386190fbp-1, + 0x1.fede5248e38f4p-1, + 0x1.fee8d486585eep-1, + 0x1.fef2fd00af31ap-1, + 0x1.fefcce6813974p-1, + 0x1.ff064b5afffbep-1, + 0x1.ff0f766697c76p-1, + 0x1.ff18520700971p-1, + 0x1.ff20e0a7ba8c2p-1, + 0x1.ff2924a3f7a83p-1, + 0x1.ff312046f2339p-1, + 0x1.ff38d5cc4227fp-1, + 0x1.ff404760319b4p-1, + 0x1.ff47772010262p-1, + 0x1.ff4e671a85425p-1, + 0x1.ff55194fe19dfp-1, + 0x1.ff5b8fb26f5f6p-1, + 0x1.ff61cc26c1578p-1, + 0x1.ff67d08401202p-1, + 0x1.ff6d9e943c231p-1, + 0x1.ff733814af88cp-1, + 0x1.ff789eb6130c9p-1, + 0x1.ff7dd41ce2b4dp-1, + 0x1.ff82d9e1a76d8p-1, + 0x1.ff87b1913e853p-1, + 0x1.ff8c5cad200a5p-1, + 0x1.ff90dcaba4096p-1, + 0x1.ff9532f846ab0p-1, + 0x1.ff9960f3eb327p-1, + 0x1.ff9d67f51ddbap-1, + 0x1.ffa14948549a7p-1, + 0x1.ffa506302ebaep-1, + 0x1.ffa89fe5b3625p-1, + 0x1.ffac17988ef4bp-1, + 0x1.ffaf6e6f4f5c0p-1, + 0x1.ffb2a5879f35ep-1, + 0x1.ffb5bdf67fe6fp-1, + 0x1.ffb8b8c88295fp-1, + 0x1.ffbb970200110p-1, + 0x1.ffbe599f4f9d9p-1, + 0x1.ffc10194fcb64p-1, + 0x1.ffc38fcffbb7cp-1, + 0x1.ffc60535dd7f5p-1, + 0x1.ffc862a501fd7p-1, + 0x1.ffcaa8f4c9beap-1, + 0x1.ffccd8f5c66d1p-1, + 0x1.ffcef371ea4d7p-1, + 0x1.ffd0f92cb6ba7p-1, + 0x1.ffd2eae369a07p-1, + 0x1.ffd4c94d29fdbp-1, + 0x1.ffd6951b33686p-1, + 0x1.ffd84ef9009eep-1, + 0x1.ffd9f78c7524ap-1, + 0x1.ffdb8f7605ee7p-1, + 0x1.ffdd1750e1220p-1, + 0x1.ffde8fb314ebfp-1, + 0x1.ffdff92db56e5p-1, + 0x1.ffe1544d01ccbp-1, + 0x1.ffe2a1988857cp-1, + 0x1.ffe3e19349dc7p-1, + 0x1.ffe514bbdc197p-1, + 0x1.ffe63b8c8b5f7p-1, + 0x1.ffe7567b7b5e1p-1, + 0x1.ffe865fac722bp-1, + 0x1.ffe96a78a04a9p-1, + 0x1.ffea645f6d6dap-1, + 0x1.ffeb5415e7c44p-1, + 0x1.ffec39ff380b9p-1, + 0x1.ffed167b12ac2p-1, + 0x1.ffede9e5d3262p-1, + 0x1.ffeeb49896c6dp-1, + 0x1.ffef76e956a9fp-1, + 0x1.fff0312b010b5p-1, + 0x1.fff0e3ad91ec2p-1, + 0x1.fff18ebe2b0e1p-1, + 0x1.fff232a72b48ep-1, + 0x1.fff2cfb0453d9p-1, + 0x1.fff3661e9569dp-1, + 0x1.fff3f634b79f9p-1, + 0x1.fff48032dbe40p-1, + 0x1.fff50456dab8cp-1, + 0x1.fff582dc48d30p-1, + 0x1.fff5fbfc8a439p-1, + 0x1.fff66feee5129p-1, + 0x1.fff6dee89352ep-1, + 0x1.fff7491cd4af6p-1, + 0x1.fff7aebcff755p-1, + 0x1.fff80ff8911fdp-1, + 0x1.fff86cfd3e657p-1, + 0x1.fff8c5f702ccfp-1, + 0x1.fff91b102fca8p-1, + 0x1.fff96c717b695p-1, + 0x1.fff9ba420e834p-1, + 0x1.fffa04a7928b1p-1, + 0x1.fffa4bc63ee9ap-1, + 0x1.fffa8fc0e5f33p-1, + 0x1.fffad0b901755p-1, + 0x1.fffb0ecebee1bp-1, + 0x1.fffb4a210b172p-1, + 0x1.fffb82cd9dcbfp-1, + 0x1.fffbb8f1049c6p-1, + 0x1.fffbeca6adbe9p-1, + 0x1.fffc1e08f25f5p-1, + 0x1.fffc4d3120aa1p-1, + 0x1.fffc7a37857d2p-1, + 0x1.fffca53375ce3p-1, + 0x1.fffcce3b57bffp-1, + 0x1.fffcf564ab6b7p-1, + 0x1.fffd1ac4135f9p-1, + 0x1.fffd3e6d5cd87p-1, + 0x1.fffd607387b07p-1, + 0x1.fffd80e8ce0dap-1, + 0x1.fffd9fdeabccep-1, + 0x1.fffdbd65e5ad0p-1, + 0x1.fffdd98e903b2p-1, + 0x1.fffdf46816833p-1, + 0x1.fffe0e0140857p-1, + 0x1.fffe26683972ap-1, + 0x1.fffe3daa95b18p-1, + 0x1.fffe53d558ae9p-1, + 0x1.fffe68f4fa777p-1, + 0x1.fffe7d156d244p-1, + 0x1.fffe904222101p-1, + 0x1.fffea2860ee1ep-1, + 0x1.fffeb3ebb267bp-1, + 0x1.fffec47d19457p-1, + 0x1.fffed443e2787p-1, + 0x1.fffee34943b15p-1, + 0x1.fffef1960d85dp-1, + 0x1.fffeff32af7afp-1, + 0x1.ffff0c273bea2p-1, + 0x1.ffff187b6bc0ep-1, + 0x1.ffff2436a21dcp-1, + 0x1.ffff2f5fefcaap-1, + 0x1.ffff39fe16963p-1, + 0x1.ffff44178c8d2p-1, + 0x1.ffff4db27f146p-1, + 0x1.ffff56d4d5e5ep-1, + 0x1.ffff5f8435efcp-1, + 0x1.ffff67c604180p-1, + 0x1.ffff6f9f67e55p-1, + 0x1.ffff77154e0d6p-1, + 0x1.ffff7e2c6aea2p-1, + 0x1.ffff84e93cd75p-1, + 0x1.ffff8b500e77cp-1, + 0x1.ffff9164f8e46p-1, + 0x1.ffff972be5c59p-1, + 0x1.ffff9ca891572p-1, + 0x1.ffffa1de8c582p-1, + 0x1.ffffa6d13de73p-1, + 0x1.ffffab83e54b8p-1, + 0x1.ffffaff99bac4p-1, + 0x1.ffffb43555b5fp-1, + 0x1.ffffb839e52f3p-1, + 0x1.ffffbc09fa7cdp-1, + 0x1.ffffbfa82616bp-1, + 0x1.ffffc316d9ed0p-1, + 0x1.ffffc6586abf6p-1, + 0x1.ffffc96f1165ep-1, + 0x1.ffffcc5cec0c1p-1, + 0x1.ffffcf23ff5fcp-1, + 0x1.ffffd1c637b2bp-1, + 0x1.ffffd4456a10dp-1, + 0x1.ffffd6a3554a1p-1, + 0x1.ffffd8e1a2f22p-1, + 0x1.ffffdb01e8546p-1, + 0x1.ffffdd05a75eap-1, + 0x1.ffffdeee4f810p-1, + 0x1.ffffe0bd3e852p-1, + 0x1.ffffe273c15b7p-1, + 0x1.ffffe41314e06p-1, + 0x1.ffffe59c6698bp-1, + 0x1.ffffe710d565ep-1, + 0x1.ffffe8717232dp-1, + 0x1.ffffe9bf4098cp-1, + 0x1.ffffeafb377d5p-1, + 0x1.ffffec2641a9ep-1, + 0x1.ffffed413e5b7p-1, + 0x1.ffffee4d01cd6p-1, + 0x1.ffffef4a55bd4p-1, + 0x1.fffff039f9e8fp-1, + 0x1.fffff11ca4876p-1, + 0x1.fffff1f302bc1p-1, + 0x1.fffff2bdb904dp-1, + 0x1.fffff37d63a36p-1, + 0x1.fffff43297019p-1, + 0x1.fffff4dde0118p-1, + 0x1.fffff57fc4a95p-1, + 0x1.fffff618c3da6p-1, + 0x1.fffff6a956450p-1, + 0x1.fffff731ee681p-1, + 0x1.fffff7b2f8ed6p-1, + 0x1.fffff82cdcf1bp-1, + 0x1.fffff89ffc4aap-1, + 0x1.fffff90cb3c81p-1, + 0x1.fffff9735b73bp-1, + 0x1.fffff9d446cccp-1, + 0x1.fffffa2fc5015p-1, + 0x1.fffffa8621251p-1, + 0x1.fffffad7a2652p-1, + 0x1.fffffb248c39dp-1, + 0x1.fffffb6d1e95dp-1, + 0x1.fffffbb196132p-1, + 0x1.fffffbf22c1e2p-1, + 0x1.fffffc2f171e3p-1, + 0x1.fffffc688a9cfp-1, + 0x1.fffffc9eb76acp-1, + 0x1.fffffcd1cbc28p-1, + 0x1.fffffd01f36afp-1, + 0x1.fffffd2f57d68p-1, + 0x1.fffffd5a2041fp-1, + 0x1.fffffd8271d12p-1, + 0x1.fffffda86faa9p-1, + 0x1.fffffdcc3b117p-1, + 0x1.fffffdedf37edp-1, + 0x1.fffffe0db6b91p-1, + 0x1.fffffe2ba0ea5p-1, + 0x1.fffffe47ccb60p-1, + 0x1.fffffe62534d4p-1, + 0x1.fffffe7b4c81ep-1, + 0x1.fffffe92ced93p-1, + 0x1.fffffea8ef9cfp-1, + 0x1.fffffebdc2ec6p-1, + 0x1.fffffed15bcbap-1, + 0x1.fffffee3cc32cp-1, + 0x1.fffffef5251c2p-1, + 0x1.ffffff0576917p-1, + 0x1.ffffff14cfb92p-1, + 0x1.ffffff233ee1dp-1, + 0x1.ffffff30d18e8p-1, + 0x1.ffffff3d9480fp-1, + 0x1.ffffff4993c46p-1, + 0x1.ffffff54dab72p-1, + 0x1.ffffff5f74141p-1, + 0x1.ffffff6969fb8p-1, + 0x1.ffffff72c5fb6p-1, + 0x1.ffffff7b91176p-1, + 0x1.ffffff83d3d07p-1, + 0x1.ffffff8b962bep-1, + 0x1.ffffff92dfba2p-1, + 0x1.ffffff99b79d2p-1, + 0x1.ffffffa0248e8p-1, + 0x1.ffffffa62ce54p-1, + 0x1.ffffffabd69b4p-1, + 0x1.ffffffb127525p-1, + 0x1.ffffffb624592p-1, + 0x1.ffffffbad2affp-1, + 0x1.ffffffbf370cdp-1, + 0x1.ffffffc355dfdp-1, + 0x1.ffffffc733572p-1, + 0x1.ffffffcad3626p-1, + 0x1.ffffffce39b67p-1, + 0x1.ffffffd169d0cp-1, + 0x1.ffffffd466fa5p-1, + 0x1.ffffffd7344aap-1, + 0x1.ffffffd9d4aabp-1, + 0x1.ffffffdc4ad7ap-1, + 0x1.ffffffde9964ep-1, + 0x1.ffffffe0c2bf0p-1, + 0x1.ffffffe2c92dbp-1, + 0x1.ffffffe4aed5ep-1, + 0x1.ffffffe675bbdp-1, + 0x1.ffffffe81fc4ep-1, + 0x1.ffffffe9aeb97p-1, + 0x1.ffffffeb24467p-1, + 0x1.ffffffec81ff2p-1, + 0x1.ffffffedc95e7p-1, + 0x1.ffffffeefbc85p-1, + 0x1.fffffff01a8b6p-1, + 0x1.fffffff126e1ep-1, + 0x1.fffffff221f30p-1, + 0x1.fffffff30cd3fp-1, + 0x1.fffffff3e8892p-1, + 0x1.fffffff4b606fp-1, + 0x1.fffffff57632dp-1, + 0x1.fffffff629e44p-1, + 0x1.fffffff6d1e56p-1, + 0x1.fffffff76ef3fp-1, + 0x1.fffffff801c1fp-1, + 0x1.fffffff88af67p-1, + 0x1.fffffff90b2e3p-1, + 0x1.fffffff982fc1p-1, + 0x1.fffffff9f2e9fp-1, + 0x1.fffffffa5b790p-1, + 0x1.fffffffabd229p-1, + 0x1.fffffffb18582p-1, + 0x1.fffffffb6d844p-1, + 0x1.fffffffbbd0aap-1, + 0x1.fffffffc0748fp-1, + 0x1.fffffffc4c96cp-1, + 0x1.fffffffc8d462p-1, + 0x1.fffffffcc9a41p-1, + 0x1.fffffffd01f89p-1, + 0x1.fffffffd36871p-1, + 0x1.fffffffd678edp-1, + 0x1.fffffffd954aep-1, + 0x1.fffffffdbff2ap-1, + 0x1.fffffffde7ba0p-1, + 0x1.fffffffe0cd16p-1, + 0x1.fffffffe2f664p-1, + 0x1.fffffffe4fa30p-1, + 0x1.fffffffe6daf7p-1, + 0x1.fffffffe89b0cp-1, + 0x1.fffffffea3c9ap-1, + 0x1.fffffffebc1a9p-1, + 0x1.fffffffed2c21p-1, + 0x1.fffffffee7dc8p-1, + 0x1.fffffffefb847p-1, + 0x1.ffffffff0dd2bp-1, + 0x1.ffffffff1ede9p-1, + 0x1.ffffffff2ebdap-1, + 0x1.ffffffff3d843p-1, + 0x1.ffffffff4b453p-1, + 0x1.ffffffff58126p-1, + 0x1.ffffffff63fc3p-1, + 0x1.ffffffff6f121p-1, + 0x1.ffffffff79626p-1, + 0x1.ffffffff82fabp-1, + 0x1.ffffffff8be77p-1, + 0x1.ffffffff94346p-1, + 0x1.ffffffff9bec8p-1, + 0x1.ffffffffa319fp-1, + 0x1.ffffffffa9c63p-1, + 0x1.ffffffffaffa4p-1, + 0x1.ffffffffb5be5p-1, + 0x1.ffffffffbb1a2p-1, + 0x1.ffffffffc014ep-1, + 0x1.ffffffffc4b56p-1, + 0x1.ffffffffc901cp-1, + 0x1.ffffffffccfffp-1, + 0x1.ffffffffd0b56p-1, + 0x1.ffffffffd4271p-1, + 0x1.ffffffffd759dp-1, + 0x1.ffffffffda520p-1, + 0x1.ffffffffdd13cp-1, + 0x1.ffffffffdfa2dp-1, + 0x1.ffffffffe202dp-1, + 0x1.ffffffffe4371p-1, + 0x1.ffffffffe642ap-1, + 0x1.ffffffffe8286p-1, + 0x1.ffffffffe9eb0p-1, + 0x1.ffffffffeb8d0p-1, + 0x1.ffffffffed10ap-1, + 0x1.ffffffffee782p-1, + 0x1.ffffffffefc57p-1, + 0x1.fffffffff0fa7p-1, + 0x1.fffffffff218fp-1, + 0x1.fffffffff3227p-1, + 0x1.fffffffff4188p-1, + 0x1.fffffffff4fc9p-1, + 0x1.fffffffff5cfdp-1, + 0x1.fffffffff6939p-1, + 0x1.fffffffff748ep-1, + 0x1.fffffffff7f0dp-1, + 0x1.fffffffff88c5p-1, + 0x1.fffffffff91c6p-1, + 0x1.fffffffff9a1bp-1, + 0x1.fffffffffa1d2p-1, + 0x1.fffffffffa8f6p-1, + 0x1.fffffffffaf92p-1, + 0x1.fffffffffb5b0p-1, + 0x1.fffffffffbb58p-1, + 0x1.fffffffffc095p-1, + 0x1.fffffffffc56dp-1, + 0x1.fffffffffc9e8p-1, + 0x1.fffffffffce0dp-1, + 0x1.fffffffffd1e1p-1, + 0x1.fffffffffd56cp-1, + 0x1.fffffffffd8b3p-1, + 0x1.fffffffffdbbap-1, + 0x1.fffffffffde86p-1, + 0x1.fffffffffe11dp-1, + 0x1.fffffffffe380p-1, + 0x1.fffffffffe5b6p-1, + 0x1.fffffffffe7c0p-1, + 0x1.fffffffffe9a2p-1, + 0x1.fffffffffeb60p-1, + 0x1.fffffffffecfbp-1, + 0x1.fffffffffee77p-1, + 0x1.fffffffffefd6p-1, + 0x1.ffffffffff11ap-1, + 0x1.ffffffffff245p-1, + 0x1.ffffffffff359p-1, + 0x1.ffffffffff457p-1, + 0x1.ffffffffff542p-1, + 0x1.ffffffffff61bp-1, + 0x1.ffffffffff6e3p-1, + 0x1.ffffffffff79bp-1, + 0x1.ffffffffff845p-1, + 0x1.ffffffffff8e2p-1, + 0x1.ffffffffff973p-1, + 0x1.ffffffffff9f8p-1, + 0x1.ffffffffffa73p-1, + 0x1.ffffffffffae4p-1, + 0x1.ffffffffffb4cp-1, + 0x1.ffffffffffbadp-1, + 0x1.ffffffffffc05p-1, + 0x1.ffffffffffc57p-1, + 0x1.ffffffffffca2p-1, + 0x1.ffffffffffce7p-1, + 0x1.ffffffffffd27p-1, + 0x1.ffffffffffd62p-1, + 0x1.ffffffffffd98p-1, + 0x1.ffffffffffdcap-1, + 0x1.ffffffffffdf8p-1, + 0x1.ffffffffffe22p-1, + 0x1.ffffffffffe49p-1, + 0x1.ffffffffffe6cp-1, + 0x1.ffffffffffe8dp-1, + 0x1.ffffffffffeabp-1, + 0x1.ffffffffffec7p-1, + 0x1.ffffffffffee1p-1, + 0x1.ffffffffffef8p-1, + 0x1.fffffffffff0ep-1, + 0x1.fffffffffff22p-1, + 0x1.fffffffffff34p-1, + 0x1.fffffffffff45p-1, + 0x1.fffffffffff54p-1, + 0x1.fffffffffff62p-1, + 0x1.fffffffffff6fp-1, + 0x1.fffffffffff7bp-1, + 0x1.fffffffffff86p-1, + 0x1.fffffffffff90p-1, + 0x1.fffffffffff9ap-1, + 0x1.fffffffffffa2p-1, + 0x1.fffffffffffaap-1, + 0x1.fffffffffffb1p-1, + 0x1.fffffffffffb8p-1, + 0x1.fffffffffffbep-1, + 0x1.fffffffffffc3p-1, + 0x1.fffffffffffc8p-1, + 0x1.fffffffffffcdp-1, + 0x1.fffffffffffd1p-1, + 0x1.fffffffffffd5p-1, + 0x1.fffffffffffd9p-1, + 0x1.fffffffffffdcp-1, + 0x1.fffffffffffdfp-1, + 0x1.fffffffffffe2p-1, + 0x1.fffffffffffe4p-1, + 0x1.fffffffffffe7p-1, + 0x1.fffffffffffe9p-1, + 0x1.fffffffffffebp-1, + 0x1.fffffffffffedp-1, + 0x1.fffffffffffeep-1, + 0x1.ffffffffffff0p-1, + 0x1.ffffffffffff1p-1, + 0x1.ffffffffffff3p-1, + 0x1.ffffffffffff4p-1, + 0x1.ffffffffffff5p-1, + 0x1.ffffffffffff6p-1, + 0x1.ffffffffffff7p-1, + 0x1.ffffffffffff7p-1, + 0x1.ffffffffffff8p-1, + 0x1.ffffffffffff9p-1, + 0x1.ffffffffffff9p-1, + 0x1.ffffffffffffap-1, + 0x1.ffffffffffffbp-1, + 0x1.ffffffffffffbp-1, + 0x1.ffffffffffffbp-1, + 0x1.ffffffffffffcp-1, + 0x1.ffffffffffffcp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffdp-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.ffffffffffffep-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.fffffffffffffp-1, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + 0x1.0000000000000p+0, + }, + .scale = { 0x1.20dd750429b6dp+0, + 0x1.20d8f1975c85dp+0, + 0x1.20cb67bd452c7p+0, + 0x1.20b4d8bac36c1p+0, + 0x1.209546ad13ccfp+0, + 0x1.206cb4897b148p+0, + 0x1.203b261cd0052p+0, + 0x1.2000a00ae3804p+0, + 0x1.1fbd27cdc72d3p+0, + 0x1.1f70c3b4f2cc7p+0, + 0x1.1f1b7ae44867fp+0, + 0x1.1ebd5552f795bp+0, + 0x1.1e565bca400d4p+0, + 0x1.1de697e413d28p+0, + 0x1.1d6e14099944ap+0, + 0x1.1cecdb718d61cp+0, + 0x1.1c62fa1e869b6p+0, + 0x1.1bd07cdd189acp+0, + 0x1.1b357141d95d5p+0, + 0x1.1a91e5a748165p+0, + 0x1.19e5e92b964abp+0, + 0x1.19318bae53a04p+0, + 0x1.1874ddcdfce24p+0, + 0x1.17aff0e56ec10p+0, + 0x1.16e2d7093cd8cp+0, + 0x1.160da304ed92fp+0, + 0x1.153068581b781p+0, + 0x1.144b3b337c90cp+0, + 0x1.135e3075d076bp+0, + 0x1.12695da8b5bdep+0, + 0x1.116cd8fd67618p+0, + 0x1.1068b94962e5ep+0, + 0x1.0f5d1602f7e41p+0, + 0x1.0e4a073dc1b91p+0, + 0x1.0d2fa5a70c168p+0, + 0x1.0c0e0a8223359p+0, + 0x1.0ae54fa490722p+0, + 0x1.09b58f724416bp+0, + 0x1.087ee4d9ad247p+0, + 0x1.07416b4fbfe7cp+0, + 0x1.05fd3ecbec297p+0, + 0x1.04b27bc403d30p+0, + 0x1.03613f2812dafp+0, + 0x1.0209a65e29545p+0, + 0x1.00abcf3e187a9p+0, + 0x1.fe8fb01a47307p-1, + 0x1.fbbbbef34b4b2p-1, + 0x1.f8dc092d58ff8p-1, + 0x1.f5f0cdaf15313p-1, + 0x1.f2fa4c16c0019p-1, + 0x1.eff8c4b1375dbp-1, + 0x1.ecec7870ebca7p-1, + 0x1.e9d5a8e4c934ep-1, + 0x1.e6b4982f158b9p-1, + 0x1.e38988fc46e72p-1, + 0x1.e054be79d3042p-1, + 0x1.dd167c4cf9d2ap-1, + 0x1.d9cf06898cdafp-1, + 0x1.d67ea1a8b5368p-1, + 0x1.d325927fb9d89p-1, + 0x1.cfc41e36c7df9p-1, + 0x1.cc5a8a3fbea40p-1, + 0x1.c8e91c4d01368p-1, + 0x1.c5701a484ef9dp-1, + 0x1.c1efca49a5011p-1, + 0x1.be68728e29d5dp-1, + 0x1.bada596f25436p-1, + 0x1.b745c55905bf8p-1, + 0x1.b3aafcc27502ep-1, + 0x1.b00a46237d5bep-1, + 0x1.ac63e7ecc1411p-1, + 0x1.a8b8287ec6a09p-1, + 0x1.a5074e2157620p-1, + 0x1.a1519efaf889ep-1, + 0x1.9d97610879642p-1, + 0x1.99d8da149c13fp-1, + 0x1.96164fafd8de3p-1, + 0x1.925007283d7aap-1, + 0x1.8e86458169af8p-1, + 0x1.8ab94f6caa71dp-1, + 0x1.86e9694134b9ep-1, + 0x1.8316d6f48133dp-1, + 0x1.7f41dc12c9e89p-1, + 0x1.7b6abbb7aaf19p-1, + 0x1.7791b886e7403p-1, + 0x1.73b714a552763p-1, + 0x1.6fdb11b1e0c34p-1, + 0x1.6bfdf0beddaf5p-1, + 0x1.681ff24b4ab04p-1, + 0x1.6441563c665d4p-1, + 0x1.60625bd75d07bp-1, + 0x1.5c8341bb23767p-1, + 0x1.58a445da7c74cp-1, + 0x1.54c5a57629db0p-1, + 0x1.50e79d1749ac9p-1, + 0x1.4d0a6889dfd9fp-1, + 0x1.492e42d78d2c5p-1, + 0x1.4553664273d24p-1, + 0x1.417a0c4049fd0p-1, + 0x1.3da26d759aef5p-1, + 0x1.39ccc1b136d5ap-1, + 0x1.35f93fe7d1b3dp-1, + 0x1.32281e2fd1a92p-1, + 0x1.2e5991bd4cbfcp-1, + 0x1.2a8dcede3673bp-1, + 0x1.26c508f6bd0ffp-1, + 0x1.22ff727dd6f7bp-1, + 0x1.1f3d3cf9ffe5ap-1, + 0x1.1b7e98fe26217p-1, + 0x1.17c3b626c7a11p-1, + 0x1.140cc3173f007p-1, + 0x1.1059ed7740313p-1, + 0x1.0cab61f084b93p-1, + 0x1.09014c2ca74dap-1, + 0x1.055bd6d32e8d7p-1, + 0x1.01bb2b87c6968p-1, + 0x1.fc3ee5d1524b0p-2, + 0x1.f511a91a67d2ap-2, + 0x1.edeeee0959518p-2, + 0x1.e6d6ffaa65a25p-2, + 0x1.dfca26f5bbf88p-2, + 0x1.d8c8aace11e63p-2, + 0x1.d1d2cfff91594p-2, + 0x1.cae8d93f1d7b6p-2, + 0x1.c40b0729ed547p-2, + 0x1.bd3998457afdap-2, + 0x1.b674c8ffc6283p-2, + 0x1.afbcd3afe8ab6p-2, + 0x1.a911f096fbc26p-2, + 0x1.a27455e14c93cp-2, + 0x1.9be437a7de946p-2, + 0x1.9561c7f23a47bp-2, + 0x1.8eed36b886d93p-2, + 0x1.8886b1e5ecfd1p-2, + 0x1.822e655b417e6p-2, + 0x1.7be47af1f5d89p-2, + 0x1.75a91a7f4d2edp-2, + 0x1.6f7c69d7d3ef8p-2, + 0x1.695e8cd31867ep-2, + 0x1.634fa54fa285fp-2, + 0x1.5d4fd33729015p-2, + 0x1.575f3483021c3p-2, + 0x1.517de540ce2a3p-2, + 0x1.4babff975a04cp-2, + 0x1.45e99bcbb7915p-2, + 0x1.4036d0468a7a2p-2, + 0x1.3a93b1998736cp-2, + 0x1.35005285227f1p-2, + 0x1.2f7cc3fe6f423p-2, + 0x1.2a09153529381p-2, + 0x1.24a55399ea239p-2, + 0x1.1f518ae487dc8p-2, + 0x1.1a0dc51a9934dp-2, + 0x1.14da0a961fd14p-2, + 0x1.0fb6620c550afp-2, + 0x1.0aa2d09497f2bp-2, + 0x1.059f59af7a906p-2, + 0x1.00abff4dec7a3p-2, + 0x1.f79183b101c5bp-3, + 0x1.edeb406d9c824p-3, + 0x1.e4652fadcb6b2p-3, + 0x1.daff4969c0b04p-3, + 0x1.d1b982c501370p-3, + 0x1.c893ce1dcbef7p-3, + 0x1.bf8e1b1ca2279p-3, + 0x1.b6a856c3ed54fp-3, + 0x1.ade26b7fbed95p-3, + 0x1.a53c4135a6526p-3, + 0x1.9cb5bd549b111p-3, + 0x1.944ec2e4f5630p-3, + 0x1.8c07329874652p-3, + 0x1.83deeada4d25ap-3, + 0x1.7bd5c7df3fe9cp-3, + 0x1.73eba3b5b07b7p-3, + 0x1.6c205655be71fp-3, + 0x1.6473b5b15a7a1p-3, + 0x1.5ce595c455b0ap-3, + 0x1.5575c8a468361p-3, + 0x1.4e241e912c305p-3, + 0x1.46f066040a832p-3, + 0x1.3fda6bc016994p-3, + 0x1.38e1fae1d6a9dp-3, + 0x1.3206dceef5f87p-3, + 0x1.2b48d9e5dea1cp-3, + 0x1.24a7b84d38971p-3, + 0x1.1e233d434b813p-3, + 0x1.17bb2c8d41535p-3, + 0x1.116f48a6476ccp-3, + 0x1.0b3f52ce8c383p-3, + 0x1.052b0b1a174eap-3, + 0x1.fe6460fef4680p-4, + 0x1.f2a901ccafb37p-4, + 0x1.e723726b824a9p-4, + 0x1.dbd32ac4c99b0p-4, + 0x1.d0b7a0f921e7cp-4, + 0x1.c5d0497c09e74p-4, + 0x1.bb1c972f23e50p-4, + 0x1.b09bfb7d11a83p-4, + 0x1.a64de673e8837p-4, + 0x1.9c31c6df3b1b8p-4, + 0x1.92470a61b6965p-4, + 0x1.888d1d8e510a3p-4, + 0x1.7f036c0107294p-4, + 0x1.75a96077274bap-4, + 0x1.6c7e64e7281cbp-4, + 0x1.6381e2980956bp-4, + 0x1.5ab342383d177p-4, + 0x1.5211ebf41880bp-4, + 0x1.499d478bca735p-4, + 0x1.4154bc68d75c3p-4, + 0x1.3937b1b319259p-4, + 0x1.31458e6542847p-4, + 0x1.297db960e4f63p-4, + 0x1.21df9981f8e53p-4, + 0x1.1a6a95b1e786fp-4, + 0x1.131e14fa1625dp-4, + 0x1.0bf97e95f2a64p-4, + 0x1.04fc3a0481321p-4, + 0x1.fc4b5e32d6259p-5, + 0x1.eeea8c1b1db93p-5, + 0x1.e1d4cf1e2450ap-5, + 0x1.d508f9a1ea64ep-5, + 0x1.c885df3451a07p-5, + 0x1.bc4a54a84e834p-5, + 0x1.b055303221015p-5, + 0x1.a4a549829587ep-5, + 0x1.993979e14fffdp-5, + 0x1.8e109c4622913p-5, + 0x1.83298d717210ep-5, + 0x1.78832c03aa2b1p-5, + 0x1.6e1c5893c380bp-5, + 0x1.63f3f5c4de13bp-5, + 0x1.5a08e85af27e0p-5, + 0x1.505a174e9c929p-5, + 0x1.46e66be002240p-5, + 0x1.3dacd1a8d8ccdp-5, + 0x1.34ac36ad8dafep-5, + 0x1.2be38b6d92415p-5, + 0x1.2351c2f2d1449p-5, + 0x1.1af5d2e04f3f6p-5, + 0x1.12ceb37ff9bc3p-5, + 0x1.0adb5fcfa8c75p-5, + 0x1.031ad58d56279p-5, + 0x1.f7182a851bca2p-6, + 0x1.e85c449e377f2p-6, + 0x1.da0005e5f28dfp-6, + 0x1.cc0180af00a8bp-6, + 0x1.be5ecd2fcb5f9p-6, + 0x1.b1160991ff737p-6, + 0x1.a4255a00b9f03p-6, + 0x1.978ae8b55ce1bp-6, + 0x1.8b44e6031383ep-6, + 0x1.7f5188610ddc8p-6, + 0x1.73af0c737bb45p-6, + 0x1.685bb5134ef13p-6, + 0x1.5d55cb54cd53ap-6, + 0x1.529b9e8cf9a1ep-6, + 0x1.482b8455dc491p-6, + 0x1.3e03d891b37dep-6, + 0x1.3422fd6d12e2bp-6, + 0x1.2a875b5ffab56p-6, + 0x1.212f612dee7fbp-6, + 0x1.181983e5133ddp-6, + 0x1.0f443edc5ce49p-6, + 0x1.06ae13b0d3255p-6, + 0x1.fcab1483ea7fcp-7, + 0x1.ec72615a894c4p-7, + 0x1.dcaf3691fc448p-7, + 0x1.cd5ec93c12431p-7, + 0x1.be7e5ac24963bp-7, + 0x1.b00b38d6b3575p-7, + 0x1.a202bd6372dcep-7, + 0x1.94624e78e0fafp-7, + 0x1.87275e3a6869dp-7, + 0x1.7a4f6aca256cbp-7, + 0x1.6dd7fe3358230p-7, + 0x1.61beae53b72b7p-7, + 0x1.56011cc3b036dp-7, + 0x1.4a9cf6bda3f4cp-7, + 0x1.3f8ff5042a88ep-7, + 0x1.34d7dbc76d7e5p-7, + 0x1.2a727a89a3f14p-7, + 0x1.205dac02bd6b9p-7, + 0x1.1697560347b25p-7, + 0x1.0d1d69569b82dp-7, + 0x1.03ede1a45bfeep-7, + 0x1.f60d8aa2a88f2p-8, + 0x1.e4cc4abf7d065p-8, + 0x1.d4143a9dfe965p-8, + 0x1.c3e1a5f5c077cp-8, + 0x1.b430ecf4a83a8p-8, + 0x1.a4fe83fb9db25p-8, + 0x1.9646f35a76623p-8, + 0x1.8806d70b2fc36p-8, + 0x1.7a3ade6c8b3e4p-8, + 0x1.6cdfcbfc1e263p-8, + 0x1.5ff2750fe7820p-8, + 0x1.536fc18f7ce5cp-8, + 0x1.4754abacdf1dcp-8, + 0x1.3b9e3f9d06e3fp-8, + 0x1.30499b503957fp-8, + 0x1.2553ee2a336bfp-8, + 0x1.1aba78ba3af89p-8, + 0x1.107a8c7323a6ep-8, + 0x1.06918b6355624p-8, + 0x1.f9f9cfd9c3035p-9, + 0x1.e77448fb66bb9p-9, + 0x1.d58da68fd1170p-9, + 0x1.c4412bf4b8f0bp-9, + 0x1.b38a3af2e55b4p-9, + 0x1.a3645330550ffp-9, + 0x1.93cb11a30d765p-9, + 0x1.84ba3004a50d0p-9, + 0x1.762d84469c18fp-9, + 0x1.6821000795a03p-9, + 0x1.5a90b00981d93p-9, + 0x1.4d78bba8ca5fdp-9, + 0x1.40d564548fad7p-9, + 0x1.34a305080681fp-9, + 0x1.28de11c5031ebp-9, + 0x1.1d83170fbf6fbp-9, + 0x1.128eb96be8798p-9, + 0x1.07fdb4dafea5fp-9, + 0x1.fb99b8b8279e1p-10, + 0x1.e7f232d9e2630p-10, + 0x1.d4fed7195d7e8p-10, + 0x1.c2b9cf7f893bfp-10, + 0x1.b11d702b3deb1p-10, + 0x1.a024365f771bdp-10, + 0x1.8fc8c794b03b5p-10, + 0x1.8005f08d6f1efp-10, + 0x1.70d6a46e07ddap-10, + 0x1.6235fbd7a4345p-10, + 0x1.541f340697987p-10, + 0x1.468dadf4080abp-10, + 0x1.397ced7af2b15p-10, + 0x1.2ce898809244ep-10, + 0x1.20cc76202c5fap-10, + 0x1.15246dda49d47p-10, + 0x1.09ec86c75d497p-10, + 0x1.fe41cd9bb4eeep-11, + 0x1.e97ba3b77f306p-11, + 0x1.d57f524723822p-11, + 0x1.c245d4b998479p-11, + 0x1.afc85e0f82e12p-11, + 0x1.9e005769dbc1dp-11, + 0x1.8ce75e9f6f8a0p-11, + 0x1.7c7744d9378f7p-11, + 0x1.6caa0d3582fe9p-11, + 0x1.5d79eb71e893bp-11, + 0x1.4ee1429bf7cc0p-11, + 0x1.40daa3c89f5b6p-11, + 0x1.3360ccd23db3ap-11, + 0x1.266ea71d4f71ap-11, + 0x1.19ff4663ae9dfp-11, + 0x1.0e0de78654d1ep-11, + 0x1.0295ef6591848p-11, + 0x1.ef25d37f49fe1p-12, + 0x1.da01102b5f851p-12, + 0x1.c5b5412dcafadp-12, + 0x1.b23a5a23e4210p-12, + 0x1.9f8893d8fd1c1p-12, + 0x1.8d986a4187285p-12, + 0x1.7c629a822bc9ep-12, + 0x1.6be02102b3520p-12, + 0x1.5c0a378c90bcap-12, + 0x1.4cda5374ea275p-12, + 0x1.3e4a23d1f4702p-12, + 0x1.30538fbb77ecdp-12, + 0x1.22f0b496539bdp-12, + 0x1.161be46ad3b50p-12, + 0x1.09cfa445b00ffp-12, + 0x1.fc0d55470cf51p-13, + 0x1.e577bbcd49935p-13, + 0x1.cfd4a5adec5bfp-13, + 0x1.bb1a9657ce465p-13, + 0x1.a740684026555p-13, + 0x1.943d4a1d1ed39p-13, + 0x1.8208bc334a6a5p-13, + 0x1.709a8db59f25cp-13, + 0x1.5feada379d8b7p-13, + 0x1.4ff207314a102p-13, + 0x1.40a8c1949f75ep-13, + 0x1.3207fb7420eb9p-13, + 0x1.2408e9ba3327fp-13, + 0x1.16a501f0e42cap-13, + 0x1.09d5f819c9e29p-13, + 0x1.fb2b792b40a22p-14, + 0x1.e3bcf436a1a95p-14, + 0x1.cd55277c18d05p-14, + 0x1.b7e94604479dcp-14, + 0x1.a36eec00926ddp-14, + 0x1.8fdc1b2dcf7b9p-14, + 0x1.7d2737527c3f9p-14, + 0x1.6b4702d7d5849p-14, + 0x1.5a329b7d30748p-14, + 0x1.49e17724f4d41p-14, + 0x1.3a4b60ba9aa4dp-14, + 0x1.2b6875310f785p-14, + 0x1.1d312098e9dbap-14, + 0x1.0f9e1b4dd36dfp-14, + 0x1.02a8673a94691p-14, + 0x1.ec929a665b449p-15, + 0x1.d4f4b4c8e09edp-15, + 0x1.be6abbb10a5aap-15, + 0x1.a8e8cc1fadef6p-15, + 0x1.94637d5bacfdbp-15, + 0x1.80cfdc72220cfp-15, + 0x1.6e2367dc27f95p-15, + 0x1.5c540b4936fd2p-15, + 0x1.4b581b8d170fcp-15, + 0x1.3b2652b06c2b2p-15, + 0x1.2bb5cc22e5db6p-15, + 0x1.1cfe010e2052dp-15, + 0x1.0ef6c4c84a0fep-15, + 0x1.01984165a5f36p-15, + 0x1.e9b5e8d00ce76p-16, + 0x1.d16f5716c6c1ap-16, + 0x1.ba4f035d60e02p-16, + 0x1.a447b7b03f045p-16, + 0x1.8f4ccca7fc90dp-16, + 0x1.7b5223dac7336p-16, + 0x1.684c227fcacefp-16, + 0x1.562fac4329b48p-16, + 0x1.44f21e49054f2p-16, + 0x1.34894a5e24657p-16, + 0x1.24eb7254ccf83p-16, + 0x1.160f438c70913p-16, + 0x1.07ebd2a2d2844p-16, + 0x1.f4f12e9ab070ap-17, + 0x1.db5ad0b27805cp-17, + 0x1.c304efa2c6f4ep-17, + 0x1.abe09e9144b5ep-17, + 0x1.95df988e76644p-17, + 0x1.80f439b4ee04bp-17, + 0x1.6d11788a69c64p-17, + 0x1.5a2adfa0b4bc4p-17, + 0x1.4834877429b8fp-17, + 0x1.37231085c7d9ap-17, + 0x1.26eb9daed6f7ep-17, + 0x1.1783ceac28910p-17, + 0x1.08e1badf0fcedp-17, + 0x1.f5f7d88472604p-18, + 0x1.db92b5212fb8dp-18, + 0x1.c282cd3957edap-18, + 0x1.aab7abace48dcp-18, + 0x1.94219bfcb4928p-18, + 0x1.7eb1a2075864dp-18, + 0x1.6a597219a93d9p-18, + 0x1.570b69502f313p-18, + 0x1.44ba864670882p-18, + 0x1.335a62115bce2p-18, + 0x1.22df298214423p-18, + 0x1.133d96ae7e0ddp-18, + 0x1.046aeabcfcdecp-18, + 0x1.ecb9cfe1d8642p-19, + 0x1.d21397ead99cbp-19, + 0x1.b8d094c86d374p-19, + 0x1.a0df0f0c626dcp-19, + 0x1.8a2e269750a39p-19, + 0x1.74adc8f4064d3p-19, + 0x1.604ea819f007cp-19, + 0x1.4d0231928c6f9p-19, + 0x1.3aba85fe22e1fp-19, + 0x1.296a70f414053p-19, + 0x1.1905613b3abf2p-19, + 0x1.097f6156f32c5p-19, + 0x1.f59a20caf6695p-20, + 0x1.d9c73698fb1dcp-20, + 0x1.bf716c6168baep-20, + 0x1.a6852c6b58392p-20, + 0x1.8eefd70594a88p-20, + 0x1.789fb715aae95p-20, + 0x1.6383f726a8e04p-20, + 0x1.4f8c96f26a26ap-20, + 0x1.3caa61607f920p-20, + 0x1.2acee2f5ecdb8p-20, + 0x1.19ec60b1242edp-20, + 0x1.09f5cf4dd2877p-20, + 0x1.f5bd95d8730d8p-21, + 0x1.d9371e2ff7c35p-21, + 0x1.be41de54d155ap-21, + 0x1.a4c89e08ef4f3p-21, + 0x1.8cb738399b12cp-21, + 0x1.75fa8dbc84becp-21, + 0x1.608078a70dcbcp-21, + 0x1.4c37c0394d094p-21, + 0x1.39100d5687bfep-21, + 0x1.26f9df8519bd6p-21, + 0x1.15e6827001f18p-21, + 0x1.05c803e4831c1p-21, + 0x1.ed22548cffd35p-22, + 0x1.d06ad6ecdf971p-22, + 0x1.b551c847fbc96p-22, + 0x1.9bc09f112b494p-22, + 0x1.83a1ff0aa239dp-22, + 0x1.6ce1aa3fd7bddp-22, + 0x1.576c72b514859p-22, + 0x1.43302cc4a0da8p-22, + 0x1.301ba221dc9bbp-22, + 0x1.1e1e857adc568p-22, + 0x1.0d2966b1746f7p-22, + 0x1.fa5b4f49cc6b2p-23, + 0x1.dc3ae30b55c16p-23, + 0x1.bfd7555a3bd68p-23, + 0x1.a517d9e61628ap-23, + 0x1.8be4f8f6c951fp-23, + 0x1.74287ded49339p-23, + 0x1.5dcd669f2cd34p-23, + 0x1.48bfd38302870p-23, + 0x1.34ecf8a3c124ap-23, + 0x1.22430f521cbcfp-23, + 0x1.10b1488aeb235p-23, + 0x1.0027c00a263a6p-23, + 0x1.e12ee004efc37p-24, + 0x1.c3e44ae32b16bp-24, + 0x1.a854ea14102a8p-24, + 0x1.8e6761569f45dp-24, + 0x1.7603bac345f65p-24, + 0x1.5f1353cdad001p-24, + 0x1.4980cb3c80949p-24, + 0x1.3537f00b6ad4dp-24, + 0x1.2225b12bffc68p-24, + 0x1.10380e1adb7e9p-24, + 0x1.febc107d5efaap-25, + 0x1.df0f2a0ee6946p-25, + 0x1.c14b2188bcee4p-25, + 0x1.a553644f7f07dp-25, + 0x1.8b0cfce0579dfp-25, + 0x1.725e7c5dd20f7p-25, + 0x1.5b2fe547a1340p-25, + 0x1.456a974e92e93p-25, + 0x1.30f93c3699078p-25, + 0x1.1dc7b5b978cf8p-25, + 0x1.0bc30c5d52f15p-25, + 0x1.f5b2be65a0c7fp-26, + 0x1.d5f3a8dea7357p-26, + 0x1.b82915b03515bp-26, + 0x1.9c3517e789488p-26, + 0x1.81fb7df06136ep-26, + 0x1.6961b8d641d06p-26, + 0x1.524ec4d916caep-26, + 0x1.3cab1343d18d1p-26, + 0x1.2860757487a01p-26, + 0x1.155a09065d4f7p-26, + 0x1.0384250e4c9fcp-26, + 0x1.e59890b926c78p-27, + 0x1.c642116a8a9e3p-27, + 0x1.a8e405e651ab6p-27, + 0x1.8d5f98114f872p-27, + 0x1.7397c5a66e307p-27, + 0x1.5b71456c5a4c4p-27, + 0x1.44d26de513197p-27, + 0x1.2fa31d6371537p-27, + 0x1.1bcca373b7b43p-27, + 0x1.0939ab853339fp-27, + 0x1.efac5187b2863p-28, + 0x1.cf1e86235d0e6p-28, + 0x1.b0a68a2128babp-28, + 0x1.9423165bc4444p-28, + 0x1.7974e743dea3cp-28, + 0x1.607e9eacd1050p-28, + 0x1.4924a74dec728p-28, + 0x1.334d19e0c2160p-28, + 0x1.1edfa3c5f5ccap-28, + 0x1.0bc56f1b54701p-28, + 0x1.f3d2185e047d9p-29, + 0x1.d26cb87945e87p-29, + 0x1.b334fac4b9f99p-29, + 0x1.96076f7918d1cp-29, + 0x1.7ac2d72fc2c63p-29, + 0x1.614801550319ep-29, + 0x1.4979ac8b28926p-29, + 0x1.333c68e2d0548p-29, + 0x1.1e767bce37dd7p-29, + 0x1.0b0fc5b6d05a0p-29, + 0x1.f1e3523b41d7dp-30, + 0x1.d00de6608effep-30, + 0x1.b0778b7b3301ap-30, + 0x1.92fb04ec0f6cfp-30, + 0x1.77756ec9f78fap-30, + 0x1.5dc61922d5a06p-30, + 0x1.45ce65699ff6dp-30, + 0x1.2f71a5f159970p-30, + 0x1.1a94ff571654fp-30, + 0x1.071f4bbea09ecp-30, + 0x1.e9f1ff8ddd774p-31, + 0x1.c818223a202c7p-31, + 0x1.a887bd2b4404dp-31, + 0x1.8b1a336c5eb6bp-31, + 0x1.6fab63324088ap-31, + 0x1.56197e30205bap-31, + 0x1.3e44e45301b92p-31, + 0x1.281000bfe4c3fp-31, + 0x1.135f28f2d50b4p-31, + 0x1.00187dded5975p-31, + 0x1.dc479de0ef001p-32, + 0x1.bad4fdad3caa1p-32, + 0x1.9baed3ed27ab8p-32, + 0x1.7ead9ce4285bbp-32, + 0x1.63ac6b4edc88ep-32, + 0x1.4a88be2a6390cp-32, + 0x1.332259185f1a0p-32, + 0x1.1d5b1f3793044p-32, + 0x1.0916f04b6e18bp-32, + 0x1.ec77101de6926p-33, + 0x1.c960bf23153e0p-33, + 0x1.a8bd20fc65ef7p-33, + 0x1.8a61745ec7d1dp-33, + 0x1.6e25d0e756261p-33, + 0x1.53e4f7d1666cbp-33, + 0x1.3b7c27a7ddb0ep-33, + 0x1.24caf2c32af14p-33, + 0x1.0fb3186804d0fp-33, + 0x1.f830c0bb41fd7p-34, + 0x1.d3c0f1a91c846p-34, + 0x1.b1e5acf351d87p-34, + 0x1.92712d259ce66p-34, + 0x1.7538c60a04476p-34, + 0x1.5a14b04b47879p-34, + 0x1.40dfd87456f4cp-34, + 0x1.2977b1172b9d5p-34, + 0x1.13bc07e891491p-34, + 0x1.ff1dbb4300811p-35, + 0x1.d9a880f306bd8p-35, + 0x1.b6e45220b55e0p-35, + 0x1.96a0b33f2c4dap-35, + 0x1.78b07e9e924acp-35, + 0x1.5ce9ab1670dd2p-35, + 0x1.4325167006bb0p-35, + 0x1.2b3e53538ff3fp-35, + 0x1.15137a7f44864p-35, + 0x1.0084ff125639dp-35, + 0x1.daeb0b7311ec7p-36, + 0x1.b7937d1c40c52p-36, + 0x1.96d082f59ab06p-36, + 0x1.7872d9fa10aadp-36, + 0x1.5c4e8e37bc7d0p-36, + 0x1.423ac0df49a40p-36, + 0x1.2a117230ad284p-36, + 0x1.13af4f04f9998p-36, + 0x1.fde703724e560p-37, + 0x1.d77f0c82e7641p-37, + 0x1.b3ee02611d7ddp-37, + 0x1.92ff33023d5bdp-37, + 0x1.7481a9e69f53fp-37, + 0x1.5847eda620959p-37, + 0x1.3e27c1fcc74bdp-37, + 0x1.25f9ee0b923dcp-37, + 0x1.0f9a0686531ffp-37, + 0x1.f5cc7718082afp-38, + 0x1.cf7e53d6a2ca5p-38, + 0x1.ac0f5f3229372p-38, + 0x1.8b498644847eap-38, + 0x1.6cfa9bcca59dcp-38, + 0x1.50f411d4fd2cdp-38, + 0x1.370ab8327af5ep-38, + 0x1.1f167f88c6b6ep-38, + 0x1.08f24085d4597p-38, + 0x1.e8f70e181d619p-39, + 0x1.c324c20e337dcp-39, + 0x1.a03261574b54ep-39, + 0x1.7fe903cdf5855p-39, + 0x1.6215c58da3450p-39, + 0x1.46897d4b69fc6p-39, + 0x1.2d1877d731b7bp-39, + 0x1.159a386b11517p-39, + 0x1.ffd27ae9393cep-40, + 0x1.d7c593130dd0bp-40, + 0x1.b2cd607c79bcfp-40, + 0x1.90ae4d3405651p-40, + 0x1.71312dd1759e2p-40, + 0x1.5422ef5d8949dp-40, + 0x1.39544b0ecc957p-40, + 0x1.20997f73e73ddp-40, + 0x1.09ca0eaacd277p-40, + 0x1.e9810295890ecp-41, + 0x1.c2b45b5aa4a1dp-41, + 0x1.9eee068fa7596p-41, + 0x1.7df2b399c10a8p-41, + 0x1.5f8b87a31bd85p-41, + 0x1.4385c96e9a2d9p-41, + 0x1.29b2933ef4cbcp-41, + 0x1.11e68a6378f8ap-41, + 0x1.f7f338086a86bp-42, + 0x1.cf8d7d9ce040ap-42, + 0x1.aa577251ae484p-42, + 0x1.8811d739efb5ep-42, + 0x1.68823e52970bep-42, + 0x1.4b72ae68e8b4cp-42, + 0x1.30b14dbe876bcp-42, + 0x1.181012ef86610p-42, + 0x1.01647ba798744p-42, + 0x1.d90e917701675p-43, + 0x1.b2a87e86d0c8ap-43, + 0x1.8f53dcb377293p-43, + 0x1.6ed2f2515e933p-43, + 0x1.50ecc9ed47f19p-43, + 0x1.356cd5ce7799ep-43, + 0x1.1c229a587ab78p-43, + 0x1.04e15ecc7f3f6p-43, + 0x1.deffc7e6a6017p-44, + 0x1.b7b040832f310p-44, + 0x1.938e021f36d76p-44, + 0x1.7258610b3b233p-44, + 0x1.53d3bfc82a909p-44, + 0x1.37c92babdc2fdp-44, + 0x1.1e06010120f6ap-44, + 0x1.065b9616170d4p-44, + 0x1.e13dd96b3753ap-45, + 0x1.b950d32467392p-45, + 0x1.94a72263259a5p-45, + 0x1.72fd93e036cdcp-45, + 0x1.54164576929abp-45, + 0x1.37b83c521fe96p-45, + 0x1.1daf033182e96p-45, + 0x1.05ca50205d26ap-45, + 0x1.dfbb6235639fap-46, + 0x1.b7807e294781fp-46, + 0x1.9298add70a734p-46, + 0x1.70beaf9c7ffb6p-46, + 0x1.51b2cd6709222p-46, + 0x1.353a6cf7f7fffp-46, + 0x1.1b1fa8cbe84a7p-46, + 0x1.0330f0fd69921p-46, + 0x1.da81670f96f9bp-47, + 0x1.b24a16b4d09aap-47, + 0x1.8d6eeb6efdbd6p-47, + 0x1.6ba91ac734785p-47, + 0x1.4cb7966770ab5p-47, + 0x1.305e9721d0981p-47, + 0x1.1667311fff70ap-47, + 0x1.fd3de10d62855p-48, + 0x1.d1aefbcd48d0cp-48, + 0x1.a9cc93c25aca9p-48, + 0x1.85487ee3ea735p-48, + 0x1.63daf8b4b1e0cp-48, + 0x1.45421e69a6ca1p-48, + 0x1.294175802d99ap-48, + 0x1.0fa17bf41068fp-48, + 0x1.f05e82aae2bb9p-49, + 0x1.c578101b29058p-49, + 0x1.9e39dc5dd2f7cp-49, + 0x1.7a553a728bbf2p-49, + 0x1.5982008db1304p-49, + 0x1.3b7e00422e51bp-49, + 0x1.200c898d9ee3ep-49, + 0x1.06f5f7eb65a56p-49, + 0x1.e00e9148a1d25p-50, + 0x1.b623734024e92p-50, + 0x1.8fd4e01891bf8p-50, + 0x1.6cd44c7470d89p-50, + 0x1.4cd9c04158cd7p-50, + 0x1.2fa34bf5c8344p-50, + 0x1.14f4890ff2461p-50, + 0x1.f92c49dfa4df5p-51, + 0x1.ccaaea71ab0dfp-51, + 0x1.a40829f001197p-51, + 0x1.7eef13b59e96cp-51, + 0x1.5d11e1a252bf5p-51, + 0x1.3e296303b2297p-51, + 0x1.21f47009f43cep-51, + 0x1.083768c5e4541p-51, + 0x1.e1777d831265ep-52, + 0x1.b69f10b0191b5p-52, + 0x1.8f8a3a05b5b52p-52, + 0x1.6be573c40c8e7p-52, + 0x1.4b645ba991fdbp-52, + 0x1.2dc119095729fp-52, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfc_1u8.c b/contrib/arm-optimized-routines/pl/math/sv_erfc_1u8.c new file mode 100644 index 000000000000..a91bef96f2e7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erfc_1u8.c @@ -0,0 +1,164 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint64_t off_idx, off_arr; + double max, shift; + double p20, p40, p41, p42; + double p51, p52; + double q5, r5; + double q6, r6; + double q7, r7; + double q8, r8; + double q9, r9; + uint64_t table_scale; +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .off_idx = 0xbd3ffffffffff260, + .off_arr = 0xfffffffffffff260, /* 0xffffffffffffffff - 3487. */ + .max = 0x1.b3ep+4, /* 3487/128. */ + .shift = 0x1p45, + .table_scale = 0x37f0000000000000, /* asuint64(0x1p-128). */ + .p20 = 0x1.5555555555555p-2, /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = -0x1.999999999999ap-4, /* 1/10. */ + .p41 = -0x1.999999999999ap-2, /* 2/5. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ + .p51 = -0x1.c71c71c71c71cp-3, /* 2/9. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ + /* Qi = (i+1) / i, for i = 5, ..., 9. */ + .q5 = 0x1.3333333333333p0, + .q6 = 0x1.2aaaaaaaaaaabp0, + .q7 = 0x1.2492492492492p0, + .q8 = 0x1.2p0, + .q9 = 0x1.1c71c71c71c72p0, + /* Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .r5 = -0x1.e79e79e79e79ep-3, + .r6 = -0x1.b6db6db6db6dbp-3, + .r7 = -0x1.8e38e38e38e39p-3, + .r8 = -0x1.6c16c16c16c17p-3, + .r9 = -0x1.4f2094f2094f2p-3, +}; + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + _ZGVsMxv_erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + svfloat64_t a = svabs_x (pg, x); + + /* Clamp input at |x| <= 3487/128. */ + a = svmin_x (pg, a, dat->max); + + /* Reduce x to the nearest multiple of 1/128. */ + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t z = svadd_x (pg, a, shift); + + /* Saturate index for the NaN case. */ + svuint64_t i = svqadd (svreinterpret_u64 (z), dat->off_idx); + + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ + i = svadd_x (pg, i, i); + const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr; + svfloat64_t erfcr = svld1_gather_index (pg, p, i); + svfloat64_t scale = svld1_gather_index (pg, p + 1, i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + svfloat64_t r = svsub_x (pg, z, shift); + svfloat64_t d = svsub_x (pg, a, r); + svfloat64_t d2 = svmul_x (pg, d, d); + svfloat64_t r2 = svmul_x (pg, r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p9(r) * d^9. */ + svfloat64_t p1 = r; + svfloat64_t third = sv_f64 (dat->p20); + svfloat64_t twothird = svmul_x (pg, third, 2.0); + svfloat64_t sixth = svmul_x (pg, third, 0.5); + svfloat64_t p2 = svmls_x (pg, third, r2, twothird); + svfloat64_t p3 = svmad_x (pg, r2, third, -0.5); + p3 = svmul_x (pg, r, p3); + svfloat64_t p4 = svmla_x (pg, sv_f64 (dat->p41), r2, dat->p42); + p4 = svmls_x (pg, sv_f64 (dat->p40), r2, p4); + svfloat64_t p5 = svmla_x (pg, sv_f64 (dat->p51), r2, dat->p52); + p5 = svmla_x (pg, sixth, r2, p5); + p5 = svmul_x (pg, r, p5); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + svfloat64_t qr5 = svld1rq (svptrue_b64 (), &dat->q5); + svfloat64_t qr6 = svld1rq (svptrue_b64 (), &dat->q6); + svfloat64_t qr7 = svld1rq (svptrue_b64 (), &dat->q7); + svfloat64_t qr8 = svld1rq (svptrue_b64 (), &dat->q8); + svfloat64_t qr9 = svld1rq (svptrue_b64 (), &dat->q9); + svfloat64_t p6 = svmla_x (pg, p4, p5, svmul_lane (r, qr5, 0)); + p6 = svmul_lane (p6, qr5, 1); + svfloat64_t p7 = svmla_x (pg, p5, p6, svmul_lane (r, qr6, 0)); + p7 = svmul_lane (p7, qr6, 1); + svfloat64_t p8 = svmla_x (pg, p6, p7, svmul_lane (r, qr7, 0)); + p8 = svmul_lane (p8, qr7, 1); + svfloat64_t p9 = svmla_x (pg, p7, p8, svmul_lane (r, qr8, 0)); + p9 = svmul_lane (p9, qr8, 1); + svfloat64_t p10 = svmla_x (pg, p8, p9, svmul_lane (r, qr9, 0)); + p10 = svmul_lane (p10, qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + svfloat64_t p90 = svmla_x (pg, p9, d, p10); + svfloat64_t p78 = svmla_x (pg, p7, d, p8); + svfloat64_t p56 = svmla_x (pg, p5, d, p6); + svfloat64_t p34 = svmla_x (pg, p3, d, p4); + svfloat64_t p12 = svmla_x (pg, p1, d, p2); + svfloat64_t y = svmla_x (pg, p78, d2, p90); + y = svmla_x (pg, p56, d2, y); + y = svmla_x (pg, p34, d2, y); + y = svmla_x (pg, p12, d2, y); + + y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t off = svreinterpret_f64 (svlsr_x (pg, sign, 1)); + /* Handle sign and scale back in a single fma. */ + svfloat64_t fac = svreinterpret_f64 (svorr_x (pg, sign, dat->table_scale)); + + return svmla_x (pg, off, fac, y); +} + +PL_SIG (SV, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (SV_NAME_D1 (erfc), 1.21) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c deleted file mode 100644 index 076b47129862..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Double-precision SVE erfc(x) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED -#include "sv_exp_tail.h" - -sv_f64_t __sv_exp_x (sv_f64_t, svbool_t); - -static NOINLINE sv_f64_t -specialcase (sv_f64_t x, sv_f64_t y, svbool_t special) -{ - return sv_call_f64 (erfc, x, y, special); -} - -static inline sv_u64_t -lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x) -{ - /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by - the number of polynomials. */ - sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1); - xp1 = svmul_f64_x (pg, xp1, xp1); - xp1 = svmul_f64_x (pg, xp1, xp1); - sv_u64_t interval_idx - = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023); - return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS), - interval_idx, sv_u64 (ERFC_NUM_INTERVALS)); -} - -static inline sv_f64_t -sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx) -{ - sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1); - const double *base = &__v_erfc_data.poly[0][12]; - sv_f64_t r = sv_lookup_f64_x (pg, base, offset); - for (int i = 0; i < ERFC_POLY_ORDER; i++) - { - base--; - sv_f64_t c = sv_lookup_f64_x (pg, base, offset); - r = sv_fma_f64_x (pg, z, r, c); - } - return r; -} - -static inline sv_f64_t -sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x) -{ - /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding - errors in x^2, so we compute an estimate for the error and use a custom exp - helper which corrects for the calculated error estimate. */ - sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x); - - /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and - a_lo is the 'small' component. */ - const sv_f64_t scale = sv_f64 (0x1.0000002p27); - sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x, - svneg_f64_x (pg, abs_x))); - a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi); - sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi); - - sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi); - sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo); - - /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) - - (a_hi + a_lo) * (a_hi + a_lo). */ - sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2); - e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2); - e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2); - e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2); - - return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2); -} - -/* Optimized double precision vector complementary error function erfc. - Maximum measured error is 3.64 ULP: - __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42 - want 0x1.ff3f4c8e200d9p-42. */ -sv_f64_t -__sv_erfc_x (sv_f64_t x, const svbool_t pg) -{ - sv_u64_t ix = sv_as_u64_f64 (x); - sv_f64_t abs_x = svabs_f64_x (pg, x); - sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52); - - /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes - to 2. As long as the polynomial is 0 in the boring zone, we can assemble - the result correctly. This is dealt with in two ways: - - The 'coarse approach' is that the approximation algorithm is - zero-predicated on in_bounds = |x| < 32, which saves the need to do - coefficient lookup etc for |x| >= 32. - - The coarse approach misses [-32, -6] and [28, 32], which are dealt with in - the polynomial and index calculation, such that the polynomial evaluates to - 0 in these regions. */ - /* in_bounds is true for lanes where |x| < 32. */ - svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404); - /* boring_zone = 2 for x < 0, 0 otherwise. */ - sv_f64_t boring_zone - = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62)); - /* Very small, nan and inf. */ - svbool_t special_cases - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432); - - /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2) - - Where P_i is a polynomial and x_i is an offset, both defined in - v_erfc_data.c. i is chosen based on which interval x falls in. */ - sv_u64_t i = lookup_interval_idx (in_bounds, abs_x); - sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i); - sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i); - /* 'copy' sign of x to p, i.e. negate p if x is negative. */ - sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff); - p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign)); - - sv_f64_t e = sv_eval_gauss (in_bounds, abs_x); - - /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally - select boring_zone because P[V_ERFC_NINTS-1]=0. */ - sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone); - - if (unlikely (svptest_any (pg, special_cases))) - { - return specialcase (x, y, special_cases); - } - return y; -} - -PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc) - -PL_SIG (SV, D, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (__sv_erfc, 3.15) -PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfcf_1u7.c b/contrib/arm-optimized-routines/pl/math/sv_erfcf_1u7.c new file mode 100644 index 000000000000..cda8f0b3752e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erfcf_1u7.c @@ -0,0 +1,111 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint32_t off_idx, off_arr; + float max, shift; + float third, two_thirds, two_over_fifteen, two_over_five, tenth; +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .off_idx = 0xb7fffd7b, /* 0xffffffff - asuint(shift) - 644. */ + .off_arr = 0xfffffd7b, /* 0xffffffff - 644. */ + .max = 10.0625f, /* 644/64. */ + .shift = 0x1p17f, + .third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, + .two_over_fifteen = 0x1.111112p-3f, + .two_over_five = -0x1.99999ap-2f, + .tenth = -0x1.99999ap-4f, +}; + +#define SignMask 0x80000000 +#define TableScale 0x28000000 /* 0x1p-47. */ + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVsMxv_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + svfloat32_t a = svabs_x (pg, x); + + /* Clamp input at |x| <= 10.0 + 4/64. */ + a = svmin_x (pg, a, dat->max); + + /* Reduce x to the nearest multiple of 1/64. */ + svfloat32_t shift = sv_f32 (dat->shift); + svfloat32_t z = svadd_x (pg, a, shift); + + /* Saturate index for the NaN case. */ + svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); + + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ + i = svmul_x (pg, i, 2); + const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr; + svfloat32_t erfcr = svld1_gather_index (pg, p, i); + svfloat32_t scale = svld1_gather_index (pg, p + 1, i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + svfloat32_t r = svsub_x (pg, z, shift); + svfloat32_t d = svsub_x (pg, a, r); + svfloat32_t d2 = svmul_x (pg, d, d); + svfloat32_t r2 = svmul_x (pg, r, r); + + svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); + svfloat32_t third = svdup_lane (coeffs, 0); + + svfloat32_t p1 = r; + svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1); + svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); + svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); + p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); + + svfloat32_t y = svmla_x (pg, p3, d, p4); + y = svmla_x (pg, p2, d, y); + y = svmla_x (pg, p1, d, y); + + /* Solves the |x| = inf/nan case. */ + y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), SignMask); + svfloat32_t off = svreinterpret_f32 (svlsr_x (pg, sign, 1)); + /* Handle sign and scale back in a single fma. */ + svfloat32_t fac = svreinterpret_f32 (svorr_x (pg, sign, TableScale)); + + return svmla_x (pg, off, fac, y); +} + +PL_SIG (SV, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (erfc), 1.14) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c b/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c deleted file mode 100644 index c7a738c55f7b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Single-precision vector erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED - -#define AbsMask (0x7fffffff) - -static NOINLINE sv_f32_t -__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) -{ - return sv_call_f32 (erff, x, y, cmp); -} - -sv_f32_t __sv_expf_x (svbool_t, sv_f32_t); - -/* Optimized single precision vector erf. Worst-case error is 1.25 ULP: - __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1 - want 0x1.9f9c8ap-1. */ -sv_f32_t -__sv_erff_x (sv_f32_t x, const svbool_t pg) -{ - sv_u32_t ix = sv_as_u32_f32 (x); - sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff); - /* Handle both inf/nan as well as small values (|x|<2^-28). */ - svbool_t cmp - = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180); - - sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask); - /* |x| < 0.921875. */ - svbool_t red = svaclt_n_f32 (pg, x, 0.921875f); - /* |x| > 4.0. */ - svbool_t bor = svacgt_n_f32 (pg, x, 4.0f); - - /* Load polynomial coefficients. */ - sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1)); - sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2); - - const float *base = (float *) __v_erff_data.coeffs; - sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2); - sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6); - sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10); - - /* Do not need to store elem 0 of __v_erff_data as it is not used. */ - sv_f32_t p1 = svtbl (c_2_5, idx_lo); - sv_f32_t p2 = svtbl (c_2_5, idx_hi); - sv_f32_t p3 = svtbl (c_6_9, idx_lo); - sv_f32_t p4 = svtbl (c_6_9, idx_hi); - sv_f32_t p5 = svtbl (c_10_13, idx_lo); - sv_f32_t p6 = svtbl (c_10_13, idx_hi); - - sv_f32_t a = svabs_f32_x (pg, x); - /* Square with merging mul - z is x^2 for reduced, |x| otherwise. */ - sv_f32_t z = svmul_f32_m (red, a, a); - - /* Evaluate polynomial on |x| or x^2. */ - sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5); - r = sv_fma_f32_x (pg, z, r, p4); - r = sv_fma_f32_x (pg, z, r, p3); - r = sv_fma_f32_x (pg, z, r, p2); - r = sv_fma_f32_x (pg, z, r, p1); - /* Use merging svmad for last operation - apply first coefficient if not - reduced, otherwise r is propagated unchanged. This is because the reduced - polynomial has lower order than the non-reduced. */ - r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]); - r = sv_fma_f32_x (pg, a, r, a); - - /* y = |x| + |x| * P(x^2) if |x| < 0.921875 - y = 1 - exp (-(|x| + |x| * P(|x|))) otherwise. */ - sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r)); - y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0)); - - /* Boring domain (absolute value is required to get the sign of erf(-nan) - right). */ - y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y)); - - /* y = erf(x) if x>0, -erf(-x) otherwise. */ - y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign)); - - if (unlikely (svptest_any (pg, cmp))) - return __sv_erff_specialcase (x, y, cmp); - return y; -} - -PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff) - -PL_SIG (SV, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (__sv_erff, 0.76) -PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000) -PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000) -PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000) -PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000) -PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000) -PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_2u.c b/contrib/arm-optimized-routines/pl/math/sv_erff_2u.c new file mode 100644 index 000000000000..adeee798ee2e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erff_2u.c @@ -0,0 +1,90 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float min, max, scale, shift, third; +} data = { + .min = 0x1.cp-7f, /* 1/64 - 1/512. */ + .max = 3.9375, /* 4 - 8/128. */ + .scale = 0x1.20dd76p+0f, /* 2/sqrt(pi). */ + .shift = 0x1p16f, + .third = 0x1.555556p-2f, /* 1/3. */ +}; + +#define SignMask (0x80000000) + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| < 0x1.cp-7, the algorithm sets r = 0, erf(r) = 0, and scale = 2 / + sqrt(pi), so it simply boils down to a Taylor series expansion near 0. For + |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error on each interval: + - [0, 0x1.cp-7]: 1.93 ULP + _ZGVsMxv_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 want 0x1.fd6868p-9 + - [0x1.cp-7, 4.0]: 1.26 ULP + _ZGVsMxv_erff(0x1.1d002ep+0) got 0x1.c4eb9ap-1 want 0x1.c4eb98p-1. */ +svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* |x| > 1/64 - 1/512. */ + svbool_t a_gt_min = svacgt (pg, x, dat->min); + + /* |x| >= 4.0 - 8/128. */ + svbool_t a_ge_max = svacge (pg, x, dat->max); + svfloat32_t a = svabs_x (pg, x); + + svfloat32_t shift = sv_f32 (dat->shift); + svfloat32_t z = svadd_x (pg, a, shift); + svuint32_t i + = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift)); + + /* Saturate lookup index. */ + i = svsel (a_ge_max, sv_u32 (512), i); + + /* r and erf(r) set to 0 for |x| below min. */ + svfloat32_t r = svsub_z (a_gt_min, z, shift); + svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i); + + /* scale set to 2/sqrt(pi) for |x| below min. */ + svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i); + scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ + svfloat32_t d = svsub_x (pg, a, r); + svfloat32_t d2 = svmul_x (pg, d, d); + svfloat32_t y = svmla_x (pg, r, d, dat->third); + y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y)); + + /* Solves the |x| = inf case. */ + y = svsel (a_ge_max, sv_f32 (1.0f), y); + + /* Copy sign. */ + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); + svuint32_t sign = svand_x (pg, ix, SignMask); + return svreinterpret_f32 (svorr_x (pg, sign, iy)); +} + +PL_SIG (SV, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (SV_NAME_F1 (erf), 1.43) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_data.c b/contrib/arm-optimized-routines/pl/math/sv_erff_data.c new file mode 100644 index 000000000000..154d3c188874 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erff_data.c @@ -0,0 +1,1046 @@ +/* + * Data for approximation of vector erff. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in SVE erff. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 4.0 (513 values): + - __erff_data.erf contains the values of erf(r), + - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2). + Note that indices 0 and 1 are never hit by the algorithm, since lookup is + performed only for x >= 1/64-1/512. */ +const struct sv_erff_data __sv_erff_data = { + .erf = { 0x0.000000p+0, + 0x1.20dbf4p-7, + 0x1.20d770p-6, + 0x1.b137e0p-6, + 0x1.20c564p-5, + 0x1.68e5d4p-5, + 0x1.b0fafep-5, + 0x1.f902a8p-5, + 0x1.207d48p-4, + 0x1.44703ep-4, + 0x1.68591ap-4, + 0x1.8c36bep-4, + 0x1.b00812p-4, + 0x1.d3cbf8p-4, + 0x1.f7815ap-4, + 0x1.0d9390p-3, + 0x1.1f5e1ap-3, + 0x1.311fc2p-3, + 0x1.42d7fcp-3, + 0x1.548642p-3, + 0x1.662a0cp-3, + 0x1.77c2d2p-3, + 0x1.895010p-3, + 0x1.9ad142p-3, + 0x1.ac45e4p-3, + 0x1.bdad72p-3, + 0x1.cf076ep-3, + 0x1.e05354p-3, + 0x1.f190aap-3, + 0x1.015f78p-2, + 0x1.09eed6p-2, + 0x1.127632p-2, + 0x1.1af54ep-2, + 0x1.236bf0p-2, + 0x1.2bd9dcp-2, + 0x1.343ed6p-2, + 0x1.3c9aa8p-2, + 0x1.44ed18p-2, + 0x1.4d35f0p-2, + 0x1.5574f4p-2, + 0x1.5da9f4p-2, + 0x1.65d4b8p-2, + 0x1.6df50ap-2, + 0x1.760abap-2, + 0x1.7e1594p-2, + 0x1.861566p-2, + 0x1.8e0a02p-2, + 0x1.95f336p-2, + 0x1.9dd0d2p-2, + 0x1.a5a2acp-2, + 0x1.ad6896p-2, + 0x1.b52264p-2, + 0x1.bccfecp-2, + 0x1.c47104p-2, + 0x1.cc0584p-2, + 0x1.d38d44p-2, + 0x1.db081cp-2, + 0x1.e275eap-2, + 0x1.e9d68ap-2, + 0x1.f129d4p-2, + 0x1.f86faap-2, + 0x1.ffa7eap-2, + 0x1.03693ap-1, + 0x1.06f794p-1, + 0x1.0a7ef6p-1, + 0x1.0dff50p-1, + 0x1.117894p-1, + 0x1.14eab4p-1, + 0x1.1855a6p-1, + 0x1.1bb95cp-1, + 0x1.1f15ccp-1, + 0x1.226ae8p-1, + 0x1.25b8a8p-1, + 0x1.28ff02p-1, + 0x1.2c3decp-1, + 0x1.2f755cp-1, + 0x1.32a54cp-1, + 0x1.35cdb4p-1, + 0x1.38ee8ap-1, + 0x1.3c07cap-1, + 0x1.3f196ep-1, + 0x1.42236ep-1, + 0x1.4525c8p-1, + 0x1.482074p-1, + 0x1.4b1372p-1, + 0x1.4dfebap-1, + 0x1.50e24cp-1, + 0x1.53be26p-1, + 0x1.569244p-1, + 0x1.595ea6p-1, + 0x1.5c2348p-1, + 0x1.5ee02ep-1, + 0x1.619556p-1, + 0x1.6442c0p-1, + 0x1.66e86ep-1, + 0x1.69865ep-1, + 0x1.6c1c98p-1, + 0x1.6eab18p-1, + 0x1.7131e6p-1, + 0x1.73b102p-1, + 0x1.762870p-1, + 0x1.789836p-1, + 0x1.7b0058p-1, + 0x1.7d60d8p-1, + 0x1.7fb9c0p-1, + 0x1.820b12p-1, + 0x1.8454d6p-1, + 0x1.869712p-1, + 0x1.88d1cep-1, + 0x1.8b050ep-1, + 0x1.8d30dep-1, + 0x1.8f5544p-1, + 0x1.91724ap-1, + 0x1.9387f6p-1, + 0x1.959652p-1, + 0x1.979d68p-1, + 0x1.999d42p-1, + 0x1.9b95e8p-1, + 0x1.9d8768p-1, + 0x1.9f71cap-1, + 0x1.a1551ap-1, + 0x1.a33162p-1, + 0x1.a506b0p-1, + 0x1.a6d50cp-1, + 0x1.a89c86p-1, + 0x1.aa5d26p-1, + 0x1.ac16fcp-1, + 0x1.adca14p-1, + 0x1.af767ap-1, + 0x1.b11c3cp-1, + 0x1.b2bb68p-1, + 0x1.b4540ap-1, + 0x1.b5e630p-1, + 0x1.b771e8p-1, + 0x1.b8f742p-1, + 0x1.ba764ap-1, + 0x1.bbef10p-1, + 0x1.bd61a2p-1, + 0x1.bece0ep-1, + 0x1.c03464p-1, + 0x1.c194b2p-1, + 0x1.c2ef08p-1, + 0x1.c44376p-1, + 0x1.c5920ap-1, + 0x1.c6dad2p-1, + 0x1.c81de2p-1, + 0x1.c95b46p-1, + 0x1.ca930ep-1, + 0x1.cbc54cp-1, + 0x1.ccf20cp-1, + 0x1.ce1962p-1, + 0x1.cf3b5cp-1, + 0x1.d0580cp-1, + 0x1.d16f7ep-1, + 0x1.d281c4p-1, + 0x1.d38ef0p-1, + 0x1.d49710p-1, + 0x1.d59a34p-1, + 0x1.d6986cp-1, + 0x1.d791cap-1, + 0x1.d8865ep-1, + 0x1.d97636p-1, + 0x1.da6162p-1, + 0x1.db47f4p-1, + 0x1.dc29fcp-1, + 0x1.dd0788p-1, + 0x1.dde0aap-1, + 0x1.deb570p-1, + 0x1.df85eap-1, + 0x1.e0522ap-1, + 0x1.e11a3ep-1, + 0x1.e1de36p-1, + 0x1.e29e22p-1, + 0x1.e35a12p-1, + 0x1.e41214p-1, + 0x1.e4c638p-1, + 0x1.e5768cp-1, + 0x1.e62322p-1, + 0x1.e6cc08p-1, + 0x1.e7714ap-1, + 0x1.e812fcp-1, + 0x1.e8b12ap-1, + 0x1.e94be4p-1, + 0x1.e9e336p-1, + 0x1.ea7730p-1, + 0x1.eb07e2p-1, + 0x1.eb9558p-1, + 0x1.ec1fa2p-1, + 0x1.eca6ccp-1, + 0x1.ed2ae6p-1, + 0x1.edabfcp-1, + 0x1.ee2a1ep-1, + 0x1.eea556p-1, + 0x1.ef1db4p-1, + 0x1.ef9344p-1, + 0x1.f00614p-1, + 0x1.f07630p-1, + 0x1.f0e3a6p-1, + 0x1.f14e82p-1, + 0x1.f1b6d0p-1, + 0x1.f21ca0p-1, + 0x1.f27ff8p-1, + 0x1.f2e0eap-1, + 0x1.f33f7ep-1, + 0x1.f39bc2p-1, + 0x1.f3f5c2p-1, + 0x1.f44d88p-1, + 0x1.f4a31ep-1, + 0x1.f4f694p-1, + 0x1.f547f2p-1, + 0x1.f59742p-1, + 0x1.f5e490p-1, + 0x1.f62fe8p-1, + 0x1.f67952p-1, + 0x1.f6c0dcp-1, + 0x1.f7068cp-1, + 0x1.f74a6ep-1, + 0x1.f78c8cp-1, + 0x1.f7cceep-1, + 0x1.f80ba2p-1, + 0x1.f848acp-1, + 0x1.f8841ap-1, + 0x1.f8bdf2p-1, + 0x1.f8f63ep-1, + 0x1.f92d08p-1, + 0x1.f96256p-1, + 0x1.f99634p-1, + 0x1.f9c8a8p-1, + 0x1.f9f9bap-1, + 0x1.fa2974p-1, + 0x1.fa57dep-1, + 0x1.fa84fep-1, + 0x1.fab0dep-1, + 0x1.fadb84p-1, + 0x1.fb04f6p-1, + 0x1.fb2d40p-1, + 0x1.fb5464p-1, + 0x1.fb7a6cp-1, + 0x1.fb9f60p-1, + 0x1.fbc344p-1, + 0x1.fbe61ep-1, + 0x1.fc07fap-1, + 0x1.fc28d8p-1, + 0x1.fc48c2p-1, + 0x1.fc67bcp-1, + 0x1.fc85d0p-1, + 0x1.fca2fep-1, + 0x1.fcbf52p-1, + 0x1.fcdaccp-1, + 0x1.fcf576p-1, + 0x1.fd0f54p-1, + 0x1.fd286ap-1, + 0x1.fd40bep-1, + 0x1.fd5856p-1, + 0x1.fd6f34p-1, + 0x1.fd8562p-1, + 0x1.fd9ae2p-1, + 0x1.fdafb8p-1, + 0x1.fdc3e8p-1, + 0x1.fdd77ap-1, + 0x1.fdea6ep-1, + 0x1.fdfcccp-1, + 0x1.fe0e96p-1, + 0x1.fe1fd0p-1, + 0x1.fe3080p-1, + 0x1.fe40a6p-1, + 0x1.fe504cp-1, + 0x1.fe5f70p-1, + 0x1.fe6e18p-1, + 0x1.fe7c46p-1, + 0x1.fe8a00p-1, + 0x1.fe9748p-1, + 0x1.fea422p-1, + 0x1.feb090p-1, + 0x1.febc96p-1, + 0x1.fec836p-1, + 0x1.fed374p-1, + 0x1.fede52p-1, + 0x1.fee8d4p-1, + 0x1.fef2fep-1, + 0x1.fefccep-1, + 0x1.ff064cp-1, + 0x1.ff0f76p-1, + 0x1.ff1852p-1, + 0x1.ff20e0p-1, + 0x1.ff2924p-1, + 0x1.ff3120p-1, + 0x1.ff38d6p-1, + 0x1.ff4048p-1, + 0x1.ff4778p-1, + 0x1.ff4e68p-1, + 0x1.ff551ap-1, + 0x1.ff5b90p-1, + 0x1.ff61ccp-1, + 0x1.ff67d0p-1, + 0x1.ff6d9ep-1, + 0x1.ff7338p-1, + 0x1.ff789ep-1, + 0x1.ff7dd4p-1, + 0x1.ff82dap-1, + 0x1.ff87b2p-1, + 0x1.ff8c5cp-1, + 0x1.ff90dcp-1, + 0x1.ff9532p-1, + 0x1.ff9960p-1, + 0x1.ff9d68p-1, + 0x1.ffa14ap-1, + 0x1.ffa506p-1, + 0x1.ffa8a0p-1, + 0x1.ffac18p-1, + 0x1.ffaf6ep-1, + 0x1.ffb2a6p-1, + 0x1.ffb5bep-1, + 0x1.ffb8b8p-1, + 0x1.ffbb98p-1, + 0x1.ffbe5ap-1, + 0x1.ffc102p-1, + 0x1.ffc390p-1, + 0x1.ffc606p-1, + 0x1.ffc862p-1, + 0x1.ffcaa8p-1, + 0x1.ffccd8p-1, + 0x1.ffcef4p-1, + 0x1.ffd0fap-1, + 0x1.ffd2eap-1, + 0x1.ffd4cap-1, + 0x1.ffd696p-1, + 0x1.ffd84ep-1, + 0x1.ffd9f8p-1, + 0x1.ffdb90p-1, + 0x1.ffdd18p-1, + 0x1.ffde90p-1, + 0x1.ffdffap-1, + 0x1.ffe154p-1, + 0x1.ffe2a2p-1, + 0x1.ffe3e2p-1, + 0x1.ffe514p-1, + 0x1.ffe63cp-1, + 0x1.ffe756p-1, + 0x1.ffe866p-1, + 0x1.ffe96ap-1, + 0x1.ffea64p-1, + 0x1.ffeb54p-1, + 0x1.ffec3ap-1, + 0x1.ffed16p-1, + 0x1.ffedeap-1, + 0x1.ffeeb4p-1, + 0x1.ffef76p-1, + 0x1.fff032p-1, + 0x1.fff0e4p-1, + 0x1.fff18ep-1, + 0x1.fff232p-1, + 0x1.fff2d0p-1, + 0x1.fff366p-1, + 0x1.fff3f6p-1, + 0x1.fff480p-1, + 0x1.fff504p-1, + 0x1.fff582p-1, + 0x1.fff5fcp-1, + 0x1.fff670p-1, + 0x1.fff6dep-1, + 0x1.fff74ap-1, + 0x1.fff7aep-1, + 0x1.fff810p-1, + 0x1.fff86cp-1, + 0x1.fff8c6p-1, + 0x1.fff91cp-1, + 0x1.fff96cp-1, + 0x1.fff9bap-1, + 0x1.fffa04p-1, + 0x1.fffa4cp-1, + 0x1.fffa90p-1, + 0x1.fffad0p-1, + 0x1.fffb0ep-1, + 0x1.fffb4ap-1, + 0x1.fffb82p-1, + 0x1.fffbb8p-1, + 0x1.fffbecp-1, + 0x1.fffc1ep-1, + 0x1.fffc4ep-1, + 0x1.fffc7ap-1, + 0x1.fffca6p-1, + 0x1.fffccep-1, + 0x1.fffcf6p-1, + 0x1.fffd1ap-1, + 0x1.fffd3ep-1, + 0x1.fffd60p-1, + 0x1.fffd80p-1, + 0x1.fffda0p-1, + 0x1.fffdbep-1, + 0x1.fffddap-1, + 0x1.fffdf4p-1, + 0x1.fffe0ep-1, + 0x1.fffe26p-1, + 0x1.fffe3ep-1, + 0x1.fffe54p-1, + 0x1.fffe68p-1, + 0x1.fffe7ep-1, + 0x1.fffe90p-1, + 0x1.fffea2p-1, + 0x1.fffeb4p-1, + 0x1.fffec4p-1, + 0x1.fffed4p-1, + 0x1.fffee4p-1, + 0x1.fffef2p-1, + 0x1.ffff00p-1, + 0x1.ffff0cp-1, + 0x1.ffff18p-1, + 0x1.ffff24p-1, + 0x1.ffff30p-1, + 0x1.ffff3ap-1, + 0x1.ffff44p-1, + 0x1.ffff4ep-1, + 0x1.ffff56p-1, + 0x1.ffff60p-1, + 0x1.ffff68p-1, + 0x1.ffff70p-1, + 0x1.ffff78p-1, + 0x1.ffff7ep-1, + 0x1.ffff84p-1, + 0x1.ffff8cp-1, + 0x1.ffff92p-1, + 0x1.ffff98p-1, + 0x1.ffff9cp-1, + 0x1.ffffa2p-1, + 0x1.ffffa6p-1, + 0x1.ffffacp-1, + 0x1.ffffb0p-1, + 0x1.ffffb4p-1, + 0x1.ffffb8p-1, + 0x1.ffffbcp-1, + 0x1.ffffc0p-1, + 0x1.ffffc4p-1, + 0x1.ffffc6p-1, + 0x1.ffffcap-1, + 0x1.ffffccp-1, + 0x1.ffffd0p-1, + 0x1.ffffd2p-1, + 0x1.ffffd4p-1, + 0x1.ffffd6p-1, + 0x1.ffffd8p-1, + 0x1.ffffdcp-1, + 0x1.ffffdep-1, + 0x1.ffffdep-1, + 0x1.ffffe0p-1, + 0x1.ffffe2p-1, + 0x1.ffffe4p-1, + 0x1.ffffe6p-1, + 0x1.ffffe8p-1, + 0x1.ffffe8p-1, + 0x1.ffffeap-1, + 0x1.ffffeap-1, + 0x1.ffffecp-1, + 0x1.ffffeep-1, + 0x1.ffffeep-1, + 0x1.fffff0p-1, + 0x1.fffff0p-1, + 0x1.fffff2p-1, + 0x1.fffff2p-1, + 0x1.fffff2p-1, + 0x1.fffff4p-1, + 0x1.fffff4p-1, + 0x1.fffff4p-1, + 0x1.fffff6p-1, + 0x1.fffff6p-1, + 0x1.fffff6p-1, + 0x1.fffff8p-1, + 0x1.fffff8p-1, + 0x1.fffff8p-1, + 0x1.fffff8p-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffap-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffcp-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.fffffep-1, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + 0x1.000000p+0, + }, + .scale = { 0x1.20dd76p+0, + 0x1.20d8f2p+0, + 0x1.20cb68p+0, + 0x1.20b4d8p+0, + 0x1.209546p+0, + 0x1.206cb4p+0, + 0x1.203b26p+0, + 0x1.2000a0p+0, + 0x1.1fbd28p+0, + 0x1.1f70c4p+0, + 0x1.1f1b7ap+0, + 0x1.1ebd56p+0, + 0x1.1e565cp+0, + 0x1.1de698p+0, + 0x1.1d6e14p+0, + 0x1.1cecdcp+0, + 0x1.1c62fap+0, + 0x1.1bd07cp+0, + 0x1.1b3572p+0, + 0x1.1a91e6p+0, + 0x1.19e5eap+0, + 0x1.19318cp+0, + 0x1.1874dep+0, + 0x1.17aff0p+0, + 0x1.16e2d8p+0, + 0x1.160da4p+0, + 0x1.153068p+0, + 0x1.144b3cp+0, + 0x1.135e30p+0, + 0x1.12695ep+0, + 0x1.116cd8p+0, + 0x1.1068bap+0, + 0x1.0f5d16p+0, + 0x1.0e4a08p+0, + 0x1.0d2fa6p+0, + 0x1.0c0e0ap+0, + 0x1.0ae550p+0, + 0x1.09b590p+0, + 0x1.087ee4p+0, + 0x1.07416cp+0, + 0x1.05fd3ep+0, + 0x1.04b27cp+0, + 0x1.036140p+0, + 0x1.0209a6p+0, + 0x1.00abd0p+0, + 0x1.fe8fb0p-1, + 0x1.fbbbbep-1, + 0x1.f8dc0ap-1, + 0x1.f5f0cep-1, + 0x1.f2fa4cp-1, + 0x1.eff8c4p-1, + 0x1.ecec78p-1, + 0x1.e9d5a8p-1, + 0x1.e6b498p-1, + 0x1.e38988p-1, + 0x1.e054bep-1, + 0x1.dd167cp-1, + 0x1.d9cf06p-1, + 0x1.d67ea2p-1, + 0x1.d32592p-1, + 0x1.cfc41ep-1, + 0x1.cc5a8ap-1, + 0x1.c8e91cp-1, + 0x1.c5701ap-1, + 0x1.c1efcap-1, + 0x1.be6872p-1, + 0x1.bada5ap-1, + 0x1.b745c6p-1, + 0x1.b3aafcp-1, + 0x1.b00a46p-1, + 0x1.ac63e8p-1, + 0x1.a8b828p-1, + 0x1.a5074ep-1, + 0x1.a1519ep-1, + 0x1.9d9762p-1, + 0x1.99d8dap-1, + 0x1.961650p-1, + 0x1.925008p-1, + 0x1.8e8646p-1, + 0x1.8ab950p-1, + 0x1.86e96ap-1, + 0x1.8316d6p-1, + 0x1.7f41dcp-1, + 0x1.7b6abcp-1, + 0x1.7791b8p-1, + 0x1.73b714p-1, + 0x1.6fdb12p-1, + 0x1.6bfdf0p-1, + 0x1.681ff2p-1, + 0x1.644156p-1, + 0x1.60625cp-1, + 0x1.5c8342p-1, + 0x1.58a446p-1, + 0x1.54c5a6p-1, + 0x1.50e79ep-1, + 0x1.4d0a68p-1, + 0x1.492e42p-1, + 0x1.455366p-1, + 0x1.417a0cp-1, + 0x1.3da26ep-1, + 0x1.39ccc2p-1, + 0x1.35f940p-1, + 0x1.32281ep-1, + 0x1.2e5992p-1, + 0x1.2a8dcep-1, + 0x1.26c508p-1, + 0x1.22ff72p-1, + 0x1.1f3d3cp-1, + 0x1.1b7e98p-1, + 0x1.17c3b6p-1, + 0x1.140cc4p-1, + 0x1.1059eep-1, + 0x1.0cab62p-1, + 0x1.09014cp-1, + 0x1.055bd6p-1, + 0x1.01bb2cp-1, + 0x1.fc3ee6p-2, + 0x1.f511aap-2, + 0x1.edeeeep-2, + 0x1.e6d700p-2, + 0x1.dfca26p-2, + 0x1.d8c8aap-2, + 0x1.d1d2d0p-2, + 0x1.cae8dap-2, + 0x1.c40b08p-2, + 0x1.bd3998p-2, + 0x1.b674c8p-2, + 0x1.afbcd4p-2, + 0x1.a911f0p-2, + 0x1.a27456p-2, + 0x1.9be438p-2, + 0x1.9561c8p-2, + 0x1.8eed36p-2, + 0x1.8886b2p-2, + 0x1.822e66p-2, + 0x1.7be47ap-2, + 0x1.75a91ap-2, + 0x1.6f7c6ap-2, + 0x1.695e8cp-2, + 0x1.634fa6p-2, + 0x1.5d4fd4p-2, + 0x1.575f34p-2, + 0x1.517de6p-2, + 0x1.4bac00p-2, + 0x1.45e99cp-2, + 0x1.4036d0p-2, + 0x1.3a93b2p-2, + 0x1.350052p-2, + 0x1.2f7cc4p-2, + 0x1.2a0916p-2, + 0x1.24a554p-2, + 0x1.1f518ap-2, + 0x1.1a0dc6p-2, + 0x1.14da0ap-2, + 0x1.0fb662p-2, + 0x1.0aa2d0p-2, + 0x1.059f5ap-2, + 0x1.00ac00p-2, + 0x1.f79184p-3, + 0x1.edeb40p-3, + 0x1.e46530p-3, + 0x1.daff4ap-3, + 0x1.d1b982p-3, + 0x1.c893cep-3, + 0x1.bf8e1cp-3, + 0x1.b6a856p-3, + 0x1.ade26cp-3, + 0x1.a53c42p-3, + 0x1.9cb5bep-3, + 0x1.944ec2p-3, + 0x1.8c0732p-3, + 0x1.83deeap-3, + 0x1.7bd5c8p-3, + 0x1.73eba4p-3, + 0x1.6c2056p-3, + 0x1.6473b6p-3, + 0x1.5ce596p-3, + 0x1.5575c8p-3, + 0x1.4e241ep-3, + 0x1.46f066p-3, + 0x1.3fda6cp-3, + 0x1.38e1fap-3, + 0x1.3206dcp-3, + 0x1.2b48dap-3, + 0x1.24a7b8p-3, + 0x1.1e233ep-3, + 0x1.17bb2cp-3, + 0x1.116f48p-3, + 0x1.0b3f52p-3, + 0x1.052b0cp-3, + 0x1.fe6460p-4, + 0x1.f2a902p-4, + 0x1.e72372p-4, + 0x1.dbd32ap-4, + 0x1.d0b7a0p-4, + 0x1.c5d04ap-4, + 0x1.bb1c98p-4, + 0x1.b09bfcp-4, + 0x1.a64de6p-4, + 0x1.9c31c6p-4, + 0x1.92470ap-4, + 0x1.888d1ep-4, + 0x1.7f036cp-4, + 0x1.75a960p-4, + 0x1.6c7e64p-4, + 0x1.6381e2p-4, + 0x1.5ab342p-4, + 0x1.5211ecp-4, + 0x1.499d48p-4, + 0x1.4154bcp-4, + 0x1.3937b2p-4, + 0x1.31458ep-4, + 0x1.297dbap-4, + 0x1.21df9ap-4, + 0x1.1a6a96p-4, + 0x1.131e14p-4, + 0x1.0bf97ep-4, + 0x1.04fc3ap-4, + 0x1.fc4b5ep-5, + 0x1.eeea8cp-5, + 0x1.e1d4d0p-5, + 0x1.d508fap-5, + 0x1.c885e0p-5, + 0x1.bc4a54p-5, + 0x1.b05530p-5, + 0x1.a4a54ap-5, + 0x1.99397ap-5, + 0x1.8e109cp-5, + 0x1.83298ep-5, + 0x1.78832cp-5, + 0x1.6e1c58p-5, + 0x1.63f3f6p-5, + 0x1.5a08e8p-5, + 0x1.505a18p-5, + 0x1.46e66cp-5, + 0x1.3dacd2p-5, + 0x1.34ac36p-5, + 0x1.2be38cp-5, + 0x1.2351c2p-5, + 0x1.1af5d2p-5, + 0x1.12ceb4p-5, + 0x1.0adb60p-5, + 0x1.031ad6p-5, + 0x1.f7182ap-6, + 0x1.e85c44p-6, + 0x1.da0006p-6, + 0x1.cc0180p-6, + 0x1.be5ecep-6, + 0x1.b1160ap-6, + 0x1.a4255ap-6, + 0x1.978ae8p-6, + 0x1.8b44e6p-6, + 0x1.7f5188p-6, + 0x1.73af0cp-6, + 0x1.685bb6p-6, + 0x1.5d55ccp-6, + 0x1.529b9ep-6, + 0x1.482b84p-6, + 0x1.3e03d8p-6, + 0x1.3422fep-6, + 0x1.2a875cp-6, + 0x1.212f62p-6, + 0x1.181984p-6, + 0x1.0f443ep-6, + 0x1.06ae14p-6, + 0x1.fcab14p-7, + 0x1.ec7262p-7, + 0x1.dcaf36p-7, + 0x1.cd5ecap-7, + 0x1.be7e5ap-7, + 0x1.b00b38p-7, + 0x1.a202bep-7, + 0x1.94624ep-7, + 0x1.87275ep-7, + 0x1.7a4f6ap-7, + 0x1.6dd7fep-7, + 0x1.61beaep-7, + 0x1.56011cp-7, + 0x1.4a9cf6p-7, + 0x1.3f8ff6p-7, + 0x1.34d7dcp-7, + 0x1.2a727ap-7, + 0x1.205dacp-7, + 0x1.169756p-7, + 0x1.0d1d6ap-7, + 0x1.03ede2p-7, + 0x1.f60d8ap-8, + 0x1.e4cc4ap-8, + 0x1.d4143ap-8, + 0x1.c3e1a6p-8, + 0x1.b430ecp-8, + 0x1.a4fe84p-8, + 0x1.9646f4p-8, + 0x1.8806d8p-8, + 0x1.7a3adep-8, + 0x1.6cdfccp-8, + 0x1.5ff276p-8, + 0x1.536fc2p-8, + 0x1.4754acp-8, + 0x1.3b9e40p-8, + 0x1.30499cp-8, + 0x1.2553eep-8, + 0x1.1aba78p-8, + 0x1.107a8cp-8, + 0x1.06918cp-8, + 0x1.f9f9d0p-9, + 0x1.e77448p-9, + 0x1.d58da6p-9, + 0x1.c4412cp-9, + 0x1.b38a3ap-9, + 0x1.a36454p-9, + 0x1.93cb12p-9, + 0x1.84ba30p-9, + 0x1.762d84p-9, + 0x1.682100p-9, + 0x1.5a90b0p-9, + 0x1.4d78bcp-9, + 0x1.40d564p-9, + 0x1.34a306p-9, + 0x1.28de12p-9, + 0x1.1d8318p-9, + 0x1.128ebap-9, + 0x1.07fdb4p-9, + 0x1.fb99b8p-10, + 0x1.e7f232p-10, + 0x1.d4fed8p-10, + 0x1.c2b9d0p-10, + 0x1.b11d70p-10, + 0x1.a02436p-10, + 0x1.8fc8c8p-10, + 0x1.8005f0p-10, + 0x1.70d6a4p-10, + 0x1.6235fcp-10, + 0x1.541f34p-10, + 0x1.468daep-10, + 0x1.397ceep-10, + 0x1.2ce898p-10, + 0x1.20cc76p-10, + 0x1.15246ep-10, + 0x1.09ec86p-10, + 0x1.fe41cep-11, + 0x1.e97ba4p-11, + 0x1.d57f52p-11, + 0x1.c245d4p-11, + 0x1.afc85ep-11, + 0x1.9e0058p-11, + 0x1.8ce75ep-11, + 0x1.7c7744p-11, + 0x1.6caa0ep-11, + 0x1.5d79ecp-11, + 0x1.4ee142p-11, + 0x1.40daa4p-11, + 0x1.3360ccp-11, + 0x1.266ea8p-11, + 0x1.19ff46p-11, + 0x1.0e0de8p-11, + 0x1.0295f0p-11, + 0x1.ef25d4p-12, + 0x1.da0110p-12, + 0x1.c5b542p-12, + 0x1.b23a5ap-12, + 0x1.9f8894p-12, + 0x1.8d986ap-12, + 0x1.7c629ap-12, + 0x1.6be022p-12, + 0x1.5c0a38p-12, + 0x1.4cda54p-12, + 0x1.3e4a24p-12, + 0x1.305390p-12, + 0x1.22f0b4p-12, + 0x1.161be4p-12, + 0x1.09cfa4p-12, + 0x1.fc0d56p-13, + 0x1.e577bcp-13, + 0x1.cfd4a6p-13, + 0x1.bb1a96p-13, + 0x1.a74068p-13, + 0x1.943d4ap-13, + 0x1.8208bcp-13, + 0x1.709a8ep-13, + 0x1.5feadap-13, + 0x1.4ff208p-13, + 0x1.40a8c2p-13, + 0x1.3207fcp-13, + 0x1.2408eap-13, + 0x1.16a502p-13, + 0x1.09d5f8p-13, + 0x1.fb2b7ap-14, + 0x1.e3bcf4p-14, + 0x1.cd5528p-14, + 0x1.b7e946p-14, + 0x1.a36eecp-14, + 0x1.8fdc1cp-14, + 0x1.7d2738p-14, + 0x1.6b4702p-14, + 0x1.5a329cp-14, + 0x1.49e178p-14, + 0x1.3a4b60p-14, + 0x1.2b6876p-14, + 0x1.1d3120p-14, + 0x1.0f9e1cp-14, + 0x1.02a868p-14, + 0x1.ec929ap-15, + 0x1.d4f4b4p-15, + 0x1.be6abcp-15, + 0x1.a8e8ccp-15, + 0x1.94637ep-15, + 0x1.80cfdcp-15, + 0x1.6e2368p-15, + 0x1.5c540cp-15, + 0x1.4b581cp-15, + 0x1.3b2652p-15, + 0x1.2bb5ccp-15, + 0x1.1cfe02p-15, + 0x1.0ef6c4p-15, + 0x1.019842p-15, + 0x1.e9b5e8p-16, + 0x1.d16f58p-16, + 0x1.ba4f04p-16, + 0x1.a447b8p-16, + 0x1.8f4cccp-16, + 0x1.7b5224p-16, + 0x1.684c22p-16, + 0x1.562facp-16, + 0x1.44f21ep-16, + 0x1.34894ap-16, + 0x1.24eb72p-16, + 0x1.160f44p-16, + 0x1.07ebd2p-16, + 0x1.f4f12ep-17, + 0x1.db5ad0p-17, + 0x1.c304f0p-17, + 0x1.abe09ep-17, + 0x1.95df98p-17, + 0x1.80f43ap-17, + 0x1.6d1178p-17, + 0x1.5a2ae0p-17, + 0x1.483488p-17, + 0x1.372310p-17, + 0x1.26eb9ep-17, + 0x1.1783cep-17, + 0x1.08e1bap-17, + 0x1.f5f7d8p-18, + 0x1.db92b6p-18, + 0x1.c282cep-18, + 0x1.aab7acp-18, + 0x1.94219cp-18, + 0x1.7eb1a2p-18, + 0x1.6a5972p-18, + 0x1.570b6ap-18, + 0x1.44ba86p-18, + 0x1.335a62p-18, + 0x1.22df2ap-18, + 0x1.133d96p-18, + 0x1.046aeap-18, + 0x1.ecb9d0p-19, + 0x1.d21398p-19, + 0x1.b8d094p-19, + 0x1.a0df10p-19, + 0x1.8a2e26p-19, + 0x1.74adc8p-19, + 0x1.604ea8p-19, + 0x1.4d0232p-19, + 0x1.3aba86p-19, + 0x1.296a70p-19, + 0x1.190562p-19, + 0x1.097f62p-19, + 0x1.f59a20p-20, + 0x1.d9c736p-20, + 0x1.bf716cp-20, + 0x1.a6852cp-20, + 0x1.8eefd8p-20, + 0x1.789fb8p-20, + 0x1.6383f8p-20, + 0x1.4f8c96p-20, + 0x1.3caa62p-20, + 0x1.2acee2p-20, + 0x1.19ec60p-20, + 0x1.09f5d0p-20, + 0x1.f5bd96p-21, + 0x1.d9371ep-21, + 0x1.be41dep-21, + 0x1.a4c89ep-21, + 0x1.8cb738p-21, + 0x1.75fa8ep-21, + 0x1.608078p-21, + 0x1.4c37c0p-21, + 0x1.39100ep-21, + 0x1.26f9e0p-21, + 0x1.15e682p-21, + 0x1.05c804p-21, + 0x1.ed2254p-22, + 0x1.d06ad6p-22, + 0x1.b551c8p-22, + 0x1.9bc0a0p-22, + 0x1.83a200p-22, + 0x1.6ce1aap-22, + 0x1.576c72p-22, + 0x1.43302cp-22, + 0x1.301ba2p-22, + 0x1.1e1e86p-22, + 0x1.0d2966p-22, + 0x1.fa5b50p-23, + 0x1.dc3ae4p-23, + 0x1.bfd756p-23, + 0x1.a517dap-23, + 0x1.8be4f8p-23, + 0x1.74287ep-23, + 0x1.5dcd66p-23, + 0x1.48bfd4p-23, + 0x1.34ecf8p-23, + 0x1.224310p-23, + 0x1.10b148p-23, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp10_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_exp10_1u5.c new file mode 100644 index 000000000000..519693afcab0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp10_1u5.c @@ -0,0 +1,122 @@ +/* + * Double-precision SVE 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +#define SpecialBound 307.0 /* floor (log10 (2^1023)). */ + +static const struct data +{ + double poly[5]; + double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.9fcb9b3p-60 + abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] + max ulp err 0.52 +0.5. */ + .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1, + 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 }, + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ + .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */ + .log2_10_lo = -0x1.9dc1da994fd21p-59, + .scale_thres = 1280.0, + .special_bound = SpecialBound, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, d->scale_thres); + + svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (pg, r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* Fast vector implementation of exp10 using FEXPA instruction. + Maximum measured error is 1.02 ulp. + SV_NAME_D1 (exp10)(-0x1.2862fec805e58p+2) got 0x1.885a89551d782p-16 + want 0x1.885a89551d781p-16. */ +svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t no_big_scale = svacle (pg, x, d->special_bound); + svbool_t special = svnot_z (pg, no_big_scale); + + /* n = round(x/(log10(2)/N)). */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t z = svmla_x (pg, shift, x, d->log10_2); + svfloat64_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi); + svfloat64_t r = x; + r = svmls_lane (r, n, log2_10, 0); + r = svmls_lane (r, n, log2_10, 1); + + /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so + for consistent NaN handling we have to manually propagate them. This + comes at significant performance cost. */ + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t scale = svexpa (u); + + /* Approximate exp10(r) using polynomial. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2, + sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1)); + + /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound + multiplication may overflow, so use special case routine. */ + if (unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to scale. */ + scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); + return special_case (pg, scale, y, n, d); + } + + /* No special case. */ + return svmla_x (pg, scale, scale, y); +} + +PL_SIG (SV, D, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (exp10), 0.52) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, 307, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 307, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp10f_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_exp10f_1u5.c new file mode 100644 index 000000000000..9ecde8f1aa52 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp10f_1u5.c @@ -0,0 +1,87 @@ +/* + * Single-precision SVE 2^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +/* For x < -SpecialBound, the result is subnormal and not handled correctly by + FEXPA. */ +#define SpecialBound 37.9 + +static const struct data +{ + float poly[5]; + float shift, log10_2, log2_10_hi, log2_10_lo, special_bound; +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 0.52 +0.5 ulp. */ + .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f, + 0x1.12b41ap-1f }, + /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ + .shift = 0x1.903f8p17f, + .log10_2 = 0x1.a934fp+1, + .log2_10_hi = 0x1.344136p-2, + .log2_10_lo = -0x1.ec10cp-27, + .special_bound = SpecialBound, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (exp10f, x, y, special); +} + +/* Single-precision SVE exp10f routine. Implements the same algorithm + as AdvSIMD exp10f. + Worst case error is 1.02 ULPs. + _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 + want 0x1.ba5f9cp-1. */ +svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ + + /* Load some constants in quad-word chunks to minimise memory access (last + lane is wasted). */ + svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2); + + /* n = round(x/(log10(2)/N)). */ + svfloat32_t shift = sv_f32 (d->shift); + svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0); + svfloat32_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1); + r = svmls_lane (r, n, log10_2_and_inv, 2); + + svbool_t special = svacgt (pg, x, d->special_bound); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t poly + = svmla_x (pg, svmul_x (pg, r, d->poly[0]), + sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (pg, scale, scale, poly), special); + + return svmla_x (pg, scale, scale, poly); +} + +PL_SIG (SV, F, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_F1 (exp10), 0.52) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, SpecialBound, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), SpecialBound, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp2_2u.c b/contrib/arm-optimized-routines/pl/math/sv_exp2_2u.c new file mode 100644 index 000000000000..dcbca8adddd1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp2_2u.c @@ -0,0 +1,107 @@ +/* + * Double-precision SVE 2^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define N (1 << V_EXP_TABLE_BITS) + +#define BigBound 1022 +#define UOFlowBound 1280 + +static const struct data +{ + double poly[4]; + double shift, big_bound, uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, + 0x1.3b2abf5571ad8p-7 }, + .shift = 0x1.8p52 / N, + .uoflow_bound = UOFlowBound, + .big_bound = BigBound, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); + + svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (pg, r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t no_big_scale = svacle (pg, x, d->big_bound); + svbool_t special = svnot_z (pg, no_big_scale); + + /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t kd = svadd_x (pg, x, shift); + svuint64_t ki = svreinterpret_u64 (kd); + /* kd = k/N. */ + kd = svsub_x (pg, kd, shift); + svfloat64_t r = svsub_x (pg, x, kd); + + /* scale ~= 2^(k/N). */ + svuint64_t idx = svand_x (pg, ki, N - 1); + svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); + svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + + /* Approximate exp2(r) using polynomial. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); + svfloat64_t y = svmul_x (pg, r, p); + + /* Assemble exp2(x) = exp2(r) * scale. */ + if (unlikely (svptest_any (pg, special))) + return special_case (pg, scale, y, kd, d); + return svmla_x (pg, scale, scale, y); +} + +PL_SIG (SV, D, 1, exp2, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (exp2), 1.15) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp2f_1u6.c b/contrib/arm-optimized-routines/pl/math/sv_exp2f_1u6.c new file mode 100644 index 000000000000..9698ff6f0682 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp2f_1u6.c @@ -0,0 +1,80 @@ +/* + * Single-precision SVE 2^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float poly[5]; + float shift, thres; +} data = { + /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. */ + .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f, + 0x1.59977ap-10f }, + /* 1.5*2^17 + 127. */ + .shift = 0x1.903f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ + .thres = 0x1.5d5e2ap+6f, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (exp2f, x, y, special); +} + +/* Single-precision SVE exp2f routine. Implements the same algorithm + as AdvSIMD exp2f. + Worst case error is 1.04 ULPs. + SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0 + want 0x1.ba7ebp+0. */ +svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + svfloat32_t shift = sv_f32 (d->shift); + svfloat32_t z = svadd_x (pg, x, shift); + svfloat32_t n = svsub_x (pg, z, shift); + svfloat32_t r = svsub_x (pg, x, n); + + svbool_t special = svacgt (pg, x, d->thres); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp2(r)-1. + Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for + coefficients 1 to 4, and apply most significant coefficient directly. */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1); + svfloat32_t p0 = svmul_x (pg, r, d->poly[0]); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (pg, scale, scale, poly), special); + + return svmla_x (pg, scale, scale, poly); +} + +PL_SIG (SV, F, 1, exp2, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, -0x1p-23, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p-23, -1, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, -0x1p23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p23, -inf, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, ScaleThres, 40000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -1, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, ScaleThres, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_exp_1u5.c new file mode 100644 index 000000000000..c187def9e625 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp_1u5.c @@ -0,0 +1,137 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double poly[4]; + double ln2_hi, ln2_lo, inv_ln2, shift, thres; +} data = { + .poly = { /* ulp error: 0.53. */ + 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, + 0x1.1111266d28935p-7 }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* 1/ln2. */ + .inv_ln2 = 0x1.71547652b82fep+0, + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .thres = 704.0, +}; + +#define C(i) sv_f64 (d->poly[i]) +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b + = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 ( + svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), + b)); /* as_u64 (s) - 0x3010...0 + b. */ + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, 1280.0); + + svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (pg, r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* SVE exp algorithm. Maximum measured error is 1.01ulps: + SV_NAME_D1 (exp)(0x1.4619d7b04da41p+6) got 0x1.885d9acc41da7p+117 + want 0x1.885d9acc41da6p+117. */ +svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svbool_t special = svacgt (pg, x, d->thres); + + /* Use a modifed version of the shift used for flooring, such that x/ln2 is + rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 * + 2^46. + + n is not an integer but can be written as n = m + i/64, with i and m + integer, 0 <= i < 64 and m <= n. + + Bits 5:0 of z will be null every time x/ln2 reaches a new integer value + (n=m, i=0), and is incremented every time z (or n) is incremented by 1/64. + FEXPA expects i in bits 5:0 of the input so it can be used as index into + FEXPA hardwired table T[i] = 2^(i/64) for i = 0:63, that will in turn + populate the mantissa of the output. Therefore, we use u=asuint(z) as + input to FEXPA. + + We add 1023 to the modified shift value in order to set bits 16:6 of u to + 1, such that once these bits are moved to the exponent of the output of + FEXPA, we get the exponent of 2^n right, i.e. we get 2^m. */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + + /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t r = svmls_lane (x, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t p01 = svmla_x (pg, C (0), C (1), r); + svfloat64_t p23 = svmla_x (pg, C (2), C (3), r); + svfloat64_t p04 = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmla_x (pg, r, p04, r2); + + /* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for + consistent NaN handling we have to manually propagate them. This comes at + significant performance cost. */ + svfloat64_t s = svexpa (u); + + /* Assemble result as exp(x) = 2^n * exp(r). If |x| > Thresh the + multiplication may overflow, so use special case routine. */ + + if (unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to s. */ + s = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (s))); + return special_case (pg, s, y, n); + } + + /* No special case. */ + return svmla_x (pg, s, s, y); +} + +PL_SIG (SV, D, 1, exp, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (exp), 1.46) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h b/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h deleted file mode 100644 index 9b739da9d82a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Double-precision SVE e^(x+tail) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef SV_EXP_TAIL_H -#define SV_EXP_TAIL_H - -#include "sv_math.h" -#if SV_SUPPORTED - -#include "v_exp_tail.h" - -#define C1 sv_f64 (C1_scal) -#define C2 sv_f64 (C2_scal) -#define C3 sv_f64 (C3_scal) -#define MinusLn2hi (-Ln2hi_scal) -#define MinusLn2lo (-Ln2lo_scal) - -#define N (1 << V_EXP_TAIL_TABLE_BITS) -#define Tab __v_exp_tail_data -#define IndexMask (N - 1) -#define Shift sv_f64 (0x1.8p+52) -#define Thres 704.0 - -static inline sv_f64_t -sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n) -{ - sv_f64_t absn = svabs_f64_x (pg, n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000), - sv_u64 (0)); - sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000)); - sv_f64_t s2 = sv_as_f64_u64 ( - svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000), - b)); - - svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N); - sv_f64_t r1 = svmul_f64_x (pg, s1, s1); - sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1); - return svsel_f64 (cmp, r1, r0); -} - -static inline sv_f64_t -sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail) -{ - /* Calculate exp(x + xtail). */ - sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift); - sv_f64_t n = svsub_f64_x (pg, z, Shift); - - sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x); - r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r); - - sv_u64_t u = sv_as_u64_f64 (z); - sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); - sv_u64_t i = svand_n_u64_x (pg, u, IndexMask); - - sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2); - y = sv_fma_f64_x (pg, y, r, C1); - y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0)); - y = sv_fma_f64_x (pg, y, r, xtail); - - /* s = 2^(n/N). */ - u = sv_lookup_u64_x (pg, Tab, i); - sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e)); - - svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres); - if (unlikely (svptest_any (pg, cmp))) - { - return sv_exp_tail_special_case (pg, s, y, n); - } - return sv_fma_f64_x (pg, y, s, s); -} - -#endif -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c index 87fbe45df5fd..93d705ce420a 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c +++ b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c @@ -1,156 +1,86 @@ /* * Single-precision vector e^x function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define C(i) __sv_expf_poly[i] - -#define InvLn2 (0x1.715476p+0f) -#define Ln2hi (0x1.62e4p-1f) -#define Ln2lo (0x1.7f7d1cp-20f) - -#if SV_EXPF_USE_FEXPA - -#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127. */ -#define Thres \ - (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal \ - and not handled correctly by FEXPA. */ - -static NOINLINE sv_f32_t -special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +static const struct data +{ + float poly[5]; + float inv_ln2, ln2_hi, ln2_lo, shift, thres; +} data = { + /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. */ + .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, + 0x1.0e4020p-7f }, + .inv_ln2 = 0x1.715476p+0f, + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, + /* 1.5*2^17 + 127. */ + .shift = 0x1.903f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ + .thres = 0x1.5d5e2ap+6f, +}; + +#define C(i) sv_f32 (d->poly[i]) +#define ExponentBias 0x3f800000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) { - /* The special-case handler from the Neon routine does not handle subnormals - in a way that is compatible with FEXPA. For the FEXPA variant we just fall - back to scalar expf. */ return sv_call_f32 (expf, x, y, special); } -#else - -#define Shift (0x1.8p23f) /* 1.5 * 2^23. */ -#define Thres (126.0f) - -/* Special-case handler adapted from Neon variant. Uses s, y and n to produce - the final result (normal cases included). It performs an update of all lanes! - Therefore: - - all previous computation need to be done on all lanes indicated by input - pg - - we cannot simply apply the special case to the special-case-activated - lanes. Besides it is likely that this would not increase performance (no - scatter/gather). */ -static inline sv_f32_t -specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, - svbool_t p_cmp1, sv_f32_t scale) +/* Optimised single-precision SVE exp function. + Worst-case error is 1.04 ulp: + SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 + want 0x1.ba74bap+4. */ +svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) { - /* s=2^(n/N) may overflow, break it up into s=s1*s2, - such that exp = s + s*y can be computed as s1*(s2+s2*y) - and s1*s1 overflows only if n>0. */ - - /* If n<=0 then set b to 0x820...0, 0 otherwise. */ - svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0. */ - sv_u32_t b - = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0. */ - - /* Set s1 to generate overflow depending on sign of exponent n. */ - sv_f32_t s1 - = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000. */ - /* Offset s to avoid overflow in final result if n is below threshold. */ - sv_f32_t s2 = sv_as_f32_u32 ( - svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b. */ - - /* |n| > 192 => 2^(n/N) overflows. */ - svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f); + const struct data *d = ptr_barrier (&data); - sv_f32_t r2 = svmul_f32_x (pg, s1, s1); - sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2); - r1 = svmul_f32_x (pg, r1, s1); - sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale); - - /* Apply condition 1 then 2. - Returns r2 if cond2 is true, otherwise - if cond1 is true then return r1, otherwise return r0. */ - sv_f32_t r = svsel_f32 (p_cmp1, r1, r0); - - return svsel_f32 (p_cmp2, r2, r); -} - -#endif - -/* Optimised single-precision SVE exp function. By default this is an SVE port - of the Neon algorithm from math/. Alternatively, enable a modification of - that algorithm that looks up scale using SVE FEXPA instruction with - SV_EXPF_USE_FEXPA. - - Worst-case error of the default algorithm is 1.95 ulp: - __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8 - want 0x1.6a023p-8. - - Worst-case error when using FEXPA is 1.04 ulp: - __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4 - want 0x1.ba74bap+4. */ -sv_f32_t -__sv_expf_x (sv_f32_t x, const svbool_t pg) -{ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + /* Load some constants in quad-word chunks to minimise memory access (last + lane is wasted). */ + svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2); + /* n = round(x/(ln2/N)). */ - sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift)); - sv_f32_t n = svsub_n_f32_x (pg, z, Shift); + svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0); + svfloat32_t n = svsub_x (pg, z, d->shift); /* r = x - n*ln2/N. */ - sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x); - r = sv_fma_n_f32_x (pg, -Ln2lo, n, r); + svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1); + r = svmls_lane (r, n, invln2_and_ln2, 2); -/* scale = 2^(n/N). */ -#if SV_EXPF_USE_FEXPA - /* NaNs also need special handling with FEXPA. */ - svbool_t is_special_case - = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x)); - sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z)); -#else - sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23); - svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres); - sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000)); -#endif + /* scale = 2^(n/N). */ + svbool_t is_special_case = svacgt (pg, x, d->thres); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - sv_f32_t r2 = svmul_f32_x (pg, r, r); - sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1))); - sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3))); - q = sv_fma_f32_x (pg, p, r2, q); - p = svmul_n_f32_x (pg, r, C (4)); - sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p); + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); + svfloat32_t p34 = svmla_x (pg, C (3), C (4), r); + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_x (pg, r, C (0)); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); if (unlikely (svptest_any (pg, is_special_case))) -#if SV_EXPF_USE_FEXPA - return special_case (x, sv_fma_f32_x (pg, poly, scale, scale), - is_special_case); -#else - return specialcase (pg, poly, n, e, is_special_case, scale); -#endif + return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case); - return sv_fma_f32_x (pg, poly, scale, scale); + return svmla_x (pg, scale, scale, poly); } -PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf) - PL_SIG (SV, F, 1, exp, -9.9, 9.9) -PL_TEST_ULP (__sv_expf, 1.46) -PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000) -PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000) -PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000) -PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000) -PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000) -PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000) -PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000) -PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000) -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_F1 (exp), 0.55) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p23, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_data.c b/contrib/arm-optimized-routines/pl/math/sv_expf_data.c deleted file mode 100644 index 6875adf857b6..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_expf_data.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Coefficients for single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Coefficients copied from the polynomial in math/v_expf.c. */ -const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f, - 0x1.fffdb6p-2f, 0x1.ffffecp-1f}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_inline.h b/contrib/arm-optimized-routines/pl/math/sv_expf_inline.h new file mode 100644 index 000000000000..0ef4e0fda946 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expf_inline.h @@ -0,0 +1,66 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) and do + * not need special-case handling + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_EXPF_INLINE_H +#define PL_MATH_SV_EXPF_INLINE_H + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +struct sv_expf_data +{ + float poly[5]; + float inv_ln2, ln2_hi, ln2_lo, shift; +}; + +/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ +#define SV_EXPF_DATA \ + { \ + .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \ + 0x1.0e4020p-7f }, \ + \ + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ + .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ + } + +#define C(i) sv_f32 (d->poly[i]) + +static inline svfloat32_t +expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +{ + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]); + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1); + svfloat32_t n = svsub_x (pg, z, d->shift); + + /* r = x - n*ln2/N. */ + svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2); + r = svmls_lane (r, n, c4_invln2_and_ln2, 3); + + /* scale = 2^(n/N). */ + svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z)); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); + svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0); + svfloat32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +#endif // PL_MATH_SV_EXPF_INLINE_H \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_expm1_2u5.c new file mode 100644 index 000000000000..82a31f6d9c0e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expm1_2u5.c @@ -0,0 +1,95 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define SpecialBound 0x1.62b7d369a5aa9p+9 +#define ExponentBias 0x3ff0000000000000 + +static const struct data +{ + double poly[11]; + double shift, inv_ln2, special_bound; + /* To be loaded in one quad-word. */ + double ln2_hi, ln2_lo; +} data = { + /* Generated using fpminimax. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13, + 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .special_bound = SpecialBound, + .inv_ln2 = 0x1.71547652b82fep0, + .ln2_hi = 0x1.62e42fefa39efp-1, + .ln2_lo = 0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + return sv_call_f64 (expm1, x, y, pg); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.18 ULP: + _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 + want 0x1.a8b9ea8d66e2p-2. */ +svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Large, Nan/Inf. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift); + svint64_t i = svcvt_s64_x (pg, n); + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t f = svmls_lane (x, n, ln2, 0); + f = svmls_lane (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t f8 = svmul_x (pg, f4, f4); + svfloat64_t p + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias); + svfloat64_t t = svreinterpret_f64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +PL_SIG (SV, D, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_D1 (expm1), 1.68) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/sv_expm1f_1u6.c new file mode 100644 index 000000000000..0ec7c00f5300 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expm1f_1u6.c @@ -0,0 +1,93 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Largest value of x for which expm1(x) should round to -1. */ +#define SpecialBound 0x1.5ebc4p+6f + +static const struct data +{ + /* These 4 are grouped together so they can be loaded as one quadword, then + used with _lane forms of svmla/svmls. */ + float c2, c4, ln2_hi, ln2_lo; + float c0, c1, c3, inv_ln2, special_bound, shift; +} data = { + /* Generated using fpminimax. */ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, + .c4 = 0x1.6b55a2p-10, + + .special_bound = SpecialBound, .shift = 0x1.8p23f, + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, +}; + +#define C(i) sv_f32 (d->c##i) + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t pg) +{ + return sv_call_f32 (expm1f, x, x, pg); +} + +/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP: + _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2 + want 0x1.e859d4p-2. */ +svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Large, NaN/Inf. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg); + + /* This vector is reliant on layout of data - it contains constants + that can be used with _lane forms of svmla/svmls. Values are: + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */ + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); + j = svsub_x (pg, j, d->shift); + svint32_t i = svcvt_s32_x (pg, j); + + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); + f = svmls_lane (f, j, lane_constants, 3); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p = svmla_x (pg, p12, f2, p34); + p = svmla_x (pg, C (0), f, p); + p = svmla_x (pg, f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svfloat32_t t = svreinterpret_f32 ( + svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000)); + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +PL_SIG (SV, F, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (SV_NAME_F1 (expm1), 1.02) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_expm1f_inline.h b/contrib/arm-optimized-routines/pl/math/sv_expm1f_inline.h new file mode 100644 index 000000000000..a6e2050ff4a6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expm1f_inline.h @@ -0,0 +1,73 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) - 1 and do + * not need special-case handling + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_EXPM1F_INLINE_H +#define PL_MATH_SV_EXPM1F_INLINE_H + +#include "sv_math.h" + +struct sv_expm1f_data +{ + /* These 4 are grouped together so they can be loaded as one quadword, then + used with _lane forms of svmla/svmls. */ + float32_t c2, c4, ln2_hi, ln2_lo; + float32_t c0, c1, c3, inv_ln2, shift; +}; + +/* Coefficients generated using fpminimax. */ +#define SV_EXPM1F_DATA \ + { \ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \ + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + \ + .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ + .ln2_lo = 0x1.7f7d1cp-20f, \ + } + +#define C(i) sv_f32 (d->c##i) + +static inline svfloat32_t +expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) +{ + /* This vector is reliant on layout of data - it contains constants + that can be used with _lane forms of svmla/svmls. Values are: + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */ + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); + j = svsub_x (pg, j, d->shift); + svint32_t i = svcvt_s32_x (pg, j); + + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); + f = svmls_lane (f, j, lane_constants, 3); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p = svmla_x (pg, p12, f2, p34); + p = svmla_x (pg, C (0), f, p); + p = svmla_x (pg, f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svfloat32_t t = svscale_x (pg, sv_f32 (1), i); + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +#endif // PL_MATH_SV_EXPM1F_INLINE_H \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_hypot_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_hypot_1u5.c new file mode 100644 index 000000000000..cf1590e4b9ab --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_hypot_1u5.c @@ -0,0 +1,51 @@ +/* + * Double-precision SVE hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint64_t tiny_bound, thres; +} data = { + .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */ + .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */ +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330) + got 0x1.6a22d0412cfp+352 + want 0x1.6a22d0412cf01p+352. */ +svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres); + + if (unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + return svsqrt_x (pg, sqsum); +} + +PL_SIG (SV, D, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D2 (hypot), 0.71) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_hypotf_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_hypotf_1u5.c new file mode 100644 index 000000000000..f428832b3dbc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_hypotf_1u5.c @@ -0,0 +1,45 @@ +/* + * Single-precision SVE hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TinyBound 0x0c800000 /* asuint (0x1p-102). */ +#define Thres 0x73000000 /* 0x70000000 - TinyBound. */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19 + want 0x1.6a2344p-19. */ +svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y, + const svbool_t pg) +{ + svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres); + + if (unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + + return svsqrt_x (pg, sqsum); +} + +PL_SIG (SV, F, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F2 (hypot), 0.71) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c index 884e2011d2f8..f55e068fd442 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c @@ -1,89 +1,75 @@ /* * Double-precision SVE log10(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "math_config.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#define OFF 0x3fe6900900000000 +#define Min 0x0010000000000000 +#define Max 0x7ff0000000000000 +#define Thres 0x7fe0000000000000 /* Max - Min. */ +#define Off 0x3fe6900900000000 #define N (1 << V_LOG10_TABLE_BITS) -#define A(i) __v_log10_data.poly[i] - -static inline sv_f64_t -specialcase (sv_f64_t x, sv_f64_t y, svbool_t special) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) { return sv_call_f64 (log10, x, y, special); } -/* SVE log10 algorithm. Maximum measured error is 2.46 ulps. - __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 - want 0x1.fffbdf6eaa667p-6. */ -sv_f64_t -__sv_log10_x (sv_f64_t x, const svbool_t pg) +/* SVE log10 algorithm. + Maximum measured error is 2.46 ulps. + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ +svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); - - svbool_t is_special_case - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010); + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); - sv_u64_t i - = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N); - sv_f64_t k - = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52)); - sv_f64_t z = sv_as_f64_u64 ( - svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52))); + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); /* log(x) = k*log(2) + log(c) + log(z/c). */ - - sv_u64_t idx = svmul_n_u64_x (pg, i, 2); - sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx); - sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx); + svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i); + svfloat64_t logc + = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i); /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1): r = z/c - 1 (we look up precomputed 1/c) log(z/c) ~= P(r). */ - sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); /* hi = log(c) + k*log(2). */ - sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc); - sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w); + svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10); + svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - sv_f64_t r2 = svmul_f64_x (pg, r, r); - sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2))); - sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0))); - y = sv_fma_n_f64_x (pg, A (4), r2, y); - y = sv_fma_f64_x (pg, y, r2, p); - y = sv_fma_f64_x (pg, y, r2, hi); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly); - if (unlikely (svptest_any (pg, is_special_case))) - { - return specialcase (x, y, is_special_case); - } - return y; + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), + special); + return svmla_x (pg, hi, r2, y); } -PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10) - PL_SIG (SV, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (__sv_log10, 1.97) -PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000) -#endif +PL_TEST_ULP (SV_NAME_D1 (log10), 1.97) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c index e7b1e9801fa9..a685b23e5de5 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c @@ -1,88 +1,93 @@ /* * Single-precision SVE log10 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +static const struct data +{ + float poly_0246[4]; + float poly_1357[4]; + float ln2, inv_ln10; +} data = { + .poly_1357 = { + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs + 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane + variant of MLA intrinsic. */ + 0x1.2879c8p-3f, 0x1.6408f8p-4f, 0x1.f0e514p-5f, 0x1.f5f76ap-5f + }, + .poly_0246 = { -0x1.bcb79cp-3f, -0x1.bcd472p-4f, -0x1.246f8p-4f, + -0x1.0fc92cp-4f }, + .ln2 = 0x1.62e43p-1f, + .inv_ln10 = 0x1.bcb7b2p-2f, +}; -#define SpecialCaseMin 0x00800000 -#define SpecialCaseMax 0x7f800000 +#define Min 0x00800000 +#define Max 0x7f800000 +#define Thres 0x7f000000 /* Max - Min. */ #define Offset 0x3f2aaaab /* 0.666667. */ -#define Mask 0x007fffff -#define Ln2 0x1.62e43p-1f /* 0x3f317218. */ -#define InvLn10 0x1.bcb7b2p-2f - -#define P(i) __v_log10f_poly[i] +#define MantissaMask 0x007fffff -static NOINLINE sv_f32_t -special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) { return sv_call_f32 (log10f, x, y, special); } /* Optimised implementation of SVE log10f using the same algorithm and - polynomial as v_log10f. Maximum error is 3.31ulps: - __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4. */ -sv_f32_t -__sv_log10f_x (sv_f32_t x, const svbool_t pg) + polynomial as AdvSIMD log10f. + Maximum error is 3.31ulps: + SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) { - sv_u32_t ix = sv_as_u32_f32 (x); - svbool_t special_cases - = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin), - SpecialCaseMax - SpecialCaseMin); + const struct data *d = ptr_barrier (&data); + svuint32_t ix = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - ix = svsub_n_u32_x (pg, ix, Offset); - sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix), - 23)); /* signextend. */ - ix = svand_n_u32_x (pg, ix, Mask); - ix = svadd_n_u32_x (pg, ix, Offset); - sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f); + ix = svsub_x (pg, ix, Offset); + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */ + ix = svand_x (pg, ix, MantissaMask); + ix = svadd_x (pg, ix, Offset); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); /* y = log10(1+r) + n*log10(2) log10(1+r) ~ r * InvLn(10) + P(r) where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for - log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3) - - P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67))) - and Qij = Pi + r * Pj. */ - sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); - sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); - sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); - sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); + log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_0246[2]), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_0246[3]), r, p_1357, 3); + svfloat32_t q_47 = svmla_x (pg, q_45, r2, q_67); + svfloat32_t q_03 = svmla_x (pg, q_01, r2, q_23); + svfloat32_t y = svmla_x (pg, q_03, r4, q_47); - sv_f32_t r2 = svmul_f32_x (pg, r, r); - sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56); - y = sv_fma_f32_x (pg, y, r2, q34); - y = sv_fma_f32_x (pg, y, r2, q12); + /* Using hi = Log10(2)*n + r*InvLn(10) is faster but less accurate. */ + svfloat32_t hi = svmla_x (pg, r, n, d->ln2); + hi = svmul_x (pg, hi, d->inv_ln10); - /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less - accurate. */ - sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r); - y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10)); - - if (unlikely (svptest_any (pg, special_cases))) - { - return special_case (x, y, special_cases); - } - return y; + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), + special); + return svmla_x (pg, hi, r2, y); } -PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f) - PL_SIG (SV, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (__sv_log10f, 2.82) -PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000) -#endif +PL_TEST_ULP (SV_NAME_F1 (log10), 2.82) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log1p_2u5.c new file mode 100644 index 000000000000..f178ab16238a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1p_2u5.c @@ -0,0 +1,116 @@ +/* + * Double-precision SVE log(1+x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double poly[19]; + double ln2_hi, ln2_lo; + uint64_t hfrt2_top, onemhfrt2_top, inf, mone; +} data = { + /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20 + polynomial, however first 2 coefficients are 0 and 1 so are not stored. */ + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* top32(asuint64(sqrt(2)/2)) << 32. */ + .hfrt2_top = 0x3fe6a09e00000000, + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ + .onemhfrt2_top = 0x00095f6200000000, + .inf = 0x7ff0000000000000, + .mone = 0xbff0000000000000, +}; + +#define AbsMask 0x7fffffffffffffff +#define BottomMask 0xffffffff + +static svfloat64_t NOINLINE +special_case (svbool_t special, svfloat64_t x, svfloat64_t y) +{ + return sv_call_f64 (log1p, x, y, special); +} + +/* Vector approximation for log1p using polynomial on reduced interval. Maximum + observed error is 2.46 ULP: + _ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 + want 0x1.fd5565fb590f6p+2. */ +svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t ax = svand_x (pg, ix, AbsMask); + svbool_t special + = svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone)); + + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f + is in [sqrt(2)/2, sqrt(2)]): + log1p(x) = k*log(2) + log1p(f). + + f may not be representable exactly, so we need a correction term: + let m = round(1 + x), c = (1 + x) - m. + c << m: at very small x, log1p(x) ~ x, hence: + log(1+x) - log(m) ~ c/m. + + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ + + /* Obtain correctly scaled k by manipulation in the exponent. + The scalar algorithm casts down to 32-bit at this point to calculate k and + u_red. We stay in double-width to obtain f and k, using the same constants + as the scalar algorithm but shifted left by 32. */ + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); + svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top); + + svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), 0x3ff); + svfloat64_t k = svcvt_f64_x (pg, ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top); + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask)); + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m); + + /* Approximate log1p(x) on the reduced input using a polynomial. Because + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... + Assembling this all correctly is dealt with at the final step. */ + svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2), + f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8); + svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly); + + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi); + svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); + + if (unlikely (svptest_any (pg, special))) + return special_case (special, x, y); + + return y; +} + +PL_SIG (SV, D, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (SV_NAME_D1 (log1p), 1.97) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000) +PL_TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1p_inline.h b/contrib/arm-optimized-routines/pl/math/sv_log1p_inline.h new file mode 100644 index 000000000000..983f8e1b0413 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1p_inline.h @@ -0,0 +1,96 @@ +/* + * Helper for SVE double-precision routines which calculate log(1 + x) and do + * not need special-case handling + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef PL_MATH_SV_LOG1P_INLINE_H +#define PL_MATH_SV_LOG1P_INLINE_H + +#include "sv_math.h" +#include "poly_sve_f64.h" + +static const struct sv_log1p_data +{ + double poly[19], ln2[2]; + uint64_t hf_rt2_top; + uint64_t one_m_hf_rt2_top; + uint32_t bottom_mask; + int64_t one_top; +} sv_log1p_data = { + /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. + */ + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6 }, + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, + .hf_rt2_top = 0x3fe6a09e00000000, + .one_m_hf_rt2_top = 0x00095f6200000000, + .bottom_mask = 0xffffffff, + .one_top = 0x3ff +}; + +static inline svfloat64_t +sv_log1p_inline (svfloat64_t x, const svbool_t pg) +{ + /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which + differs from v_log1p_2u5.c by: + - No special-case handling - this should be dealt with by the caller. + - Pairwise Horner polynomial evaluation for improved accuracy. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using svsel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. + See sv_log1p_2u1.c for details of the algorithm. */ + const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data); + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); + svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top); + + svint64_t ki + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top); + svfloat64_t k = svcvt_f64_x (pg, ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top); + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask)); + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1)); + svfloat64_t cm; + +#ifndef WANT_SV_LOG1P_K0_SHORTCUT +#error \ + "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_SV_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + svbool_t knot0 = svcmpne (pg, k, 0); + cm = svdiv_z (knot0, c, m); + if (likely (!svptest_any (pg, knot0))) + { + f = svsel (knot0, f, x); + } +#else + /* No shortcut. */ + cm = svdiv_x (pg, c, m); +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]); + + return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); +} +#endif // PL_MATH_SV_LOG1P_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1pf_1u3.c b/contrib/arm-optimized-routines/pl/math/sv_log1pf_1u3.c new file mode 100644 index 000000000000..ea1a3dbf723a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1pf_1u3.c @@ -0,0 +1,97 @@ +/* + * Single-precision vector log(x + 1) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[8]; + float ln2, exp_bias; + uint32_t four, three_quarters; +} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as + this can be fmov-ed directly instead of including it in + the main load-and-mla polynomial schedule. */ + 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, + 0x1.abcb6p-4f, -0x1.6f0d5ep-5f}, + .ln2 = 0x1.62e43p-1f, + .exp_bias = 0x1p-23f, + .four = 0x40800000, + .three_quarters = 0x3f400000}; + +#define SignExponentMask 0xff800000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (log1pf, x, y, special); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.27 ULP very close to 0.5. + _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2 + want 0x1.9f323ep-2. */ +svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* x < -1, Inf/Nan. */ + svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); + special = svorn_z (pg, special, svcmpge (pg, x, -1)); + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + svfloat32_t m = svadd_x (pg, x, 1); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + svint32_t k + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), + sv_s32 (SignExponentMask)); + + /* Scale x by exponent manipulation. */ + svfloat32_t m_scale = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); + m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25)); + + /* Evaluate polynomial on reduced interval. */ + svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale), + ms4 = svmul_x (pg, ms2, ms2); + svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly); + p = svmad_x (pg, m_scale, p, -0.5); + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias); + + /* Apply the scaling back. */ + svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +PL_SIG (SV, F, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (SV_NAME_F1 (log1p), 0.77) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000) +PL_TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000) +PL_TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log1pf_inline.h b/contrib/arm-optimized-routines/pl/math/sv_log1pf_inline.h new file mode 100644 index 000000000000..d13b094f6b5d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log1pf_inline.h @@ -0,0 +1,65 @@ +/* + * Helper for SVE routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_LOG1PF_INLINE_H +#define PL_MATH_SV_LOG1PF_INLINE_H + +#include "v_math.h" +#include "math_config.h" +#include "poly_sve_f32.h" + +static const struct sv_log1pf_data +{ + float32_t poly[9]; + float32_t ln2; + float32_t scale_back; +} sv_log1pf_data = { + /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */ + .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, + -0x1.6f0d5ep-5f }, + .scale_back = 0x1.0p-23f, + .ln2 = 0x1.62e43p-1f, +}; + +static inline svfloat32_t +eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg) +{ + svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1])); + svfloat32_t m2 = svmul_x (pg, m, m); + svfloat32_t q = svmla_x (pg, m, m2, p_12); + svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2); + p = svmul_x (pg, m2, p); + + return svmla_x (pg, q, m2, p); +} + +static inline svfloat32_t +sv_log1pf_inline (svfloat32_t x, svbool_t pg) +{ + const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); + + svfloat32_t m = svadd_x (pg, x, 1.0f); + + svint32_t ks = svsub_x (pg, svreinterpret_s32 (m), + svreinterpret_s32 (svdup_f32 (0.75f))); + ks = svand_x (pg, ks, 0xff800000); + svuint32_t k = svreinterpret_u32 (ks); + svfloat32_t s = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k)); + + svfloat32_t m_scale + = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k)); + m_scale + = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s)); + svfloat32_t p = eval_poly (m_scale, d->poly, pg); + svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back); + return svmla_x (pg, p, scale_back, d->ln2); +} + +#endif // PL_MATH_SV_LOG1PF_INLINE_H \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c index a0815bb5646f..0775a39cc85d 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c @@ -1,85 +1,73 @@ /* * Double-precision SVE log2 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_sve_f64.h" -#if SV_SUPPORTED - -#define InvLn2 sv_f64 (0x1.71547652b82fep0) #define N (1 << V_LOG2_TABLE_BITS) -#define OFF 0x3fe6900900000000 -#define P(i) sv_f64 (__v_log2_data.poly[i]) +#define Off 0x3fe6900900000000 +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ -NOINLINE static sv_f64_t -specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) { return sv_call_f64 (log2, x, y, cmp); } -/* Double-precision SVE log2 routine. Implements the same algorithm as vector - log10, with coefficients and table entries scaled in extended precision. +/* Double-precision SVE log2 routine. + Implements the same algorithm as AdvSIMD log10, with coefficients and table + entries scaled in extended precision. The maximum observed error is 2.58 ULP: - __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -sv_f64_t -__sv_log2_x (sv_f64_t x, const svbool_t pg) + SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); - - svbool_t special - = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010); + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); - sv_u64_t i - = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N); - sv_f64_t k - = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52)); - sv_f64_t z = sv_as_f64_u64 ( - svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52))); + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); - sv_u64_t idx = svmul_n_u64_x (pg, i, 2); - sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx); - sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx); + svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i); + svfloat64_t log2c + = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i); /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ - sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); - sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2); - sv_f64_t r2 = svmul_f64_x (pg, r, r); - sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2)); - sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0)); - sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23); - y = sv_fma_f64_x (pg, y, r2, p_01); - y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w)); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly); + w = svadd_x (pg, k, w); if (unlikely (svptest_any (pg, special))) - { - return specialcase (x, y, special); - } - return y; + return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y), + special); + return svmla_x (pg, w, r2, y); } -PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2) - PL_SIG (SV, D, 1, log2, 0.01, 11.1) -PL_TEST_ULP (__sv_log2, 2.09) -PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2) -PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000) -PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000) - -#endif +PL_TEST_ULP (SV_NAME_D1 (log2), 2.09) +PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_D1 (log2)) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c index fe2ab16b90b7..9e96c62bbcc6 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c @@ -1,79 +1,86 @@ /* * Single-precision vector/SVE log2 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define P(i) __v_log2f_data.poly[i] +static const struct data +{ + float poly_02468[5]; + float poly_1357[4]; +} data = { + .poly_1357 = { + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs + 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane + variant of MLA intrinsic. */ + -0x1.715458p-1f, -0x1.7171a4p-2f, -0x1.e5143ep-3f, -0x1.c675bp-3f + }, + .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, + 0x1.9d8ecap-3f, 0x1.9e495p-3f }, +}; -#define Ln2 (0x1.62e43p-1f) /* 0x3f317218. */ #define Min (0x00800000) #define Max (0x7f800000) -#define Mask (0x007fffff) +#define Thres (0x7f000000) /* Max - Min. */ +#define MantissaMask (0x007fffff) #define Off (0x3f2aaaab) /* 0.666667. */ -static NOINLINE sv_f32_t -specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (log2f, x, y, cmp); } /* Optimised implementation of SVE log2f, using the same algorithm - and polynomial as Neon log2f. Maximum error is 2.48 ULPs: - __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2 - want 0x1.a9be8p-2. */ -sv_f32_t -__sv_log2f_x (sv_f32_t x, const svbool_t pg) + and polynomial as AdvSIMD log2f. + Maximum error is 2.48 ULPs: + SV_NAME_F1 (log2)(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) { - sv_u32_t u = sv_as_u32_f32 (x); - svbool_t special - = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min)); + const struct data *d = ptr_barrier (&data); + + svuint32_t u = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_n_u32_x (pg, u, Off); - sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u), - 23)); /* Sign-extend. */ - u = svand_n_u32_x (pg, u, Mask); - u = svadd_n_u32_x (pg, u, Off); - sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f); + u = svsub_x (pg, u, Off); + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ + u = svand_x (pg, u, MantissaMask); + u = svadd_x (pg, u, Off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log2(1+r) + n. */ - sv_f32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t r2 = svmul_x (pg, r, r); /* Evaluate polynomial using pairwise Horner scheme. */ - sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); - sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); - sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); - sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); - sv_f32_t y; - y = sv_fma_n_f32_x (pg, P (8), r2, p67); - y = sv_fma_f32_x (pg, y, r2, p45); - y = sv_fma_f32_x (pg, y, r2, p23); - y = sv_fma_f32_x (pg, y, r2, p01); - y = sv_fma_f32_x (pg, y, r, n); + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_02468[0]), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_02468[1]), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_02468[2]), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_02468[3]), r, p_1357, 3); + svfloat32_t y = svmla_x (pg, q_67, r2, sv_f32 (d->poly_02468[4])); + y = svmla_x (pg, q_45, r2, y); + y = svmla_x (pg, q_23, r2, y); + y = svmla_x (pg, q_01, r2, y); if (unlikely (svptest_any (pg, special))) - return specialcase (x, y, special); - return y; + return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special); + return svmla_x (pg, n, r, y); } -PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f) - PL_SIG (SV, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (__sv_log2f, 1.99) -PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f) -PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000) -PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000) - -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_F1 (log2), 1.99) +PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_F1 (log2)) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c index 7f06fd31ebf1..2530c9e3f62c 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c @@ -1,85 +1,76 @@ /* * Double-precision SVE log(x) function. * * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED +#define P(i) sv_f64 (__v_log_data.poly[i]) +#define N (1 << V_LOG_TABLE_BITS) +#define Off (0x3fe6900900000000) +#define MaxTop (0x7ff) +#define MinTop (0x001) +#define ThreshTop (0x7fe) /* MaxTop - MinTop. */ -#define A(i) __sv_log_data.poly[i] -#define Ln2 (0x1.62e42fefa39efp-1) -#define N (1 << SV_LOG_TABLE_BITS) -#define OFF (0x3fe6900900000000) - -double -optr_aor_log_f64 (double); - -static NOINLINE sv_f64_t -__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) { - return sv_call_f64 (optr_aor_log_f64, x, y, cmp); + return sv_call_f64 (log, x, y, cmp); } -/* SVE port of Neon log algorithm from math/. +/* SVE port of AdvSIMD log algorithm. Maximum measured error is 2.17 ulp: - __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 - want 0x1.ffffff1cca045p-2. */ -sv_f64_t -__sv_log_x (sv_f64_t x, const svbool_t pg) + SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ +svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) { - sv_u64_t ix = sv_as_u64_f64 (x); - sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); - svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), - sv_u64 (0x7ff0 - 0x0010)); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t top = svlsr_x (pg, ix, 52); + svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop)); - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); - /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power - of 2. */ - sv_u64_t i - = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)), - N - 1); - sv_s64_t k - = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift. */ - sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)); - sv_f64_t z = sv_as_f64_u64 (iz); + svuint64_t tmp = svsub_x (pg, ix, Off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svint64_t k + = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); /* Lookup in 2 global lists (length N). */ - sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i); - sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i); + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); - sv_f64_t kd = sv_to_f64_s64_x (pg, k); + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t kd = svcvt_f64_x (pg, k); /* hi = r + log(c) + k*Ln2. */ - sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r)); + svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - sv_f64_t r2 = svmul_f64_x (pg, r, r); - sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2))); - sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0))); - y = sv_fma_n_f64_x (pg, A (4), r2, y); - y = sv_fma_f64_x (pg, y, r2, p); - y = sv_fma_f64_x (pg, y, r2, hi); + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, P (2), r, P (3)); + svfloat64_t p = svmla_x (pg, P (0), r, P (1)); + y = svmla_x (pg, y, r2, P (4)); + y = svmla_x (pg, p, r2, y); if (unlikely (svptest_any (pg, cmp))) - return __sv_log_specialcase (x, y, cmp); - return y; + return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp); + return svmla_x (pg, hi, r2, y); } -PL_ALIAS (__sv_log_x, _ZGVsMxv_log) - PL_SIG (SV, D, 1, log, 0.01, 11.1) -PL_TEST_ULP (__sv_log, 1.68) -PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_log, 100, inf, 50000) -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_D1 (log), 1.68) +PL_TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_data.c b/contrib/arm-optimized-routines/pl/math/sv_log_data.c deleted file mode 100644 index 77f9989444f5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_log_data.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Coefficients for double-precision SVE log(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -const struct sv_log_data __sv_log_data = { - /* All coefficients and table entries are copied from the Neon routine in - math/. See math/v_log_data.c for an explanation of the algorithm. */ - - .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0, - 0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0, - 0x1.623f1d916f323p+0, 0x1.60578da220f65p+0, - 0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0, - 0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0, - 0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0, - 0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0, - 0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0, - 0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0, - 0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0, - 0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0, - 0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0, - 0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0, - 0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0, - 0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0, - 0x1.36987540fbf53p+0, 0x1.352166b648f61p+0, - 0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0, - 0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0, - 0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0, - 0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0, - 0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0, - 0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0, - 0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0, - 0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0, - 0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0, - 0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0, - 0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0, - 0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0, - 0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0, - 0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0, - 0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0, - 0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0, - 0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0, - 0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0, - 0x1.07321489b13eap+0, 0x1.062491aee9904p+0, - 0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0, - 0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0, - 0x1.010037d38bcc2p+0, 1.0, - 0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1, - 0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1, - 0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1, - 0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1, - 0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1, - 0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1, - 0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1, - 0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1, - 0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1, - 0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1, - 0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1, - 0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1, - 0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1, - 0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1, - 0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1, - 0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1, - 0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1, - 0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1, - 0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1, - 0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1, - 0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1, - 0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1, - 0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1, - 0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1, - 0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1, - 0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1}, - - .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2, - -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2, - -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2, - -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2, - -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2, - -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2, - -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2, - -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2, - -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2, - -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3, - -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3, - -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3, - -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3, - -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3, - -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3, - -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3, - -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3, - -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3, - -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3, - -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3, - -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3, - -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3, - -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4, - -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4, - -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4, - -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4, - -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4, - -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4, - -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4, - -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4, - -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5, - -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5, - -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5, - -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5, - -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6, - -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6, - -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7, - -0x1.ff6fe1feb4e53p-9, 0.0, - 0x1.fe91885ec8e20p-8, 0x1.fc516f716296dp-7, - 0x1.7bb4dd70a015bp-6, 0x1.f84c99b34b674p-6, - 0x1.39f9ce4fb2d71p-5, 0x1.7756c0fd22e78p-5, - 0x1.b43ee82db8f3ap-5, 0x1.f0b3fced60034p-5, - 0x1.165bd78d4878ep-4, 0x1.3425d2715ebe6p-4, - 0x1.51b8bd91b7915p-4, 0x1.6f15632c76a47p-4, - 0x1.8c3c88ecbe503p-4, 0x1.a92ef077625dap-4, - 0x1.c5ed5745fa006p-4, 0x1.e27876de1c993p-4, - 0x1.fed104fce4cdcp-4, 0x1.0d7bd9c17d78bp-3, - 0x1.1b76986cef97bp-3, 0x1.295913d24f750p-3, - 0x1.37239fa295d17p-3, 0x1.44d68dd78714bp-3, - 0x1.52722ebe5d780p-3, 0x1.5ff6d12671f98p-3, - 0x1.6d64c2389484bp-3, 0x1.7abc4da40fddap-3, - 0x1.87fdbda1e8452p-3, 0x1.95295b06a5f37p-3, - 0x1.a23f6d34abbc5p-3, 0x1.af403a28e04f2p-3, - 0x1.bc2c06a85721ap-3, 0x1.c903161240163p-3, - 0x1.d5c5aa93287ebp-3, 0x1.e274051823fa9p-3, - 0x1.ef0e656300c16p-3, 0x1.fb9509f05aa2ap-3, - 0x1.04041821f37afp-2, 0x1.0a340a49b3029p-2, - 0x1.105a7918a126dp-2, 0x1.1677819812b84p-2, - 0x1.1c8b405b40c0ep-2, 0x1.2295d16cfa6b1p-2, - 0x1.28975066318a2p-2, 0x1.2e8fd855d86fcp-2, - 0x1.347f83d605e59p-2, 0x1.3a666d1244588p-2, - 0x1.4044adb6f8ec4p-2, 0x1.461a5f077558cp-2, - 0x1.4be799e20b9c8p-2, 0x1.51ac76a6b79dfp-2, - 0x1.57690d5744a45p-2, 0x1.5d1d758e45217p-2}, - - .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3}, -}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c index 11f0b8aa12c5..967355247036 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c +++ b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c @@ -1,77 +1,86 @@ /* * Single-precision vector log function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define P(i) __sv_logf_poly[i] +static const struct data +{ + float poly_0135[4]; + float poly_246[3]; + float ln2; +} data = { + .poly_0135 = { + /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so + that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used + with _lane variant of MLA intrinsic. */ + -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f + }, + .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, + .ln2 = 0x1.62e43p-1f +}; -#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */ #define Min (0x00800000) #define Max (0x7f800000) +#define Thresh (0x7f000000) /* Max - Min. */ #define Mask (0x007fffff) -#define Off (0x3f2aaaab) /* 0.666667 */ +#define Off (0x3f2aaaab) /* 0.666667. */ -float -optr_aor_log_f32 (float); +float optr_aor_log_f32 (float); -static NOINLINE sv_f32_t -__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (optr_aor_log_f32, x, y, cmp); } -/* Optimised implementation of SVE logf, using the same algorithm and polynomial - as the Neon routine in math/. Maximum error is 3.34 ULPs: - __sv_logf(0x1.557298p+0) got 0x1.26edecp-2 - want 0x1.26ede6p-2. */ -sv_f32_t -__sv_logf_x (sv_f32_t x, const svbool_t pg) +/* Optimised implementation of SVE logf, using the same algorithm and + polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs: + SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2 + want 0x1.26ede6p-2. */ +svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) { - sv_u32_t u = sv_as_u32_f32 (x); - svbool_t cmp - = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min)); + const struct data *d = ptr_barrier (&data); + + svuint32_t u = svreinterpret_u32 (x); + svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_n_u32_x (pg, u, Off); - sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u), - 23)); /* Sign-extend. */ - u = svand_n_u32_x (pg, u, Mask); - u = svadd_n_u32_x (pg, u, Off); - sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f); + u = svsub_x (pg, u, Off); + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ + u = svand_x (pg, u, Mask); + u = svadd_x (pg, u, Off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log(1+r) + n*ln2. */ - sv_f32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t r2 = svmul_x (pg, r, r); /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ - sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2))); - sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4))); - sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6))); - p = sv_fma_n_f32_x (pg, P (0), r2, p); - q = sv_fma_f32_x (pg, p, r2, q); - y = sv_fma_f32_x (pg, q, r2, y); - p = sv_fma_n_f32_x (pg, Ln2, n, r); - y = sv_fma_f32_x (pg, y, r2, p); + svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); + svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); + svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2); + svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3); + p = svmla_lane (p, r2, p_0135, 0); + + q = svmla_x (pg, q, r2, p); + y = svmla_x (pg, y, r2, q); + p = svmla_x (pg, r, n, d->ln2); if (unlikely (svptest_any (pg, cmp))) - return __sv_logf_specialcase (x, y, cmp); - return y; + return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp); + return svmla_x (pg, p, r2, y); } -PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf) - PL_SIG (SV, F, 1, log, 0.01, 11.1) -PL_TEST_ULP (__sv_logf, 2.85) -PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000) -PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000) -#endif // SV_SUPPORTED +PL_TEST_ULP (SV_NAME_F1 (log), 2.85) +PL_TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_data.c b/contrib/arm-optimized-routines/pl/math/sv_logf_data.c deleted file mode 100644 index 51dd7a7eeb37..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_logf_data.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Coefficients for single-precision SVE log function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -const float __sv_logf_poly[] = { - /* Copied from coeffs for the Neon routine in math/. */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, - -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, -}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_math.h b/contrib/arm-optimized-routines/pl/math/sv_math.h index 5ef0ad3bd5e0..f67fe91803ba 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_math.h +++ b/contrib/arm-optimized-routines/pl/math/sv_math.h @@ -1,245 +1,133 @@ /* * Wrapper functions for SVE ACLE. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef SV_MATH_H #define SV_MATH_H #ifndef WANT_VMATH /* Enable the build of vector math code. */ -#define WANT_VMATH 1 +# define WANT_VMATH 1 #endif -#if WANT_VMATH - -#if WANT_SVE_MATH -#define SV_SUPPORTED 1 - -#include -#include - -#include "math_config.h" -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; +#if WANT_VMATH -typedef svfloat64_t sv_f64_t; -typedef svuint64_t sv_u64_t; -typedef svint64_t sv_s64_t; +# include +# include -typedef svfloat32_t sv_f32_t; -typedef svuint32_t sv_u32_t; -typedef svint32_t sv_s32_t; +# include "math_config.h" /* Double precision. */ -static inline sv_s64_t -sv_s64 (s64_t x) -{ - return svdup_n_s64 (x); -} - -static inline sv_u64_t -sv_u64 (u64_t x) -{ - return svdup_n_u64 (x); -} - -static inline sv_f64_t -sv_f64 (f64_t x) -{ - return svdup_n_f64 (x); -} - -static inline sv_f64_t -sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z) +static inline svint64_t +sv_s64 (int64_t x) { - return svmla_f64_x (pg, z, x, y); + return svdup_s64 (x); } -/* res = z + x * y with x scalar. */ -static inline sv_f64_t -sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z) +static inline svuint64_t +sv_u64 (uint64_t x) { - return svmla_n_f64_x (pg, z, y, x); + return svdup_u64 (x); } -static inline sv_s64_t -sv_as_s64_u64 (sv_u64_t x) +static inline svfloat64_t +sv_f64 (double x) { - return svreinterpret_s64_u64 (x); + return svdup_f64 (x); } -static inline sv_u64_t -sv_as_u64_f64 (sv_f64_t x) -{ - return svreinterpret_u64_f64 (x); -} - -static inline sv_f64_t -sv_as_f64_u64 (sv_u64_t x) -{ - return svreinterpret_f64_u64 (x); -} - -static inline sv_f64_t -sv_to_f64_s64_x (svbool_t pg, sv_s64_t s) -{ - return svcvt_f64_x (pg, s); -} - -static inline sv_f64_t -sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp) +static inline svfloat64_t +sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f64_t elem = svclastb_n_f64 (p, 0, x); + double elem = svclastb (p, 0, x); elem = (*f) (elem); - sv_f64_t y2 = svdup_n_f64 (elem); - y = svsel_f64 (p, y2, y); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); p = svpnext_b64 (cmp, p); } return y; } -static inline sv_f64_t -sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y, - svbool_t cmp) +static inline svfloat64_t +sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2, + svfloat64_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f64_t elem1 = svclastb_n_f64 (p, 0, x1); - f64_t elem2 = svclastb_n_f64 (p, 0, x2); - f64_t ret = (*f) (elem1, elem2); - sv_f64_t y2 = svdup_n_f64 (ret); - y = svsel_f64 (p, y2, y); + double elem1 = svclastb (p, 0, x1); + double elem2 = svclastb (p, 0, x2); + double ret = (*f) (elem1, elem2); + svfloat64_t y2 = sv_f64 (ret); + y = svsel (p, y2, y); p = svpnext_b64 (cmp, p); } return y; } -/* Load array of uint64_t into svuint64_t. */ -static inline sv_u64_t -sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx) -{ - return svld1_gather_u64index_u64 (pg, tab, idx); -} - -/* Load array of double into svfloat64_t. */ -static inline sv_f64_t -sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx) +static inline svuint64_t +sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y) { - return svld1_gather_u64index_f64 (pg, tab, idx); -} - -static inline sv_u64_t -sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y) -{ - sv_u64_t q = svdiv_n_u64_x (pg, x, y); - return svmls_n_u64_x (pg, x, q, y); + svuint64_t q = svdiv_x (pg, x, y); + return svmls_x (pg, x, q, y); } /* Single precision. */ -static inline sv_s32_t -sv_s32 (s32_t x) -{ - return svdup_n_s32 (x); -} - -static inline sv_u32_t -sv_u32 (u32_t x) -{ - return svdup_n_u32 (x); -} - -static inline sv_f32_t -sv_f32 (f32_t x) -{ - return svdup_n_f32 (x); -} - -static inline sv_f32_t -sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z) -{ - return svmla_f32_x (pg, z, x, y); -} - -/* res = z + x * y with x scalar. */ -static inline sv_f32_t -sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z) -{ - return svmla_n_f32_x (pg, z, y, x); -} - -static inline sv_u32_t -sv_as_u32_f32 (sv_f32_t x) -{ - return svreinterpret_u32_f32 (x); -} - -static inline sv_f32_t -sv_as_f32_u32 (sv_u32_t x) +static inline svint32_t +sv_s32 (int32_t x) { - return svreinterpret_f32_u32 (x); + return svdup_s32 (x); } -static inline sv_s32_t -sv_as_s32_u32 (sv_u32_t x) +static inline svuint32_t +sv_u32 (uint32_t x) { - return svreinterpret_s32_u32 (x); + return svdup_u32 (x); } -static inline sv_f32_t -sv_to_f32_s32_x (svbool_t pg, sv_s32_t s) +static inline svfloat32_t +sv_f32 (float x) { - return svcvt_f32_x (pg, s); + return svdup_f32 (x); } -static inline sv_s32_t -sv_to_s32_f32_x (svbool_t pg, sv_f32_t x) -{ - return svcvt_s32_f32_x (pg, x); -} - -static inline sv_f32_t -sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp) +static inline svfloat32_t +sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f32_t elem = svclastb_n_f32 (p, 0, x); + float elem = svclastb (p, 0, x); elem = (*f) (elem); - sv_f32_t y2 = svdup_n_f32 (elem); - y = svsel_f32 (p, y2, y); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); p = svpnext_b32 (cmp, p); } return y; } -static inline sv_f32_t -sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y, - svbool_t cmp) +static inline svfloat32_t +sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, + svfloat32_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { - f32_t elem1 = svclastb_n_f32 (p, 0, x1); - f32_t elem2 = svclastb_n_f32 (p, 0, x2); - f32_t ret = (*f) (elem1, elem2); - sv_f32_t y2 = svdup_n_f32 (ret); - y = svsel_f32 (p, y2, y); + float elem1 = svclastb (p, 0, x1); + float elem2 = svclastb (p, 0, x2); + float ret = (*f) (elem1, elem2); + svfloat32_t y2 = sv_f32 (ret); + y = svsel (p, y2, y); p = svpnext_b32 (cmp, p); } return y; } - -#endif #endif + #endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_pow_1u5.c b/contrib/arm-optimized-routines/pl/math/sv_pow_1u5.c new file mode 100644 index 000000000000..0838810206a1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_pow_1u5.c @@ -0,0 +1,444 @@ +/* + * Double-precision SVE pow(x, y) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* This version share a similar algorithm as AOR scalar pow. + + The core computation consists in computing pow(x, y) as + + exp (y * log (x)). + + The algorithms for exp and log are very similar to scalar exp and log. + The log relies on table lookup for 3 variables and an order 8 polynomial. + It returns a high and a low contribution that are then passed to the exp, + to minimise the loss of accuracy in both routines. + The exp is based on 8-bit table lookup for scale and order-4 polynomial. + The SVE algorithm drops the tail in the exp computation at the price of + a lower accuracy, slightly above 1ULP. + The SVE algorithm also drops the special treatement of small (< 2^-65) and + large (> 2^63) finite values of |y|, as they only affect non-round to nearest + modes. + + Maximum measured error is 1.04 ULPs: + SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) + got 0x1.f7116284221fcp-1 + want 0x1.f7116284221fdp-1. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define A __v_pow_log_data.poly +#define Off 0x3fe6955500000000 + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define C __v_pow_exp_data.poly +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define HugeExp 0x409 /* top12(1024.). */ + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Check if x is an integer. */ +static inline svbool_t +sv_isint (svbool_t pg, svfloat64_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +sv_isnotint (svbool_t pg, svfloat64_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +sv_isodd (svbool_t pg, svfloat64_t x) +{ + svfloat64_t y = svmul_x (pg, x, 0.5); + return sv_isnotint (pg, y); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Top 12 bits (sign and exponent of each double float lane). */ +static inline svuint64_t +sv_top12 (svfloat64_t x) +{ + return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint64_t i) +{ + return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), + 2 * asuint64 (INFINITY) - 1); +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +specialcase (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale; + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + return 0x1p1009 * (scale + scale * tmp); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + double y = scale + scale * tmp; + return 0x1p-1022 * y; +} + +/* Scalar fallback for special cases of SVE pow's exp. */ +static inline svfloat64_t +sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double sx1 = svclastb (p, 0, x1); + uint64_t su1 = svclastb (p, 0, u1); + uint64_t su2 = svclastb (p, 0, u2); + double elem = specialcase (sx1, su1, su2); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline svfloat64_t +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), + sv_u64 (N_LOG - 1)); + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); + svfloat64_t z = svreinterpret_f64 (iz); + svfloat64_t kd = svcvt_f64_x (pg, k); + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version + that uses array of structures. We also do the lookup earlier in the code to + make sure it finishes as early as possible. */ + svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); + svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); + svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + svfloat64_t r = svmad_x (pg, z, invc, -1.0); + /* k*Ln2 + log(c) + r. */ + svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); + svfloat64_t t2 = svadd_x (pg, t1, r); + svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); + svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ + svfloat64_t ar2 = svmul_x (pg, r, ar); + svfloat64_t ar3 = svmul_x (pg, r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + svfloat64_t hi = svadd_x (pg, t2, ar2); + svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); + svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * + A[6])))). */ + svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); + svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); + svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); + svfloat64_t p = svmla_x (pg, a34, ar2, a56); + p = svmla_x (pg, a12, ar2, p); + p = svmul_x (pg, ar3, p); + svfloat64_t lo = svadd_x ( + pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + svfloat64_t y = svadd_x (pg, hi, lo); + *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline svfloat64_t +sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias) +{ + /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) + and other cases of large values of x (scale * (1 + TMP) oflow). */ + svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff); + /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ + svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); + + /* Conditions special, uflow and oflow are all expressed as uoflow && + something, hence do not bother computing anything if no lane in uoflow is + true. */ + svbool_t special = svpfalse_b (); + svbool_t uflow = svpfalse_b (); + svbool_t oflow = svpfalse_b (); + if (unlikely (svptest_any (pg, uoflow))) + { + /* |x| is tiny (|x| <= 0x1p-54). */ + uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + uflow = svand_z (pg, uoflow, uflow); + /* |x| is huge (|x| >= 1024). */ + oflow = svcmpge (pg, abstop, HugeExp); + oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow + or underflow. */ + special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); + svfloat64_t kd = svadd_x (pg, z, shift); + svuint64_t ki = svreinterpret_u64 (kd); + kd = svsub_x (pg, kd, shift); + svfloat64_t r = x; + r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); + r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + sbits = svadd_x (pg, sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); + tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); + tmp = svmla_x (pg, r, r2, tmp); + svfloat64_t scale = svreinterpret_f64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, tmp); + + /* Update result with special and large cases. */ + if (unlikely (svptest_any (pg, special))) + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); + svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); + svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + z = svsel (oflow, res_uoflow, z); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + z = svsel (uflow, res_spurious_uflow, z); + + return z; +} + +static inline double +pow_sc (double x, double y) +{ + uint64_t ix = asuint64 (x); + uint64_t iy = asuint64 (y); + /* Special cases: |x| or |y| is 0, inf or nan. */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double_t x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + x2 = -x2; + /* Without the barrier some versions of clang hoist the 1/x2 and + thus division by zero exception can be signaled spuriously. */ + return (iy >> 63) ? opt_barrier_double (1 / x2) : x2; + } + return x; +} + +svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) +{ + /* This preamble handles special case conditions used in the final scalar + fallbacks. It also updates ix and sign_bias, that are used in the core + computation too, i.e., exp( y * log (x) ). */ + svuint64_t vix0 = svreinterpret_u64 (x); + svuint64_t viy0 = svreinterpret_u64 (y); + svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); + + /* Negative x cases. */ + svuint64_t sign_bit = svlsr_m (pg, vix0, 63); + svbool_t xisneg = svcmpeq (pg, sign_bit, 1); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yisnotint_xisneg = svpfalse_b (); + svuint64_t sign_bias = sv_u64 (0); + svuint64_t vix = vix0; + svuint64_t vtopx1 = vtopx0; + if (unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yisnotint_xisneg = sv_isnotint (xisneg, y); + svbool_t yisint_xisneg = sv_isint (xisneg, y); + svbool_t yisodd_xisneg = sv_isodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); + vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t special = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); + svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); + if (unlikely (svptest_any (pg, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); + + svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); + vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); + vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52); + vix = svsel (topx_is_null, vix_norm, vix); + } + + /* y_hi = log(ix, &y_lo). */ + svfloat64_t vlo; + svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); + + /* z = exp(y_hi, y_lo, sign_bias). */ + svfloat64_t vehi = svmul_x (pg, y, vhi); + svfloat64_t velo = svmul_x (pg, y, vlo); + svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); + velo = svsub_x (pg, velo, vemi); + svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); + + /* Cases of finite y and finite negative x. */ + vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); + + /* Cases of zero/inf/nan x or y. */ + if (unlikely (svptest_any (pg, special))) + vz = sv_call2_f64 (pow_sc, x, y, vz, special); + + return vz; +} + +PL_SIG (SV, D, 2, pow) +PL_TEST_ULP (SV_NAME_D2 (pow), 0.55) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +SV_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +/* around estimated argmaxs of ULP error. */ +SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* |x| is inf, y is odd or even integer, or y is real not integer. */ +SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1) +SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1) +SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1) +SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1) +/* 0.0^y. */ +SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) +/* 1.0^y. */ +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c b/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c new file mode 100644 index 000000000000..2db0636aea62 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c @@ -0,0 +1,360 @@ +/* + * Single-precision SVE powf function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* The following data is used in the SVE pow core computation + and special case detection. */ +#define Tinvc __v_powf_data.invc +#define Tlogc __v_powf_data.logc +#define Texp __v_powf_data.scale +#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) +#define Shift 0x1.8p52 +#define Norm 0x1p23f /* 0x4b000000. */ + +/* Overall ULP error bound for pow is 2.6 ulp + ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +static const struct data +{ + double log_poly[4]; + double exp_poly[3]; + float uflow_bound, oflow_bound, small_bound; + uint32_t sign_bias, sign_mask, subnormal_bias, off; +} data = { + /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of + V_POWF_EXP2_N. */ + .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3, + -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 }, + /* rel err: 1.69 * 2^-34. */ + .exp_poly = { + 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */ + 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */ + 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */ + }, + .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */ + .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */ + .small_bound = 0x1p-126f, + .off = 0x3f35d000, + .sign_bias = SignBias, + .sign_mask = 0x80000000, + .subnormal_bias = 0x0b800000, /* 23 << 23. */ +}; + +#define A(i) sv_f64 (d->log_poly[i]) +#define C(i) sv_f64 (d->exp_poly[i]) + +/* Check if x is an integer. */ +static inline svbool_t +svisint (svbool_t pg, svfloat32_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +svisnotint (svbool_t pg, svfloat32_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +svisodd (svbool_t pg, svfloat32_t x) +{ + svfloat32_t y = svmul_x (pg, x, 0.5f); + return svisnotint (pg, y); +} + +/* Check if zero, inf or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint32_t i) +{ + return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), + 2u * 0x7f800000 - 1); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint32_t iy) +{ + int e = iy >> 23 & 0xff; + if (e < 0x7f) + return 0; + if (e > 0x7f + 23) + return 2; + if (iy & ((1 << (0x7f + 23 - e)) - 1)) + return 0; + if (iy & (1 << (0x7f + 23 - e))) + return 1; + return 2; +} + +/* Check if zero, inf or nan. */ +static inline int +zeroinfnan (uint32_t ix) +{ + return 2 * ix - 1 >= 2u * 0x7f800000 - 1; +} + +/* A scalar subroutine used to fix main power special cases. Similar to the + preamble of finite_powf except that we do not update ix and sign_bias. This + is done in the preamble of the SVE powf. */ +static inline float +powf_specialcase (float x, float y, float z) +{ + uint32_t ix = asuint (x); + uint32_t iy = asuint (y); + /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignalingf_inline (x) ? x + y : 1.0f; + if (ix == 0x3f800000) + return issignalingf_inline (y) ? x + y : 1.0f; + if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000) + return x + y; + if (2 * ix == 2 * 0x3f800000) + return 1.0f; + if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000)) + return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + float_t x2 = x * x; + if (ix & 0x80000000 && checkint (iy) == 1) + x2 = -x2; + return iy & 0x80000000 ? 1 / x2 : x2; + } + /* We need a return here in case x<0 and y is integer, but all other tests + need to be run. */ + return z; +} + +/* Scalar fallback for special case routines with custom signature. */ +static inline svfloat32_t +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float sx1 = svclastb (p, 0, x1); + float sx2 = svclastb (p, 0, x2); + float elem = svclastb (p, 0, y); + elem = powf_specialcase (sx1, sx2, elem); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +/* Compute core for half of the lanes in double precision. */ +static inline svfloat64_t +sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, + svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx, + const struct data *d) +{ + svfloat64_t invc = svld1_gather_index (pg, Tinvc, i); + svfloat64_t logc = svld1_gather_index (pg, Tlogc, i); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc); + svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k)); + + /* Polynomial to approximate log1p(r)/ln2. */ + svfloat64_t logx = A (0); + logx = svmla_x (pg, A (1), r, logx); + logx = svmla_x (pg, A (2), r, logx); + logx = svmla_x (pg, A (3), r, logx); + logx = svmla_x (pg, y0, r, logx); + *pylogx = svmul_x (pg, y, logx); + + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svadd_x (pg, *pylogx, Shift); + svuint64_t ki = svreinterpret_u64 (kd); + kd = svsub_x (pg, kd, Shift); + + r = svsub_x (pg, *pylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + svuint64_t t + = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (pg, ki, sign_bias); + t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svfloat64_t s = svreinterpret_f64 (t); + + svfloat64_t p = C (0); + p = svmla_x (pg, C (1), p, r); + p = svmla_x (pg, C (2), p, r); + p = svmla_x (pg, s, p, svmul_x (pg, s, r)); + + return p; +} + +/* Widen vector to double precision and compute core on both halves of the + vector. Lower cost of promotion by considering all lanes active. */ +static inline svfloat32_t +sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, + svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx, + const struct data *d) +{ + const svbool_t ptrue = svptrue_b64 (); + + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in + order to perform core computation in double precision. */ + const svbool_t pg_lo = svunpklo (pg); + const svbool_t pg_hi = svunpkhi (pg); + svfloat64_t y_lo = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat32_t z = svreinterpret_f32 (iz); + svfloat64_t z_lo = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); + svfloat64_t z_hi = svcvt_f64_x ( + ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); + svuint64_t i_lo = svunpklo (i); + svuint64_t i_hi = svunpkhi (i); + svint64_t k_lo = svunpklo (k); + svint64_t k_hi = svunpkhi (k); + svuint64_t sign_bias_lo = svunpklo (sign_bias); + svuint64_t sign_bias_hi = svunpkhi (sign_bias); + + /* Compute each part in double precision. */ + svfloat64_t ylogx_lo, ylogx_hi; + svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo, + sign_bias_lo, &ylogx_lo, d); + svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi, + sign_bias_hi, &ylogx_hi, d); + + /* Convert back to single-precision and interleave. */ + svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo); + svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi); + *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32); + svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo); + svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi); + return svuzp1 (lo_32, hi_32); +} + +/* Implementation of SVE powf. + Provides the same accuracy as AdvSIMD powf, since it relies on the same + algorithm. The theoretical maximum error is under 2.60 ULPs. + Maximum measured error is 2.56 ULPs: + SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 + want 0x1.fd4b06p+127. */ +svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t vix0 = svreinterpret_u32 (x); + svuint32_t viy0 = svreinterpret_u32 (y); + + /* Negative x cases. */ + svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); + svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yisnotint_xisneg = svpfalse_b (); + svuint32_t sign_bias = sv_u32 (0); + svuint32_t vix = vix0; + if (unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yisnotint_xisneg = svisnotint (xisneg, y); + svbool_t yisint_xisneg = svisint (xisneg, y); + svbool_t yisodd_xisneg = svisodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t cmp = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svbool_t xsmall = svaclt (pg, x, d->small_bound); + if (unlikely (svptest_any (pg, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); + vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff); + vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias); + vix = svsel (xsmall, vix_norm, vix); + } + /* Part of core computation carried in working precision. */ + svuint32_t tmp = svsub_x (pg, vix, d->off); + svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (pg, tmp, 0xff800000); + svuint32_t iz = svsub_x (pg, vix, top); + svint32_t k + = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results to + handle cases of underflow and underflow in exp. */ + svfloat32_t ylogx; + svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); + + /* Handle exp special cases of underflow and overflow. */ + svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svfloat32_t ret_oflow + = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); + svfloat32_t ret_uflow = svreinterpret_f32 (sign); + ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); + + /* Cases of finite y and finite negative x. */ + ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); + + if (unlikely (svptest_any (pg, cmp))) + return sv_call_powf_sc (x, y, ret, cmp); + + return ret; +} + +PL_SIG (SV, F, 2, pow) +PL_TEST_ULP (SV_NAME_F2 (pow), 2.06) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n) +SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000) +SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000) +SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000) +/* x~1 or y~1. */ +SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +/* around estimated argmaxs of ULP error. */ +SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* |x| is inf, y is odd or even integer, or y is real not integer. */ +SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1) +SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1) +SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1) +SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1) +/* 0.0^y. */ +SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) +/* 1.0^y. */ +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_powi.c b/contrib/arm-optimized-routines/pl/math/sv_powi.c index 1bb0eb3d3498..e53bf2195533 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_powi.c +++ b/contrib/arm-optimized-routines/pl/math/sv_powi.c @@ -1,53 +1,48 @@ /* * Double-precision SVE powi(x, n) function. * * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#if SV_SUPPORTED /* Optimized double-precision vector powi (double base, long integer power). powi is developed for environments in which accuracy is of much less importance than performance, hence we provide no estimate for worst-case error. */ svfloat64_t -__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p) +_ZGVsMxvv_powk (svfloat64_t as, svint64_t ns, svbool_t p) { /* Compute powi by successive squaring, right to left. */ - svfloat64_t acc = svdup_n_f64 (1.0); - svbool_t want_recip = svcmplt_n_s64 (p, ns, 0); - svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns)); + svfloat64_t acc = sv_f64 (1.0); + svbool_t want_recip = svcmplt (p, ns, 0); + svuint64_t ns_abs = svreinterpret_u64 (svabs_x (p, ns)); /* We use a max to avoid needing to check whether any lane != 0 on each iteration. */ - uint64_t max_n = svmaxv_u64 (p, ns_abs); + uint64_t max_n = svmaxv (p, ns_abs); svfloat64_t c = as; /* Successively square c, and use merging predication (_m) to determine whether or not to perform the multiplication or keep the previous iteration. */ while (true) { - svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull); - acc = svmul_f64_m (px, acc, c); + svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1ull), 1ull); + acc = svmul_m (px, acc, c); max_n >>= 1; if (max_n == 0) break; - ns_abs = svlsr_n_u64_x (p, ns_abs, 1); - c = svmul_f64_x (p, c, c); + ns_abs = svlsr_x (p, ns_abs, 1); + c = svmul_x (p, c, c); } /* Negative powers are handled by computing the abs(n) version and then taking the reciprocal. */ if (svptest_any (want_recip, want_recip)) - acc = svdivr_n_f64_m (want_recip, acc, 1.0); + acc = svdivr_m (want_recip, acc, 1.0); return acc; } - -strong_alias (__sv_powi_x, _ZGVsMxvv_powk) - -#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_powif.c b/contrib/arm-optimized-routines/pl/math/sv_powif.c index d0567e393927..7e032fd86a20 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_powif.c +++ b/contrib/arm-optimized-routines/pl/math/sv_powif.c @@ -1,54 +1,48 @@ /* * Single-precision SVE powi(x, n) function. * * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#if SV_SUPPORTED /* Optimized single-precision vector powi (float base, integer power). powi is developed for environments in which accuracy is of much less importance than performance, hence we provide no estimate for worst-case error. */ svfloat32_t -__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p) +_ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p) { /* Compute powi by successive squaring, right to left. */ - svfloat32_t acc = svdup_n_f32 (1.f); - svbool_t want_recip = svcmplt_n_s32 (p, ns, 0); - svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns)); + svfloat32_t acc = sv_f32 (1.f); + svbool_t want_recip = svcmplt (p, ns, 0); + svuint32_t ns_abs = svreinterpret_u32 (svabs_x (p, ns)); /* We use a max to avoid needing to check whether any lane != 0 on each iteration. */ - uint32_t max_n = svmaxv_u32 (p, ns_abs); + uint32_t max_n = svmaxv (p, ns_abs); svfloat32_t c = as; /* Successively square c, and use merging predication (_m) to determine whether or not to perform the multiplication or keep the previous iteration. */ while (true) { - svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1); - acc = svmul_f32_m (px, acc, c); + svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1), 1); + acc = svmul_m (px, acc, c); max_n >>= 1; if (max_n == 0) break; - ns_abs = svlsr_n_u32_x (p, ns_abs, 1); - c = svmul_f32_x (p, c, c); + ns_abs = svlsr_x (p, ns_abs, 1); + c = svmul_x (p, c, c); } /* Negative powers are handled by computing the abs(n) version and then taking the reciprocal. */ if (svptest_any (want_recip, want_recip)) - acc = svdivr_n_f32_m (want_recip, acc, 1.0f); + acc = svdivr_m (want_recip, acc, 1.0f); return acc; } - -/* Note no trailing f for ZGV... name - 64-bit integer version is powk. */ -strong_alias (__sv_powif_x, _ZGVsMxvv_powi) - -#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c b/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c deleted file mode 100644 index 3fee08061918..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Double-precision SVE sin(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if SV_SUPPORTED - -#define InvPi (sv_f64 (0x1.45f306dc9c883p-2)) -#define HalfPi (sv_f64 (0x1.921fb54442d18p+0)) -#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1)) -#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0)) -#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26)) -#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54)) -#define Shift (sv_f64 (0x1.8p52)) -#define RangeVal (sv_f64 (0x1p23)) -#define AbsMask (0x7fffffffffffffff) - -static NOINLINE sv_f64_t -__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) -{ - return sv_call_f64 (sin, x, y, cmp); -} - -/* A fast SVE implementation of sin based on trigonometric - instructions (FTMAD, FTSSEL, FTSMUL). - Maximum observed error in 2.52 ULP: - __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40 - want 0x1.10ace8f3e7868p-40. */ -sv_f64_t -__sv_sin_x (sv_f64_t x, const svbool_t pg) -{ - sv_f64_t n, r, r2, y; - sv_u64_t sign; - svbool_t cmp; - - r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask)); - sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask); - cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal)); - - /* n = rint(|x|/(pi/2)). */ - sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift); - n = svsub_f64_x (pg, q, Shift); - - /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = sv_fma_f64_x (pg, NegPio2_1, n, r); - r = sv_fma_f64_x (pg, NegPio2_2, n, r); - r = sv_fma_f64_x (pg, NegPio2_3, n, r); - - /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q)); - - /* sin(r) poly approx. */ - r2 = svtsmul_f64 (r, sv_as_u64_f64 (q)); - y = sv_f64 (0.0); - y = svtmad_f64 (y, r2, 7); - y = svtmad_f64 (y, r2, 6); - y = svtmad_f64 (y, r2, 5); - y = svtmad_f64 (y, r2, 4); - y = svtmad_f64 (y, r2, 3); - y = svtmad_f64 (y, r2, 2); - y = svtmad_f64 (y, r2, 1); - y = svtmad_f64 (y, r2, 0); - - /* Apply factor. */ - y = svmul_f64_x (pg, f, y); - - /* sign = y^sign. */ - y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); - - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ - if (unlikely (svptest_any (pg, cmp))) - return __sv_sin_specialcase (x, y, cmp); - return y; -} - -PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin) - -PL_SIG (SV, D, 1, sin, -3.1, 3.1) -PL_TEST_ULP (__sv_sin, 2.03) -PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_sin_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_sin_3u5.c new file mode 100644 index 000000000000..a81f3fc80f3d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sin_3u5.c @@ -0,0 +1,96 @@ +/* + * Double-precision SVE sin(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double inv_pi, pi_1, pi_2, pi_3, shift, range_val; + double poly[7]; +} data = { + .poly = { -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, -0x1.a01a019936f27p-13, + 0x1.71de37a97d93ep-19, -0x1.ae633919987c6p-26, + 0x1.60e277ae07cecp-33, -0x1.9e9540300a1p-41, }, + + .inv_pi = 0x1.45f306dc9c883p-2, + .pi_1 = 0x1.921fb54442d18p+1, + .pi_2 = 0x1.1a62633145c06p-53, + .pi_3 = 0x1.c1cd129024e09p-106, + .shift = 0x1.8p52, + .range_val = 0x1p23, +}; + +#define C(i) sv_f64 (d->poly[i]) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) +{ + return sv_call_f64 (sin, x, y, cmp); +} + +/* A fast SVE implementation of sin. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVsMxv_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVsMxv_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Load some values in quad-word chunks to minimise memory access. */ + const svbool_t ptrue = svptrue_b64 (); + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t inv_pi_and_pi1 = svld1rq (ptrue, &d->inv_pi); + svfloat64_t pi2_and_pi3 = svld1rq (ptrue, &d->pi_2); + + /* n = rint(|x|/pi). */ + svfloat64_t n = svmla_lane (shift, x, inv_pi_and_pi1, 0); + svuint64_t odd = svlsl_x (pg, svreinterpret_u64 (n), 63); + n = svsub_x (pg, n, shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/2 .. pi/2). */ + svfloat64_t r = x; + r = svmls_lane (r, n, inv_pi_and_pi1, 1); + r = svmls_lane (r, n, pi2_and_pi3, 0); + r = svmls_lane (r, n, pi2_and_pi3, 1); + + /* sin(r) poly approx. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r3 = svmul_x (pg, r2, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + + svfloat64_t t1 = svmla_x (pg, C (4), C (5), r2); + svfloat64_t t2 = svmla_x (pg, C (2), C (3), r2); + svfloat64_t t3 = svmla_x (pg, C (0), C (1), r2); + + svfloat64_t y = svmla_x (pg, t1, C (6), r4); + y = svmla_x (pg, t2, y, r4); + y = svmla_x (pg, t3, y, r4); + y = svmla_x (pg, r, y, r3); + + svbool_t cmp = svacle (pg, x, d->range_val); + cmp = svnot_z (pg, cmp); + if (unlikely (svptest_any (pg, cmp))) + return special_case (x, + svreinterpret_f64 (sveor_z ( + svnot_z (pg, cmp), svreinterpret_u64 (y), odd)), + cmp); + + /* Copy sign. */ + return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd)); +} + +PL_SIG (SV, D, 1, sin, -3.1, 3.1) +PL_TEST_ULP (SV_NAME_D1 (sin), 2.73) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincos_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_sincos_3u5.c new file mode 100644 index 000000000000..f73550082d5b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincos_3u5.c @@ -0,0 +1,61 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "sv_sincos_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static void NOINLINE +special_case (svfloat64_t x, svbool_t special, double *out_sin, + double *out_cos) +{ + svbool_t p = svptrue_pat_b64 (SV_VL1); + for (int i = 0; i < svcntd (); i++) + { + if (svptest_any (special, p)) + sincos (svlastb (p, x), out_sin + i, out_cos + i); + p = svpnext_b64 (svptrue_b64 (), p); + } +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + sv_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +void +_ZGVsMxvl8l8_sincos (svfloat64_t x, double *out_sin, double *out_cos, + svbool_t pg) +{ + const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat64x2_t sc = sv_sincos_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); + + if (unlikely (svptest_any (pg, special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVsMxv_sincos_sin, 2.73) +PL_TEST_ULP (_ZGVsMxv_sincos_cos, 2.73) +#define SV_SINCOS_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n) +SV_SINCOS_INTERVAL (0, 0x1p23, 500000) +SV_SINCOS_INTERVAL (-0, -0x1p23, 500000) +SV_SINCOS_INTERVAL (0x1p23, inf, 10000) +SV_SINCOS_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincos_common.h b/contrib/arm-optimized-routines/pl/math/sv_sincos_common.h new file mode 100644 index 000000000000..f7b58deb90bd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincos_common.h @@ -0,0 +1,85 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" + +static const struct sv_sincos_data +{ + double sin_poly[7], cos_poly[6], pio2[3]; + double inv_pio2, shift, range_val; +} sv_sincos_data = { + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2 = { 0x1.921fb50000000p+0, 0x1.110b460000000p-26, + 0x1.1a62633145c07p-54 }, + .shift = 0x1.8p52, + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, + -0x1.a01a019936f27p-13, 0x1.71de37a97d93ep-19, + -0x1.ae633919987c6p-26, 0x1.60e277ae07cecp-33, + -0x1.9e9540300a1p-41 }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + 0x1.555555555554cp-5, -0x1.6c16c16c1521fp-10, + 0x1.a01a019cbf62ap-16, -0x1.27e4f812b681ep-22, + 0x1.1ee9f152a57cdp-29, -0x1.8fb131098404bp-37 }, + .range_val = 0x1p23, }; + +static inline svbool_t +check_ge_rangeval (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d) +{ + svbool_t in_bounds = svaclt (pg, x, d->range_val); + return svnot_z (pg, in_bounds); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline svfloat64x2_t +sv_sincos_inline (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t q = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_pio2), + d->shift); + svint64_t n = svcvt_s64_x (pg, q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + svfloat64_t r = x; + r = svmls_x (pg, r, q, d->pio2[0]); + r = svmls_x (pg, r, q, d->pio2[1]); + r = svmls_x (pg, r, q, d->pio2[2]); + + svfloat64_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r2, r), + r4 = svmul_x (pg, r2, r2); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + svfloat64_t s = sv_pw_horner_6_f64_x (pg, r2, r4, d->sin_poly); + s = svmla_x (pg, r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + svfloat64_t c = sv_pw_horner_5_f64_x (pg, r2, r4, d->cos_poly); + c = svmad_x (pg, c, r2, -0.5); + c = svmad_x (pg, c, r2, 1); + + svuint64_t un = svreinterpret_u64 (n); + /* If odd quadrant, swap cos and sin. */ + svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 63), 0); + svfloat64_t ss = svsel (swap, s, c); + svfloat64_t cc = svsel (swap, c, s); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + svuint64_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 62); + svuint64_t cos_sign = svlsl_x ( + pg, svand_x (pg, svreinterpret_u64 (svadd_x (pg, n, 1)), 2), 62); + ss = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ss), sin_sign)); + cc = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (cc), cos_sign)); + + return svcreate2 (ss, cc); +} diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincosf_1u8.c b/contrib/arm-optimized-routines/pl/math/sv_sincosf_1u8.c new file mode 100644 index 000000000000..c335de8d3dbb --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincosf_1u8.c @@ -0,0 +1,62 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "sv_sincosf_common.h" +#include "sv_math.h" +#include "pl_test.h" + +static void NOINLINE +special_case (svfloat32_t x, svbool_t special, float *out_sin, float *out_cos) +{ + svbool_t p = svptrue_pat_b32 (SV_VL1); + for (int i = 0; i < svcntw (); i++) + { + if (svptest_any (special, p)) + sincosf (svlastb (p, x), out_sin + i, out_cos + i); + p = svpnext_b32 (svptrue_b32 (), p); + } +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +void +_ZGVsMxvl4l4_sincosf (svfloat32_t x, float *out_sin, float *out_cos, + svbool_t pg) +{ + const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat32x2_t sc = sv_sincosf_inline (pg, x, d); + + svst1_f32 (pg, out_sin, svget2 (sc, 0)); + svst1_f32 (pg, out_cos, svget2 (sc, 1)); + + if (unlikely (svptest_any (pg, special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17) +PL_TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31) +#define SV_SINCOSF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n) +SV_SINCOSF_INTERVAL (0, 0x1p20, 500000) +SV_SINCOSF_INTERVAL (-0, -0x1p20, 500000) +SV_SINCOSF_INTERVAL (0x1p20, inf, 10000) +SV_SINCOSF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sincosf_common.h b/contrib/arm-optimized-routines/pl/math/sv_sincosf_common.h new file mode 100644 index 000000000000..714e996443b3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sincosf_common.h @@ -0,0 +1,81 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +const static struct sv_sincosf_data +{ + float poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} sv_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + -0x1.555546p-3, 0x1.11076p-7, -0x1.994eb4p-13 }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + 0x1.55554ap-5, -0x1.6c0c1ap-10, 0x1.99e0eep-16 }, + .pio2 = { 0x1.921fb6p+0f, -0x1.777a5cp-25f, -0x1.ee59dap-50f }, + .inv_pio2 = 0x1.45f306p-1f, + .shift = 0x1.8p23, + .range_val = 0x1p20 +}; + +static inline svbool_t +check_ge_rangeval (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d) +{ + svbool_t in_bounds = svaclt (pg, x, d->range_val); + return svnot_z (pg, in_bounds); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline svfloat32x2_t +sv_sincosf_inline (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + svfloat32_t q = svmla_x (pg, sv_f32 (d->shift), x, d->inv_pio2); + q = svsub_x (pg, q, d->shift); + svint32_t n = svcvt_s32_x (pg, q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + svfloat32_t r = x; + r = svmls_x (pg, r, q, d->pio2[0]); + r = svmls_x (pg, r, q, d->pio2[1]); + r = svmls_x (pg, r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + svfloat32_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r, r2); + svfloat32_t s = svmla_x (pg, sv_f32 (d->poly_sin[1]), r2, d->poly_sin[2]); + s = svmad_x (pg, r2, s, d->poly_sin[0]); + s = svmla_x (pg, r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t p = svmla_x (pg, sv_f32 (d->poly_cos[1]), r2, d->poly_cos[2]); + svfloat32_t c = svmad_x (pg, sv_f32 (d->poly_cos[0]), r2, -0.5); + c = svmla_x (pg, c, r4, p); + c = svmad_x (pg, r2, c, 1); + + svuint32_t un = svreinterpret_u32 (n); + /* If odd quadrant, swap cos and sin. */ + svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 31), 0); + svfloat32_t ss = svsel (swap, s, c); + svfloat32_t cc = svsel (swap, c, s); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + svuint32_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 30); + svuint32_t cos_sign = svlsl_x ( + pg, svand_x (pg, svreinterpret_u32 (svadd_x (pg, n, 1)), 2), 30); + ss = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ss), sin_sign)); + cc = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (cc), cos_sign)); + + return svcreate2 (ss, cc); +} diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c index 9184ccd3cf0c..675d7b2480f7 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c +++ b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c @@ -1,84 +1,93 @@ /* * Single-precision SVE sin(x) function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -#define A3 (sv_f32 (__sv_sinf_data.coeffs[3])) -#define A5 (sv_f32 (__sv_sinf_data.coeffs[2])) -#define A7 (sv_f32 (__sv_sinf_data.coeffs[1])) -#define A9 (sv_f32 (__sv_sinf_data.coeffs[0])) +static const struct data +{ + float poly[4]; + /* Pi-related values to be loaded as one quad-word and used with + svmla_lane. */ + float negpi1, negpi2, negpi3, invpi; + float shift; +} data = { + .poly = { + /* Non-zero coefficients from the degree 9 Taylor series expansion of + sin. */ + -0x1.555548p-3f, 0x1.110df4p-7f, -0x1.9f42eap-13f, 0x1.5b2e76p-19f + }, + .negpi1 = -0x1.921fb6p+1f, + .negpi2 = 0x1.777a5cp-24f, + .negpi3 = 0x1.ee59dap-49f, + .invpi = 0x1.45f306p-2f, + .shift = 0x1.8p+23f +}; -#define NegPi1 (sv_f32 (-0x1.921fb6p+1f)) -#define NegPi2 (sv_f32 (0x1.777a5cp-24f)) -#define NegPi3 (sv_f32 (0x1.ee59dap-49f)) -#define RangeVal (sv_f32 (0x1p20f)) -#define InvPi (sv_f32 (0x1.45f306p-2f)) -#define Shift (sv_f32 (0x1.8p+23f)) -#define AbsMask (0x7fffffff) +#define RangeVal 0x49800000 /* asuint32 (0x1p20f). */ +#define C(i) sv_f32 (d->poly[i]) -static NOINLINE sv_f32_t -__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (sinf, x, y, cmp); } /* A fast SVE implementation of sinf. Maximum error: 1.89 ULPs. This maximum error is achieved at multiple values in [-2^18, 2^18] but one example is: - __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */ -sv_f32_t -__sv_sinf_x (sv_f32_t x, const svbool_t pg) + SV_NAME_F1 (sin)(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */ +svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg) { - sv_f32_t n, r, r2, y; - sv_u32_t sign, odd; - svbool_t cmp; + const struct data *d = ptr_barrier (&data); - r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); - sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask); - cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svbool_t cmp = svcmpge (pg, svreinterpret_u32 (ax), RangeVal); + + /* pi_vals are a quad-word of helper values - the first 3 elements contain + -pi in extended precision, the last contains 1 / pi. */ + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->negpi1); /* n = rint(|x|/pi). */ - n = sv_fma_f32_x (pg, InvPi, r, Shift); - odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31); - n = svsub_f32_x (pg, n, Shift); + svfloat32_t n = svmla_lane (sv_f32 (d->shift), ax, pi_vals, 3); + svuint32_t odd = svlsl_x (pg, svreinterpret_u32 (n), 31); + n = svsub_x (pg, n, d->shift); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = sv_fma_f32_x (pg, NegPi1, n, r); - r = sv_fma_f32_x (pg, NegPi2, n, r); - r = sv_fma_f32_x (pg, NegPi3, n, r); + svfloat32_t r; + r = svmla_lane (ax, n, pi_vals, 0); + r = svmla_lane (r, n, pi_vals, 1); + r = svmla_lane (r, n, pi_vals, 2); /* sin(r) approx using a degree 9 polynomial from the Taylor series expansion. Note that only the odd terms of this are non-zero. */ - r2 = svmul_f32_x (pg, r, r); - y = sv_fma_f32_x (pg, A9, r2, A7); - y = sv_fma_f32_x (pg, y, r2, A5); - y = sv_fma_f32_x (pg, y, r2, A3); - y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r); + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y; + y = svmla_x (pg, C (2), r2, C (3)); + y = svmla_x (pg, C (1), r2, y); + y = svmla_x (pg, C (0), r2, y); + y = svmla_x (pg, r, r, svmul_x (pg, y, r2)); /* sign = y^sign^odd. */ - y = sv_as_f32_u32 ( - sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd))); + sign = sveor_x (pg, sign, odd); - /* No need to pass pg to specialcase here since cmp is a strict subset, - guaranteed by the cmpge above. */ if (unlikely (svptest_any (pg, cmp))) - return __sv_sinf_specialcase (x, y, cmp); - return y; + return special_case (x, + svreinterpret_f32 (sveor_x ( + svnot_z (pg, cmp), svreinterpret_u32 (y), sign)), + cmp); + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf) - PL_SIG (SV, F, 1, sin, -3.1, 3.1) -PL_TEST_ULP (__sv_sinf, 1.40) -PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (SV_NAME_F1 (sin), 1.40) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c deleted file mode 100644 index 1e1ab5e48df1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Data used in single-precision sin(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Polynomial coefficients for approximating sin(x) in single - precision. These are the non-zero coefficients from the - degree 9 Taylor series expansion of sin. */ - -const struct sv_sinf_data __sv_sinf_data = {.coeffs = { - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, - }}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/sv_sinh_3u.c new file mode 100644 index 000000000000..a01e19caecda --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinh_3u.c @@ -0,0 +1,103 @@ +/* + * Double-precision SVE sinh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[11]; + float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift; + uint64_t halff; + int64_t onef; + uint64_t large_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .inv_ln2 = 0x1.71547652b82fep0, + .m_ln2_hi = -0x1.62e42fefa39efp-1, + .m_ln2_lo = -0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, + + .halff = 0x3fe0000000000000, + .onef = 0x3ff0000000000000, + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = 0x4080000000000000, +}; + +static inline svfloat64_t +expm1_inline (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Reduce argument: + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where i = round(x / ln2) + and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ + svfloat64_t j + = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); + svint64_t i = svcvt_s64_x (pg, j); + svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi); + f = svmla_x (pg, f, j, d->m_ln2_lo); + /* Approximate expm1(f) using polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t f8 = svmul_x (pg, f4, f4); + svfloat64_t p + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); + /* t = 2^i. */ + svfloat64_t t = svscale_x (pg, sv_f64 (1), i); + /* expm1(x) ~= p * t + (t - 1). */ + return svmla_x (pg, svsub_x (pg, t, 1.0), p, t); +} + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg) +{ + return sv_call_f64 (sinh, x, x, pg); +} + +/* Approximation for SVE double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.57 ULP: + _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2 + want 0x1.ab929fc64bd63p-2. */ +svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff)); + + svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound); + + /* Fall back to scalar variant for all lanes if any are special. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + svfloat64_t t = expm1_inline (ax, pg); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + return svmul_x (pg, t, halfsign); +} + +PL_SIG (SV, D, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (sinh), 2.08) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/sv_sinhf_2u3.c new file mode 100644 index 000000000000..e34ecf378ad3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinhf_2u3.c @@ -0,0 +1,64 @@ +/* + * Single-precision SVE sinh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_expm1f_inline.h" + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t halff, large_bound; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + .halff = 0x3f000000, + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .large_bound = 0x42b0c0a7, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +{ + return sv_call_f32 (sinhf, x, y, pg); +} + +/* Approximation for SVE single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff)); + + svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound); + + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us to + retain acceptable accuracy for very small inputs. */ + svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + + /* Fall back to the scalar variant for any lanes which would cause + expm1f to overflow. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, t, halfsign), special); + + return svmul_x (pg, t, halfsign); +} + +PL_SIG (SV, F, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (sinh), 1.76) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinpi_3u1.c b/contrib/arm-optimized-routines/pl/math/sv_sinpi_3u1.c new file mode 100644 index 000000000000..c9f23da1b19b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinpi_3u1.c @@ -0,0 +1,57 @@ +/* + * Double-precision SVE sinpi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f64.h" + +static const struct data +{ + double poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, +}; + +/* A fast SVE implementation of sinpi. + Maximum error 3.10 ULP: + _ZGVsMxv_sinpi(0x1.df1a14f1b235p-2) got 0x1.fd64f541606cp-1 + want 0x1.fd64f541606c3p-1. */ +svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2) + with n = rint(x) and r = r - n. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svuint64_t intn = svreinterpret_u64 (svcvt_s64_x (pg, n)); + svuint64_t sign = svlsl_z (pg, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +PL_SIG (SV, D, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_D1 (sinpi), 2.61) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinpif_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_sinpif_2u5.c new file mode 100644 index 000000000000..ac3f924bed68 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinpif_2u5.c @@ -0,0 +1,53 @@ +/* + * Single-precision SVE sinpi(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, +}; + +/* A fast SVE implementation of sinpif. + Maximum error 2.48 ULP: + _ZGVsMxv_sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2 + with n = rint(x) and r = r - n. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); + svuint32_t sign = svlsl_z (pg, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +PL_SIG (SV, F, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (SV_NAME_F1 (sinpi), 1.99) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_tan_3u5.c new file mode 100644 index 000000000000..746396e98a10 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tan_3u5.c @@ -0,0 +1,99 @@ +/* + * Double-precision SVE tan(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + double poly[9]; + double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; +} data = { + /* Polynomial generated with FPMinimax. */ + .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, + 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, + 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, + 0x1.4e4fd14147622p-12, }, + .half_pi_hi = 0x1.921fb54442d18p0, + .half_pi_lo = 0x1.1a62633145c07p-54, + .inv_half_pi = 0x1.45f306dc9c883p-1, + .range_val = 0x1p23, + .shift = 0x1.8p52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (tan, x, y, special); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* Invert condition to catch NaNs and Infs as well as large values. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); + + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); + q = svsub_x (pg, q, shift); + svint64_t qi = svcvt_s64_x (pg, q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + svfloat64_t r = x; + svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi); + r = svmls_lane (r, q, half_pi, 0); + r = svmls_lane (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = svmul_x (pg, r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t r8 = svmul_x (pg, r4, r4); + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); + p = svmad_x (pg, p, r2, dat->poly[0]); + p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ + svbool_t use_recip + = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (pg, p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + if (unlikely (svptest_any (pg, special))) + return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); + return svdiv_x (pg, n, d); +} + +PL_SIG (SV, D, 1, tan, -3.1, 3.1) +PL_TEST_ULP (SV_NAME_D1 (tan), 2.99) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c index cca43bd886fd..6b8cd1e64b44 100644 --- a/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c @@ -1,112 +1,119 @@ /* * Single-precision vector tan(x) function. * * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" -#if SV_SUPPORTED - -/* Constants. */ -#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) -#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) -#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) -#define InvPio2 (sv_f32 (0x1.45f306p-1f)) -#define RangeVal (sv_f32 (0x1p15f)) -#define Shift (sv_f32 (0x1.8p+23f)) - -#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i]) - -/* Use full Estrin's scheme to evaluate polynomial. */ -static inline sv_f32_t -eval_poly (svbool_t pg, sv_f32_t z) +static const struct data { - sv_f32_t z2 = svmul_f32_x (pg, z, z); - sv_f32_t z4 = svmul_f32_x (pg, z2, z2); - sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0)); - sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2)); - sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4)); - sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10); - sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10); - return y; -} - -static NOINLINE sv_f32_t -__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) + float pio2_1, pio2_2, pio2_3, invpio2; + float c1, c3, c5; + float c0, c2, c4, range_val, shift; +} data = { + /* Coefficients generated using: + poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), + deg, + [|single ...|], + [a*a;b*b]); + optimize relative error + final prec : 23 bits + deg : 5 + a : 0x1p-126 ^ 2 + b : ((pi) / 0x1p2) ^ 2 + dirty rel error: 0x1.f7c2e4p-25 + dirty abs error: 0x1.f7c2ecp-25. */ + .c0 = 0x1.55555p-2, .c1 = 0x1.11166p-3, + .c2 = 0x1.b88a78p-5, .c3 = 0x1.7b5756p-6, + .c4 = 0x1.4ef4cep-8, .c5 = 0x1.0e1e74p-7, + + .pio2_1 = 0x1.921fb6p+0f, .pio2_2 = -0x1.777a5cp-25f, + .pio2_3 = -0x1.ee59dap-50f, .invpio2 = 0x1.45f306p-1f, + .range_val = 0x1p15f, .shift = 0x1.8p+23f +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) { return sv_call_f32 (tanf, x, y, cmp); } /* Fast implementation of SVE tanf. Maximum error is 3.45 ULP: - __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 - want 0x1.ff9850p-1. */ -sv_f32_t -__sv_tanf_x (sv_f32_t x, const svbool_t pg) + SV_NAME_F1 (tan)(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* Determine whether input is too large to perform fast regression. */ - svbool_t cmp = svacge_f32 (pg, x, RangeVal); - svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0)); + svbool_t cmp = svacge (pg, x, d->range_val); + + svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); /* n = rint(x/(pi/2)). */ - sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift); - sv_f32_t n = svsub_f32_x (pg, q, Shift); + svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); + svfloat32_t n = svsub_x (pg, q, d->shift); /* n is already a signed integer, simply convert it. */ - sv_s32_t in = sv_to_s32_f32_x (pg, n); + svint32_t in = svcvt_s32_x (pg, n); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ - sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1)); - svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0)); + svint32_t alt = svand_x (pg, in, 1); + svbool_t pred_alt = svcmpne (pg, alt, 0); /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ - sv_f32_t r; - r = sv_fma_f32_x (pg, NegPio2_1, n, x); - r = sv_fma_f32_x (pg, NegPio2_2, n, r); - r = sv_fma_f32_x (pg, NegPio2_3, n, r); + svfloat32_t r; + r = svmls_lane (x, n, pi_vals, 0); + r = svmls_lane (r, n, pi_vals, 1); + r = svmls_lane (r, n, pi_vals, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). - grows to infinity then use symmetries of tangent and the identity tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use the same polynomial approximation of tan as above. */ /* Perform additional reduction if required. */ - sv_f32_t z = svneg_f32_m (r, pred_alt, r); + svfloat32_t z = svneg_m (r, pred_alt, r); - /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ - sv_f32_t z2 = svmul_f32_x (pg, z, z); - sv_f32_t p = eval_poly (pg, z2); - sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z); + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], + using Estrin on z^2. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); - /* Transform result back, if necessary. */ - sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y); - y = svsel_f32 (pred_alt, inv_y, y); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t p = svmla_x (pg, p01, z4, p23); + + svfloat32_t z8 = svmul_x (pg, z4, z4); + p = svmla_x (pg, p, z8, p45); + + svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); - /* Fast reduction does not handle the x = -0.0 case well, - therefore it is fixed here. */ - y = svsel_f32 (pred_minuszero, x, y); + /* Transform result back, if necessary. */ + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); /* No need to pass pg to specialcase here since cmp is a strict subset, guaranteed by the cmpge above. */ if (unlikely (svptest_any (pg, cmp))) - return __sv_tanf_specialcase (x, y, cmp); - return y; -} + return special_case (x, svsel (pred_alt, inv_y, y), cmp); -PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf) + return svsel (pred_alt, inv_y, y); +} PL_SIG (SV, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (__sv_tanf, 2.96) -PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000) -PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000) -PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000) -PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000) -PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000) -#endif +PL_TEST_ULP (SV_NAME_F1 (tan), 2.96) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000) +PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/sv_tanh_3u.c new file mode 100644 index 000000000000..f54139f1ddbc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tanh_3u.c @@ -0,0 +1,96 @@ +/* + * Double-precision SVE tanh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "poly_sve_f64.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64_t poly[11]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64_t thresh, tiny_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .inv_ln2 = 0x1.71547652b82fep0, + .ln2_hi = -0x1.62e42fefa39efp-1, + .ln2_lo = -0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, + + .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = 0x01f241bf835f9d5f, +}; + +static inline svfloat64_t +expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Helper routine for calculating exp(x) - 1. Vector port of the helper from + the scalar variant of tanh. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + svfloat64_t j + = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); + svint64_t i = svcvt_s64_x (pg, j); + svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi); + f = svmla_x (pg, f, j, d->ln2_lo); + + /* Approximate expm1(f) using polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t p = svmla_x ( + pg, f, f2, + sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly)); + + /* t = 2 ^ i. */ + svfloat64_t t = svscale_x (pg, sv_f64 (1), i); + /* expm1(x) = p * t + (t - 1). */ + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (tanh, x, y, special); +} + +/* SVE approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.77 ULP: + _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ +svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x)); + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh); + + svfloat64_t u = svadd_x (pg, x, x); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat64_t q = expm1_inline (u, pg, d); + svfloat64_t qp2 = svadd_x (pg, q, 2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svdiv_x (pg, q, qp2), special); + return svdiv_x (pg, q, qp2); +} + +PL_SIG (SV, D, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_D1 (tanh), 2.27) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/sv_tanhf_2u6.c new file mode 100644 index 000000000000..988a56de0b2e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tanhf_2u6.c @@ -0,0 +1,59 @@ +/* + * Single-precision SVE tanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#include "sv_expm1f_inline.h" + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t boring_bound, onef; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ + .boring_bound = 0x41102cb3, + .onef = 0x3f800000, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (tanhf, x, y, special); +} + +/* Approximation for single-precision SVE tanh(x), using a simplified + version of expm1f. The maximum error is 2.57 ULP: + _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5 + want 0x1.fb71aap-5. */ +svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound); + svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); + + svbool_t special = svcmpgt (pg, iax, 0x7f800000); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts); + svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); + if (unlikely (svptest_any (pg, special))) + return special_case (x, svsel_f32 (is_boring, boring, y), special); + return svsel_f32 (is_boring, boring, y); +} + +PL_SIG (SV, F, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (SV_NAME_F1 (tanh), 2.07) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/tanf_3u3.c b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c index ec006dc04c4c..30c86fa89730 100644 --- a/contrib/arm-optimized-routines/pl/math/tanf_3u3.c +++ b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c @@ -1,202 +1,193 @@ /* * Single-precision scalar tan(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" -#include "pairwise_hornerf.h" +#include "poly_scalar_f32.h" /* Useful constants. */ #define NegPio2_1 (-0x1.921fb6p+0f) #define NegPio2_2 (0x1.777a5cp-25f) #define NegPio2_3 (0x1.ee59dap-50f) /* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */ #define RangeVal (0x1p17f) #define InvPio2 ((0x1.45f306p-1f)) #define Shift (0x1.8p+23f) #define AbsMask (0x7fffffff) #define Pio4 (0x1.921fb6p-1) /* 2PI * 2^-64. */ #define Pio2p63 (0x1.921FB54442D18p-62) -#define P(i) __tanf_poly_data.poly_tan[i] -#define Q(i) __tanf_poly_data.poly_cotan[i] - static inline float eval_P (float z) { - return PAIRWISE_HORNER_5 (z, z * z, P); + return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan); } static inline float eval_Q (float z) { - return PAIRWISE_HORNER_3 (z, z * z, Q); + return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan); } /* Reduction of the input argument x using Cody-Waite approach, such that x = r + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */ static inline float reduce (float x, int32_t *in) { /* n = rint(x/(pi/2)). */ float r = x; float q = fmaf (InvPio2, r, Shift); float n = q - Shift; /* There is no rounding here, n is representable by a signed integer. */ *in = (int32_t) n; /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */ r = fmaf (NegPio2_1, n, r); r = fmaf (NegPio2_2, n, r); r = fmaf (NegPio2_3, n, r); return r; } /* Table with 4/PI to 192 bit precision. To avoid unaligned accesses only 8 new bits are added per entry, making the table 4 times larger. */ static const uint32_t __inv_pio4[24] = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44, 0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1, 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62, 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041}; /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit multiply computes the exact 2.62-bit fixed-point modulo. Since the result can have at most 29 leading zeros after the binary point, the double precision result is accurate to 33 bits. */ static inline double reduce_large (uint32_t xi, int *np) { const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15]; int shift = (xi >> 23) & 7; uint64_t n, res0, res1, res2; xi = (xi & 0xffffff) | 0x800000; xi <<= shift; res0 = xi * arr[0]; res1 = (uint64_t) xi * arr[4]; res2 = (uint64_t) xi * arr[8]; res0 = (res2 >> 32) | (res0 << 32); res0 += res1; n = (res0 + (1ULL << 61)) >> 62; res0 -= n << 62; double x = (int64_t) res0; *np = n; return x * Pio2p63; } /* Top 12 bits of the float representation with the sign bit cleared. */ static inline uint32_t top12 (float x) { return (asuint (x) >> 20); } /* Fast single-precision tan implementation. Maximum ULP error: 3.293ulps. tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */ float tanf (float x) { /* Get top words. */ uint32_t ix = asuint (x); uint32_t ia = ix & AbsMask; uint32_t ia12 = ia >> 20; /* Dispatch between no reduction (small numbers), fast reduction and slow large numbers reduction. The reduction step determines r float (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */ int32_t n; float r; if (ia12 < top12 (Pio4)) { /* Optimize small values. */ if (unlikely (ia12 < top12 (0x1p-12f))) { if (unlikely (ia12 < top12 (0x1p-126f))) /* Force underflow for tiny x. */ force_eval_float (x * x); return x; } /* tan (x) ~= x + x^3 * P(x^2). */ float x2 = x * x; float y = eval_P (x2); return fmaf (x2, x * y, x); } /* Similar to other trigonometric routines, fast inaccurate reduction is performed for values of x from pi/4 up to RangeVal. In order to keep errors below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for other trigonometric routines. Above this value more advanced but slower reduction techniques need to be implemented to reach a similar accuracy. */ else if (ia12 < top12 (RangeVal)) { /* Fast inaccurate reduction. */ r = reduce (x, &n); } else if (ia12 < 0x7f8) { /* Slow accurate reduction. */ uint32_t sign = ix & ~AbsMask; double dar = reduce_large (ia, &n); float ar = (float) dar; r = asfloat (asuint (ar) ^ sign); } else { /* tan(Inf or NaN) is NaN. */ return __math_invalidf (x); } /* If x lives in an interval where |tan(x)| - is finite then use an approximation of tangent in the form tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). - grows to infinity then use an approximation of cotangent in the form cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early. Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r), we only need to change the sign of r to obtain tan(x) from cotan(r). This 2-interval approach requires 2 different sets of coefficients P and Q, where Q is a lower order polynomial than P. */ /* Determine if x lives in an interval where |tan(x)| grows to infinity. */ uint32_t alt = (uint32_t) n & 1; /* Perform additional reduction if required. */ float z = alt ? -r : r; /* Prepare backward transformation. */ float z2 = r * r; float offset = alt ? 1.0f / z : z; float scale = alt ? z : z * z2; /* Evaluate polynomial approximation of tan or cotan. */ float p = alt ? eval_Q (z2) : eval_P (z2); /* A unified way of assembling the result on both interval types. */ return fmaf (scale, p, offset); } PL_SIG (S, F, 1, tan, -3.1, 3.1) PL_TEST_ULP (tanf, 2.80) PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) -PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000) -PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000) -PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000) -PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000) -PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000) -PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000) -PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000) -PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) -PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000) -PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000) -PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000) +PL_TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) +PL_TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/tanh_3u.c b/contrib/arm-optimized-routines/pl/math/tanh_3u.c index 46d9fb3fd7e1..86f2904afc32 100644 --- a/contrib/arm-optimized-routines/pl/math/tanh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/tanh_3u.c @@ -1,82 +1,78 @@ /* * Double-precision tanh(x) function. * * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "estrin.h" +#include "poly_scalar_f64.h" #include "pl_sig.h" #include "pl_test.h" #define AbsMask 0x7fffffffffffffff #define InvLn2 0x1.71547652b82fep0 #define Ln2hi 0x1.62e42fefa39efp-1 #define Ln2lo 0x1.abc9e3b39803fp-56 #define Shift 0x1.8p52 -#define C(i) __expm1_poly[i] #define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ #define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ #define One 0x3ff0000000000000 static inline double expm1_inline (double x) { /* Helper routine for calculating exp(x) - 1. Copied from expm1_2u5.c, with several simplifications: - No special-case handling for tiny or special values. - Simpler combination of p and t in final stage of the algorithm. - Use shift-and-add instead of ldexp to calculate t. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ double j = fma (InvLn2, x, Shift) - Shift; int64_t i = j; double f = fma (j, -Ln2hi, x); f = fma (j, -Ln2lo, f); /* Approximate expm1(f) using polynomial. */ double f2 = f * f; double f4 = f2 * f2; - double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f); /* t = 2 ^ i. */ double t = asdouble ((uint64_t) (i + 1023) << 52); /* expm1(x) = p * t + (t - 1). */ return fma (p, t, t - 1); } /* Approximation for double-precision tanh(x), using a simplified version of - expm1. The greatest observed error is 2.75 ULP: - tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3 - want -0x1.ba31ba4691ab4p-3. */ + expm1. The greatest observed error is 2.77 ULP: + tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ double tanh (double x) { uint64_t ix = asuint64 (x); uint64_t ia = ix & AbsMask; uint64_t sign = ix & ~AbsMask; if (unlikely (ia > BoringBound)) { if (ia > 0x7ff0000000000000) return __math_invalid (x); return asdouble (One | sign); } if (unlikely (ia < TinyBound)) return x; /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ double q = expm1_inline (2 * x); return q / (q + 2); } PL_SIG (S, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (tanh, 2.26) -PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000) -PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000) -PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000) -PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000) -PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000) -PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000) +PL_TEST_ULP (tanh, 2.27) +PL_TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000) +PL_TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000) +PL_TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c index 76e54a438e57..93ea3cf5d865 100644 --- a/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c +++ b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c @@ -1,91 +1,88 @@ /* * Single-precision tanh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "pl_sig.h" #include "pl_test.h" #define BoringBound \ 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ negative). */ #define AbsMask 0x7fffffff #define One 0x3f800000 #define Shift (0x1.8p23f) #define InvLn2 (0x1.715476p+0f) #define Ln2hi (0x1.62e4p-1f) #define Ln2lo (0x1.7f7d1cp-20f) #define C(i) __expm1f_poly[i] static inline float expm1f_inline (float x) { /* Helper routine for calculating exp(x) - 1. Copied from expm1f_1u6.c, with several simplifications: - No special-case handling for tiny or special values, instead return early from the main routine. - No special handling for large values: - No early return for infinity. - Simpler combination of p and t in final stage of algorithm. - |i| < 27, so can calculate t by simpler shift-and-add, instead of ldexpf (same as vector algorithm). */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ float j = fmaf (InvLn2, x, Shift) - Shift; int32_t i = j; float f = fmaf (j, -Ln2hi, x); f = fmaf (j, -Ln2lo, f); /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). Uses Estrin scheme, where the main expm1f routine uses Horner. */ float f2 = f * f; float p_01 = fmaf (f, C (1), C (0)); float p_23 = fmaf (f, C (3), C (2)); float p = fmaf (f2, p_23, p_01); p = fmaf (f2 * f2, C (4), p); p = fmaf (f2, p, f); /* t = 2^i. */ float t = asfloat ((uint32_t) (i + 127) << 23); /* expm1(x) ~= p * t + (t - 1). */ return fmaf (p, t, t - 1); } /* Approximation for single-precision tanh(x), using a simplified version of expm1f. The maximum error is 2.58 ULP: tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 want 0x1.f9ba08p-5. */ float tanhf (float x) { uint32_t ix = asuint (x); uint32_t iax = ix & AbsMask; uint32_t sign = ix & ~AbsMask; if (unlikely (iax > BoringBound)) { if (iax > 0x7f800000) return __math_invalidf (x); return asfloat (One | sign); } if (unlikely (iax < 0x34000000)) return x; /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ float q = expm1f_inline (2 * x); return q / (q + 2); } PL_SIG (S, F, 1, tanh, -10.0, 10.0) PL_TEST_ULP (tanhf, 2.09) -PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000) -PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000) -PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) -PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100) +PL_TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h index e0f6ac70912c..f2710a979d40 100644 --- a/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h +++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h @@ -1,86 +1,87 @@ // clang-format off /* * Function entries for mathbench. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _ZSF1(fun, a, b) F(fun##f, a, b) #define _ZSD1(f, a, b) D(f, a, b) -#ifdef __vpcs +#if defined(__vpcs) && __aarch64__ -#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b) -#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b) - -#elif __aarch64__ - -#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) -#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) - -#elif WANT_VMATH - -#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) -#define _ZVD1(f, a, b) D(__s_##f, a, b) +#define _ZVF1(fun, a, b) VNF(_ZGVnN4v_##fun##f, a, b) +#define _ZVD1(f, a, b) VND(_ZGVnN2v_##f, a, b) #else #define _ZVF1(f, a, b) #define _ZVD1(f, a, b) #endif #if WANT_SVE_MATH -#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b) -#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b) +#define _ZSVF1(fun, a, b) SVF(_ZGVsMxv_##fun##f, a, b) +#define _ZSVD1(f, a, b) SVD(_ZGVsMxv_##f, a, b) #else #define _ZSVF1(f, a, b) #define _ZSVD1(f, a, b) #endif /* No auto-generated wrappers for binary functions - they have be manually defined in mathbench_wrappers.h. We have to define silent macros for them anyway as they will be emitted by PL_SIG. */ #define _ZSF2(...) #define _ZSD2(...) #define _ZVF2(...) #define _ZVD2(...) #define _ZSVF2(...) #define _ZSVD2(...) #include "mathbench_funcs_gen.h" /* PL_SIG only emits entries for unary functions, since if a function needs to be wrapped in mathbench there is no way for it to know the same of the wrapper. Add entries for binary functions, or any other exotic signatures that need wrapping, below. */ {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, {"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}}, {"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}}, -{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}}, -{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}}, -{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}}, -{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}}, -{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}}, {"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}}, -{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}}, {"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}}, +{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}}, +{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}}, +{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}}, +{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}}, +{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}}, +{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}}, +{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}}, +{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}}, +{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}}, #if WANT_SVE_MATH -{"__sv_atan2f_x", 'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}}, {"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}}, -{"__sv_atan2_x", 'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}}, -{"_ZGVsM2vv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}}, -{"__sv_powif_x", 'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}}, +{"_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}}, +{"_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_hypotf_wrap}}, +{"_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_hypot_wrap}}, {"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}}, -{"__sv_powi_x", 'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}}, {"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}}, +{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}}, +{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}}, +{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}}, +{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}}, +{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}}, +{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}}, +{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}}, +{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}}, +{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}}, +{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}}, #endif - // clang-format on + // clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h index eba960eb96ac..fe7f8963cdee 100644 --- a/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h +++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h @@ -1,133 +1,206 @@ /* * Function wrappers for mathbench. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ static double atan2_wrap (double x) { return atan2 (5.0, x); } static float atan2f_wrap (float x) { return atan2f (5.0f, x); } static double powi_wrap (double x) { return __builtin_powi (x, (int) round (x)); } -#if WANT_VMATH -#if __aarch64__ +#if __aarch64__ && defined(__vpcs) -static double -__s_atan2_wrap (double x) +__vpcs static v_double +_Z_atan2_wrap (v_double x) { - return __s_atan2 (5.0, x); + return _ZGVnN2vv_atan2 (v_double_dup (5.0), x); } -static float -__s_atan2f_wrap (float x) +__vpcs static v_float +_Z_atan2f_wrap (v_float x) { - return __s_atan2f (5.0f, x); + return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x); } -static v_double -__v_atan2_wrap (v_double x) +__vpcs static v_float +_Z_hypotf_wrap (v_float x) { - return __v_atan2 (v_double_dup (5.0), x); + return _ZGVnN4vv_hypotf (v_float_dup (5.0f), x); } -static v_float -__v_atan2f_wrap (v_float x) +__vpcs static v_double +_Z_hypot_wrap (v_double x) { - return __v_atan2f (v_float_dup (5.0f), x); + return _ZGVnN2vv_hypot (v_double_dup (5.0), x); } -#ifdef __vpcs - __vpcs static v_double -__vn_atan2_wrap (v_double x) +xy_Z_pow (v_double x) { - return __vn_atan2 (v_double_dup (5.0), x); + return _ZGVnN2vv_pow (x, x); } -__vpcs static v_float -__vn_atan2f_wrap (v_float x) +__vpcs static v_double +x_Z_pow (v_double x) { - return __vn_atan2f (v_float_dup (5.0f), x); + return _ZGVnN2vv_pow (x, v_double_dup (23.4)); } __vpcs static v_double -_Z_atan2_wrap (v_double x) +y_Z_pow (v_double x) { - return _ZGVnN2vv_atan2 (v_double_dup (5.0), x); + return _ZGVnN2vv_pow (v_double_dup (2.34), x); } __vpcs static v_float -_Z_atan2f_wrap (v_float x) +_Z_sincosf_wrap (v_float x) { - return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x); + v_float s, c; + _ZGVnN4vl4l4_sincosf (x, &s, &c); + return s + c; } -#endif // __vpcs -#endif // __arch64__ -#endif // WANT_VMATH +__vpcs static v_float +_Z_cexpif_wrap (v_float x) +{ + __f32x4x2_t sc = _ZGVnN4v_cexpif (x); + return sc.val[0] + sc.val[1]; +} -#if WANT_SVE_MATH +__vpcs static v_double +_Z_sincos_wrap (v_double x) +{ + v_double s, c; + _ZGVnN2vl8l8_sincos (x, &s, &c); + return s + c; +} -static sv_float -__sv_atan2f_wrap (sv_float x, sv_bool pg) +__vpcs static v_double +_Z_cexpi_wrap (v_double x) { - return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg); + __f64x2x2_t sc = _ZGVnN2v_cexpi (x); + return sc.val[0] + sc.val[1]; } +#endif // __arch64__ && __vpcs + +#if WANT_SVE_MATH + static sv_float _Z_sv_atan2f_wrap (sv_float x, sv_bool pg) { - return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg); + return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg); } static sv_double -__sv_atan2_wrap (sv_double x, sv_bool pg) +_Z_sv_atan2_wrap (sv_double x, sv_bool pg) { - return __sv_atan2_x (x, svdup_n_f64 (5.0), pg); + return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg); +} + +static sv_float +_Z_sv_hypotf_wrap (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg); } static sv_double -_Z_sv_atan2_wrap (sv_double x, sv_bool pg) +_Z_sv_hypot_wrap (sv_double x, sv_bool pg) { - return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg); + return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg); } static sv_float _Z_sv_powi_wrap (sv_float x, sv_bool pg) { return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg); } +static sv_double +_Z_sv_powk_wrap (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg); +} + +static sv_float +xy_Z_sv_powf (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_powf (x, x, pg); +} + static sv_float -__sv_powif_wrap (sv_float x, sv_bool pg) +x_Z_sv_powf (sv_float x, sv_bool pg) { - return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg); + return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg); +} + +static sv_float +y_Z_sv_powf (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg); } static sv_double -_Z_sv_powk_wrap (sv_double x, sv_bool pg) +xy_Z_sv_pow (sv_double x, sv_bool pg) { - return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg); + return _ZGVsMxvv_pow (x, x, pg); +} + +static sv_double +x_Z_sv_pow (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg); +} + +static sv_double +y_Z_sv_pow (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg); +} + +static sv_float +_Z_sv_sincosf_wrap (sv_float x, sv_bool pg) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincosf (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); +} + +static sv_float +_Z_sv_cexpif_wrap (sv_float x, sv_bool pg) +{ + svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg); + return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); +} + +static sv_double +_Z_sv_sincos_wrap (sv_double x, sv_bool pg) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincos (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); } static sv_double -__sv_powi_wrap (sv_double x, sv_bool pg) +_Z_sv_cexpi_wrap (sv_double x, sv_bool pg) { - return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg); + svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg); + return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); } #endif // WANT_SVE_MATH diff --git a/contrib/arm-optimized-routines/pl/math/test/pl_test.h b/contrib/arm-optimized-routines/pl/math/test/pl_test.h index 467d1cac0c36..e7ed4eed634e 100644 --- a/contrib/arm-optimized-routines/pl/math/test/pl_test.h +++ b/contrib/arm-optimized-routines/pl/math/test/pl_test.h @@ -1,33 +1,39 @@ /* * PL macros for emitting various details about routines for consumption by * runulp.sh. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. */ /* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV on PL_TEST_ULP to add EXPECT_FENV to all scalar routines. */ -#if !(V_SUPPORTED || SV_SUPPORTED) -#define PL_TEST_ULP(f, l) \ - PL_TEST_EXPECT_FENV_ALWAYS (f) \ - PL_TEST_ULP f l +#if WANT_VMATH || defined(IGNORE_SCALAR_FENV) +# define PL_TEST_ULP(f, l) PL_TEST_ULP f l #else -#define PL_TEST_ULP(f, l) PL_TEST_ULP f l +# define PL_TEST_ULP(f, l) \ + PL_TEST_EXPECT_FENV_ALWAYS (f) \ + PL_TEST_ULP f l #endif -/* Emit aliases to allow test params to be mapped from aliases back to their - aliasees. */ -#define PL_ALIAS(a, b) PL_TEST_ALIAS a b - /* Emit routine name if e == 1 and f is expected to correctly trigger fenv exceptions. e allows declaration to be emitted conditionally upon certain build flags - defer expansion by one pass to allow those flags to be expanded properly. */ #define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e) #define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f) #define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f #define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1) #define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n +#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) \ + PL_TEST_INTERVAL (f, lo, hi, n) \ + PL_TEST_INTERVAL (f, -lo, -hi, n) #define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c +#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) \ + PL_TEST_INTERVAL_C (f, lo, hi, n, c) \ + PL_TEST_INTERVAL_C (f, -lo, -hi, n, c) +// clang-format off +#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL f xlo,ylo xhi,yhi n +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/test/runulp.sh b/contrib/arm-optimized-routines/pl/math/test/runulp.sh index 4d02530d44b1..0f5a41f76b25 100755 --- a/contrib/arm-optimized-routines/pl/math/test/runulp.sh +++ b/contrib/arm-optimized-routines/pl/math/test/runulp.sh @@ -1,78 +1,78 @@ #!/bin/bash # ULP error check script. # # Copyright (c) 2019-2023, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception #set -x set -eu # cd to bin directory. cd "${0%/*}" flags="${ULPFLAGS:--q}" emu="$@" # Enable SVE testing WANT_SVE_MATH=${WANT_SVE_MATH:-0} FAIL=0 PASS=0 t() { - key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}') - L=$(cat $LIMITS | grep "^$key " | awk '{print $2}') + routine=$1 + L=$(cat $LIMITS | grep "^$routine " | awk '{print $2}') [[ $L =~ ^[0-9]+\.[0-9]+$ ]] - extra_flags="" + extra_flags= [[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5" - grep -q "^$key$" $FENV || extra_flags="$extra_flags -f" - $emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) + grep -q "^$routine$" $FENV || extra_flags="$extra_flags -f" + IFS=',' read -ra LO <<< "$2" + IFS=',' read -ra HI <<< "$3" + ITV="${LO[0]} ${HI[0]}" + for i in "${!LO[@]}"; do + [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}" + done + # Add -z flag to ignore zero sign for vector routines + { echo $routine | grep -q "ZGV"; } && extra_flags="$extra_flags -z" + $emu ./ulp -e $L $flags ${extra_flags} $routine $ITV $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) } check() { $emu ./ulp -f -q "$@" #>/dev/null } -# Regression-test for correct NaN handling in atan2 -check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000 -check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan -check atan2 nan nan x -nan -nan +if [ "$FUNC" == "atan2" ] || [ -z "$FUNC" ]; then + # Regression-test for correct NaN handling in atan2 + check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000 + check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan + check atan2 nan nan x -nan -nan +fi # vector functions flags="${ULPFLAGS:--q}" -runs= -check __s_log10f 1 && runs=1 -runv= -check __v_log10f 1 && runv=1 -runvn= -check __vn_log10f 1 && runvn=1 runsv= if [ $WANT_SVE_MATH -eq 1 ]; then -check __sv_cosf 0 && runsv=1 -check __sv_cos 0 && runsv=1 -check __sv_sinf 0 && runsv=1 -check __sv_sin 0 && runsv=1 # No guarantees about powi accuracy, so regression-test for exactness # w.r.t. the custom reference impl in ulp_wrappers.h -check -q -f -e 0 __sv_powif 0 inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powif -0 -inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powif 0 inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi 0 inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi -0 -inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi 0 inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 __sv_powi -0 -inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000 && runsv=1 fi while read F LO HI N C do t $F $LO $HI $N $C done << EOF -$(cat $INTERVALS) +$(cat $INTERVALS | grep "\b$FUNC\b") EOF [ 0 -eq $FAIL ] || { echo "FAILED $FAIL PASSED $PASS" exit 1 } diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acos.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acos.tst new file mode 100644 index 000000000000..a73dcd25965b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acos.tst @@ -0,0 +1,17 @@ +; acos.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=acos op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=acos op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=acos op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=acos op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=acos op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acos op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acos op1=00000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=acos op1=80000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=acos op1=3ff00000.00000000 result=00000000.00000000 errno=0 +func=acos op1=bff00000.00000000 result=400921fb.54442d18.469 errno=0 +func=acos op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=acos op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosf.tst new file mode 100644 index 000000000000..9e453e3bff5e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosf.tst @@ -0,0 +1,21 @@ +; acosf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=acosf op1=7fc00001 result=7fc00001 errno=0 +func=acosf op1=ffc00001 result=7fc00001 errno=0 +func=acosf op1=7f800001 result=7fc00001 errno=0 status=i +func=acosf op1=ff800001 result=7fc00001 errno=0 status=i +func=acosf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=acosf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=acosf op1=00000000 result=3fc90fda.a22 errno=0 +func=acosf op1=80000000 result=3fc90fda.a22 errno=0 +func=acosf op1=3f800000 result=00000000 errno=0 +func=acosf op1=bf800000 result=40490fda.a22 errno=0 +func=acosf op1=3f800001 result=7fc00001 errno=EDOM status=i +func=acosf op1=bf800001 result=7fc00001 errno=EDOM status=i +func=acosf op1=33000000 result=3fc90fda.622 error=0 +func=acosf op1=30000000 result=3fc90fda.a12 error=0 +func=acosf op1=2d000000 result=3fc90fda.a21 error=0 +func=acosf op1=2a000000 result=3fc90fda.a22 error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asin.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asin.tst new file mode 100644 index 000000000000..6180d7849d90 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asin.tst @@ -0,0 +1,24 @@ +; asin.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=asin op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=asin op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=asin op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=asin op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=asin op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=asin op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=asin op1=00000000.00000000 result=00000000.00000000 errno=0 +func=asin op1=80000000.00000000 result=80000000.00000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=asin op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=asin op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux + +func=asin op1=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=asin op1=bff00000.00000000 result=bff921fb.54442d18.469 errno=0 +func=asin op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=asin op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinf.tst new file mode 100644 index 000000000000..a85b2593768d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinf.tst @@ -0,0 +1,24 @@ +; asinf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=asinf op1=7fc00001 result=7fc00001 errno=0 +func=asinf op1=ffc00001 result=7fc00001 errno=0 +func=asinf op1=7f800001 result=7fc00001 errno=0 status=i +func=asinf op1=ff800001 result=7fc00001 errno=0 status=i +func=asinf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=asinf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=asinf op1=00000000 result=00000000 errno=0 +func=asinf op1=80000000 result=80000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=asinf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=asinf op1=80000001 result=80000001 errno=0 maybestatus=ux + +func=asinf op1=3f800000 result=3fc90fda.a22 errno=0 +func=asinf op1=bf800000 result=bfc90fda.a22 errno=0 +func=asinf op1=3f800001 result=7fc00001 errno=EDOM status=i +func=asinf op1=bf800001 result=7fc00001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h index 5e3133e1db4c..4929b481ffe1 100644 --- a/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h +++ b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h @@ -1,66 +1,70 @@ /* * Function entries for ulp. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifdef __vpcs +#if defined(__vpcs) && __aarch64__ -#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f) -#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f) -#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f) -#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f) - -#elif __aarch64 - -#define _ZVF1(f) SF1 (f) VF1 (f) -#define _ZVD1(f) SD1 (f) VD1 (f) -#define _ZVF2(f) SF2 (f) VF2 (f) -#define _ZVD2(f) SD2 (f) VD2 (f) - -#elif WANT_VMATH - -#define _ZVF1(f) SF1 (f) -#define _ZVD1(f) SD1 (f) -#define _ZVF2(f) SF2 (f) -#define _ZVD2(f) SD2 (f) +#define _ZVF1(f) ZVF1 (f) +#define _ZVD1(f) ZVD1 (f) +#define _ZVF2(f) ZVF2 (f) +#define _ZVD2(f) ZVD2 (f) #else #define _ZVF1(f) #define _ZVD1(f) #define _ZVF2(f) #define _ZVD2(f) #endif #if WANT_SVE_MATH -#define _ZSVF1(f) SVF1 (f) ZSVF1 (f) -#define _ZSVF2(f) SVF2 (f) ZSVF2 (f) -#define _ZSVD1(f) SVD1 (f) ZSVD1 (f) -#define _ZSVD2(f) SVD2 (f) ZSVD2 (f) +#define _ZSVF1(f) ZSVF1 (f) +#define _ZSVF2(f) ZSVF2 (f) +#define _ZSVD1(f) ZSVD1 (f) +#define _ZSVD2(f) ZSVD2 (f) #else #define _ZSVF1(f) #define _ZSVF2(f) #define _ZSVD1(f) #define _ZSVD2(f) #endif #define _ZSF1(f) F1 (f) #define _ZSF2(f) F2 (f) #define _ZSD1(f) D1 (f) #define _ZSD2(f) D2 (f) #include "ulp_funcs_gen.h" +F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) +F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) + +F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) +F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) + #if WANT_SVE_MATH -F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0) F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0) -F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0) F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0) + +F (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) +F (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) +F (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) + +F (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) +F (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) +F (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) #endif diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h index b682e939054a..0f7b68949c7b 100644 --- a/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h +++ b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h @@ -1,148 +1,140 @@ // clang-format off /* * Function wrappers for ulp. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include +#include #if USE_MPFR static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y, x, r); return mpfr_sin(y, x, r); } static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y, x, r); return mpfr_cos(y, x, r); } static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) { mpfr_t y2; mpfr_init(y2); mpfr_trunc(y2, y); return mpfr_pow(ret, x, y2, rnd); } #endif /* Our implementations of powi/powk are too imprecise to verify against any established pow implementation. Instead we have the following simple implementation, against which it is enough to maintain bitwise reproducibility. Note the test framework expects the reference impl to be of higher precision than the function under test. For instance this means that the reference for double-precision powi will be passed a long double, so to check bitwise reproducibility we have to cast it back down to double. This is fine since a round-trip to higher precision and back down is correctly rounded. */ #define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \ - static DBL_T NAME (DBL_T in_val, DBL_T y) \ + static DBL_T __attribute__((unused)) NAME (DBL_T in_val, DBL_T y) \ { \ INT_T n = (INT_T) round (y); \ FLT_T acc = 1.0; \ bool want_recip = n < 0; \ n = n < 0 ? -n : n; \ \ for (FLT_T c = in_val; n; c *= c, n >>= 1) \ { \ if (n & 0x1) \ { \ acc *= c; \ } \ } \ if (want_recip) \ { \ acc = 1.0 / acc; \ } \ return acc; \ } DECL_POW_INT_REF(ref_powif, double, float, int) DECL_POW_INT_REF(ref_powi, long double, double, int) -#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; } -#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; } -#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; } -#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; } - -#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; } -#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; } -#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; } -#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; } - #define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; } #define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; } #define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; } #define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; } -#ifdef __vpcs - -#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func) -#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func) -#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func) -#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func) - -#elif __aarch64__ - -#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) -#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) -#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) -#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) - -#elif WANT_VMATH +#if defined(__vpcs) && __aarch64__ -#define ZVNF1_WRAP(func) VF1_WRAP(func) -#define ZVNF2_WRAP(func) VF2_WRAP(func) -#define ZVND1_WRAP(func) VD1_WRAP(func) -#define ZVND2_WRAP(func) VD2_WRAP(func) +#define ZVNF1_WRAP(func) ZVF1_WRAP(func) +#define ZVNF2_WRAP(func) ZVF2_WRAP(func) +#define ZVND1_WRAP(func) ZVD1_WRAP(func) +#define ZVND2_WRAP(func) ZVD2_WRAP(func) #else #define ZVNF1_WRAP(func) #define ZVNF2_WRAP(func) #define ZVND1_WRAP(func) #define ZVND2_WRAP(func) #endif -#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); } -#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); } -#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); } -#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); } - #define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); } #define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); } #define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); } #define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); } #if WANT_SVE_MATH -#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func) -#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func) -#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func) -#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func) +#define ZSVNF1_WRAP(func) ZSVF1_WRAP(func) +#define ZSVNF2_WRAP(func) ZSVF2_WRAP(func) +#define ZSVND1_WRAP(func) ZSVD1_WRAP(func) +#define ZSVND2_WRAP(func) ZSVD2_WRAP(func) #else #define ZSVNF1_WRAP(func) #define ZSVNF2_WRAP(func) #define ZSVND1_WRAP(func) #define ZSVND2_WRAP(func) #endif /* No wrappers for scalar routines, but PL_SIG will emit them. */ #define ZSNF1_WRAP(func) #define ZSNF2_WRAP(func) #define ZSND1_WRAP(func) #define ZSND2_WRAP(func) #include "ulp_wrappers_gen.h" +float v_sincosf_sin(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return s[0]; } +float v_sincosf_cos(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return c[0]; } +float v_cexpif_sin(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[0][0]; } +float v_cexpif_cos(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[1][0]; } + +double v_sincos_sin(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return s[0]; } +double v_sincos_cos(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return c[0]; } +double v_cexpi_sin(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[0][0]; } +double v_cexpi_cos(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[1][0]; } + #if WANT_SVE_MATH -static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); } -static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); } -static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); } -static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); } +static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_s32((int)round(y)), svptrue_b32())); } +static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_s64((long)round(y)), svptrue_b64())); } + +float sv_sincosf_sin(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return s[0]; } +float sv_sincosf_cos(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return c[0]; } +float sv_cexpif_sin(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 0)); } +float sv_cexpif_cos(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 1)); } + +double sv_sincos_sin(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return s[0]; } +double sv_sincos_cos(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return c[0]; } +double sv_cexpi_sin(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 0)); } +double sv_cexpi_cos(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 1)); } + #endif // clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/tools/asin.sollya b/contrib/arm-optimized-routines/pl/math/tools/asin.sollya new file mode 100644 index 000000000000..8ef861d0898b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/asin.sollya @@ -0,0 +1,29 @@ +// polynomial for approximating asin(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +f = asin(x); +dtype = double; + +prec=256; + +a = 0x1p-106; +b = 0.25; + +deg = 11; + +backward = proc(poly, d) { + return d + d ^ 3 * poly(d * d); +}; + +forward = proc(f, d) { + return (f(sqrt(d))-sqrt(d))/(d*sqrt(d)); +}; + +poly = fpminimax(forward(f, x), [|0,...,deg|], [|dtype ...|], [a;b], relative, floating); + +display = hexadecimal!; +print("rel error:", dirtyinfnorm(1-backward(poly, x)/f(x), [a;b])); +print("in [", a, b, "]"); +for i from 0 to deg do print(coeff(poly, i)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/asinf.sollya b/contrib/arm-optimized-routines/pl/math/tools/asinf.sollya new file mode 100644 index 000000000000..5b627e546c73 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/asinf.sollya @@ -0,0 +1,36 @@ +// polynomial for approximating asinf(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +f = asin(x); +dtype = single; + +a = 0x1p-24; +b = 0.25; + +deg = 4; + +backward = proc(poly, d) { + return d + d ^ 3 * poly(d * d); +}; + +forward = proc(f, d) { + return (f(sqrt(d))-sqrt(d))/(d*sqrt(d)); +}; + +approx = proc(poly, d) { + return remez(1 - poly(x) / forward(f, x), deg - d, [a;b], x^d/forward(f, x), 1e-16); +}; + +poly = 0; +for i from 0 to deg do { + i; + p = roundcoefficients(approx(poly,i), [|dtype ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal!; +print("rel error:", accurateinfnorm(1-backward(poly, x)/f(x), [a;b], 30)); +print("in [", a, b, "]"); +for i from 0 to deg do print(coeff(poly, i)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erf.sollya b/contrib/arm-optimized-routines/pl/math/tools/erf.sollya new file mode 100644 index 000000000000..b2fc559b511e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/erf.sollya @@ -0,0 +1,25 @@ +// tables and constants for approximating erf(x). +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +display = hexadecimal; +prec=128; + +// Tables +print("{ i, r, erf(r), 2/sqrt(pi) * exp(-r^2)}"); +for i from 0 to 768 do { + r = i / 128; + t0 = double(erf(r)); + t1 = double(2/sqrt(pi) * exp(-r * r)); + print("{ " @ i @ ",\t" @ r @ ",\t" @ t0 @ ",\t" @ t1 @ " },"); +}; + +// Constants +double(1/3); +double(1/10); +double(2/15); +double(2/9); +double(2/45); +double(2/sqrt(pi)); + diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya index 8c40b4b5db6b..1e2791291ebb 100644 --- a/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya +++ b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya @@ -1,23 +1,51 @@ -// polynomial for approximating erfc(x)*exp(x*x) +// tables and constants for approximating erfc(x). // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2023, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception -deg = 12; // poly degree - -// interval bounds -a = 0x1.60dfc14636e2ap0; -b = 0x1.d413cccfe779ap0; +display = hexadecimal; +prec=128; -f = proc(y) { - t = y + a; - return erfc(t) * exp(t*t); +// Tables +print("{ i, r, erfc(r), 2/sqrt(pi) * exp(-r^2) }"); +for i from 0 to 3787 do { + r = 0.0 + i / 128; + t0 = double(erfc(r) * 2^128); + t1 = double(2/sqrt(pi) * exp(-r * r) * 2^128); + print("{ " @ t0 @ ",\t" @ t1 @ " },"); }; -poly = remez(f(x), deg, [0;b-a], 1, 1e-16); +// Constants +print("> 2/sqrt(pi)"); +double(2/sqrt(pi)); -display = hexadecimal; -print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); -print("in [",a,b,"]"); -print("coeffs:"); -for i from 0 to deg do round(coeff(poly,i), 52, RN); +print("> 1/3"); +double(1/3); + +print("> P5"); +double(2/15); +double(1/10); +double(2/9); +double(2/45); + +print("> P6"); +double(1/42); +double(1/7); +double(2/21); +double(4/315); + +print("> Q"); +double( 5.0 / 4.0); +double( 6.0 / 5.0); +double( 7.0 / 6.0); +double( 8.0 / 7.0); +double( 9.0 / 8.0); +double(10.0 / 9.0); + +print("> R"); +double(-2.0 * 4.0 / (5.0 * 6.0)); +double(-2.0 * 5.0 / (6.0 * 7.0)); +double(-2.0 * 6.0 / (7.0 * 8.0)); +double(-2.0 * 7.0 / (8.0 * 9.0)); +double(-2.0 * 8.0 / (9.0 * 10.0)); +double(-2.0 * 9.0 / (10.0 * 11.0)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya index 69c683647af7..1d7fc264d99d 100644 --- a/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya +++ b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya @@ -1,31 +1,22 @@ -// polynomial for approximating erfc(x)*exp(x*x) +// tables and constants for approximating erfcf(x). // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2023, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception -deg = 15; // poly degree - -// interval bounds -a = 0x1.0p-26; -b = 2; - -f = proc(y) { - return erfc(y) * exp(y*y); -}; - -approx = proc(poly, d) { - return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); -}; +display = hexadecimal; +prec=128; -poly = 0; -for i from 0 to deg do { - p = roundcoefficients(approx(poly,i), [|D ...|]); - poly = poly + x^i*coeff(p,0); - print(i); +// Tables +print("{ i, r, erfc(r), 2/sqrt(pi) * exp(-r^2) }"); +for i from 0 to 644 do { + r = 0.0 + i / 64; + t0 = single(erfc(r) * 2^47); + t1 = single(2/sqrt(pi) * exp(-r * r) * 2^47); + print("{ " @ t0 @ ",\t" @ t1 @ " },"); }; -display = hexadecimal; -print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); -print("in [",a,b,"]"); -print("coeffs:"); -for i from 0 to deg do coeff(poly,i); +// Constants +single(1/3); +single(2/15); +single(1/10); +single(2/sqrt(pi)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erff.sollya b/contrib/arm-optimized-routines/pl/math/tools/erff.sollya new file mode 100644 index 000000000000..59b23ef021f0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/erff.sollya @@ -0,0 +1,20 @@ +// tables and constants for approximating erff(x). +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +display = hexadecimal; +prec=128; + +// Tables +print("{ i, r, erf(r), 2/sqrt(pi) * exp(-r^2)}"); +for i from 0 to 512 do { + r = i / 128; + t0 = single(erf(r)); + t1 = single(2/sqrt(pi) * exp(-r * r)); + print("{ " @ i @ ",\t" @ r @ ",\t" @ t0 @ ",\t" @ t1 @ " },"); +}; + +// Constants +single(1/3); +single(2/sqrt(pi)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/exp10.sollya b/contrib/arm-optimized-routines/pl/math/tools/exp10.sollya new file mode 100644 index 000000000000..9f30b4018209 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/exp10.sollya @@ -0,0 +1,55 @@ +// polynomial for approximating 10^x +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// exp10f parameters +deg = 5; // poly degree +N = 1; // Neon 1, SVE 64 +b = log(2)/(2 * N * log(10)); // interval +a = -b; +wp = single; + +// exp10 parameters +//deg = 4; // poly degree - bump to 5 for ~1 ULP +//N = 128; // table size +//b = log(2)/(2 * N * log(10)); // interval +//a = -b; +//wp = D; + + +// find polynomial with minimal relative error + +f = 10^x; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx_abs = proc(poly,d) { + return remez(f(x) - poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|wp ...|]); +// p = roundcoefficients(approx_abs(poly,i), [|wp ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/10^x, [a;b], 30)); +print("abs error:", accurateinfnorm(10^x-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); + +log10_2 = round(N * log(10) / log(2), wp, RN); +log2_10 = log(2) / (N * log(10)); +log2_10_hi = round(log2_10, wp, RN); +log2_10_lo = round(log2_10 - log2_10_hi, wp, RN); +print(log10_2); +print(log2_10_hi); +print(log2_10_lo); diff --git a/contrib/arm-optimized-routines/pl/math/tools/sincos.sollya b/contrib/arm-optimized-routines/pl/math/tools/sincos.sollya new file mode 100644 index 000000000000..7d36266b446b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/sincos.sollya @@ -0,0 +1,33 @@ +// polynomial for approximating cos(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// This script only finds the coeffs for cos - see math/aarch64/v_sin.c for sin coeffs + +deg = 14; // polynomial degree +a = -pi/4; // interval +b = pi/4; + +// find even polynomial with minimal abs error compared to cos(x) + +f = cos(x); + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg/2 do { + p = roundcoefficients(approx(poly,2*i), [|double ...|]); + poly = poly + x^(2*i)*coeff(p,0); +}; + +display = hexadecimal; +//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +//print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/sincosf.sollya b/contrib/arm-optimized-routines/pl/math/tools/sincosf.sollya new file mode 100644 index 000000000000..178ee83ac196 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/sincosf.sollya @@ -0,0 +1,33 @@ +// polynomial for approximating cos(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// This script only finds the coeffs for cos - see math/tools/sin.sollya for sin coeffs. + +deg = 8; // polynomial degree +a = -pi/4; // interval +b = pi/4; + +// find even polynomial with minimal abs error compared to cos(x) + +f = cos(x); + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg/2 do { + p = roundcoefficients(approx(poly,2*i), [|single ...|]); + poly = poly + x^(2*i)*coeff(p,0); +}; + +display = hexadecimal; +//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +//print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/sinpi.sollya b/contrib/arm-optimized-routines/pl/math/tools/sinpi.sollya new file mode 100644 index 000000000000..62cc87e7697d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/sinpi.sollya @@ -0,0 +1,33 @@ +// polynomial for approximating sinpi(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 19; // polynomial degree +a = -1/2; // interval +b = 1/2; + +// find even polynomial with minimal abs error compared to sinpi(x) + +// f = sin(pi* x); +f = pi*x; +c = 1; +for i from 1 to 80 do { c = 2*i*(2*i + 1)*c; f = f + (-1)^i*(pi*x)^(2*i+1)/c; }; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)| +approx = proc(poly,d) { + return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10); +}; + +// first coeff is predefine, iteratively find optimal double prec coeffs +poly = pi*x; +for i from 0 to (deg-1)/2 do { + p = roundcoefficients(approx(poly,2*i+1), [|D ...|]); + poly = poly + x^(2*i+1)*coeff(p,0); +}; + +display = hexadecimal; +print("abs error:", accurateinfnorm(sin(pi*x)-poly(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/trigpi_references.c b/contrib/arm-optimized-routines/pl/math/trigpi_references.c new file mode 100644 index 000000000000..4b0514b6766a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/trigpi_references.c @@ -0,0 +1,57 @@ +/* + * Extended precision scalar reference functions for trigpi. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "math_config.h" +#include "mathlib.h" + +long double +sinpil (long double x) +{ + /* sin(inf) should return nan, as defined by C23. */ + if (isinf (x)) + return __math_invalid (x); + + long double ax = fabsl (x); + + /* Return 0 for all values above 2^64 to prevent + overflow when casting to uint64_t. */ + if (ax >= 0x1p64) + return 0; + + /* All integer cases should return 0. */ + if (ax == (uint64_t) ax) + return 0; + + return sinl (x * M_PIl); +} + +long double +cospil (long double x) +{ + /* cos(inf) should return nan, as defined by C23. */ + if (isinf (x)) + return __math_invalid (x); + + long double ax = fabsl (x); + + if (ax >= 0x1p64) + return 1; + + uint64_t m = (uint64_t) ax; + + /* Integer values of cospi(x) should return +/-1. + The sign depends on if x is odd or even. */ + if (m == ax) + return (m & 1) ? -1 : 1; + + /* Values of Integer + 0.5 should always return 0. */ + if (ax - 0.5 == m || ax + 0.5 == m) + return 0; + + return cosl (ax * M_PIl); +} \ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/v_acos_2u.c b/contrib/arm-optimized-routines/pl/math/v_acos_2u.c new file mode 100644 index 000000000000..581f8506c0d6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_acos_2u.c @@ -0,0 +1,122 @@ +/* + * Double-precision vector acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[12]; + float64x2_t pi, pi_over_2; + uint64x2_t abs_mask; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, + .pi = V2 (0x1.921fb54442d18p+1), + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define Oneu (0x3ff0000000000000) +#define Small (0x3e50000000000000) /* 2^-53. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (acos, x, y, special); +} +#endif + +/* Double-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (Oneu - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x), + vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax)); + float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float64x2_t y = vbslq_f64 (d->abs_mask, p, x); + + uint64x2_t is_neg = vcltzq_f64 (x); + float64x2_t off = vreinterpretq_f64_u64 ( + vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi))); + float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0)); + float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off); + + return vfmaq_f64 (add, mul, y); +} + +PL_SIG (V, D, 1, acos, -1.0, 1.0) +PL_TEST_ULP (V_NAME_D1 (acos), 1.02) +PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_acosf_1u4.c b/contrib/arm-optimized-routines/pl/math/v_acosf_1u4.c new file mode 100644 index 000000000000..bb17b1df18f3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_acosf_1u4.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector acos(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f, pif; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), + .pif = V4 (0x1.921fb6p+1f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x32800000 /* 2^-26. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (acosf, x, y, special); +} +#endif + +/* Single-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.26 ulps, + _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x); + + uint32x4_t is_neg = vcltzq_f32 (x); + float32x4_t off = vreinterpretq_f32_u32 ( + vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg)); + float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0)); + float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off); + + return vfmaq_f32 (add, mul, y); +} + +PL_SIG (V, F, 1, acos, -1.0, 1.0) +PL_TEST_ULP (V_NAME_F1 (acos), 0.82) +PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c index 22f69d7636e4..42fa2616d562 100644 --- a/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c @@ -1,51 +1,66 @@ /* * Single-precision vector acosh(x) function. * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" #define WANT_V_LOG1P_K0_SHORTCUT 1 #include "v_log1p_inline.h" -#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ - -#if V_SUPPORTED +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one, thresh; +} data = { + .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */ +}; -static NOINLINE VPCS_ATTR v_f64_t -special_case (v_f64_t x) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special, + const struct v_log1p_data *d) { - return v_call_f64 (acosh, x, x, v_u64 (-1)); + return v_call_f64 (acosh, x, log1p_inline (y, d), special); } /* Vector approximation for double-precision acosh, based on log1p. The largest observed error is 3.02 ULP in the region where the argument to log1p falls in the k=0 interval, i.e. x close to 1: - __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 - want 0x1.f2d6d823bc9e2p-5. */ -VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x) + _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 + want 0x1.f2d6d823bc9e2p-5. */ +VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) { - v_u64_t itop = v_as_u64_f64 (x) >> 52; - v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop)); + const struct data *d = ptr_barrier (&data); + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh); + float64x2_t special_arg = x; - /* Fall back to scalar routine for all lanes if any of them are special. */ +#if WANT_SIMD_EXCEPT if (unlikely (v_any_u64 (special))) - return special_case (x); + x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); +#endif - v_f64_t xm1 = x - 1; - v_f64_t u = xm1 * (x + 1); - return log1p_inline (xm1 + v_sqrt_f64 (u)); + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1)); + float64x2_t y; + y = vaddq_f64 (x, v_f64 (1)); + y = vmulq_f64 (y, xm1); + y = vsqrtq_f64 (y); + y = vaddq_f64 (xm1, y); + + if (unlikely (v_any_u64 (special))) + return special_case (special_arg, y, special, &d->log1p_consts); + return log1p_inline (y, &d->log1p_consts); } -VPCS_ALIAS PL_SIG (V, D, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (V_NAME (acosh), 2.53) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh)) -PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000) -PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000) -PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000) -PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000) -#endif +PL_TEST_ULP (V_NAME_D1 (acosh), 2.53) +PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c index 2b5aff591a74..a2ff0f02635b 100644 --- a/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c +++ b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c @@ -1,68 +1,78 @@ /* * Single-precision vector acosh(x) function. * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "v_log1pf_inline.h" -#define SignMask 0x80000000 -#define One 0x3f800000 -#define SquareLim 0x5f800000 /* asuint(0x1p64). */ - -#if V_SUPPORTED +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; + uint16x4_t thresh; +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .one = V4 (0x3f800000), + .thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */ +}; -#include "v_log1pf_inline.h" +#define SignMask 0x80000000 -static NOINLINE VPCS_ATTR v_f32_t -special_case (v_f32_t x, v_f32_t y, v_u32_t special) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint16x4_t special, + const struct v_log1pf_data d) { - return v_call_f32 (acoshf, x, y, special); + return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); } /* Vector approximation for single-precision acosh, based on log1p. Maximum error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it is 2.78 ULP: __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 want 0x1.ef9ea2p-3. With exceptions disabled, we can compute u with a shorter dependency chain, which gives maximum error of 3.07 ULP: __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4 want 0x1.fbc7f4p-4. */ -VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One)); + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh); #if WANT_SIMD_EXCEPT /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use - only xm1 to calculate u, as operating on x will trigger invalid for NaN. */ - v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1); - v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1); + only xm1 to calculate u, as operating on x will trigger invalid for NaN. + Widening sign-extend special predicate in order to mask with it. */ + uint32x4_t p + = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special))); + float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); + float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); #else - v_f32_t xm1 = x - 1; - v_f32_t u = xm1 * (x + 1.0f); + float32x4_t xm1 = vsubq_f32 (x, v_f32 (1)); + float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f))); #endif - v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u)); - if (unlikely (v_any_u32 (special))) - return special_case (x, y, special); - return y; + float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, special, d->log1pf_consts); + return log1pf_inline (y, d->log1pf_consts); } -VPCS_ALIAS PL_SIG (V, F, 1, acosh, 1.0, 10.0) #if WANT_SIMD_EXCEPT -PL_TEST_ULP (V_NAME (acoshf), 2.29) +PL_TEST_ULP (V_NAME_F1 (acosh), 2.29) #else -PL_TEST_ULP (V_NAME (acoshf), 2.58) -#endif -PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500) -PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000) -PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000) -PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000) +PL_TEST_ULP (V_NAME_F1 (acosh), 2.58) #endif +PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000) +PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_asin_3u.c b/contrib/arm-optimized-routines/pl/math/v_asin_3u.c new file mode 100644 index 000000000000..756443c6b320 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_asin_3u.c @@ -0,0 +1,113 @@ +/* + * Double-precision vector asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[12]; + float64x2_t pi_over_2; + uint64x2_t abs_mask; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define One (0x3ff0000000000000) +#define Small (0x3e50000000000000) /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (asin, x, y, special); +} +#endif + +/* Double-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 + want 0x1.110d7e85fdd53p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate exceptions are raised. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (One - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x), + vfmsq_n_f64 (v_f64 (0.5), ax, 0.5)); + float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0)); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +PL_SIG (V, D, 1, asin, -1.0, 1.0) +PL_TEST_ULP (V_NAME_D1 (asin), 2.19) +PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_asinf_2u5.c b/contrib/arm-optimized-routines/pl/math/v_asinf_2u5.c new file mode 100644 index 000000000000..eb978cd956ab --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_asinf_2u5.c @@ -0,0 +1,104 @@ +/* + * Single-precision vector asin(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x39800000 /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (asinf, x, y, special); +} +#endif + +/* Single-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate fp exceptions are raised. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float32x4_t y + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); + + /* Copy sign. */ + return vbslq_f32 (v_u32 (AbsMask), y, x); +} + +PL_SIG (V, F, 1, asin, -1.0, 1.0) +PL_TEST_ULP (V_NAME_F1 (asin), 1.91) +PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c index fd329b6b7f69..4862bef94861 100644 --- a/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c @@ -1,175 +1,175 @@ /* * Double-precision vector asinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +#define A(i) v_f64 (__v_log_data.poly[i]) +#define N (1 << V_LOG_TABLE_BITS) -#define OneTop 0x3ff /* top12(asuint64(1.0f)). */ -#define HugeBound 0x5fe /* top12(asuint64(0x1p511)). */ -#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)). */ -#define AbsMask v_u64 (0x7fffffffffffffff) -#define C(i) v_f64 (__asinh_data.poly[i]) - -/* Constants & data for log. */ -#define OFF 0x3fe6000000000000 -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) -#define A(i) v_f64 (__sv_log_data.poly[i]) -#define T(i) __log_data.tab[i] -#define N (1 << LOG_TABLE_BITS) +const static struct data +{ + float64x2_t poly[18]; + uint64x2_t off, huge_bound, abs_mask; + float64x2_t ln2, tiny_bound; +} data = { + .off = V2 (0x3fe6900900000000), + .ln2 = V2 (0x1.62e42fefa39efp-1), + .huge_bound = V2 (0x5fe0000000000000), + .tiny_bound = V2 (0x1p-26), + .abs_mask = V2 (0x7fffffffffffffff), + /* Even terms of polynomial s.t. asinh(x) is approximated by + asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). + Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ + .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4), + V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6), + V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6), + V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7), + V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7), + V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8), + V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9), + V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12), + V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) }, +}; -static NOINLINE v_f64_t -special_case (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (asinh, x, y, special); } struct entry { - v_f64_t invc; - v_f64_t logc; + float64x2_t invc; + float64x2_t logc; }; static inline struct entry -lookup (v_u64_t i) +lookup (uint64x2_t i) { - struct entry e; -#ifdef SCALAR - e.invc = T (i).invc; - e.logc = T (i).logc; -#else - e.invc[0] = T (i[0]).invc; - e.logc[0] = T (i[0]).logc; - e.invc[1] = T (i[1]).invc; - e.logc[1] = T (i[1]).logc; -#endif - return e; + float64x2_t e0 = vld1q_f64 ( + &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); + float64x2_t e1 = vld1q_f64 ( + &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); + return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) }; } -static inline v_f64_t -log_inline (v_f64_t x) +static inline float64x2_t +log_inline (float64x2_t x, const struct data *d) { - /* Double-precision vector log, copied from math/v_log.c with some cosmetic - modification and special-cases removed. See that file for details of the - algorithm used. */ - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t tmp = ix - OFF; - v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N; - v_s64_t k = v_as_s64_u64 (tmp) >> 52; - v_u64_t iz = ix - (tmp & 0xfffULL << 52); - v_f64_t z = v_as_f64_u64 (iz); - struct entry e = lookup (i); - v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - v_f64_t kd = v_to_f64_s64 (k); - v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r); - v_f64_t r2 = r * r; - v_f64_t y = v_fma_f64 (A (3), r, A (2)); - v_f64_t p = v_fma_f64 (A (1), r, A (0)); - y = v_fma_f64 (A (4), r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); + /* Double-precision vector log, copied from ordinary vector log with some + cosmetic modification and special-cases removed. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t tmp = vsubq_u64 (ix, d->off); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz + = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52))); + float64x2_t z = vreinterpretq_f64_u64 (iz); + struct entry e = lookup (tmp); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_f64 (A (2), A (3), r); + float64x2_t p = vfmaq_f64 (A (0), A (1), r); + y = vfmaq_f64 (y, A (4), r2); + y = vfmaq_f64 (p, y, r2); + y = vfmaq_f64 (hi, y, r2); return y; } /* Double-precision implementation of vector asinh(x). asinh is very sensitive around 1, so it is impractical to devise a single low-cost algorithm which is sufficiently accurate on a wide range of input. Instead we use two different algorithms: asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise where log(x) is an optimized log approximation, and P(x) is a polynomial shared with the scalar routine. The greatest observed error 3.29 ULP, in |x| >= 1: __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1 want 0x1.ffffcfd0e2352p-1. */ -VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x) +VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_f64_t ax = v_as_f64_u64 (iax); - v_u64_t top12 = iax >> 52; + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t iax = vreinterpretq_u64_f64 (ax); - v_u64_t gt1 = v_cond_u64 (top12 >= OneTop); - v_u64_t special = v_cond_u64 (top12 >= HugeBound); + uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); + uint64x2_t special = vcgeq_u64 (iax, d->huge_bound); #if WANT_SIMD_EXCEPT - v_u64_t tiny = v_cond_u64 (top12 < TinyBound); - special |= tiny; + uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); + special = vorrq_u64 (special, tiny); #endif /* Option 1: |x| >= 1. Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will overflow, by setting special lanes to 1. These will be fixed later. */ - v_f64_t option_1 = v_f64 (0); + float64x2_t option_1 = v_f64 (0); if (likely (v_any_u64 (gt1))) { #if WANT_SIMD_EXCEPT - v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax); + float64x2_t xm = v_zerofy_f64 (ax, special); #else - v_f64_t xm = ax; + float64x2_t xm = ax; #endif - option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1)); + option_1 = log_inline ( + vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); } /* Option 2: |x| < 1. Compute asinh(x) using a polynomial. If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will overflow, and tiny lanes, which will underflow, by setting them to 0. They will be fixed later, either by selecting x or falling back to the scalar special-case. The largest observed error in this region is 1.47 ULPs: __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 want 0x1.c1d6bf874019cp-1. */ - v_f64_t option_2 = v_f64 (0); - if (likely (v_any_u64 (~gt1))) + float64x2_t option_2 = v_f64 (0); + if (likely (v_any_u64 (vceqzq_u64 (gt1)))) { #if WANT_SIMD_EXCEPT - ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax); + ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); #endif - v_f64_t x2 = ax * ax; - v_f64_t z2 = x2 * x2; - v_f64_t z4 = z2 * z2; - v_f64_t z8 = z4 * z4; - v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); - option_2 = v_fma_f64 (p, x2 * ax, ax); + float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2), + z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2), + z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly); + option_2 = vfmaq_f64 (ax, p, x3); #if WANT_SIMD_EXCEPT - option_2 = v_sel_f64 (tiny, x, option_2); + option_2 = vbslq_f64 (tiny, x, option_2); #endif } /* Choose the right option for each lane. */ - v_f64_t y = v_sel_f64 (gt1, option_1, option_2); + float64x2_t y = vbslq_f64 (gt1, option_1, option_2); /* Copy sign. */ - y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix)); + y = vbslq_f64 (d->abs_mask, y, x); if (unlikely (v_any_u64 (special))) return special_case (x, y, special); return y; } -VPCS_ALIAS PL_SIG (V, D, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (asinh), 2.80) -PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT) +PL_TEST_ULP (V_NAME_D1 (asinh), 2.80) +PL_TEST_EXPECT_FENV (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT) /* Test vector asinh 3 times, with control lane < 1, > 1 and special. Ensures the v_sel is choosing the right option in all cases. */ -#define V_ASINH_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5) \ - PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2) \ - PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600) +#define V_ASINH_INTERVAL(lo, hi, n) \ + PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0.5) \ + PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 2) \ + PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0x1p600) V_ASINH_INTERVAL (0, 0x1p-26, 50000) V_ASINH_INTERVAL (0x1p-26, 1, 50000) V_ASINH_INTERVAL (1, 0x1p511, 50000) V_ASINH_INTERVAL (0x1p511, inf, 40000) -V_ASINH_INTERVAL (-0, -0x1p-26, 50000) -V_ASINH_INTERVAL (-0x1p-26, -1, 50000) -V_ASINH_INTERVAL (-1, -0x1p511, 50000) -V_ASINH_INTERVAL (-0x1p511, -inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c index 9d8c8a936ae3..1723ba90d2f3 100644 --- a/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c +++ b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c @@ -1,70 +1,80 @@ /* * Single-precision vector asinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "include/mathlib.h" #include "pl_sig.h" #include "pl_test.h" - -#if V_SUPPORTED +#include "v_log1pf_inline.h" #define SignMask v_u32 (0x80000000) -#define One v_f32 (1.0f) -#define BigBound v_u32 (0x5f800000) /* asuint(0x1p64). */ -#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30). */ -#include "v_log1pf_inline.h" +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t big_bound; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ +#endif +}; -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { return v_call_f32 (asinhf, x, y, special); } /* Single-precision implementation of vector asinh(x), using vector log1p. Worst-case error is 2.66 ULP, at roughly +/-0.25: __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ -VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (asinh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & ~SignMask; - v_u32_t sign = ix & SignMask; - v_f32_t ax = v_as_f32_u32 (iax); - v_u32_t special = v_cond_u32 (iax >= BigBound); + const struct data *dat = ptr_barrier (&data); + uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask); + float32x4_t ax = vreinterpretq_f32_u32 (iax); + uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); + float32x4_t special_arg = x; #if WANT_SIMD_EXCEPT /* Sidestep tiny and large values to avoid inadvertently triggering under/overflow. */ - special |= v_cond_u32 (iax < TinyBound); + special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); if (unlikely (v_any_u32 (special))) - ax = v_sel_f32 (special, One, ax); + { + ax = v_zerofy_f32 (ax, special); + x = v_zerofy_f32 (x, special); + } #endif /* asinh(x) = log(x + sqrt(x * x + 1)). For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ - v_f32_t d = One + v_sqrt_f32 (ax * ax + One); - v_f32_t y = log1pf_inline (ax + ax * ax / d); - y = v_as_f32_u32 (sign | v_as_u32_f32 (y)); + float32x4_t d + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x))); + float32x4_t y = log1pf_inline ( + vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts); if (unlikely (v_any_u32 (special))) - return specialcase (x, y, special); - return y; + return special_case (special_arg, vbslq_f32 (SignMask, x, y), special); + return vbslq_f32 (SignMask, x, y); } -VPCS_ALIAS PL_SIG (V, F, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (asinhf), 2.17) -PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000) -PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000) -PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000) -PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000) -PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000) -#endif +PL_TEST_ULP (V_NAME_F1 (asinh), 2.17) +PL_TEST_EXPECT_FENV (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000) +PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c index 6327fea8eb2c..f24667682dec 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c @@ -1,90 +1,121 @@ /* * Double-precision vector atan2(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#include "atan_common.h" +static const struct data +{ + float64x2_t pi_over_2; + float64x2_t poly[20]; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + the interval [2**-1022, 1.0]. */ + .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), + V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), + V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), + V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), + V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), + V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), + V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), + V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), + V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), + V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), +}; -#define PiOver2 v_f64 (0x1.921fb54442d18p+0) #define SignMask v_u64 (0x8000000000000000) /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) { return v_call2_f64 (atan2, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ -static inline v_u64_t -zeroinfnan (v_u64_t i) +static inline uint64x2_t +zeroinfnan (uint64x2_t i) { - return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1)); + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), + v_u64 (2 * asuint64 (INFINITY) - 1)); } /* Fast implementation of vector atan2. Maximum observed error is 2.8 ulps: - v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) + _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) got 0x1.92d628ab678ccp-1 want 0x1.92d628ab678cfp-1. */ -VPCS_ATTR -v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x) +float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iy = v_as_u64_f64 (y); + const struct data *data_ptr = ptr_barrier (&data); - v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t iy = vreinterpretq_u64_f64 (y); - v_u64_t sign_x = ix & SignMask; - v_u64_t sign_y = iy & SignMask; - v_u64_t sign_xy = sign_x ^ sign_y; + uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); - v_f64_t ax = v_abs_f64 (x); - v_f64_t ay = v_abs_f64 (y); + uint64x2_t sign_x = vandq_u64 (ix, SignMask); + uint64x2_t sign_y = vandq_u64 (iy, SignMask); + uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y); - v_u64_t pred_xlt0 = x < 0.0; - v_u64_t pred_aygtax = ay > ax; + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t pred_xlt0 = vcltzq_f64 (x); + uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); /* Set up z for call to atan. */ - v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay); - v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax); - v_f64_t z = v_div_f64 (n, d); + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, d); /* Work out the correct shift. */ - v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0)); - shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift); - shift *= PiOver2; - - v_f64_t ret = eval_poly (z, z, shift); + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); + shift = vmulq_f64 (shift, data_ptr->pi_over_2); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t ret + = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), + v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); + ret = vaddq_f64 (ret, shift); /* Account for the sign of x and y. */ - ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy); + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); if (unlikely (v_any_u64 (special_cases))) - { - return specialcase (y, x, ret, special_cases); - } + return special_case (y, x, ret, special_cases); return ret; } -VPCS_ALIAS /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (V, D, 2, atan2) // TODO tighten this once __v_atan2 is fixed -PL_TEST_ULP (V_NAME (atan2), 2.9) -PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000) -PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (V_NAME_D2 (atan2), 2.9) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c index 5d1e6ca4488e..bbfc3cb552f6 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c @@ -1,89 +1,115 @@ /* * Single-precision vector atan2(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f32.h" -#if V_SUPPORTED - -#include "atanf_common.h" +static const struct data +{ + float32x4_t poly[8]; + float32x4_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, + .pi_over_2 = V4 (0x1.921fb6p+0f), +}; -/* Useful constants. */ -#define PiOver2 v_f32 (0x1.921fb6p+0f) #define SignMask v_u32 (0x80000000) /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) { return v_call2_f32 (atan2f, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ -static inline v_u32_t -zeroinfnan (v_u32_t i) +static inline uint32x4_t +zeroinfnan (uint32x4_t i) { - return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1)); + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), + v_u32 (2 * 0x7f800000lu - 1)); } /* Fast implementation of vector atan2f. Maximum observed error is 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: - v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ -VPCS_ATTR -v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x) + _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iy = v_as_u32_f32 (y); + const struct data *data_ptr = ptr_barrier (&data); - v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t iy = vreinterpretq_u32_f32 (y); - v_u32_t sign_x = ix & SignMask; - v_u32_t sign_y = iy & SignMask; - v_u32_t sign_xy = sign_x ^ sign_y; + uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); - v_f32_t ax = v_abs_f32 (x); - v_f32_t ay = v_abs_f32 (y); + uint32x4_t sign_x = vandq_u32 (ix, SignMask); + uint32x4_t sign_y = vandq_u32 (iy, SignMask); + uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y); - v_u32_t pred_xlt0 = x < 0.0f; - v_u32_t pred_aygtax = ay > ax; + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t pred_xlt0 = vcltzq_f32 (x); + uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); /* Set up z for call to atanf. */ - v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay); - v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax); - v_f32_t z = v_div_f32 (n, d); + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); + float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, d); /* Work out the correct shift. */ - v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f)); - shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift); - shift *= PiOver2; - - v_f32_t ret = eval_poly (z, z, shift); + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); + shift = vmulq_f32 (shift, data_ptr->pi_over_2); + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t ret = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); + + /* y = shift + z * P(z^2). */ + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); /* Account for the sign of y. */ - ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy); + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); if (unlikely (v_any_u32 (special_cases))) { - return specialcase (y, x, ret, special_cases); + return special_case (y, x, ret, special_cases); } return ret; } -VPCS_ALIAS /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ PL_SIG (V, F, 2, atan2) -PL_TEST_ULP (V_NAME (atan2f), 2.46) -PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000) -PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000) -#endif +PL_TEST_ULP (V_NAME_F2 (atan2), 2.46) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c index 0f3c2ccf2606..ba68cc3cc720 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c @@ -1,74 +1,104 @@ /* * Double-precision vector atan(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#include "atan_common.h" +static const struct data +{ + float64x2_t pi_over_2; + float64x2_t poly[20]; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), + V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), + V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), + V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), + V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), + V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), + V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), + V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), + V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), + V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), +}; -#define PiOver2 v_f64 (0x1.921fb54442d18p+0) -#define AbsMask v_u64 (0x7fffffffffffffff) -#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)). */ -#define BigBound 0x434 /* top12(asuint64(0x1p53)). */ +#define SignMask v_u64 (0x8000000000000000) +#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */ +#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */ /* Fast implementation of vector atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: - __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ -VPCS_ATTR -v_f64_t V_NAME (atan) (v_f64_t x) + _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { + const struct data *d = ptr_barrier (&data); + /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need fenv. */ - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t sign = ix & ~AbsMask; + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t sign = vandq_u64 (ix, SignMask); #if WANT_SIMD_EXCEPT - v_u64_t ia12 = (ix >> 52) & 0x7ff; - v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound); + uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000)); + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)), + v_u64 (BigBound - TinyBound)); /* If any lane is special, fall back to the scalar routine for all lanes. */ if (unlikely (v_any_u64 (special))) return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1)); #endif /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - v_u64_t red = v_cagt_f64 (x, v_f64 (1.0)); + uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); /* Avoid dependency in abs(x) in division (and comparison). */ - v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x); - v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0)); + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); /* Use absolute value only when needed (odd powers of z). */ - v_f64_t az = v_abs_f64 (z); - az = v_sel_f64 (red, -az, az); + float64x2_t az = vbslq_f64 ( + SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); - /* Calculate the polynomial approximation. */ - v_f64_t y = eval_poly (z, az, shift); + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t y + = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), + v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); + y = vaddq_f64 (y, shift); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign); + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); return y; } -VPCS_ALIAS PL_SIG (V, D, 1, atan, -10.0, 10.0) -PL_TEST_ULP (V_NAME (atan), 1.78) -PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000) -PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000) -PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000) -PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000) -PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000) -PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000) - -#endif +PL_TEST_ULP (V_NAME_D1 (atan), 1.78) +PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000) +PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c index 67d90b94f5d3..f522d957c1cc 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c @@ -1,83 +1,107 @@ /* * Single-precision vector atan(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f32.h" -#if V_SUPPORTED +static const struct data +{ + float32x4_t poly[8]; + float32x4_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, + .pi_over_2 = V4 (0x1.921fb6p+0f), +}; + +#define SignMask v_u32 (0x80000000) -#include "atanf_common.h" +#define P(i) d->poly[i] -#define PiOver2 v_f32 (0x1.921fb6p+0f) -#define AbsMask v_u32 (0x7fffffff) -#define TinyBound 0x308 /* top12(asuint(0x1p-30)). */ -#define BigBound 0x4e8 /* top12(asuint(0x1p30)). */ +#define TinyBound 0x30800000 /* asuint(0x1p-30). */ +#define BigBound 0x4e800000 /* asuint(0x1p30). */ #if WANT_SIMD_EXCEPT -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { return v_call_f32 (atanf, x, y, special); } #endif /* Fast implementation of vector atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: - v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ -VPCS_ATTR -v_f32_t V_NAME (atanf) (v_f32_t x) + _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x) { + const struct data *d = ptr_barrier (&data); + /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need fenv. */ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t sign = ix & ~AbsMask; + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t sign = vandq_u32 (ix, SignMask); #if WANT_SIMD_EXCEPT - v_u32_t ia12 = (ix >> 20) & 0x7ff; - v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound); + uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); + uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), + v_u32 (BigBound - TinyBound)); /* If any lane is special, fall back to the scalar routine for all lanes. */ if (unlikely (v_any_u32 (special))) - return specialcase (x, x, v_u32 (-1)); + return special_case (x, x, v_u32 (-1)); #endif /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - v_u32_t red = v_cagt_f32 (x, v_f32 (1.0)); + uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); /* Avoid dependency in abs(x) in division (and comparison). */ - v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x); - v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f)); + float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); /* Use absolute value only when needed (odd powers of z). */ - v_f32_t az = v_abs_f32 (z); - az = v_sel_f32 (red, -az, az); + float32x4_t az = vbslq_f32 ( + SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); - /* Calculate the polynomial approximation. */ - v_f32_t y = eval_poly (z, az, shift); + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t y = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); + + /* y = shift + z * P(z^2). */ + y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); return y; } -VPCS_ALIAS PL_SIG (V, F, 1, atan, -10.0, 10.0) -PL_TEST_ULP (V_NAME (atanf), 2.5) -PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000) -PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000) -PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000) -PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000) -PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_F1 (atan), 2.5) +PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c index bfaf5c2b917f..f282826a3f32 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c @@ -1,61 +1,66 @@ /* * Double-precision vector atanh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pairwise_horner.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - #define WANT_V_LOG1P_K0_SHORTCUT 0 #include "v_log1p_inline.h" -#define AbsMask 0x7fffffffffffffff -#define Half 0x3fe0000000000000 -#define One 0x3ff0000000000000 +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one, half; +} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .half = V2 (0x3fe0000000000000) }; -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (atanh, x, y, special); } /* Approximation for vector double-precision atanh(x) using modified log1p. The greatest observed error is 3.31 ULP: - __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 - want 0x1.ffd8ff31b501cp-6. */ + _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ VPCS_ATTR -v_f64_t V_NAME (atanh) (v_f64_t x) +float64x2_t V_NAME_D1 (atanh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t sign = ix & ~AbsMask; - v_u64_t ia = ix & AbsMask; - v_u64_t special = v_cond_u64 (ia >= One); - v_f64_t halfsign = v_as_f64_u64 (sign | Half); + const struct data *d = ptr_barrier (&data); - /* Mask special lanes with 0 to prevent spurious underflow. */ - v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia)); - v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax)); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (ax); + uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia); + uint64x2_t special = vcgeq_u64 (ia, d->one); + float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half)); + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, special); +#endif + + float64x2_t y; + y = vaddq_f64 (ax, ax); + y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax)); + y = log1p_inline (y, &d->log1p_consts); if (unlikely (v_any_u64 (special))) - return specialcase (x, y, special); - return y; + return special_case (x, vmulq_f64 (y, halfsign), special); + return vmulq_f64 (y, halfsign); } -VPCS_ALIAS PL_SIG (V, D, 1, atanh, -1.0, 1.0) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh)) -PL_TEST_ULP (V_NAME (atanh), 3.32) -PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0) -PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0) -#endif +PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT) +PL_TEST_ULP (V_NAME_D1 (atanh), 3.32) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0) diff --git a/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c index cd3069661142..f6a5f25eca9a 100644 --- a/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c +++ b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c @@ -1,62 +1,77 @@ /* * Single-precision vector atanh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "v_log1pf_inline.h" -#if V_SUPPORTED +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .one = V4 (0x3f800000), +#if WANT_SIMD_EXCEPT + /* 0x1p-12, below which atanhf(x) rounds to x. */ + .tiny_bound = V4 (0x39800000), +#endif +}; -#include "v_log1pf_inline.h" +#define AbsMask v_u32 (0x7fffffff) +#define Half v_u32 (0x3f000000) -#define AbsMask 0x7fffffff -#define Half 0x3f000000 -#define One 0x3f800000 -#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (atanhf, x, y, special); +} /* Approximation for vector single-precision atanh(x) using modified log1p. The maximum error is 3.08 ULP: __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 want 0x1.ffcb82p-5. */ -VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_f32_t halfsign - = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix)); - v_u32_t iax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); - v_f32_t ax = v_as_f32_u32 (iax); + float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); #if WANT_SIMD_EXCEPT - v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound)); + uint32x4_t special + = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound)); /* Side-step special cases by setting those lanes to 0, which will trigger no exceptions. These will be fixed up later. */ if (unlikely (v_any_u32 (special))) - ax = v_sel_f32 (special, v_f32 (0), ax); + ax = v_zerofy_f32 (ax, special); #else - v_u32_t special = v_cond_u32 (iax >= One); + uint32x4_t special = vcgeq_u32 (iax, d->one); #endif - v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax)); + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax)); + y = log1pf_inline (y, d->log1pf_consts); if (unlikely (v_any_u32 (special))) - return v_call_f32 (atanhf, x, y, special); - return y; + return special_case (x, vmulq_f32 (halfsign, y), special); + return vmulq_f32 (halfsign, y); } -VPCS_ALIAS PL_SIG (V, F, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (V_NAME (atanhf), 2.59) -PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0) -PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0) -#endif +PL_TEST_ULP (V_NAME_F1 (atanh), 2.59) +PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0) diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c index d5abe41024bc..cc7cff15dc0f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c +++ b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c @@ -1,98 +1,116 @@ /* * Double-precision vector cbrt(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED +const static struct data +{ + float64x2_t poly[4], one_third, shift; + int64x2_t exp_bias; + uint64x2_t abs_mask, tiny_bound; + uint32x4_t thresh; + double table[5]; +} data = { + .shift = V2 (0x1.8p52), + .poly = { /* Generated with fpminimax in [0.5, 1]. */ + V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1), + V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) }, + .exp_bias = V2 (1022), + .abs_mask = V2(0x7fffffffffffffff), + .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */ + .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */ + .one_third = V2(0x1.5555555555555p-2), + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 } +}; -#define AbsMask 0x7fffffffffffffff -#define TwoThirds v_f64 (0x1.5555555555555p-1) -#define TinyBound 0x001 /* top12 (smallest_normal). */ -#define BigBound 0x7ff /* top12 (infinity). */ #define MantissaMask v_u64 (0x000fffffffffffff) -#define HalfExp v_u64 (0x3fe0000000000000) - -#define C(i) v_f64 (__cbrt_data.poly[i]) -#define T(i) v_lookup_f64 (__cbrt_data.table, i) -static NOINLINE v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint32x2_t special) { - return v_call_f64 (cbrt, x, y, special); + return v_call_f64 (cbrt, x, y, vmovl_u32 (special)); } /* Approximation for double-precision vector cbrt(x), using low-order polynomial and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat according to the exponent, for instance an error observed for double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an integer. __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 want 0x1.965fe72821e99p+0. */ -VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x) +VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_u64_t ia12 = iax >> 52; + const struct data *d = ptr_barrier (&data); + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); /* Subnormal, +/-0 and special values. */ - v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound)); + uint32x2_t special + = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh)); /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector version of frexp, which gets subnormal values wrong - these have to be special-cased as a result. */ - v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp)); - v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022; + float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5)); + int64x2_t exp_bias = d->exp_bias; + uint64x2_t ia12 = vshrq_n_u64 (iax, 52); + int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias); /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for Newton iterations. */ - v_f64_t p_01 = v_fma_f64 (C (1), m, C (0)); - v_f64_t p_23 = v_fma_f64 (C (3), m, C (2)); - v_f64_t p = v_fma_f64 (m * m, p_23, p_01); - + float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly); + float64x2_t one_third = d->one_third; /* Two iterations of Newton's method for iteratively approximating cbrt. */ - v_f64_t m_by_3 = m / 3; - v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p)); - a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a)); + float64x2_t m_by_3 = vmulq_f64 (m, one_third); + float64x2_t two_thirds = vaddq_f64 (one_third, one_third); + float64x2_t a + = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p); + a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a); /* Assemble the result by the following: cbrt(x) = cbrt(m) * 2 ^ (e / 3). We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is not necessarily a multiple of 3 we lose some information. Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is an integer in [-2, 2], and can be looked up in the table T. Hence the result is assembled as: cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ - v_s64_t ey = e / 3; - v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2)); + float64x2_t ef = vcvtq_f64_s64 (e); + float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third)); + int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3))); + int64x2_t ey = vcvtq_s64_f64 (eb3f); + + float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] }; + my = vmulq_f64 (my, a); /* Vector version of ldexp. */ - v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my; - /* Copy sign. */ - y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix)); + float64x2_t y = vreinterpretq_f64_s64 ( + vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52)); + y = vmulq_f64 (y, my); + + if (unlikely (v_any_u32h (special))) + return special_case (x, vbslq_f64 (d->abs_mask, y, x), special); - if (unlikely (v_any_u64 (special))) - return specialcase (x, y, special); - return y; + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); } -VPCS_ALIAS -PL_TEST_ULP (V_NAME (cbrt), 1.30) +PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30) PL_SIG (V, D, 1, cbrt, -10.0, 10.0) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt)) -PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000) -PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000) -#endif +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt)) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c deleted file mode 100644 index 62fa37505834..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Single-precision vector cbrt(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define AbsMask 0x7fffffff -#define SignMask v_u32 (0x80000000) -#define TwoThirds v_f32 (0x1.555556p-1f) -#define SmallestNormal 0x00800000 -#define MantissaMask 0x007fffff -#define HalfExp 0x3f000000 - -#define C(i) v_f32 (__cbrtf_data.poly[i]) -#define T(i) v_lookup_f32 (__cbrtf_data.table, i) - -static NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) -{ - return v_call_f32 (cbrtf, x, y, special); -} - -/* Approximation for vector single-precision cbrt(x) using Newton iteration with - initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP. - This is observed for every value where the mantissa is 0x1.81410e and the - exponent is a multiple of 3, for example: - __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 - want 0x1.255d92p+10. */ -VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x) -{ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - - /* Subnormal, +/-0 and special values. */ - v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000)); - - /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector - version of frexpf, which gets subnormal values wrong - these have to be - special-cased as a result. */ - v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp); - v_s32_t e = v_as_s32_u32 (iax >> 23) - 126; - - /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, - the less accurate the next stage of the algorithm needs to be. An order-4 - polynomial is enough for one Newton iteration. */ - v_f32_t p_01 = v_fma_f32 (C (1), m, C (0)); - v_f32_t p_23 = v_fma_f32 (C (3), m, C (2)); - v_f32_t p = v_fma_f32 (m * m, p_23, p_01); - - /* One iteration of Newton's method for iteratively approximating cbrt. */ - v_f32_t m_by_3 = m / 3; - v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p)); - - /* Assemble the result by the following: - - cbrt(x) = cbrt(m) * 2 ^ (e / 3). - - We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is - not necessarily a multiple of 3 we lose some information. - - Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. - - Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is - an integer in [-2, 2], and can be looked up in the table T. Hence the - result is assembled as: - - cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ - - v_s32_t ey = e / 3; - v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2)); - - /* Vector version of ldexpf. */ - v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my; - /* Copy sign. */ - y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y))); - - if (unlikely (v_any_u32 (special))) - return specialcase (x, y, special); - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, cbrt, -10.0, 10.0) -PL_TEST_ULP (V_NAME (cbrtf), 1.03) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf)) -PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000) -PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u7.c b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u7.c new file mode 100644 index 000000000000..74918765209f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u7.c @@ -0,0 +1,116 @@ +/* + * Single-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_advsimd_f32.h" + +const static struct data +{ + float32x4_t poly[4], one_third; + float table[5]; +} data = { + .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with + FPMinimax. */ + V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1), + V4 (0x1.2c74c2p-3) }, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = V4 (0x1.555556p-2f), +}; + +#define SignMask v_u32 (0x80000000) +#define SmallestNormal v_u32 (0x00800000) +#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask v_u32 (0x007fffff) +#define HalfExp v_u32 (0x3f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint16x4_t special) +{ + return v_call_f32 (cbrtf, x, y, vmovl_u16 (special)); +} + +static inline float32x4_t +shifted_lookup (const float *table, int32x4_t i) +{ + return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2], + table[i[3] + 2] }; +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + /* Subnormal, +/-0 and special values. */ + uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5)); + int32x4_t e + = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126)); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly); + + float32x4_t one_third = d->one_third; + float32x4_t two_thirds = vaddq_f32 (one_third, one_third); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float32x4_t m_by_3 = vmulq_f32 (m, one_third); + float32x4_t a + = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third); + int32x4_t ey = vcvtq_s32_f32 (ef); + int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3))); + + float32x4_t my = shifted_lookup (d->table, em3); + my = vmulq_f32 (my, a); + + /* Vector version of ldexpf. */ + float32x4_t y + = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23)); + y = vmulq_f32 (y, my); + + if (unlikely (v_any_u16h (special))) + return special_case (x, vbslq_f32 (SignMask, x, y), special); + + /* Copy sign. */ + return vbslq_f32 (SignMask, x, y); +} + +PL_SIG (V, F, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt)) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cexpi_3u5.c b/contrib/arm-optimized-routines/pl/math/v_cexpi_3u5.c new file mode 100644 index 000000000000..5163b15926b8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cexpi_3u5.c @@ -0,0 +1,45 @@ +/* + * Double-precision vector sincos function - return-by-value interface. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincos_common.h" +#include "v_math.h" +#include "pl_test.h" + +static float64x2x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y) +{ + return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special), + v_call_f64 (cos, x, y.val[1], special) }; +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR float64x2x2_t +_ZGVnN2v_cexpi (float64x2_t x) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73) +PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73) +#define V_CEXPI_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n) +V_CEXPI_INTERVAL (0, 0x1p23, 500000) +V_CEXPI_INTERVAL (-0, -0x1p23, 500000) +V_CEXPI_INTERVAL (0x1p23, inf, 10000) +V_CEXPI_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cexpif_1u8.c b/contrib/arm-optimized-routines/pl/math/v_cexpif_1u8.c new file mode 100644 index 000000000000..4897018d3090 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cexpif_1u8.c @@ -0,0 +1,47 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "pl_test.h" + +static float32x4x2_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y) +{ + return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special), + v_call_f32 (cosf, x, y.val[1], special) }; +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR float32x4x2_t +_ZGVnN4v_cexpif (float32x4_t x) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, sc); + return sc; +} + +PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17) +PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31) +#define V_CEXPIF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n) +V_CEXPIF_INTERVAL (0, 0x1p20, 500000) +V_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +V_CEXPIF_INTERVAL (0x1p20, inf, 10000) +V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c index 0a9fbf817a10..649c390f4622 100644 --- a/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c +++ b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c @@ -1,96 +1,104 @@ /* * Double-precision vector cosh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" -#include "v_exp_tail.h" - -#define C1 v_f64 (C1_scal) -#define C2 v_f64 (C2_scal) -#define C3 v_f64 (C3_scal) -#define InvLn2 v_f64 (InvLn2_scal) -#define Ln2hi v_f64 (Ln2hi_scal) -#define Ln2lo v_f64 (Ln2lo_scal) -#define IndexMask v_u64 (IndexMask_scal) -#define Shift v_f64 (Shift_scal) -#define Thres v_f64 (Thres_scal) - -#define AbsMask 0x7fffffffffffffff -#define Half v_f64 (0.5) -#define SpecialBound \ - 0x4086000000000000 /* 0x1.6p9, above which exp overflows. */ - -#if V_SUPPORTED - -static inline v_f64_t -exp_inline (v_f64_t x) + +static const struct data +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2, shift, thres; + uint64x2_t index_mask, special_bound; +} data = { + .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5), }, + + .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + /* -ln2/N. */ + .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64}, + .shift = V2 (0x1.8p+52), + .thres = V2 (704.0), + + .index_mask = V2 (0xff), + /* 0x1.6p9, above which exp overflows. */ + .special_bound = V2 (0x4086000000000000), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from v_exp_tail, with no + special-case handling or tail. */ +static inline float64x2_t +exp_inline (float64x2_t x) { - /* Helper for approximating exp(x). Copied from v_exp_tail, with no - special-case handling or tail. */ + const struct data *d = ptr_barrier (&data); /* n = round(x/(ln2/N)). */ - v_f64_t z = v_fma_f64 (x, InvLn2, Shift); - v_u64_t u = v_as_u64_f64 (z); - v_f64_t n = z - Shift; + float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); /* r = x - n*ln2/N. */ - v_f64_t r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); + float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0); + r = vfmaq_laneq_f64 (r, n, d->ln2, 1); - v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS); - v_u64_t i = u & IndexMask; + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, d->index_mask); /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - v_f64_t y = v_fma_f64 (C3, r, C2); - y = v_fma_f64 (y, r, C1); - y = v_fma_f64 (y, r, v_f64 (1)) * r; + float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r); + y = vfmaq_f64 (d->poly[0], y, r); + y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r); /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - v_f64_t s = v_as_f64_u64 (u + e); + u = v_lookup_u64 (__v_exp_tail_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); - return v_fma_f64 (y, s, s); + return vfmaq_f64 (s, y, s); } /* Approximation for vector double-precision cosh(x) using exp_inline. cosh(x) = (exp(x) + exp(-x)) / 2. - The greatest observed error is in the scalar fall-back region, so is the same - as the scalar routine, 1.93 ULP: - __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 - want 0x1.fdf28623ef923p+1021. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. The greatest observed error in the non-special region is 1.54 ULP: - __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 - want 0x1.f711dcb0c77b1p+7. */ -VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x) + _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 + want 0x1.f711dcb0c77b1p+7. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_u64_t special = v_cond_u64 (iax > SpecialBound); + const struct data *d = ptr_barrier (&data); - /* If any inputs are special, fall back to scalar for all lanes. */ - if (unlikely (v_any_u64 (special))) - return v_call_f64 (cosh, x, x, v_u64 (-1)); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t special + = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound); - v_f64_t ax = v_as_f64_u64 (iax); /* Up to the point that exp overflows, we can use it to calculate cosh by exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ - v_f64_t t = exp_inline (ax); - return t * Half + Half / t; + float64x2_t t = exp_inline (ax); + float64x2_t half_t = vmulq_n_f64 (t, 0.5); + float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t); + + /* Fall back to scalar for any special cases. */ + if (unlikely (v_any_u64 (special))) + return special_case (x, vaddq_f64 (half_t, half_over_t), special); + + return vaddq_f64 (half_t, half_over_t); } -VPCS_ALIAS PL_SIG (V, D, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (cosh), 1.43) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh)) -PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000) -PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000) -PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000) -PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_D1 (cosh), 1.43) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh)) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c index 1422d4d12b31..c622b0b183f1 100644 --- a/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c +++ b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c @@ -1,74 +1,80 @@ /* * Single-precision vector cosh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "v_expf_inline.h" #include "v_math.h" #include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" -#define AbsMask 0x7fffffff -#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ -#define SpecialBound \ - 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \ - special case. */ -#define Half v_f32 (0.5) - -#if V_SUPPORTED +static const struct data +{ + struct v_expf_data expf_consts; + uint32x4_t tiny_bound, special_bound; +} data = { + .expf_consts = V_EXPF_DATA, + .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = V4 (0x42ad496c), +}; -v_f32_t V_NAME (expf) (v_f32_t); +#if !WANT_SIMD_EXCEPT +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (coshf, x, y, special); +} +#endif /* Single-precision vector cosh, using vector expf. Maximum error is 2.38 ULP: - __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4. */ -VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x) + _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 + want 0x1.6a4922p+4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - v_f32_t ax = v_as_f32_u32 (iax); - v_u32_t special = v_cond_u32 (iax >= SpecialBound); + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered correctly, fall back to the scalar variant for all inputs if any input is a special value or above the bound - at which expf overflows. */ + at which expf overflows. */ if (unlikely (v_any_u32 (special))) return v_call_f32 (coshf, x, x, v_u32 (-1)); - v_u32_t tiny = v_cond_u32 (iax <= TinyBound); + uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound); /* If any input is tiny, avoid underflow exception by fixing tiny lanes of - input to 1, which will generate no exceptions, and then also fixing tiny - lanes of output to 1 just before return. */ + input to 0, which will generate no exceptions. */ if (unlikely (v_any_u32 (tiny))) - ax = v_sel_f32 (tiny, v_f32 (1), ax); + ax = v_zerofy_f32 (ax, tiny); #endif /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - v_f32_t t = V_NAME (expf) (ax); - v_f32_t y = t * Half + Half / t; + float32x4_t t = v_expf_inline (ax, &d->expf_consts); + float32x4_t half_t = vmulq_n_f32 (t, 0.5); + float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); #if WANT_SIMD_EXCEPT if (unlikely (v_any_u32 (tiny))) - return v_sel_f32 (tiny, v_f32 (1), y); + return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); #else if (unlikely (v_any_u32 (special))) - return v_call_f32 (coshf, x, y, special); + return special_case (x, vaddq_f32 (half_t, half_over_t), special); #endif - return y; + return vaddq_f32 (half_t, half_over_t); } -VPCS_ALIAS PL_SIG (V, F, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (coshf), 1.89) -PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100) -PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000) -PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100) -PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000) -PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000) -#endif +PL_TEST_ULP (V_NAME_F1 (cosh), 1.89) +PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cospi_3u1.c b/contrib/arm-optimized-routines/pl/math/v_cospi_3u1.c new file mode 100644 index 000000000000..3c2ee0b74c8e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cospi_3u1.c @@ -0,0 +1,86 @@ +/* + * Double-precision vector cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[10]; + float64x2_t range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cospi, x, y, cmp); +} + +/* Approximation for vector double-precision cospi(x). + Maximum Error 3.06 ULP: + _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1 + want 0x1.fa854babfb6c1p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float64x2_t r = vabsq_f64 (x); + uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f64 (r, cmp); + uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63); + +#else + float64x2_t r = x; + uint64x2_t cmp = vcageq_f64 (r, d->range_val); + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + +#endif + + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +PL_SIG (V, D, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_D1 (cospi), 2.56) +PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_cospif_3u2.c b/contrib/arm-optimized-routines/pl/math/v_cospif_3u2.c new file mode 100644 index 000000000000..d88aa828439d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cospif_3u2.c @@ -0,0 +1,83 @@ +/* + * Single-precision vector cospi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[6]; + float32x4_t range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cospif, x, y, cmp); +} + +/* Approximation for vector single-precision cospi(x) + Maximum Error: 3.17 ULP: + _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float32x4_t r = vabsq_f32 (x); + uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f32 (r, cmp); + uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31); + +#else + float32x4_t r = x; + uint32x4_t cmp = vcageq_f32 (r, d->range_val); + + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + +#endif + + /* r = x - rint(x). */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +PL_SIG (V, F, 1, cospi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_F1 (cospi), 2.67) +PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_2u.c b/contrib/arm-optimized-routines/pl/math/v_erf_2u.c deleted file mode 100644 index 1d7ddbb1ee3e..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erf_2u.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Double-precision vector erf(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "include/mathlib.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define AbsMask v_u64 (0x7fffffffffffffff) -#define AbsXMax v_f64 (0x1.8p+2) -#define Scale v_f64 (0x1p+3) - -/* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (erf, x, y, cmp); -} - -/* A structure to perform look-up in coeffs and other parameter tables. */ -struct entry -{ - v_f64_t P[V_ERF_NCOEFFS]; - v_f64_t shift; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - for (int j = 0; j < V_ERF_NCOEFFS; ++j) - e.P[j] = __v_erf_data.coeffs[j][i]; - e.shift = __v_erf_data.shifts[i]; -#else - for (int j = 0; j < V_ERF_NCOEFFS; ++j) - { - e.P[j][0] = __v_erf_data.coeffs[j][i[0]]; - e.P[j][1] = __v_erf_data.coeffs[j][i[1]]; - } - e.shift[0] = __v_erf_data.shifts[i[0]]; - e.shift[1] = __v_erf_data.shifts[i[1]]; -#endif - return e; -} - -/* Optimized double precision vector error function erf. Maximum - observed error is 1.75 ULP, in [0.110, 0.111]: - verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4 - want 0x1.fe0ed62a54985p-4. */ -VPCS_ATTR -v_f64_t V_NAME (erf) (v_f64_t x) -{ - /* Handle both inf/nan as well as small values (|x|<2^-28) - If any condition in the lane is true then a loop over - scalar calls will be performed. */ - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t atop = (ix >> 48) & v_u64 (0x7fff); - v_u64_t special_case - = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30)); - - /* Get sign and absolute value. */ - v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask; - v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax); - - /* Compute index by truncating 8 * a with a=|x| saturated to 6.0. */ - -#ifdef SCALAR - v_u64_t i = v_trunc_u64 (a * Scale); -#else - v_u64_t i = vcvtq_n_u64_f64 (a, 3); -#endif - /* Get polynomial coefficients and shift parameter using lookup. */ - struct entry dat = lookup (i); - - /* Evaluate polynomial on transformed argument. */ - v_f64_t z = v_fma_f64 (a, Scale, dat.shift); - - v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]); - v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]); - v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]); - v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]); - v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]); - - v_f64_t z2 = z * z; - v_f64_t y = v_fma_f64 (z2, r5, r4); - y = v_fma_f64 (z2, y, r3); - y = v_fma_f64 (z2, y, r2); - y = v_fma_f64 (z2, y, r1); - - /* y=erf(x) if x>0, -erf(-x) otherwise. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign); - - if (unlikely (v_any_u64 (special_case))) - return specialcase (x, y, special_case); - return y; -} -VPCS_ALIAS - -PL_SIG (V, D, 1, erf, -6.0, 6.0) -PL_TEST_ULP (V_NAME (erf), 1.26) -PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_2u5.c b/contrib/arm-optimized-routines/pl/math/v_erf_2u5.c new file mode 100644 index 000000000000..e581ec5bb8a7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erf_2u5.c @@ -0,0 +1,158 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t third; + float64x2_t tenth, two_over_five, two_over_fifteen; + float64x2_t two_over_nine, two_over_fortyfive; + float64x2_t max, shift; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound, huge_bound, scale_minus_one; +#endif +} data = { + .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = V2 (0x1.1111111111111p-3), + .tenth = V2 (-0x1.999999999999ap-4), + .two_over_five = V2 (-0x1.999999999999ap-2), + .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), + .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5), + .max = V2 (5.9921875), /* 6 - 1/128. */ + .shift = V2 (0x1p45), +#if WANT_SIMD_EXCEPT + .huge_bound = V2 (0x1p205), + .tiny_bound = V2 (0x1p-226), + .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffffffffffff + +struct entry +{ + float64x2_t erf; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])), + e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1])); + e.erf = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + + float64x2_t a = vabsq_f64 (x); + /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs + to return expected results. */ + uint64x2_t a_le_max = vcleq_f64 (a, dat->max); + uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max); + +#if WANT_SIMD_EXCEPT + /* |x| huge or tiny. */ + uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound); + uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound); + uint64x2_t cmp = vorrq_u64 (cmp1, cmp2); + /* If any lanes are special, mask them with 1 for small x or 8 for large + values and retain a copy of a to allow special case handler to fix special + lanes later. This is only necessary if fenv exceptions are to be triggered + correctly. */ + if (unlikely (v_any_u64 (cmp))) + { + a = vbslq_f64 (cmp1, v_f64 (8.0), a); + a = vbslq_f64 (cmp2, v_f64 (1.0), a); + } +#endif + + /* Set r to multiple of 1/128 nearest to |x|. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Lookup erf(r) and scale(r) in table, without shortcut for small values, + but with saturated indices for large values and NaNs in order to avoid + segfault. */ + uint64x2_t i + = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); + i = vbslq_u64 (a_le_max, i, v_u64 (768)); + struct entry e = lookup (i); + + float64x2_t r = vsubq_f64 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + float64x2_t p1 = r; + float64x2_t p2 + = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); + float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen); + p4 = vfmsq_f64 (dat->tenth, r2, p4); + float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); + + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p34, d2, p5); + y = vfmaq_f64 (p12, d2, y); + + y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = vbslq_f64 (a_gt_max, v_f64 (1.0), y); + + /* Copy sign. */ + y = vbslq_f64 (v_u64 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp2))) + { + /* Neutralise huge values of x before fixing small values. */ + x = vbslq_f64 (cmp1, v_f64 (1.0), x); + /* Fix tiny values that trigger spurious underflow. */ + return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y); + } +#endif + return y; +} + +PL_SIG (V, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (V_NAME_D1 (erf), 1.79) +PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_data.c b/contrib/arm-optimized-routines/pl/math/v_erf_data.c deleted file mode 100644 index 7bbb281ad912..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erf_data.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Polynomial coefficients and shifts for double-precision erf(x) vector - * function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for - i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for - [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1 - above 6. - - Coefficients for each interval generated using fpminimax algorithm. See - v_erf.sollya for details. Note the array is transposed, so for a set of - coefficients C generated on interval i, C[j] is at coeffs[j][i]. */ - -const struct v_erf_data __v_erf_data - = {.shifts - = {-0x1p-1019, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, - -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, - -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, - -39, -40, -41, -42, -43, -44, -45, -46, -47, 0}, - .coeffs = { - // clang-format off - -{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1, - 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1, - 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1, - 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1, - 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1, - 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1, - 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1, - 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0}, - -{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4, - 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6, - 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10, - 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15, - 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22, - 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31, - 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41, - 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0}, - -{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8, - -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9, - -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12, - -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17, - -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23, - -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32, - -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42, - -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0}, - -{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14, - 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12, - 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14, - 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19, - 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25, - 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33, - 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49, - 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0}, - -{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15, - 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18, - -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18, - -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22, - -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27, - -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35, - -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41, - -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0}, - -{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21, - -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21, - -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22, - 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25, - 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30, - 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38, - 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40, - 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0}, - -{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23, - -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24, - 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28, - -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28, - -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33, - -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43, - -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39, - -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0}, - -{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28, - 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31, - -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31, - -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32, - 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37, - 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40, - 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39, - 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0}, - -{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32, - 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32, - -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34, - 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37, - -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40, - -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41, - -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40, - -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0}, - -{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35, - -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37, - 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39, - -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43, - 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48, - 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43, - 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42, - 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0} - // clang-format on - }}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_1u8.c b/contrib/arm-optimized-routines/pl/math/v_erfc_1u8.c new file mode 100644 index 000000000000..10ef7e6a3c34 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfc_1u8.c @@ -0,0 +1,198 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint64x2_t offset, table_scale; + float64x2_t max, shift; + float64x2_t p20, p40, p41, p42; + float64x2_t p51, p52; + float64x2_t qr5, qr6, qr7, qr8, qr9; +#if WANT_SIMD_EXCEPT + float64x2_t uflow_bound; +#endif +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .offset = V2 (0xbd3ffffffffff260), + .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */ + .max = V2 (0x1.b3ep+4), /* 3487/128. */ + .shift = V2 (0x1p45), + .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ + .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ + .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */ + .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ + .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */ + /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, + .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, + .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 }, + .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 }, + .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 }, +#if WANT_SIMD_EXCEPT + .uflow_bound = V2 (0x1.a8b12fc6e4892p+4), +#endif +}; + +#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */ +#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */ + +struct entry +{ + float64x2_t erfc; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])), + e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1])); + e.erfc = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + return v_call_f64 (erfc, x, y, cmp); +} +#endif + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (erfc) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound)); + /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x), + vreinterpretq_s64_f64 (dat->uflow_bound)); + cmp = vorrq_u64 (cmp, uflow); + float64x2_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u64 (cmp))) + x = v_zerofy_f64 (x, cmp); +#endif + + float64x2_t a = vabsq_f64 (x); + a = vminq_f64 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Clamp index to a range of 3487. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float64x2_t r = vsubq_f64 (z, shift); + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t p1 = r; + float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); + float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42); + p4 = vfmsq_f64 (dat->p40, r2, p4); + float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0)); + p6 = vmulq_laneq_f64 (p6, dat->qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0)); + p7 = vmulq_laneq_f64 (p7, dat->qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0)); + p8 = vmulq_laneq_f64 (p8, dat->qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0)); + p9 = vmulq_laneq_f64 (p9, dat->qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0)); + p10 = vmulq_laneq_f64 (p10, dat->qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + float64x2_t p90 = vfmaq_f64 (p9, d, p10); + float64x2_t p78 = vfmaq_f64 (p7, d, p8); + float64x2_t p56 = vfmaq_f64 (p5, d, p6); + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p78, d2, p90); + y = vfmaq_f64 (p56, d2, y); + y = vfmaq_f64 (p34, d2, y); + y = vfmaq_f64 (p12, d2, y); + + y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63); + float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float64x2_t fac = vreinterpretq_f64_u64 ( + vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (xm, vfmaq_f64 (off, fac, y), cmp); +#endif + + return vfmaq_f64 (off, fac, y); +} + +PL_SIG (V, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (V_NAME_D1 (erfc), 1.21) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c deleted file mode 100644 index c30635153a20..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Double-precision vector erfc(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "horner.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -/* Accurate exponential (vector variant of exp_dd). */ -v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); - -#define One v_f64 (1.0) -#define AbsMask v_u64 (0x7fffffffffffffff) -#define Scale v_f64 (0x1.0000002p27) - -/* Coeffs for polynomial approximation on [0x1.0p-28., 31.]. */ -#define PX __v_erfc_data.poly -#define xint __v_erfc_data.interval_bounds - -/* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (erfc, x, y, cmp); -} - -/* A structure to perform look-up in coeffs and other parameter - tables. */ -struct entry -{ - v_f64_t P[ERFC_POLY_ORDER + 1]; - v_f64_t xi; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - for (int j = 0; j <= ERFC_POLY_ORDER; ++j) - e.P[j] = PX[i][j]; - e.xi = xint[i]; -#else - for (int j = 0; j <= ERFC_POLY_ORDER; ++j) - { - e.P[j][0] = PX[i[0]][j]; - e.P[j][1] = PX[i[1]][j]; - } - e.xi[0] = xint[i[0]]; - e.xi[1] = xint[i[1]]; -#endif - return e; -} - -/* Accurate evaluation of exp(x^2) using compensated product - (x^2 ~ x*x + e2) and custom exp(y+d) routine for small - corrections d<> 63) << 62); - /* Use 12-bit for small, nan and inf case detection. */ - atop = (ix >> 52) & 0x7ff; - cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd)); - - struct entry dat; - - /* All entries of the vector are out of bounds, take a short path. - Use smallest possible number above 28 representable in 12 bits. */ - v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404)); - - /* Use sign to produce either 0 if x > 0, 2 otherwise. */ - if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp))) - return fac; - - /* erfc(|x|) = P(|x|-x_i)*exp(-x^2). */ - - v_f64_t a = v_abs_f64 (x); - - /* Interval bounds are a logarithmic scale, i.e. interval n has - lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain - the interval index. */ - v_f64_t xp1 = a + v_f64 (1.0); - xp1 = xp1 * xp1; - xp1 = xp1 * xp1; - v_u64_t ixp1 = v_as_u64_f64 (xp1); - i = (ixp1 >> 52) - v_u64 (1023); - - /* Index cannot exceed number of polynomials. */ -#ifdef SCALAR - i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS; -#else - i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS, - i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS}; -#endif - /* Get coeffs of i-th polynomial. */ - dat = lookup (i); - - /* Evaluate Polynomial: P(|x|-x_i). */ - z = a - dat.xi; -#define C(i) dat.P[i] - p = HORNER_12 (z, C); - - /* Evaluate Gaussian: exp(-x^2). */ - v_f64_t e = v_eval_gauss (a); - - /* Copy sign. */ - sign = v_as_u64_f64 (x) & ~AbsMask; - p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign); - - /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise. */ - y = v_fma_f64 (p, e, fac); - - /* No need to fix value of y if x is out of bound, as - P[ERFC_NUM_INTERVALS]=0. */ - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS - -PL_SIG (V, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (V_NAME (erfc), 3.15) -PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_data.c b/contrib/arm-optimized-routines/pl/math/v_erfc_data.c deleted file mode 100644 index 3c47033c1170..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erfc_data.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Polynomial coefficients for double-precision erfc(x) vector function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have - the same bounds as the scalar algorithm, with the exception of the lower - bound of the first interval which is larger. This is because the vector - variants fall back to the scalar for tiny arguments, meaning that we can use - a slightly different approach which is more precise for larger inputs but - unacceptably imprecise for tiny inputs. */ - -const struct v_erfc_data __v_erfc_data = { - -/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a - logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the - exception of the first interval. */ -.interval_bounds = { - 0x1p-28, /* If xmin=2^-28, 0 otherwise. */ - 0x1.837f0518db8a9p-3, /* 0.189. */ - 0x1.a827999fcef32p-2, /* 0.414. */ - 0x1.5d13f32b5a75bp-1, /* 0.682. */ - 0x1.0p0, /* 1.000. */ - 0x1.60dfc14636e2ap0, /* 1.378. */ - 0x1.d413cccfe779ap0, /* 1.828. */ - 0x1.2e89f995ad3adp1, /* 2.364. */ - 0x1.8p1, /* 3.000. */ - 0x1.e0dfc14636e2ap1, /* 3.757. */ - 0x1.2a09e667f3bcdp2, /* 4.657. */ - 0x1.6e89f995ad3adp2, /* 5.727. */ - 0x1.cp2, /* 7.000. */ - 0x1.106fe0a31b715p3, /* 8.514. */ - 0x1.4a09e667f3bcdp3, /* 10.31. */ - 0x1.8e89f995ad3adp3, /* 12.45. */ - 0x1.ep3, /* 15.00. */ - 0x1.206fe0a31b715p4, /* 18.03. */ - 0x1.5a09e667f3bcdp4, /* 21.63. */ - 0x1.9e89f995ad3adp4, /* 25.91. */ - 0x1.fp4 /* 31.00. */ -}, - -/* Generated using fpminimax algorithm on each interval separately. The - polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval - [0;b-a], where [a;b] is the interval in which the input lies. Note this is - slightly different from the scalar polynomial, which approximates - erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details. */ -.poly = { -/* 3.725290298461914e-9 < x < 0.18920711500272103. */ -{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9}, -/* 0.18920711500272103 < x < 0.41421356237309515. */ -{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16}, -/* 0.41421356237309515 < x < 0.681792830507429. */ -{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12}, -/* 0.681792830507429 < x < 1. */ -{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15}, -/* 1 < x < 1.378414230005442. */ -{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18}, -/* 1.378414230005442 < x < 1.8284271247461903. */ -{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20}, -/* 1.8284271247461903 < x < 2.363585661014858. */ -{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22}, -/* 2.363585661014858 < x < 3. */ -{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25}, -/* 3 < x < 3.756828460010884. */ -{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28}, -/* 3.756828460010884 < x < 4.656854249492381. */ -{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31}, -/* 4.656854249492381 < x < 5.727171322029716. */ -{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34}, -/* 5.727171322029716 < x < 7. */ -{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37}, -/* 7 < x < 8.513656920021768. */ -{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41}, -/* 8.513656920021768 < x < 10.313708498984761. */ -{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44}, -/* 10.313708498984761 < x < 12.454342644059432. */ -{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47}, -/* 12.454342644059432 < x < 15. */ -{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51}, -/* 15 < x < 18.027313840043536. */ -{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54}, -/* 18.027313840043536 < x < 21.627416997969522. */ -{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58}, -/* 21.627416997969522 < x < 25.908685288118864. */ -{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61}, -/* 25.908685288118864 < x < 31. */ -{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64}, -/* Dummy interval for x>31 */ -{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, - 0x0p0, 0x0p0, 0x0p0} -} -}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c deleted file mode 100644 index 963490d789bd..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Single-precision vector erfc(x) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "erfcf.h" -#include "estrin.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)] - -VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); - -static VPCS_ATTR NOINLINE v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t special) -{ - return v_call_f32 (erfcf, x, y, special); -} - -static inline uint32_t -interval_index (uint32_t ia12) -{ - // clang-format off - return (ia12 < 0x400 ? 0 : - (ia12 < 0x408 ? 1 : - (ia12 < 0x410 ? 2 : - 3))); - // clang-format on -} - -/* The C macro wraps the coeffs argument in order to make the - poynomial evaluation more readable. In the scalarised variant the - second pointer is ignored. */ -#ifdef SCALAR -#define C(i) coeff1[i] -#else -#define C(i) ((v_f64_t){coeff1[i], coeff2[i]}) -#endif - -static inline v_f64_t -v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, - const double *coeff2) -{ - v_f64_t x2 = x * x; - v_f64_t x4 = x2 * x2; - v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C); - v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0)); - return poly * gauss; -} - -static inline float -approx_poly_gauss (float abs_x, const double *coeff) -{ - return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x)); -} - -static v_f32_t -v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes) -{ -#ifdef SCALAR - float y = approx_poly_gauss (abs_x, P (ia12)); - return sign ? 2 - y : y; -#else - float32x2_t lo32 = {0, 0}; - float32x2_t hi32 = {0, 0}; - /* The polynomial and Gaussian components must be calculated in - double precision in order to meet the required ULP error. This - means we have to promote low and high halves of the - single-precision input vector to two separate double-precision - input vectors. This incurs some overhead, and there is also - overhead to loading the polynomial coefficients as this cannot be - done in a vector fashion. This would be wasted effort for - elements which lie in the 'boring' zone, as they will be - overwritten later. Hence we use the lanes parameter to only do - the promotion on a pair of lanes if both of those lanes are - interesting and not special cases. If one lane is inactive, we - use a scalar routine which is shared with the scalar variant. */ - if (lanes[0] & lanes[1]) - { - lo32 = vcvt_f32_f64 ( - v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)), - P (ia12[0]), P (ia12[1]))); - } - else if (lanes[0]) - { - lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0])); - } - else if (lanes[1]) - { - lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1])); - } - - if (lanes[2] & lanes[3]) - { - hi32 - = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x), - P (ia12[2]), P (ia12[3]))); - } - else if (lanes[2]) - { - hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2])); - } - else if (lanes[3]) - { - hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3])); - } - - v_f32_t y = vcombine_f32 (lo32, hi32); - - if (v_any_u32 (sign)) - { - y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y); - } - - return y; -#endif -} - -/* Optimized single-precision vector complementary error function - erfcf. Max measured error: 0.750092 at various values between - -0x1.06521p-20 and -0x1.add1dap-17. For example: - __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0 - +0.249908 ulp err 0.250092. */ -VPCS_ATTR -v_f32_t V_NAME (erfcf) (v_f32_t x) -{ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t ia = ix & 0x7fffffff; - v_u32_t ia12 = ia >> 20; - v_u32_t sign = ix >> 31; - v_u32_t inf_ia12 = v_u32 (0x7f8); - - v_u32_t special_cases - = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328)); - v_u32_t in_bounds - = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3))); - v_f32_t boring_zone = v_as_f32_u32 (sign << 30); - -#ifdef SCALAR - if (unlikely (special_cases)) - { - if (ia12 >= 0x7f8) - return (float) (sign << 1) + 1.0f / x; /* Special cases. */ - else - return 1.0f - x; /* Small case. */ - } - else if (likely (!in_bounds)) - { - return sign ? boring_zone : __math_uflowf (boring_zone); - } -#endif - - v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12, - in_bounds & ~special_cases); - -#ifndef SCALAR - y = vbslq_f32 (~in_bounds, boring_zone, y); - - if (unlikely (v_any_u32 (special_cases))) - { - return specialcase (x, y, special_cases); - } -#endif - - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (V_NAME (erfcf), 0.26) -PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u7.c b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u7.c new file mode 100644 index 000000000000..c361d0704438 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u7.c @@ -0,0 +1,166 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + uint32x4_t offset, table_scale; + float32x4_t max, shift; + float32x4_t coeffs, third, two_over_five, tenth; +#if WANT_SIMD_EXCEPT + float32x4_t uflow_bound; +#endif + +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */ + .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */ + .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */ + .shift = V4 (0x1p17f), + /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and + fmas. */ + .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .third = V4 (0x1.555556p-2f), + .two_over_five = V4 (-0x1.99999ap-2f), + .tenth = V4 (-0x1.99999ap-4f), +#if WANT_SIMD_EXCEPT + .uflow_bound = V4 (0x1.2639cp+3f), +#endif +}; + +#define TinyBound 0x41000000 /* 0x1p-62f << 1. */ +#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ +#define Off 0xfffffd7b /* 0xffffffff - 644. */ + +struct entry +{ + float32x4_t erfc; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0])); + float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1])); + float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2])); + float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3])); + float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); + float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + e.erfc = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + return v_call_f32 (erfcf, x, y, cmp); +} +#endif + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +VPCS_ATTR +float32x4_t V_NAME_F1 (erfc) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); + /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), + vreinterpretq_s32_f32 (dat->uflow_bound)); + cmp = vorrq_u32 (cmp, uflow); + float32x4_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + float32x4_t a = vabsq_f32 (x); + a = vminq_f32 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + /* Clamp index to a range of 644. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float32x4_t r = vsubq_f32 (z, shift); + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t p1 = r; + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1); + float32x4_t p3 + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2); + p4 = vfmsq_f32 (dat->tenth, r2, p4); + + float32x4_t y = vfmaq_f32 (p3, d, p4); + y = vfmaq_f32 (p2, d, y); + y = vfmaq_f32 (p1, d, y); + y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31); + float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float32x4_t fac = vreinterpretq_f32_u32 ( + vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (xm, vfmaq_f32 (off, fac, y), cmp); +#endif + + return vfmaq_f32 (off, fac, y); +} + +PL_SIG (V, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (V_NAME_F1 (erfc), 1.14) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) +PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c deleted file mode 100644 index 3a25cc8751d1..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Single-precision vector erf(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "include/mathlib.h" -#include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#if V_SUPPORTED - -VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t); - -#define AbsMask v_u32 (0x7fffffff) - -/* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - return v_call_f32 (erff, x, y, cmp); -} - -/* A structure to perform look-up in coeffs and other parameter tables. */ -struct entry -{ - v_f32_t P[V_ERFF_NCOEFFS]; -}; - -static inline struct entry -lookup (v_u32_t i) -{ - struct entry e; -#ifdef SCALAR - for (int j = 0; j < V_ERFF_NCOEFFS; ++j) - e.P[j] = __v_erff_data.coeffs[j][i]; -#else - for (int j = 0; j < V_ERFF_NCOEFFS; ++j) - { - e.P[j][0] = __v_erff_data.coeffs[j][i[0]]; - e.P[j][1] = __v_erff_data.coeffs[j][i[1]]; - e.P[j][2] = __v_erff_data.coeffs[j][i[2]]; - e.P[j][3] = __v_erff_data.coeffs[j][i[3]]; - } -#endif - return e; -} - -/* Optimized single precision vector error function erf. - Maximum measured at +/- 0.931, 1.25ULP: - v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1 - want -0x1.9f9c8ap-1. */ -VPCS_ATTR -v_f32_t V_NAME (erff) (v_f32_t x) -{ - /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition - in the lane is true then a loop over scalar calls will be performed. */ - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t atop = (ix >> 16) & v_u32 (0x7fff); - v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180)); - - /* Get sign and absolute value. */ - v_u32_t sign = ix & ~AbsMask; - /* |x| < 0.921875. */ - v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f)); - /* |x| > 4.0. */ - v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f)); - /* Avoid dependency in abs(x) in division (and comparison). */ - v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1)); - - /* Get polynomial coefficients. */ - struct entry dat = lookup (i); - - v_f32_t a = v_abs_f32 (x); - v_f32_t z = v_sel_f32 (red, x * x, a); - - /* Evaluate Polynomial of |x| or x^2. */ - v_f32_t r = dat.P[6]; - r = v_fma_f32 (z, r, dat.P[5]); - r = v_fma_f32 (z, r, dat.P[4]); - r = v_fma_f32 (z, r, dat.P[3]); - r = v_fma_f32 (z, r, dat.P[2]); - r = v_fma_f32 (z, r, dat.P[1]); - r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0])); - r = v_fma_f32 (a, r, a); - - /* y = |x| + |x|*P(|x|) if |x| < 0.921875 - 1 - exp (-(|x|+|x|*P(x^2))) otherwise. */ - v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r)); - - /* Boring domain (absolute value is required to get the sign of erf(-nan) - right). */ - y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y)); - - /* y=erf(x) if x>0, -erf(-x) otherwise. */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS - -PL_SIG (V, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (V_NAME (erff), 0.76) -PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000) -PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000) -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_2u.c b/contrib/arm-optimized-routines/pl/math/v_erff_2u.c new file mode 100644 index 000000000000..502526407df2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erff_2u.c @@ -0,0 +1,118 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t max, shift, third; +#if WANT_SIMD_EXCEPT + float32x4_t tiny_bound, scale_minus_one; +#endif +} data = { + .max = V4 (3.9375), /* 4 - 8/128. */ + .shift = V4 (0x1p16f), + .third = V4 (0x1.555556p-2f), /* 1/3. */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x1p-62f), + .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffff + +struct entry +{ + float32x4_t erf; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float64_t t0 = *((float64_t *) (__erff_data.tab + i[0])); + float64_t t1 = *((float64_t *) (__erff_data.tab + i[1])); + float64_t t2 = *((float64_t *) (__erff_data.tab + i[2])); + float64_t t3 = *((float64_t *) (__erff_data.tab + i[3])); + float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); + float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + e.erf = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. */ + uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + float32x4_t a = vabsq_f32 (x); + uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max); + + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + uint32x4_t i + = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift)); + i = vminq_u32 (i, v_u32 (512)); + struct entry e = lookup (i); + + float32x4_t r = vsubq_f32 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t y = vfmaq_f32 (r, dat->third, d); + y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y)); + + /* Solves the |x| = inf case. */ + y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y); + + /* Copy sign. */ + y = vbslq_f32 (v_u32 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y); +#endif + return y; +} + +PL_SIG (V, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (V_NAME_F1 (erf), 1.43) +PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_data.c b/contrib/arm-optimized-routines/pl/math/v_erff_data.c deleted file mode 100644 index 73ccb5cbcfa8..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_erff_data.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Data for approximation of vector erff. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Minimax approximation of erff. */ -const struct v_erff_data __v_erff_data - = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f}, - {0x1.06eba6p-03f, 0x1.450aa0p-1}, - {-0x1.8126e0p-02f, 0x1.b55cb0p-4f}, - {0x1.ce1a46p-04f, -0x1.8d6300p-6f}, - {-0x1.b68bd2p-06f, 0x1.fd1336p-9f}, - {0x1.473f48p-08f, -0x1.91d2ccp-12f}, - {-0x1.3a1a82p-11f, 0x1.222900p-16f}}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c b/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c new file mode 100644 index 000000000000..654a7336e85b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c @@ -0,0 +1,161 @@ +/* + * Double-precision inverse error function (AdvSIMD variant). + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "pl_test.h" +#include "mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "poly_advsimd_f64.h" +#define V_LOG_INLINE_POLY_ORDER 4 +#include "v_log_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. P is interleaved P_17 and P_37, similar for Q. P17 + and Q17 are provided as homogenous vectors as well for when the shortcut + can be taken. */ + double P[8][2], Q[7][2]; + float64x2_t tailshift; + uint8x16_t idx; + struct v_log_inline_data log_tbl; + float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6]; +} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 }, + { -0x1.6b23cc5c6c6d7p+6, 0x1.60b8fe375999ep-2 }, + { 0x1.74e5f6ceb3548p+7, -0x1.779bb9bef7c0fp+1 }, + { -0x1.5200bb15cc6bbp+7, 0x1.786ea384470a2p+3 }, + { 0x1.05d193233a849p+6, -0x1.6a7c1453c85d3p+4 }, + { -0x1.148c5474ee5e1p+3, 0x1.31f0fc5613142p+4 }, + { 0x1.689181bbafd0cp-3, -0x1.5ea6c007d4dbbp+2 }, + { 0, 0x1.e66f265ce9e5p-3 } }, + .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 }, + { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 }, + { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 }, + { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 }, + { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 }, + { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 }, + { 0x1p+0, -0x1.4075c56404eecp+3 } }, + .P_57 = { V2 (0x1.b874f9516f7f1p-14), V2 (0x1.5921f2916c1c4p-7), + V2 (0x1.145ae7d5b8fa4p-2), V2 (0x1.29d6dcc3b2fb7p+1), + V2 (0x1.cabe2209a7985p+2), V2 (0x1.11859f0745c4p+3), + V2 (0x1.b7ec7bc6a2ce5p+2), V2 (0x1.d0419e0bb42aep+1), + V2 (0x1.c5aa03eef7258p-1) }, + .Q_57 = { V2 (0x1.b8747e12691f1p-14), V2 (0x1.59240d8ed1e0ap-7), + V2 (0x1.14aef2b181e2p-2), V2 (0x1.2cd181bcea52p+1), + V2 (0x1.e6e63e0b7aa4cp+2), V2 (0x1.65cf8da94aa3ap+3), + V2 (0x1.7e5c787b10a36p+3), V2 (0x1.0626d68b6cea3p+3), + V2 (0x1.065c5f193abf6p+2), V2 (0x1p+0) }, + .P_17 = { V2 (0x1.007ce8f01b2e8p+4), V2 (-0x1.6b23cc5c6c6d7p+6), + V2 (0x1.74e5f6ceb3548p+7), V2 (-0x1.5200bb15cc6bbp+7), + V2 (0x1.05d193233a849p+6), V2 (-0x1.148c5474ee5e1p+3), + V2 (0x1.689181bbafd0cp-3) }, + .Q_17 = { V2 (0x1.d8fb0f913bd7bp+3), V2 (-0x1.6d7f25a3f1c24p+6), + V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7), + V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) }, + .tailshift = V2 (-0.87890625), + .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + .log_tbl = V_LOG_CONSTANTS }; + +static inline float64x2_t +special (float64x2_t x, const struct data *d) +{ + /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1p_inline helper on -abs(x) - note that erfinv(inf) + will still be finite. */ + float64x2_t t = vnegq_f64 ( + v_log_inline (vsubq_f64 (v_f64 (1), vabsq_f64 (x)), &d->log_tbl)); + t = vdivq_f64 (v_f64 (1), vsqrtq_f64 (t)); + float64x2_t ts = vbslq_f64 (v_u64 (0x7fffffffffffffff), t, x); + return vdivq_f64 (v_horner_8_f64 (t, d->P_57), + vmulq_f64 (ts, v_horner_9_f64 (t, d->Q_57))); +} + +static inline float64x2_t +lookup (const double *c, uint8x16_t idx) +{ + float64x2_t x = vld1q_f64 (c); + return vreinterpretq_f64_u8 (vqtbl1q_u8 (vreinterpretq_u8_f64 (x), idx)); +} + +static inline float64x2_t VPCS_ATTR +notails (float64x2_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + float64x2_t t = vfmaq_f64 (v_f64 (-0.5625), x, x); + float64x2_t p = vmulq_f64 (v_horner_6_f64 (t, d->P_17), x); + float64x2_t q = vaddq_f64 (d->Q_17[5], t); + for (int i = 4; i >= 0; i--) + q = vfmaq_f64 (d->Q_17[i], q, t); + return vdivq_f64 (p, q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Largest observed error is 24.75 ULP: + _ZGVnN2v_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0 + want 0x1.ea0547268660cp+0. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + uint64x2_t is_tail = vcagtq_f64 (x, v_f64 (0.75)); + + if (unlikely (!v_any_u64 (is_tail))) + /* If input is normally distributed in [-1, 1] then likelihood of this is + 0.75^2 ~= 0.56. */ + return notails (x, d); + + uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375)); + + uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8)); + uint8x16_t idx = vaddq_u8 (d->idx, off); + + float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625)); + t = vfmaq_f64 (t, x, x); + + float64x2_t p = lookup (&d->P[7][0], idx); + /* Last coeff of q is either 0 or 1 - use mask instead of load. */ + float64x2_t q = vreinterpretq_f64_u64 ( + vandq_u64 (is_tail, vreinterpretq_u64_f64 (v_f64 (1)))); + for (int i = 6; i >= 0; i--) + { + p = vfmaq_f64 (lookup (&d->P[i][0], idx), p, t); + q = vfmaq_f64 (lookup (&d->Q[i][0], idx), q, t); + } + p = vmulq_f64 (p, x); + + if (unlikely (v_any_u64 (extreme_tail))) + return vbslq_f64 (extreme_tail, special (x, d), vdivq_f64 (p, q)); + + return vdivq_f64 (p, q); +} + +PL_SIG (V, D, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8) +/* Test with control lane in each interval. */ +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, + 0.5) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, + 0.8) +PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, + 0.95) diff --git a/contrib/arm-optimized-routines/pl/math/v_erfinvf_5u.c b/contrib/arm-optimized-routines/pl/math/v_erfinvf_5u.c new file mode 100644 index 000000000000..5a6800b86ae9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfinvf_5u.c @@ -0,0 +1,163 @@ +/* + * Single-precision inverse error function (AdvSIMD variant). + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_advsimd_f32.h" +#include "v_logf_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. Coefficients are stored in various interleaved + formats to allow for table-based (vector-to-vector) lookup. + + Plo is first two coefficients of P_10 and P_29 interleaved. + PQ is third coeff of P_10 and first of Q_29 interleaved. + Qhi is second and third coeffs of Q_29 interleaved. + P29_3 is a homogenous vector with fourth coeff of P_29. + + P_10 and Q_10 are also stored in homogenous vectors to allow better + memory access when no lanes are in a tail region. */ + float32x4_t Plo, PQ, Qhi, P29_3, tailshift; + float32x4_t P_50[6], Q_50[2]; + float32x4_t P_10[3], Q_10[3]; + uint8x16_t idxhi, idxlo; + struct v_logf_data logf_tbl; +} data = { + .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 }, + .P29_3 = V4 (0x1.b13626p-2), + .tailshift = V4 (-0.87890625), + .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 }, + .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 }, + .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 }, + .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1), + V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) }, + .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) }, + .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) }, + .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) }, + .logf_tbl = V_LOGF_CONSTANTS +}; + +static inline float32x4_t +special (float32x4_t x, const struct data *d) +{ + /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf) + will still be finite. */ + float32x4_t t = vdivq_f32 ( + v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline ( + vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl)))); + float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x); + float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t); + return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q)); +} + +static inline float32x4_t +notails (float32x4_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x); + float32x4_t q = vaddq_f32 (t, d->Q_10[2]); + q = vfmaq_f32 (d->Q_10[1], t, q); + q = vfmaq_f32 (d->Q_10[0], t, q); + + return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q); +} + +static inline float32x4_t +lookup (float32x4_t tbl, uint8x16_t idx) +{ + return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx)); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Worst-case error is 4.98 ULP, in the + tail region: + _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0 + want 0x1.b4793ap+0 . */ +float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error + function", Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75)); + uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375)); + + if (unlikely (!v_any_u32 (is_tail))) + /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to + gather coefficients. If input is uniform in [-1, 1] then likelihood of + this is 0.75^4 ~= 0.31. */ + return notails (x, d); + + /* Select requisite shift depending on interval: polynomial is evaluated on + x * x - shift. + Normal shift = 0.5625 + Tail shift = 0.87890625. */ + float32x4_t t + = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x); + + /* Calculate indexes for tbl: tbl is byte-wise, so: + [0, 1, 2, 3, 4, 5, 6, ....] copies the vector + Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores + two pairs of coeffs, so we need two idx vectors - one for each pair. */ + uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4)); + uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off); + uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off); + + /* Load the tables. */ + float32x4_t p_lo = d->Plo; + float32x4_t pq = d->PQ; + float32x4_t qhi = d->Qhi; + + /* Do the lookup (and calculate p3 by masking non-tail lanes). */ + float32x4_t p3 = vreinterpretq_f32_u32 ( + vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3))); + float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi), + p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi), + q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi); + + float32x4_t p = vfmaq_f32 (p2, p3, t); + p = vfmaq_f32 (p1, p, t); + p = vfmaq_f32 (p0, p, t); + p = vmulq_f32 (x, p); + + float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t); + q = vfmaq_f32 (q0, q, t); + + if (unlikely (v_any_u32 (extreme_tail))) + /* At least one lane is in the extreme tail - if input is uniform in + [-1, 1] the likelihood of this is ~0.23. */ + return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q)); + + return vdivq_f32 (p, q); +} + +PL_SIG (V, F, 1, erfinv, -0.99, 0.99) +PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49) +/* Test with control lane in each interval. */ +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8) +PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp10_2u.c b/contrib/arm-optimized-routines/pl/math/v_exp10_2u.c new file mode 100644 index 000000000000..29072a60fb3a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp10_2u.c @@ -0,0 +1,144 @@ +/* + * Double-precision vector 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */ +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 163840.0 /* 1280.0 * N. */ + +const static struct data +{ + float64x2_t poly[4]; + float64x2_t log10_2, log2_10_hi, log2_10_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.5ddf8f28p-54 + abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ] + maxerr: 1.14432 +0.5 ulp. */ + .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1), + V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) }, + .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */ + .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */ + .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66), + .shift = V2 (0x1.8p+52), +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (ScaleBound), + .special_bound = V2 (SpecialBound), +#endif +}; + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask v_u64 (N - 1) + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ +# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f64 (exp10, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp10. + Maximum measured error is 1.64 ulp. + _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5 + want 0x1.f8dab6d7fed0ap+5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcageq_f64 (x, d->special_bound); +#endif + + /* n = round(x/(log10(2)/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*log10(2)/N. */ + float64x2_t r = x; + r = vfmsq_f64 (r, d->log2_10_hi, n); + r = vfmsq_f64 (r, d->log2_10_lo, n); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, IndexMask); + + /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]); + float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]); + p = vfmaq_f64 (p, y, r2); + y = vmulq_f64 (r, p); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n, d); +#endif + + return vfmaq_f64 (s, y, s); +} + +PL_SIG (S, D, 1, exp10, -9.9, 9.9) +PL_SIG (V, D, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (V_NAME_D1 (exp10), 1.15) +PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp10f_2u4.c b/contrib/arm-optimized-routines/pl/math/v_exp10f_2u4.c new file mode 100644 index 000000000000..0e91becfa612 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp10f_2u4.c @@ -0,0 +1,138 @@ +/* + * Single-precision vector 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "poly_advsimd_f32.h" + +#define ScaleBound 192.0f + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t log10_2_and_inv, shift; + +#if !WANT_SIMD_EXCEPT + float32x4_t scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 1.85943 +0.5 ulp. */ + .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), + V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, + .shift = V4 (0x1.8p23f), + + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V4 (ScaleBound) +#endif +}; + +#define ExponentBias v_u32 (0x3f800000) + +#if WANT_SIMD_EXCEPT + +# define SpecialBound 38.0f /* rint(log10(2^127)). */ +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ +# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (exp10f, x, y, cmp); +} + +#else + +# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +/* Fast vector implementation of single-precision exp10. + Algorithm is accurate to 2.36 ULP. + _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 + want 0x1.7e79cp+11. */ +float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t cmp = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); + float32x4_t n = vsubq_f32 (z, d->shift); + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + +#if !WANT_SIMD_EXCEPT + uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); +#endif + + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t poly + = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), + v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} + +PL_SIG (S, F, 1, exp10, -9.9, 9.9) +PL_SIG (V, F, 1, exp10, -9.9, 9.9) +PL_TEST_ULP (V_NAME_F1 (exp10), 1.86) +PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp2_2u.c b/contrib/arm-optimized-routines/pl/math/v_exp2_2u.c new file mode 100644 index 000000000000..de59779689f5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp2_2u.c @@ -0,0 +1,128 @@ +/* + * Double-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define BigBound 1022.0 +#define UOFlowBound 1280.0 + +static const struct data +{ + float64x2_t poly[4]; + float64x2_t shift, scale_big_bound, scale_uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3), + V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) }, + .shift = V2 (0x1.8p52 / N), + .scale_big_bound = V2 (BigBound), + .scale_uoflow_bound = V2 (UOFlowBound), +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){ __v_exp_data[i[0] & IndexMask], + __v_exp_data[i[1] & IndexMask] }; +} + +#if WANT_SIMD_EXCEPT + +# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ +# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ + +/* Call scalar exp2 as a fallback. */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) +{ + return v_call_f64 (exp2, x, y, is_special); +} + +#else + +# define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset)); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (exp2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); + /* Mask special lanes and retain a copy of x for passing to special-case + handler. */ + float64x2_t xc = x; + x = v_zerofy_f64 (x, cmp); +#else + cmp = vcagtq_f64 (x, d->scale_big_bound); +#endif + + /* n = round(x/N). */ + float64x2_t z = vaddq_f64 (d->shift, x); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n/N. */ + float64x2_t r = vsubq_f64 (x, n); + + /* s = 2^(n/N). */ + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + u = lookup_sbits (u); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + /* y ~ exp2(r) - 1. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); + y = vmulq_f64 (r, y); + + if (unlikely (v_any_u64 (cmp))) +#if !WANT_SIMD_EXCEPT + return special_case (s, y, n, d); +#else + return special_case (xc, vfmaq_f64 (s, s, y), cmp); +#endif + return vfmaq_f64 (s, s, y); +} + +PL_SIG (V, D, 1, exp2, -9.9, 9.9) +PL_TEST_ULP (V_NAME_D1 (exp2), 1.15) +PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_data.c b/contrib/arm-optimized-routines/pl/math/v_exp_data.c new file mode 100644 index 000000000000..fd01cf27606f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_data.c @@ -0,0 +1,55 @@ +/* + * Scale values for vector exp and exp2 + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* 2^(j/N), j=0..N, N=2^7=128. Copied from math/v_exp_data.c. */ +const uint64_t __v_exp_data[] = { + 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, + 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, + 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, + 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, + 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, + 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, + 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, + 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, + 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, + 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, + 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, + 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, + 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, + 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, + 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, + 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, + 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, + 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, + 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, + 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, + 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, + 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, + 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, + 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, + 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, + 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, + 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, + 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, + 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, + 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, + 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, + 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, + 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, + 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, + 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, + 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, + 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, + 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, + 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, + 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, + 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, + 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, + 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail.c deleted file mode 100644 index fd38aa8ae6ea..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_exp_tail.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Double-precision vector e^(x+tail) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "math_config.h" -#if V_SUPPORTED -#include "v_exp_tail.h" - -#define C1 v_f64 (C1_scal) -#define C2 v_f64 (C2_scal) -#define C3 v_f64 (C3_scal) -#define InvLn2 v_f64 (InvLn2_scal) -#define Ln2hi v_f64 (Ln2hi_scal) -#define Ln2lo v_f64 (Ln2lo_scal) - -#define IndexMask v_u64 (IndexMask_scal) -#define Shift v_f64 (Shift_scal) -#define Thres v_f64 (Thres_scal) - -VPCS_ATTR -static v_f64_t -specialcase (v_f64_t s, v_f64_t y, v_f64_t n) -{ - v_f64_t absn = v_abs_f64 (n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); - v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); - v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); - v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); - v_f64_t r1 = s1 * s1; - v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; - return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); -} - -VPCS_ATTR -v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail) -{ - v_f64_t n, r, s, y, z; - v_u64_t cmp, u, e, i; - - cmp = v_cond_u64 (v_abs_f64 (x) > Thres); - - /* n = round(x/(ln2/N)). */ - z = v_fma_f64 (x, InvLn2, Shift); - u = v_as_u64_f64 (z); - n = z - Shift; - - /* r = x - n*ln2/N. */ - r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); - - e = u << (52 - V_EXP_TAIL_TABLE_BITS); - i = u & IndexMask; - - /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - y = v_fma_f64 (C3, r, C2); - y = v_fma_f64 (y, r, C1); - y = v_fma_f64 (y, r, v_f64 (1.0)); - y = v_fma_f64 (y, r, xtail); - - /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - s = v_as_f64_u64 (u + e); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (s, y, n); - return v_fma_f64 (y, s, s); -} -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c index 675eb769bf07..989dd41d949a 100644 --- a/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c @@ -1,97 +1,98 @@ /* - * Lookup table for double-precision e^(x+tail) vector function. + * Lookup table for double-precision e^x vector function. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* 2^(j/N), j=0..N (where N = 256). */ -const uint64_t __v_exp_tail_data[] - = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, - 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, - 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, - 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, - 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, - 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, - 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, - 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, - 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, - 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, - 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, - 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, - 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, - 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, - 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, - 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, - 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, - 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, - 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, - 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, - 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, - 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, - 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, - 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, - 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, - 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, - 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, - 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, - 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, - 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, - 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, - 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, - 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, - 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, - 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, - 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, - 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, - 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, - 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, - 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, - 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, - 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, - 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, - 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, - 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, - 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, - 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, - 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, - 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, - 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, - 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, - 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, - 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, - 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, - 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, - 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, - 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, - 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, - 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, - 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, - 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, - 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, - 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, - 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, - 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, - 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, - 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, - 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, - 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, - 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, - 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, - 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, - 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, - 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, - 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, - 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, - 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, - 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, - 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, - 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, - 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, - 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, - 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, - 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, - 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, - 0x3feff9d96b2a23d9}; +/* 2^(j/N), j=0..N, N=2^8=256. Copied from math/v_exp_data.c. */ +const uint64_t __v_exp_tail_data[] = { + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail_inline.h b/contrib/arm-optimized-routines/pl/math/v_exp_tail_inline.h new file mode 100644 index 000000000000..76ecc6b0a33a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail_inline.h @@ -0,0 +1,102 @@ +/* + * Double-precision vector e^(x+tail) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef PL_MATH_V_EXP_TAIL_INLINE_H +#define PL_MATH_V_EXP_TAIL_INLINE_H + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +#ifndef WANT_V_EXP_TAIL_SPECIALCASE +#error \ + "Cannot use v_exp_tail_inline.h without specifying whether you need the special case computation." +#endif + +#define N (1 << V_EXP_TAIL_TABLE_BITS) + +static const struct data +{ + float64x2_t poly[4]; +#if WANT_V_EXP_TAIL_SPECIALCASE + float64x2_t big_bound, huge_bound; +#endif + float64x2_t shift, invln2, ln2_hi, ln2_lo; +} data = { +#if WANT_V_EXP_TAIL_SPECIALCASE + .big_bound = V2 (704.0), + .huge_bound = V2 (1280.0 * N), +#endif + .shift = V2 (0x1.8p52), + .invln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-9), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-64), + .poly = { V2 (1.0), V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5) }, +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){__v_exp_tail_data[i[0]], __v_exp_tail_data[i[1]]}; +} + +#if WANT_V_EXP_TAIL_SPECIALCASE +#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* The following 2 bias when combined form the exponent bias: + SpecialBias1 - SpecialBias2 = asuint64(1.0). */ +#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ +static float64x2_t VPCS_ATTR +v_exp_tail_special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t oflow = vcagtq_f64 (n, d->huge_bound); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + float64x2_t r1 = vmulq_f64 (s1, s1); + return vbslq_f64 (oflow, r1, r0); +} +#endif + +static inline float64x2_t VPCS_ATTR +v_exp_tail_inline (float64x2_t x, float64x2_t xtail) +{ + const struct data *d = ptr_barrier (&data); +#if WANT_V_EXP_TAIL_SPECIALCASE + uint64x2_t special = vcgtq_f64 (vabsq_f64 (x), d->big_bound); +#endif + /* n = round(x/(ln2/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->invln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*ln2/N. */ + float64x2_t r = x; + r = vfmsq_f64 (r, d->ln2_hi, n); + r = vfmsq_f64 (r, d->ln2_lo, n); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, v_u64 (N - 1)); + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4, using Horner. */ + float64x2_t y = v_horner_3_f64 (r, d->poly); + y = vfmaq_f64 (xtail, y, r); + + /* s = 2^(n/N). */ + u = lookup_sbits (i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + +#if WANT_V_EXP_TAIL_SPECIALCASE + if (unlikely (v_any_u64 (special))) + return v_exp_tail_special_case (s, y, n, d); +#endif + return vfmaq_f64 (s, y, s); +} +#endif // PL_MATH_V_EXP_TAIL_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_expf.c b/contrib/arm-optimized-routines/pl/math/v_expf.c deleted file mode 100644 index a422e69feb62..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_expf.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "mathlib.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.45358 +0.5 ulp. */ - 0x1.0e4020p-7f, - 0x1.573e2ep-5f, - 0x1.555e66p-3f, - 0x1.fffdb6p-2f, - 0x1.ffffecp-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn, cmp, scale); - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_expf_inline.h b/contrib/arm-optimized-routines/pl/math/v_expf_inline.h new file mode 100644 index 000000000000..166683726b4d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_expf_inline.h @@ -0,0 +1,60 @@ +/* + * Helper for single-precision routines which calculate exp(x) and do not + * need special-case handling + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_V_EXPF_INLINE_H +#define PL_MATH_V_EXPF_INLINE_H + +#include "v_math.h" + +struct v_expf_data +{ + float32x4_t poly[5]; + float32x4_t shift, invln2_and_ln2; +}; + +/* maxerr: 1.45358 +0.5 ulp. */ +#define V_EXPF_DATA \ + { \ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ + .shift = V4 (0x1.8p23f), \ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + } + +#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ +#define C(i) d->poly[i] + +static inline float32x4_t +v_expf_inline (float32x4_t x, const struct v_expf_data *d) +{ + /* Helper routine for calculating exp(x). + Copied from v_expf.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t n, r, z; + z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0); + n = vsubq_f32 (z, d->shift); + r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1); + r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + + /* Custom order-4 Estrin avoids building high order monomial. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p, q, poly; + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + return vfmaq_f32 (scale, poly, scale); +} + +#endif // PL_MATH_V_EXPF_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c index 4b491d17feef..dd255472cec0 100644 --- a/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c @@ -1,113 +1,118 @@ /* * Double-precision vector exp(x) - 1 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - -#define InvLn2 v_f64 (0x1.71547652b82fep0) -#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) -#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) -#define Shift v_f64 (0x1.8p52) -#define TinyBound \ - 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ -#define SpecialBound \ - 0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the \ - final stage of the algorithm overflows so fall back to \ - scalar. */ -#define AbsMask 0x7fffffffffffffff -#define One 0x3ff0000000000000 - -#define C(i) v_f64 (__expm1_poly[i]) - -static inline v_f64_t -eval_poly (v_f64_t f, v_f64_t f2) +static const struct data { - /* Evaluate custom polynomial using Estrin scheme. */ - v_f64_t p_01 = v_fma_f64 (f, C (1), C (0)); - v_f64_t p_23 = v_fma_f64 (f, C (3), C (2)); - v_f64_t p_45 = v_fma_f64 (f, C (5), C (4)); - v_f64_t p_67 = v_fma_f64 (f, C (7), C (6)); - v_f64_t p_89 = v_fma_f64 (f, C (9), C (8)); - - v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01); - v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45); - v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89); + float64x2_t poly[11]; + float64x2_t invln2, ln2, shift; + int64x2_t exponent_bias; +#if WANT_SIMD_EXCEPT + uint64x2_t thresh, tiny_bound; +#else + float64x2_t oflow_bound; +#endif +} data = { + /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, + .invln2 = V2 (0x1.71547652b82fep0), + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, + .shift = V2 (0x1.8p52), + .exponent_bias = V2 (0x3ff0000000000000), +#if WANT_SIMD_EXCEPT + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs + compare. */ + .thresh = V2 (0x78c56fa6d34b552), + /* asuint64(0x1p-51) << 1. */ + .tiny_bound = V2 (0x3cc0000000000000 << 1), +#else + /* Value above which expm1(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), +#endif +}; - v_f64_t f4 = f2 * f2; - v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03); - return v_fma_f64 (f4 * f4, p_8a, p_07); +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (expm1, x, y, special); } /* Double-precision vector exp(x) - 1 function. The maximum error observed error is 2.18 ULP: - __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 - want 0x1.a8b9ea8d66e2p-2. */ -VPCS_ATTR -v_f64_t V_NAME (expm1) (v_f64_t x) + _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 + want 0x1.a8b9ea8d66e2p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t ax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + + uint64x2_t ix = vreinterpretq_u64_f64 (x); #if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to the scalar - variant for all lanes if any of them should trigger an exception. */ - v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound)); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); if (unlikely (v_any_u64 (special))) - return v_call_f64 (expm1, x, x, v_u64 (-1)); + x = v_zerofy_f64 (x, special); #else /* Large input, NaNs and Infs. */ - v_u64_t special - = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000)); + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); #endif /* Reduce argument to smaller range: Let i = round(x / ln2) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; - v_s64_t i = v_to_s64_f64 (j); - v_f64_t f = v_fma_f64 (j, MLn2hi, x); - f = v_fma_f64 (j, MLn2lo, f); + float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); + f = vfmsq_laneq_f64 (f, n, d->ln2, 1); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - v_f64_t f2 = f * f; - v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t f8 = vmulq_f64 (f4, f4); + float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); - /* expm1(x) ~= p * t + (t - 1). */ - v_f64_t y = v_fma_f64 (p, t, t - 1); + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); -#if !WANT_SIMD_EXCEPT if (unlikely (v_any_u64 (special))) - return v_call_f64 (expm1, x, y, special); -#endif + return special_case (vreinterpretq_f64_u64 (ix), + vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t), + special); - return y; + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); } -VPCS_ALIAS PL_SIG (V, D, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (V_NAME (expm1), 1.68) -PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000) -PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000) -PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000) -PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) -PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100) -PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100) -#endif +PL_TEST_ULP (V_NAME_D1 (expm1), 1.68) +PL_TEST_EXPECT_FENV (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c index ab132427e58d..6b282d0cc00f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c +++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c @@ -1,94 +1,117 @@ /* * Single-precision vector exp(x) - 1 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float32x4_t poly[5]; + float32x4_t invln2_and_ln2; + float32x4_t shift; + int32x4_t exponent_bias; +#if WANT_SIMD_EXCEPT + uint32x4_t thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ + .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), + V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, + .shift = V4 (0x1.8p23f), + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + /* Value above which expm1f(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V4 (0x1.5ebc4p+6), +#else + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute + compare. */ + .thresh = V4 (0x1d5ebc40), +#endif +}; -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define MLn2hi v_f32 (-0x1.62e4p-1f) -#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) -#define AbsMask (0x7fffffff) -#define One (0x3f800000) -#define SpecialBound \ - (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x) \ - should round to -1. */ -#define TinyBound (0x34000000) /* asuint(0x1p-23). */ +/* asuint(0x1p-23), shifted by 1 for abs compare. */ +#define TinyBound v_u32 (0x34000000 << 1) -#define C(i) v_f32 (__expm1f_poly[i]) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (expm1f, x, y, special); +} /* Single-precision vector exp(x) - 1 function. The maximum error is 1.51 ULP: - expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 - want 0x1.e2fb94p-2. */ -VPCS_ATTR -v_f32_t V_NAME (expm1f) (v_f32_t x) + _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2 + want 0x1.e2fb94p-2. */ +float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t ax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); #if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to the scalar - variant for all lanes if any of them should trigger an exception. */ - v_u32_t special - = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound)); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint32x4_t special + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); if (unlikely (v_any_u32 (special))) - return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff)); + x = v_zerofy_f32 (x, special); #else - /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0. */ - v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000)); + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); #endif /* Reduce argument to smaller range: Let i = round(x / ln2) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; - v_s32_t i = v_to_s32_f32 (j); - v_f32_t f = v_fma_f32 (j, MLn2hi, x); - f = v_fma_f32 (j, MLn2lo, f); + float32x4_t j = vsubq_f32 ( + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - - v_f32_t p = v_fma_f32 (C (4), f, C (3)); - p = v_fma_f32 (p, f, C (2)); - p = v_fma_f32 (p, f, C (1)); - p = v_fma_f32 (p, f, C (0)); - p = v_fma_f32 (f * f, p, f); + float32x4_t p = v_horner_4_f32 (f, d->poly); + p = vfmaq_f32 (f, vmulq_f32 (f, f), p); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); - /* expm1(x) ~= p * t + (t - 1). */ - v_f32_t y = v_fma_f32 (p, t, t - 1); + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); -#if !WANT_SIMD_EXCEPT if (unlikely (v_any_u32 (special))) - return v_call_f32 (expm1f, x, y, special); -#endif + return special_case (vreinterpretq_f32_u32 (ix), + vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t), + special); - return y; + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); } -VPCS_ALIAS PL_SIG (V, F, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (V_NAME (expm1f), 1.02) -PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000) -PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000) -#endif +PL_TEST_ULP (V_NAME_F1 (expm1), 1.02) +PL_TEST_EXPECT_FENV (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000) +PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h index c261941ebed6..6ae94c452de2 100644 --- a/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h +++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h @@ -1,49 +1,63 @@ /* * Helper for single-precision routines which calculate exp(x) - 1 and do not * need special-case handling * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef PL_MATH_V_EXPM1F_INLINE_H #define PL_MATH_V_EXPM1F_INLINE_H #include "v_math.h" #include "math_config.h" -#include "estrinf.h" +#include "poly_advsimd_f32.h" -#define One 0x3f800000 -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define MLn2hi v_f32 (-0x1.62e4p-1f) -#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) - -#define C(i) v_f32 (__expm1f_poly[i]) - -static inline v_f32_t -expm1f_inline (v_f32_t x) +struct v_expm1f_data +{ + float32x4_t poly[5]; + float32x4_t invln2_and_ln2, shift; + int32x4_t exponent_bias; +}; + +/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, + log(2)/2]. Exponent bias is asuint(1.0f). + invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */ +#define V_EXPM1F_DATA \ + { \ + .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \ + V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \ + .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + } + +static inline float32x4_t +expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) { /* Helper routine for calculating exp(x) - 1. Copied from v_expm1f_1u6.c, with all special-case handling removed - the calling routine should handle special values if required. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; - v_s32_t i = v_to_s32_f32 (j); - v_f32_t f = v_fma_f32 (j, MLn2hi, x); - f = v_fma_f32 (j, MLn2lo, f); + float32x4_t j = vsubq_f32 ( + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). - Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */ - v_f32_t f2 = f * f; - v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C); - p = v_fma_f32 (f2, p, f); + Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses + Horner. */ + float32x4_t f2 = vmulq_f32 (f, f); + float32x4_t f4 = vmulq_f32 (f2, f2); + float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly); + p = vfmaq_f32 (f, f2, p); /* t = 2^i. */ - v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); /* expm1(x) ~= p * t + (t - 1). */ - return v_fma_f32 (p, t, t - 1); + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); } #endif // PL_MATH_V_EXPM1F_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_hypot_1u5.c b/contrib/arm-optimized-routines/pl/math/v_hypot_1u5.c new file mode 100644 index 000000000000..d4ff7be89a8f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_hypot_1u5.c @@ -0,0 +1,95 @@ +/* + * Double-precision vector hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint64x2_t tiny_bound, thres; +} data = { + .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ + .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ +}; +#else +static const struct data +{ + uint64x2_t tiny_bound; + uint32x4_t thres; +} data = { + .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, + uint32x2_t special) +{ + return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special)); +} + +/* Vector implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222) + got 0x1.6a1b19400964ep-204 + want 0x1.6a1b19400964dp-204. */ +#if WANT_SIMD_EXCEPT + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t ix = vreinterpretq_u64_f64 (ax); + uint64x2_t iy = vreinterpretq_u64_f64 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres); + uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f64 (ax, specialx); + ay = v_zerofy_f64 (ay, specialy); + uint32x2_t special = vaddhn_u64 (specialx, specialy); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#else + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y); + + uint32x2_t special = vcge_u32 ( + vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), + vget_low_u32 (d->thres)); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#endif + +PL_SIG (V, D, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (V_NAME_D2 (hypot), 1.21) +PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_hypotf_1u5.c b/contrib/arm-optimized-routines/pl/math/v_hypotf_1u5.c new file mode 100644 index 000000000000..3227b0a3fd8b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_hypotf_1u5.c @@ -0,0 +1,94 @@ +/* + * Single-precision vector hypot(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint32x4_t tiny_bound, thres; +} data = { + .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ + .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ +}; +#else +static const struct data +{ + uint32x4_t tiny_bound; + uint16x8_t thres; +} data = { + .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ + .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, + uint16x4_t special) +{ + return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special)); +} + +/* Vector implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 + want 0x1.6a41dp-13. */ +#if WANT_SIMD_EXCEPT + +float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t ix = vreinterpretq_u32_f32 (ax); + uint32x4_t iy = vreinterpretq_u32_f32 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); + uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f32 (ax, specialx); + ay = v_zerofy_f32 (ay, specialy); + uint16x4_t special = vaddhn_u32 (specialx, specialy); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#else + +float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); + + uint16x4_t special = vcge_u16 ( + vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), + vget_low_u16 (d->thres)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#endif + +PL_SIG (V, F, 2, hypot, -10.0, 10.0) +PL_TEST_ULP (V_NAME_F2 (hypot), 1.21) +PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c index 86d398ca13a9..35dd62fe5e3e 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c @@ -1,110 +1,120 @@ /* * Double-precision vector log10(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "include/mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#define A(i) v_f64 (__v_log10_data.poly[i]) -#define T(s, i) __v_log10_data.tab[i].s -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) #define N (1 << V_LOG10_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t invln10, log10_2, ln2; + uint64x2_t sign_exp_mask; +} data = { + /* Computed from log coefficients divided by log(10) then rounded to double + precision. */ + .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3), + V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4), + V2 (-0x1.287461742fee4p-4) }, + .ln2 = V2 (0x1.62e42fefa39efp-1), + .invln10 = V2 (0x1.bcb7b1526e50ep-2), + .log10_2 = V2 (0x1.34413509f79ffp-2), + .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000), +}; + +#define Off v_u64 (0x3fe6900900000000) +#define IndexMask (N - 1) + +#define T(s, i) __v_log10_data.s[i] struct entry { - v_f64_t invc; - v_f64_t log10c; + float64x2_t invc; + float64x2_t log10c; }; static inline struct entry -lookup (v_u64_t i) +lookup (uint64x2_t i) { struct entry e; -#ifdef SCALAR - e.invc = T (invc, i); - e.log10c = T (log10c, i); -#else - e.invc[0] = T (invc, i[0]); - e.log10c[0] = T (log10c, i[0]); - e.invc[1] = T (invc, i[1]); - e.log10c[1] = T (log10c, i[1]); -#endif + uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log10c = vuzp2q_f64 (e0, e1); return e; } -VPCS_ATTR -inline static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + uint32x2_t special) { - return v_call_f64 (log10, x, y, cmp); + return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special)); } -/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps). +/* Fast implementation of double-precision vector log10 + is a slight modification of double-precision vector log. Max ULP error: < 2.5 ulp (nearest rounding.) Maximum measured at 2.46 ulp for x in [0.96, 0.97] - __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 - want 0x1.fff6be3cae4b9p-6 - -0.459999 ulp err 1.96. */ -VPCS_ATTR -v_f64_t V_NAME (log10) (v_f64_t x) + _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 + want 0x1.fff6be3cae4b9p-6. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) { - v_f64_t z, r, r2, p, y, kd, hi; - v_u64_t ix, iz, tmp, top, i, cmp; - v_s64_t k; - struct entry e; - - ix = v_as_u64_f64 (x); - top = ix >> 48; - cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N; - k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */ - iz = ix - (tmp & v_u64 (0xfffULL << 52)); - z = v_as_f64_u64 (iz); - e = lookup (i); + uint64x2_t tmp = vsubq_u64 (ix, Off); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (tmp); /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ - r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - kd = v_to_f64_s64 (k); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); /* hi = r / log(10) + log10(c) + k*log10(2). - Constants in `v_log10_data.c` are computed (in extended precision) as + Constants in v_log10_data.c are computed (in extended precision) as e.log10c := e.logc * ivln10. */ - v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c); + float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10); /* y = log10(1+r) + n * log10(2). */ - hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w); + float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = r * r; - y = v_fma_f64 (A (3), r, A (2)); - p = v_fma_f64 (A (1), r, A (0)); - y = v_fma_f64 (A (4), r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, hi, r2, special); + return vfmaq_f64 (hi, r2, y); } -VPCS_ALIAS PL_SIG (V, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log10), 1.97) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10)) -PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000) -PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000) -PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000) -#endif +PL_TEST_ULP (V_NAME_D1 (log10), 1.97) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log10)) +PL_TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_data.c b/contrib/arm-optimized-routines/pl/math/v_log10_data.c index fda85c886963..d9a624dab9ce 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log10_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_log10_data.c @@ -1,167 +1,163 @@ /* * Lookup table for double-precision log10(x) vector function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#define N (1 << V_LOG10_TABLE_BITS) - -/* Algorithm: +const struct v_log10_data __v_log10_data = { + /* Computed from log's coefficients div by log(10) then rounded to double + precision. */ + .poly = { -0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4, + 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4 }, + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + /* Algorithm: x = 2^k z log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10) -where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) -and log(c) and 1/c for the ith subinterval comes from a lookup table: - - tab[i].invc = 1/c - tab[i].log10c = (double)log10(c) - -where c is near the center of the subinterval and is chosen by trying several -floating point invc candidates around 1/center and selecting one for which -the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval -that contains 1 and the previous one got tweaked to avoid cancellation. -NB: invc should be optimized to minimize error in (double)log10(c) instead. */ -const struct v_log10_data __v_log10_data - = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3}, - {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3}, - {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3}, - {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3}, - {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3}, - {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3}, - {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3}, - {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3}, - {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3}, - {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3}, - {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3}, - {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4}, - {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4}, - {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4}, - {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4}, - {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4}, - {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4}, - {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4}, - {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4}, - {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4}, - {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4}, - {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4}, - {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4}, - {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4}, - {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4}, - {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4}, - {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4}, - {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4}, - {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4}, - {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4}, - {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4}, - {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4}, - {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4}, - {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4}, - {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4}, - {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4}, - {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4}, - {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4}, - {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4}, - {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4}, - {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4}, - {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5}, - {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5}, - {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5}, - {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5}, - {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5}, - {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5}, - {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5}, - {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5}, - {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5}, - {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5}, - {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5}, - {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5}, - {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5}, - {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5}, - {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5}, - {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5}, - {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5}, - {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6}, - {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6}, - {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6}, - {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6}, - {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6}, - {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6}, - {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6}, - {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6}, - {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7}, - {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7}, - {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7}, - {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7}, - {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7}, - {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8}, - {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8}, - {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9}, - {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10}, - {1.0, 0.0}, - {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9}, - {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8}, - {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7}, - {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7}, - {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6}, - {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6}, - {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6}, - {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6}, - {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6}, - {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5}, - {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5}, - {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5}, - {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5}, - {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5}, - {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5}, - {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5}, - {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5}, - {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5}, - {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5}, - {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4}, - {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4}, - {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4}, - {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4}, - {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4}, - {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4}, - {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4}, - {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4}, - {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4}, - {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4}, - {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4}, - {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4}, - {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4}, - {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4}, - {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4}, - {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4}, - {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4}, - {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4}, - {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4}, - {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4}, - {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4}, - {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4}, - {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4}, - {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3}, - {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3}, - {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3}, - {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3}, - {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3}, - {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3}, - {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3}, - {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3}, - {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3}, - {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}}, - - /* Computed from log coeffs div by log(10) then rounded to double - precision. */ - .poly - = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4, - 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4}, + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup + tables: - .invln10 = 0x1.bcb7b1526e50ep-2, - .log10_2 = 0x1.34413509f79ffp-2 + table[i].invc = 1/c + table[i].log10c = (double)log10(c) + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. NB: invc should be optimized to minimize error in + (double)log10(c) instead. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.345825f221684p-3 }, + { 0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3 }, + { 0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3 }, + { 0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3 }, + { 0x1.623f1d916f323p+0, -0x1.20e7081762193p-3 }, + { 0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3 }, + { 0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3 }, + { 0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3 }, + { 0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3 }, + { 0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3 }, + { 0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3 }, + { 0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4 }, + { 0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4 }, + { 0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4 }, + { 0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4 }, + { 0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4 }, + { 0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4 }, + { 0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4 }, + { 0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4 }, + { 0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4 }, + { 0x1.446f12b278001p+0, -0x1.a56c091954f87p-4 }, + { 0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4 }, + { 0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4 }, + { 0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4 }, + { 0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4 }, + { 0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4 }, + { 0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4 }, + { 0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4 }, + { 0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4 }, + { 0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4 }, + { 0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4 }, + { 0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4 }, + { 0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4 }, + { 0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4 }, + { 0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4 }, + { 0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4 }, + { 0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4 }, + { 0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4 }, + { 0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4 }, + { 0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5 }, + { 0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5 }, + { 0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5 }, + { 0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5 }, + { 0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5 }, + { 0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5 }, + { 0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5 }, + { 0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5 }, + { 0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5 }, + { 0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5 }, + { 0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5 }, + { 0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5 }, + { 0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5 }, + { 0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5 }, + { 0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5 }, + { 0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6 }, + { 0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6 }, + { 0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6 }, + { 0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6 }, + { 0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6 }, + { 0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6 }, + { 0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6 }, + { 0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7 }, + { 0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7 }, + { 0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7 }, + { 0x1.062491aee9904p+0, -0x1.517249c15a75cp-7 }, + { 0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8 }, + { 0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8 }, + { 0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9 }, + { 0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9 }, + { 0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8 }, + { 0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7 }, + { 0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7 }, + { 0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6 }, + { 0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6 }, + { 0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6 }, + { 0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6 }, + { 0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6 }, + { 0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5 }, + { 0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5 }, + { 0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5 }, + { 0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5 }, + { 0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5 }, + { 0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5 }, + { 0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5 }, + { 0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5 }, + { 0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4 }, + { 0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4 }, + { 0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4 }, + { 0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4 }, + { 0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4 }, + { 0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4 }, + { 0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4 }, + { 0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4 }, + { 0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4 }, + { 0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4 }, + { 0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4 }, + { 0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4 }, + { 0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4 }, + { 0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4 }, + { 0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4 }, + { 0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4 }, + { 0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4 }, + { 0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4 }, + { 0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4 }, + { 0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4 }, + { 0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4 }, + { 0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3 }, + { 0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3 }, + { 0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3 }, + { 0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3 }, + { 0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3 }, + { 0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3 }, + { 0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3 }, + { 0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3 }, + { 0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3 }, + { 0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3 } } }; diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c index e9f7f0346ca2..92bc50ba5bd9 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c @@ -1,82 +1,82 @@ /* * Single-precision vector log10 function. * * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "mathlib.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - -#define P(i) v_f32 (__v_log10f_poly[i]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218. */ -#define InvLn10 v_f32 (0x1.bcb7b2p-2f) -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667. */ +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + float32x4_t poly[8]; + float32x4_t inv_ln10, ln2; + uint32x4_t off, mantissa_mask; +} data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), + V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), + V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, + uint16x4_t cmp) { /* Fall back to scalar code. */ - return v_call_f32 (log10f, x, y, cmp); + return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); } -/* Our fast implementation of v_log10f uses a similar approach as v_logf. - With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with - order 9. This is more efficient than using a low order polynomial computed in - double precision. +/* Fast implementation of AdvSIMD log10f, + uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and + an order 9 polynomial. Maximum error: 3.305ulps (nearest rounding.) - __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492. */ -VPCS_ATTR -v_f32_t V_NAME (log10f) (v_f32_t x) + _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x) { - v_f32_t n, o, p, q, r, r2, y; - v_u32_t u, cmp; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u -= Off; - n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ - u &= Mask; - u += Off; - r = v_as_f32_u32 (u) - v_f32 (1.0f); + u = vsubq_u32 (u, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - /* y = log10(1+r) + n*log10(2). */ - r2 = r * r; - /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 + - r2*(P6+r*P7))). */ - o = v_fma_f32 (P (7), r, P (6)); - p = v_fma_f32 (P (5), r, P (4)); - q = v_fma_f32 (P (3), r, P (2)); - y = v_fma_f32 (P (1), r, P (0)); - p = v_fma_f32 (o, r2, p); - q = v_fma_f32 (p, r2, q); - y = v_fma_f32 (q, r2, y); - /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster - but less accurate. */ - p = v_fma_f32 (Ln2, n, r); - y = v_fma_f32 (y, r2, p * InvLn10); + /* y = log10(1+r) + n * log10(2). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); + /* y = Log10(2) * n + poly * InvLn(10). */ + float32x4_t y = vfmaq_f32 (r, d->ln2, n); + y = vmulq_f32 (y, d->inv_ln10); - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; + if (unlikely (v_any_u16h (special))) + return special_case (x, y, poly, r2, special); + return vfmaq_f32 (y, poly, r2); } -VPCS_ALIAS PL_SIG (V, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log10f), 2.81) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f)) -PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000) -#endif +PL_TEST_ULP (V_NAME_F1 (log10), 2.81) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log10)) +PL_TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_data.c b/contrib/arm-optimized-routines/pl/math/v_log10f_data.c deleted file mode 100644 index 537482a92017..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_log10f_data.c +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Coefficients for single-precision vector log10 function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "math_config.h" - -const float __v_log10f_poly[] = { - /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in - [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ - -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f, - -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c index e48291081ab3..face02ddc6c3 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c @@ -1,120 +1,128 @@ /* * Double-precision vector log(1+x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - -#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) -#define Ln2Lo v_f64 (0x1.ef35793c76730p-45) -#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ -#define OneMHfRt2Top \ - 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ - << 32. */ -#define OneTop12 0x3ff -#define BottomMask 0xffffffff -#define AbsMask 0x7fffffffffffffff -#define C(i) v_f64 (__log1p_data.coeffs[i]) - -static inline v_f64_t -eval_poly (v_f64_t f) +const static struct data { - v_f64_t f2 = f * f; - v_f64_t f4 = f2 * f2; - v_f64_t f8 = f4 * f4; - return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); -} - -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t special) + float64x2_t poly[19], ln2[2]; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one; + int64x2_t one_top; +} data = { + /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ + .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), + V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), + V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), + V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), + V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), + V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), + V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), + V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), + V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), + V2 (-0x1.cfa7385bdb37ep-6) }, + .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, + /* top32(asuint64(sqrt(2)/2)) << 32. */ + .hf_rt2_top = V2 (0x3fe6a09e00000000), + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), + .umask = V2 (0x000fffff00000000), + .one_top = V2 (0x3ff), + .inf = V2 (0x7ff0000000000000), + .minus_one = V2 (0xbff0000000000000) +}; + +#define BottomMask v_u64 (0xffffffff) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (log1p, x, y, special); } -/* Vector log1p approximation using polynomial on reduced interval. Routine is a - modification of the algorithm used in scalar log1p, with no shortcut for k=0 - and no narrowing for f and k. Maximum observed error is 2.46 ULP: - __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 - want 0x1.fd5565fb590f6p+2 . */ -VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x) +/* Vector log1p approximation using polynomial on reduced interval. Routine is + a modification of the algorithm used in scalar log1p, with no shortcut for + k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP: + _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2 + want 0x1.fd61d0727429fp+2 . */ +VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t ia = ix & AbsMask; - v_u64_t special - = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000)) - | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000)); + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t special = vcgeq_u64 (ia, d->inf); #if WANT_SIMD_EXCEPT + special = vorrq_u64 (special, + vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1)))); if (unlikely (v_any_u64 (special))) - x = v_sel_f64 (special, v_f64 (0), x); + x = v_zerofy_f64 (x, special); +#else + special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1))); #endif /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f is in [sqrt(2)/2, sqrt(2)]): log1p(x) = k*log(2) + log1p(f). f may not be representable exactly, so we need a correction term: let m = round(1 + x), c = (1 + x) - m. c << m: at very small x, log1p(x) ~ x, hence: log(1+x) - log(m) ~ c/m. We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ /* Obtain correctly scaled k by manipulation in the exponent. The scalar algorithm casts down to 32-bit at this point to calculate k and u_red. We stay in double-width to obtain f and k, using the same constants as the scalar algorithm but shifted left by 32. */ - v_f64_t m = x + 1; - v_u64_t mi = v_as_u64_f64 (m); - v_u64_t u = mi + OneMHfRt2Top; + float64x2_t m = vaddq_f64 (x, v_f64 (1)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12; - v_f64_t k = v_to_f64_s64 (ki); + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; - v_u64_t u_red = utop | (mi & BottomMask); - v_f64_t f = v_as_f64_u64 (u_red) - 1; + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); /* Correction term c/m. */ - v_f64_t cm = (x - (m - 1)) / m; + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); /* Approximate log1p(x) on the reduced input using a polynomial. Because - log1p(0)=0 we choose an approximation of the form: - x + C0*x^2 + C1*x^3 + C2x^4 + ... - Hence approximation has the form f + f^2 * P(f) + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) where P(x) = C0 + C1*x + C2x^2 + ... - Assembling this all correctly is dealt with at the final step. */ - v_f64_t p = eval_poly (f); + Assembling this all correctly is dealt with at the final step. */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); - v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); - v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); - v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi); + float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); + float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); + float64x2_t y = vaddq_f64 (ylo, yhi); if (unlikely (v_any_u64 (special))) - return specialcase (v_as_f64_u64 (ix), y, special); + return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p), + special); - return y; + return vfmaq_f64 (y, f2, p); } -VPCS_ALIAS PL_SIG (V, D, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (V_NAME (log1p), 1.97) -PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000) -PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000) -#endif +PL_TEST_ULP (V_NAME_D1 (log1p), 1.97) +PL_TEST_EXPECT_FENV (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000) +PL_TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500) diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h index e5c733964bc0..bd57bfc6fe6e 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h +++ b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h @@ -1,77 +1,91 @@ /* * Helper for vector double-precision routines which calculate log(1 + x) and do * not need special-case handling * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef PL_MATH_V_LOG1P_INLINE_H #define PL_MATH_V_LOG1P_INLINE_H #include "v_math.h" -#include "pairwise_horner.h" +#include "poly_advsimd_f64.h" -#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) -#define Ln2Lo v_f64 (0x1.ef35793c76730p-45) -#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ -#define OneMHfRt2Top \ - 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ - << 32. */ -#define OneTop 0x3ff -#define BottomMask 0xffffffff -#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ +struct v_log1p_data +{ + float64x2_t poly[19], ln2[2]; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; + int64x2_t one_top; +}; + +/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ +#define V_LOG1P_CONSTANTS_TABLE \ + { \ + .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \ + V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \ + V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \ + V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \ + V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \ + V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \ + V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \ + V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \ + V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \ + V2 (-0x1.cfa7385bdb37ep-6) }, \ + .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \ + .hf_rt2_top = V2 (0x3fe6a09e00000000), \ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ + .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ + } -#define C(i) v_f64 (__log1p_data.coeffs[i]) +#define BottomMask v_u64 (0xffffffff) -static inline v_f64_t -log1p_inline (v_f64_t x) +static inline float64x2_t +log1p_inline (float64x2_t x, const struct v_log1p_data *d) { /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several modifications: - No special-case handling - this should be dealt with by the caller. - Pairwise Horner polynomial evaluation for improved accuracy. - Optionally simulate the shortcut for k=0, used in the scalar routine, using v_sel, for improved accuracy when the argument to log1p is close to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in the source of the caller before including this file. See v_log1pf_2u1.c for details of the algorithm. */ - v_f64_t m = x + 1; - v_u64_t mi = v_as_u64_f64 (m); - v_u64_t u = mi + OneMHfRt2Top; + float64x2_t m = vaddq_f64 (x, v_f64 (1)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop; - v_f64_t k = v_to_f64_s64 (ki); + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; - v_u64_t u_red = utop | (mi & BottomMask); - v_f64_t f = v_as_f64_u64 (u_red) - 1; + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); /* Correction term c/m. */ - v_f64_t cm = (x - (m - 1)) / m; + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); #ifndef WANT_V_LOG1P_K0_SHORTCUT #error \ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" #elif WANT_V_LOG1P_K0_SHORTCUT /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is - that the approximation is solely the polynomial. */ - v_u64_t k0 = k == 0; - if (unlikely (v_any_u64 (k0))) - { - cm = v_sel_f64 (k0, v_f64 (0), cm); - f = v_sel_f64 (k0, x, f); - } + that the approximation is solely the polynomial. */ + uint64x2_t k0 = vceqzq_f64 (k); + cm = v_zerofy_f64 (cm, k0); + f = vbslq_f64 (k0, x, f); #endif /* Approximate log1p(f) on the reduced input using a polynomial. */ - v_f64_t f2 = f * f; - v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ - v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); - v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); - return v_fma_f64 (f2, p, ylo + yhi); + float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); + float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); + return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); } #endif // PL_MATH_V_LOG1P_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c index 4a7732b403ec..153c88da9c88 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c +++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c @@ -1,160 +1,126 @@ /* * Single-precision vector log(1+x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f32.h" -#if V_SUPPORTED - -#define AbsMask 0x7fffffff -#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ -#define MinusOne 0xbf800000 -#define Ln2 (0x1.62e43p-1f) -#define Four 0x40800000 -#define ThreeQuarters v_u32 (0x3f400000) - -#define C(i) v_f32 (__log1pf_data.coeffs[i]) - -static inline v_f32_t -eval_poly (v_f32_t m) +const static struct data { -#ifdef V_LOG1PF_1U3 - - /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme. */ - v_f32_t p = v_fma_f32 (C (8), m, C (7)); - p = v_fma_f32 (p, m, C (6)); - p = v_fma_f32 (p, m, C (5)); - p = v_fma_f32 (p, m, C (4)); - p = v_fma_f32 (p, m, C (3)); - p = v_fma_f32 (p, m, C (2)); - p = v_fma_f32 (p, m, C (1)); - p = v_fma_f32 (p, m, C (0)); - return v_fma_f32 (m, m * p, m); - -#elif defined(V_LOG1PF_2U5) - - /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ - v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); - v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); - v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); - v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); - - v_f32_t m2 = m * m; - v_f32_t p_02 = v_fma_f32 (m2, p_12, m); - v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); - v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); - - v_f32_t m4 = m2 * m2; - v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); - - return v_fma_f32 (m4, m4 * p_79, p_06); - -#else -#error No precision specified for v_log1pf -#endif + float32x4_t poly[8], ln2; + uint32x4_t tiny_bound, minus_one, four, thresh; + int32x4_t three_quarters; +} data = { + .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more + efficiently. */ + V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), + V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), + V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, + .ln2 = V4 (0x1.62e43p-1f), + .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */ + .minus_one = V4 (0xbf800000), + .four = V4 (0x40800000), + .three_quarters = V4 (0x3f400000) +}; + +static inline float32x4_t +eval_poly (float32x4_t m, const float32x4_t *p) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */ + float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]); + float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]); + float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]); + float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]); + + float32x4_t m2 = vmulq_f32 (m, m); + float32x4_t p_02 = vfmaq_f32 (m, m2, p_12); + float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56); + float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]); + + float32x4_t m4 = vmulq_f32 (m2, m2); + float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36); + return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79)); } -static inline float -handle_special (float x) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { - uint32_t ix = asuint (x); - uint32_t ia = ix & AbsMask; - if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000) - { - /* x == -Inf => log1pf(x) = NaN. - x < -1.0 => log1pf(x) = NaN. - x == +/-NaN => log1pf(x) = NaN. */ -#if WANT_SIMD_EXCEPT - return __math_invalidf (asfloat (ia)); -#else - return NAN; -#endif - } - if (ix == 0xbf800000) - { - /* x == -1.0 => log1pf(x) = -Inf. */ -#if WANT_SIMD_EXCEPT - return __math_divzerof (ix); -#else - return -INFINITY; -#endif - } - /* |x| < TinyBound => log1p(x) = x. */ - return x; + return v_call_f32 (log1pf, x, y, special); } -/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is - the same as for the scalar algorithm, i.e. worst-case error when using Estrin +/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is roughly 2.02 ULP: log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ -VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x) +VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8); - v_u32_t special_cases - = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound)) - | v_cond_u32 (ix >= MinusOne); - v_f32_t special_arg = x; + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t special_cases + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh), + vcgeq_u32 (ix, d->minus_one)); + float32x4_t special_arg = x; #if WANT_SIMD_EXCEPT if (unlikely (v_any_u32 (special_cases))) /* Side-step special lanes so fenv exceptions are not triggered inadvertently. */ - x = v_sel_f32 (special_cases, v_f32 (1), x); + x = v_zerofy_f32 (x, special_cases); #endif /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m is in [-0.25, 0.5]): log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). We approximate log1p(m) with a polynomial, then scale by k*log(2). Instead of doing this directly, we use an intermediate scale factor s = 4*k*log(2) to ensure the scale is representable as a normalised fp32 number. */ - v_f32_t m = x + v_f32 (1.0f); + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); /* Choose k to scale x to the range [-1/4, 1/2]. */ - v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000); + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); /* Scale x by exponent manipulation. */ - v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k)); + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); /* Scale up to ensure that the scale factor is representable as normalised fp32 number, and scale m down accordingly. */ - v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); - m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); /* Evaluate polynomial on the reduced interval. */ - v_f32_t p = eval_poly (m_scale); + float32x4_t p = eval_poly (m_scale, d->poly); /* The scale factor to be applied back at the end - by multiplying float(k) by 2^-23 we get the unbiased exponent of k. */ - v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f); + float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23)); /* Apply the scaling back. */ - v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p); + float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2); if (unlikely (v_any_u32 (special_cases))) - return v_call_f32 (handle_special, special_arg, y, special_cases); + return special_case (special_arg, y, special_cases); return y; } -VPCS_ALIAS PL_SIG (V, F, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (V_NAME (log1pf), 1.53) -PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000) -PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000) -PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000) -#endif +PL_TEST_ULP (V_NAME_F1 (log1p), 1.53) +PL_TEST_EXPECT_FENV (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h index e3048e667c26..c654c6bad08f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h +++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h @@ -1,55 +1,67 @@ /* * Helper for single-precision routines which calculate log(1 + x) and do not * need special-case handling * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef PL_MATH_V_LOG1PF_INLINE_H #define PL_MATH_V_LOG1PF_INLINE_H #include "v_math.h" -#include "math_config.h" +#include "poly_advsimd_f32.h" -#define Four 0x40800000 -#define Ln2 v_f32 (0x1.62e43p-1f) - -#define C(i) v_f32 (__log1pf_data.coeffs[i]) - -static inline v_f32_t -eval_poly (v_f32_t m) +struct v_log1pf_data { - /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ - v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); - v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); - v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); - v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); + float32x4_t poly[8], ln2; + uint32x4_t four; + int32x4_t three_quarters; +}; - v_f32_t m2 = m * m; - v_f32_t p_02 = v_fma_f32 (m2, p_12, m); - v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); - v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); +/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more efficiently. */ +#define V_LOG1PF_CONSTANTS_TABLE \ + { \ + .poly \ + = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \ + V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \ + V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \ + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ + .three_quarters = V4 (0x3f400000) \ + } - v_f32_t m4 = m2 * m2; - v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); - - return v_fma_f32 (m4, m4 * p_79, p_06); +static inline float32x4_t +eval_poly (float32x4_t m, const float32x4_t *c) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine + uses split Estrin, but this way reduces register pressure in the calling + routine). */ + float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]); + float32x4_t m2 = vmulq_f32 (m, m); + q = vfmaq_f32 (m, m2, q); + float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1); + p = vmulq_f32 (m2, p); + return vfmaq_f32 (q, m2, p); } -static inline v_f32_t -log1pf_inline (v_f32_t x) +static inline float32x4_t +log1pf_inline (float32x4_t x, const struct v_log1pf_data d) { /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no special-case handling. See that file for details of the algorithm. */ - v_f32_t m = x + 1.0f; - v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000; - v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); - v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k) - + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); - v_f32_t p = eval_poly (m_scale); - v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f; - return v_fma_f32 (scale_back, Ln2, p); + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku)); + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); + float32x4_t p = eval_poly (m_scale, d.poly); + float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); + return vfmaq_f32 (p, scale_back, d.ln2); } #endif // PL_MATH_V_LOG1PF_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_3u.c b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c index fac73f60c600..2dd2c34b7c97 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log2_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c @@ -1,100 +1,109 @@ /* * Double-precision vector log2 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "include/mathlib.h" #include "pl_sig.h" #include "pl_test.h" +#include "poly_advsimd_f64.h" -#if V_SUPPORTED - -#define InvLn2 v_f64 (0x1.71547652b82fep0) #define N (1 << V_LOG2_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) -#define P(i) v_f64 (__v_log2_data.poly[i]) + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t invln2; + uint64x2_t sign_exp_mask; +} data = { + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2), + V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2), + V2 (-0x1.ec738d616fe26p-3) }, + .invln2 = V2 (0x1.71547652b82fep0), + .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000), +}; + +#define Off v_u64 (0x3fe6900900000000) +#define IndexMask (N - 1) struct entry { - v_f64_t invc; - v_f64_t log2c; + float64x2_t invc; + float64x2_t log2c; }; static inline struct entry -lookup (v_u64_t i) +lookup (uint64x2_t i) { struct entry e; -#ifdef SCALAR - e.invc = __v_log2_data.tab[i].invc; - e.log2c = __v_log2_data.tab[i].log2c; -#else - e.invc[0] = __v_log2_data.tab[i[0]].invc; - e.log2c[0] = __v_log2_data.tab[i[0]].log2c; - e.invc[1] = __v_log2_data.tab[i[1]].invc; - e.log2c[1] = __v_log2_data.tab[i[1]].log2c; -#endif + uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log2c = vuzp2q_f64 (e0, e1); return e; } -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, + uint32x2_t special) { - return v_call_f64 (log2, x, y, cmp); + return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special)); } -/* Double-precision vector log2 routine. Implements the same algorithm as vector - log10, with coefficients and table entries scaled in extended precision. - The maximum observed error is 2.58 ULP: - __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -VPCS_ATTR -v_f64_t V_NAME (log2) (v_f64_t x) +/* Double-precision vector log2 routine. Implements the same algorithm as + vector log10, with coefficients and table entries scaled in extended + precision. The maximum observed error is 2.58 ULP: + _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t top = ix >> 48; - v_u64_t special - = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - v_u64_t tmp = ix - OFF; - v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N; - v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */ - v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52)); - v_f64_t z = v_as_f64_u64 (iz); - struct entry e = lookup (i); + uint64x2_t tmp = vsubq_u64 (ix, Off); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (tmp); /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ - v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - v_f64_t kd = v_to_f64_s64 (k); - v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2); - v_f64_t r2 = r * r; - v_f64_t p_23 = v_fma_f64 (P (3), r, P (2)); - v_f64_t p_01 = v_fma_f64 (P (1), r, P (0)); - v_f64_t y = v_fma_f64 (P (4), r2, p_23); - y = v_fma_f64 (r2, y, p_01); - y = v_fma_f64 (r2, y, kd + w); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); + w = vaddq_f64 (kd, w); - if (unlikely (v_any_u64 (special))) - return specialcase (x, y, special); - return y; + if (unlikely (v_any_u32h (special))) + return special_case (x, y, w, r2, special); + return vfmaq_f64 (w, r2, y); } -VPCS_ALIAS PL_SIG (V, D, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log2), 2.09) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2)) -PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000) -#endif +PL_TEST_ULP (V_NAME_D1 (log2), 2.09) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log2)) +PL_TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_data.c b/contrib/arm-optimized-routines/pl/math/v_log2_data.c index 2a1da6823fbc..50697daff925 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log2_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_log2_data.c @@ -1,155 +1,153 @@ /* * Coefficients and table entries for vector log2 * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #define N (1 << V_LOG2_TABLE_BITS) -// clang-format off - const struct v_log2_data __v_log2_data = { -/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6. - Each coefficient was scaled by log2(e) in extended precision and rounded back to - double. */ -.poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2, - 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 }, + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2, + 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 }, + + .invln2 = 0x1.71547652b82fep0, -/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was - calculated by scaling log10(c) by log2(10) in extended precision and rounding - back. */ -.tab = { -{ 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 }, -{ 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 }, -{ 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 }, -{ 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 }, -{ 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 }, -{ 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 }, -{ 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 }, -{ 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 }, -{ 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 }, -{ 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 }, -{ 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 }, -{ 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 }, -{ 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 }, -{ 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 }, -{ 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 }, -{ 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 }, -{ 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 }, -{ 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 }, -{ 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 }, -{ 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 }, -{ 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 }, -{ 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 }, -{ 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 }, -{ 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 }, -{ 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 }, -{ 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 }, -{ 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 }, -{ 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 }, -{ 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 }, -{ 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 }, -{ 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 }, -{ 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 }, -{ 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 }, -{ 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 }, -{ 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 }, -{ 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 }, -{ 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 }, -{ 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 }, -{ 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 }, -{ 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 }, -{ 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 }, -{ 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 }, -{ 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 }, -{ 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 }, -{ 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 }, -{ 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 }, -{ 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 }, -{ 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 }, -{ 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 }, -{ 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 }, -{ 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 }, -{ 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 }, -{ 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 }, -{ 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 }, -{ 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 }, -{ 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 }, -{ 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 }, -{ 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 }, -{ 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 }, -{ 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 }, -{ 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 }, -{ 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 }, -{ 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 }, -{ 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 }, -{ 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 }, -{ 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 }, -{ 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 }, -{ 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 }, -{ 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 }, -{ 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 }, -{ 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 }, -{ 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 }, -{ 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 }, -{ 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 }, -{ 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 }, -{ 1.0, 0.0 }, -{ 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 }, -{ 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 }, -{ 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 }, -{ 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 }, -{ 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 }, -{ 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 }, -{ 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 }, -{ 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 }, -{ 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 }, -{ 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 }, -{ 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 }, -{ 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 }, -{ 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 }, -{ 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 }, -{ 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 }, -{ 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 }, -{ 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 }, -{ 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 }, -{ 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 }, -{ 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 }, -{ 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 }, -{ 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 }, -{ 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 }, -{ 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 }, -{ 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 }, -{ 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 }, -{ 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 }, -{ 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 }, -{ 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 }, -{ 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 }, -{ 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 }, -{ 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 }, -{ 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 }, -{ 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 }, -{ 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 }, -{ 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 }, -{ 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 }, -{ 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 }, -{ 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 }, -{ 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 }, -{ 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 }, -{ 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 }, -{ 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 }, -{ 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 }, -{ 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 }, -{ 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 }, -{ 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 }, -{ 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 }, -{ 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 }, -{ 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 }, -{ 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 }, -{ 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 }} + /* Derived from tables in v_log_data.c in a similar way as v_log10_data.c. + This means invc is unchanged and log2c was calculated by scaling log(c) by + log2(e) in extended precision and rounding back to double precision. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 }, + { 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 }, + { 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 }, + { 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 }, + { 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 }, + { 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 }, + { 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 }, + { 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 }, + { 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 }, + { 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 }, + { 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 }, + { 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 }, + { 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 }, + { 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 }, + { 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 }, + { 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 }, + { 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 }, + { 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 }, + { 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 }, + { 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 }, + { 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 }, + { 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 }, + { 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 }, + { 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 }, + { 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 }, + { 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 }, + { 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 }, + { 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 }, + { 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 }, + { 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 }, + { 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 }, + { 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 }, + { 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 }, + { 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 }, + { 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 }, + { 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 }, + { 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 }, + { 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 }, + { 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 }, + { 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 }, + { 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 }, + { 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 }, + { 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 }, + { 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 }, + { 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 }, + { 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 }, + { 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 }, + { 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 }, + { 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 }, + { 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 }, + { 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 }, + { 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 }, + { 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 }, + { 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 }, + { 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 }, + { 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 }, + { 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 }, + { 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 }, + { 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 }, + { 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 }, + { 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 }, + { 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 }, + { 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 }, + { 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 }, + { 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 }, + { 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 }, + { 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 }, + { 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 }, + { 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 }, + { 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 }, + { 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 }, + { 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 }, + { 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 }, + { 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 }, + { 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 }, + { 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 }, + { 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 }, + { 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 }, + { 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 }, + { 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 }, + { 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 }, + { 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 }, + { 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 }, + { 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 }, + { 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 }, + { 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 }, + { 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 } } }; -// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c index 8f9241bed8e6..c64d88742136 100644 --- a/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c @@ -1,68 +1,77 @@ /* * Single-precision vector log2 function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pairwise_hornerf.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED -#define C(i) v_f32 (__v_log2f_data.poly[i]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + uint32x4_t off, mantissa_mask; + float32x4_t poly[9]; +} data = { + /* Coefficients generated using Remez algorithm approximate + log2(1+r)/r for r in [ -1/3, 1/3 ]. + rel error: 0x1.c4c4b0cp-26. */ + .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), + V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), + V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, + uint16x4_t cmp) { /* Fall back to scalar code. */ - return v_call_f32 (log2f, x, y, cmp); + return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); } -/* Fast implementation for single precision log2, - relies on same argument reduction as Neon logf. +/* Fast implementation for single precision AdvSIMD log2, + relies on same argument reduction as AdvSIMD logf. Maximum error: 2.48 ULPs - __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 - want 0x1.a9be8p-2. */ -VPCS_ATTR -v_f32_t V_NAME (log2f) (v_f32_t x) + _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x) { - v_u32_t u = v_as_u32_f32 (x); - v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min); + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u -= Off; - v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ - u &= Mask; - u += Off; - v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f); + u = vsubq_u32 (u, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log2(1+r) + n. */ - v_f32_t r2 = r * r; - v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C); - v_f32_t y = v_fma_f32 (p, r, n); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; + if (unlikely (v_any_u16h (special))) + return special_case (x, n, p, r, special); + return vfmaq_f32 (n, p, r); } -VPCS_ALIAS PL_SIG (V, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME (log2f), 1.99) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f)) -PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000) -#endif +PL_TEST_ULP (V_NAME_F1 (log2), 1.99) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log2)) +PL_TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_data.c b/contrib/arm-optimized-routines/pl/math/v_log2f_data.c deleted file mode 100644 index b144e8f4992d..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_log2f_data.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Coefficients for vector log2f - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* See tools/v_log2f.sollya for the algorithm used to generate these - coefficients. */ -const struct v_log2f_data __v_log2f_data - = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)). */ - -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f, - -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log_data.c b/contrib/arm-optimized-routines/pl/math/v_log_data.c new file mode 100644 index 000000000000..a26e8a051d97 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log_data.c @@ -0,0 +1,161 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct v_log_data __v_log_data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 }, + .ln2 = 0x1.62e42fefa39efp-1, + /* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from two lookup + tables: + + table[i].invc = 1/c + table[i].logc = (double)log(c) + + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, + { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, + { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, + { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, + { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, + { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, + { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, + { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, + { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, + { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, + { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, + { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, + { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, + { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, + { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, + { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, + { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, + { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, + { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, + { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, + { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, + { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, + { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, + { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, + { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, + { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, + { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, + { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, + { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, + { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, + { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, + { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, + { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, + { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, + { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, + { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, + { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, + { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, + { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, + { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, + { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, + { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, + { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, + { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, + { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, + { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, + { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, + { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, + { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, + { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, + { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, + { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, + { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, + { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, + { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, + { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, + { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, + { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, + { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, + { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, + { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, + { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, + { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, + { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, + { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, + { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, + { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, + { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, + { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, + { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, + { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, + { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, + { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, + { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, + { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, + { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, + { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, + { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, + { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, + { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, + { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, + { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, + { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, + { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, + { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, + { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, + { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log_inline.h b/contrib/arm-optimized-routines/pl/math/v_log_inline.h new file mode 100644 index 000000000000..2df00cf4ddf4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log_inline.h @@ -0,0 +1,104 @@ +/* + * Double-precision vector log(x) function - inline version + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "math_config.h" + +#ifndef V_LOG_INLINE_POLY_ORDER +# error Cannot use inline log helper without specifying poly order (options are 4 or 5) +#endif + +#if V_LOG_INLINE_POLY_ORDER == 4 +# define POLY \ + { \ + V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2), \ + V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3) \ + } +#elif V_LOG_INLINE_POLY_ORDER == 5 +# define POLY \ + { \ + V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), \ + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), \ + V2 (-0x1.554e550bd501ep-3) \ + } +#else +# error Can only choose order 4 or 5 for log poly +#endif + +struct v_log_inline_data +{ + float64x2_t poly[V_LOG_INLINE_POLY_ORDER]; + float64x2_t ln2; + uint64x2_t off, sign_exp_mask; +}; + +#define V_LOG_CONSTANTS \ + { \ + .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1), \ + .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000) \ + } + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +log_lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +v_log_inline (float64x2_t x, const struct v_log_inline_data *d) +{ + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, d->off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = log_lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); +#if V_LOG_POLY_ORDER == 5 + y = vfmaq_f64 (y, A (4), r2); +#endif + y = vfmaq_f64 (p, y, r2); + + return vfmaq_f64 (hi, y, r2); +} diff --git a/contrib/arm-optimized-routines/pl/math/v_logf_inline.h b/contrib/arm-optimized-routines/pl/math/v_logf_inline.h new file mode 100644 index 000000000000..c00fe0909afc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_logf_inline.h @@ -0,0 +1,59 @@ +/* + * Single-precision vector log function - inline version + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +struct v_logf_data +{ + float32x4_t poly[7]; + float32x4_t ln2; + uint32x4_t off, mantissa_mask; +}; + +#define V_LOGF_CONSTANTS \ + { \ + .poly \ + = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), \ + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), \ + V4 (-0x1.ffffc8p-2f) }, \ + .ln2 = V4 (0x1.62e43p-1f), .off = V4 (0x3f2aaaab), \ + .mantissa_mask = V4 (0x007fffff) \ + } + +#define P(i) d->poly[7 - i] + +static inline float32x4_t +v_logf_inline (float32x4_t x, const struct v_logf_data *d) +{ + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + + u = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + return vfmaq_f32 (p, y, r2); +} + +#undef P diff --git a/contrib/arm-optimized-routines/pl/math/v_math.h b/contrib/arm-optimized-routines/pl/math/v_math.h index a8fa091a7cbf..1b10929faccc 100644 --- a/contrib/arm-optimized-routines/pl/math/v_math.h +++ b/contrib/arm-optimized-routines/pl/math/v_math.h @@ -1,855 +1,175 @@ /* * Vector math abstractions. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _V_MATH_H #define _V_MATH_H #ifndef WANT_VMATH /* Enable the build of vector math code. */ # define WANT_VMATH 1 #endif -#if WANT_VMATH - -/* The goal of this header is to allow vector (only Neon for now) - and scalar build of the same algorithm. */ -#if SCALAR -#define V_NAME(x) __s_##x -#elif VPCS && __aarch64__ -#define V_NAME(x) __vn_##x -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -#else -#define V_NAME(x) __v_##x -#endif - -#ifndef VPCS_ATTR -#define VPCS_ATTR -#endif -#ifndef VPCS_ALIAS -#define VPCS_ALIAS -#endif +#if WANT_VMATH -#include -#include "math_config.h" +# if __aarch64__ +# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) +# else +# error "Cannot build without AArch64" +# endif -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; +# include +# include "math_config.h" +# if __aarch64__ -/* reinterpret as type1 from type2. */ -static inline u32_t -as_u32_f32 (f32_t x) -{ - union { f32_t f; u32_t u; } r = {x}; - return r.u; -} -static inline f32_t -as_f32_u32 (u32_t x) -{ - union { u32_t u; f32_t f; } r = {x}; - return r.f; -} -static inline s32_t -as_s32_u32 (u32_t x) -{ - union { u32_t u; s32_t i; } r = {x}; - return r.i; -} -static inline u32_t -as_u32_s32 (s32_t x) -{ - union { s32_t i; u32_t u; } r = {x}; - return r.u; -} -static inline u64_t -as_u64_f64 (f64_t x) -{ - union { f64_t f; u64_t u; } r = {x}; - return r.u; -} -static inline f64_t -as_f64_u64 (u64_t x) -{ - union { u64_t u; f64_t f; } r = {x}; - return r.f; -} -static inline s64_t -as_s64_u64 (u64_t x) -{ - union { u64_t u; s64_t i; } r = {x}; - return r.i; -} -static inline u64_t -as_u64_s64 (s64_t x) -{ - union { s64_t i; u64_t u; } r = {x}; - return r.u; -} +# include -#if SCALAR -#define V_SUPPORTED 1 -typedef f32_t v_f32_t; -typedef u32_t v_u32_t; -typedef s32_t v_s32_t; -typedef f64_t v_f64_t; -typedef u64_t v_u64_t; -typedef s64_t v_s64_t; +/* Shorthand helpers for declaring constants. */ +# define V2(X) { X, X } +# define V4(X) { X, X, X, X } +# define V8(X) { X, X, X, X, X, X, X, X } static inline int -v_lanes32 (void) -{ - return 1; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return x; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return x; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return x; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) +v_any_u16h (uint16x4_t x) { - return x; + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; } -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) +static inline float32x4_t +v_f32 (float x) { - *x = v; + return (float32x4_t) V4 (x); } -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) +static inline uint32x4_t +v_u32 (uint32_t x) { - *x = v; + return (uint32x4_t) V4 (x); } -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) +static inline int32x4_t +v_s32 (int32_t x) { - *x = v; + return (int32x4_t) V4 (x); } -/* true if any elements of a v_cond result is non-zero. */ +/* true if any elements of a vector compare result is non-zero. */ static inline int -v_any_u32 (v_u32_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x ? -1 : 0; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return __builtin_fabsf (x); -} -static inline v_u32_t -v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) -{ - return (y & ~m) | (x & m); -} -static inline v_u32_t -v_cagt_f32 (v_f32_t x, v_f32_t y) -{ - return fabsf (x) > fabsf (y); -} -/* to wrap |x| >= |y|. */ -static inline v_u32_t -v_cage_f32 (v_f32_t x, v_f32_t y) -{ - return fabsf (x) >= fabsf (y); -} -static inline v_u32_t -v_calt_f32 (v_f32_t x, v_f32_t y) -{ - return fabsf (x) < fabsf (y); -} -static inline v_f32_t -v_div_f32 (v_f32_t x, v_f32_t y) -{ - return x / y; -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return __builtin_fmaf (x, y, z); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return __builtin_roundf (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return __builtin_lroundf (x); /* relies on -fno-math-errno. */ -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return p ? x : y; -} -static inline v_u32_t -v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y) -{ - return p ? x : y; -} -static inline v_f32_t -v_sqrt_f32 (v_f32_t x) -{ - return __builtin_sqrtf (x); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return x; -} -static inline v_s32_t -v_to_s32_f32 (v_f32_t x) -{ - return x; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_s32_t -v_as_s32_f32 (v_f32_t x) -{ - union - { - v_f32_t f; - v_s32_t u; - } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return f (x); -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return f (x1, x2); -} - -static inline int -v_lanes64 (void) -{ - return 1; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return x; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return x; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return x; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - *x = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - return x != 0; -} -/* true if all elements of a v_cond result is non-zero. */ -static inline int -v_all_u64 (v_u64_t x) -{ - return x; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x ? -1 : 0; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return __builtin_fabs (x); -} -static inline v_u64_t -v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y) -{ - return (y & ~m) | (x & m); -} -static inline v_u64_t -v_cagt_f64 (v_f64_t x, v_f64_t y) -{ - return fabs (x) > fabs (y); -} -static inline v_f64_t -v_div_f64 (v_f64_t x, v_f64_t y) -{ - return x / y; -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return __builtin_fma (x, y, z); -} -static inline v_f64_t -v_min_f64(v_f64_t x, v_f64_t y) { - return x < y ? x : y; -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return __builtin_round (x); -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return p ? x : y; -} -static inline v_f64_t -v_sqrt_f64 (v_f64_t x) -{ - return __builtin_sqrt (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return __builtin_lround (x); /* relies on -fno-math-errno. */ -} -static inline v_u64_t -v_trunc_u64 (v_f64_t x) -{ - return __builtin_trunc (x); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return x; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return x; -} - -static inline v_s64_t -v_to_s64_f64 (v_f64_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return f (x); -} -static inline v_f64_t -v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y, - v_u64_t p) -{ - return f (x1, x2); -} - -#elif __aarch64__ -#define V_SUPPORTED 1 -#include -typedef float32x4_t v_f32_t; -typedef uint32x4_t v_u32_t; -typedef int32x4_t v_s32_t; -typedef float64x2_t v_f64_t; -typedef uint64x2_t v_u64_t; -typedef int64x2_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return (v_f32_t){x, x, x, x}; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return (v_u32_t){x, x, x, x}; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return (v_s32_t){x, x, x, x}; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x[i]; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x[i]; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x[i]; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - (*x)[i] = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) +v_any_u32 (uint32x4_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; } -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return vabsq_f32 (x); -} -static inline v_u32_t -v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) -{ - return vbslq_u32 (m, x, y); -} -static inline v_u32_t -v_cagt_f32 (v_f32_t x, v_f32_t y) -{ - return vcagtq_f32 (x, y); -} -/* to wrap |x| >= |y|. */ -static inline v_u32_t -v_cage_f32 (v_f32_t x, v_f32_t y) -{ - return vcageq_f32 (x, y); -} -static inline v_u32_t -v_calt_f32 (v_f32_t x, v_f32_t y) -{ - return vcaltq_f32 (x, y); -} -static inline v_f32_t -v_div_f32 (v_f32_t x, v_f32_t y) -{ - return vdivq_f32 (x, y); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return vfmaq_f32 (z, x, y); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return vrndaq_f32 (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return vcvtaq_s32_f32 (x); -} -static inline v_f32_t -v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) -{ - return vbslq_f32 (p, x, y); -} -static inline v_u32_t -v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y) -{ - return vbslq_u32 (p, x, y); -} -static inline v_f32_t -v_sqrt_f32 (v_f32_t x) -{ - return vsqrtq_f32 (x); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -static inline v_s32_t -v_to_s32_f32 (v_f32_t x) -{ - return vcvtq_s32_f32 (x); -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_s32_t -v_as_s32_f32 (v_f32_t x) -{ - union - { - v_f32_t f; - v_s32_t u; - } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) +static inline int +v_any_u32h (uint32x2_t x) { - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; } -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) { - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; } -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) { - return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; } -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) { - return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; } -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) { - return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3] }; } -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) +static inline float32x4_t +v_zerofy_f32 (float32x4_t x, uint32x4_t mask) { - return ( - v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; + return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); } -static inline int -v_lanes64 (void) +static inline float64x2_t +v_f64 (double x) { - return 2; + return (float64x2_t) V2 (x); } -static inline v_f64_t -v_f64 (f64_t x) +static inline uint64x2_t +v_u64 (uint64_t x) { - return (v_f64_t){x, x}; + return (uint64x2_t) V2 (x); } -static inline v_u64_t -v_u64 (u64_t x) +static inline int64x2_t +v_s64 (int64_t x) { - return (v_u64_t){x, x}; + return (int64x2_t) V2 (x); } -static inline v_s64_t -v_s64 (s64_t x) -{ - return (v_s64_t){x, x}; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x[i]; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - (*x)[i] = v; -} -/* true if any elements of a v_cond result is non-zero. */ + +/* true if any elements of a vector compare result is non-zero. */ static inline int -v_any_u64 (v_u64_t x) +v_any_u64 (uint64x2_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_u64 (x) != 0; } -/* true if all elements of a v_cond result is 1. */ +/* true if all elements of a vector compare result is 1. */ static inline int -v_all_u64 (v_u64_t x) +v_all_u64 (uint64x2_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; } -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return vabsq_f64 (x); -} -static inline v_u64_t -v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y) -{ - return vbslq_u64 (m, x, y); -} -static inline v_u64_t -v_cagt_f64 (v_f64_t x, v_f64_t y) +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) { - return vcagtq_f64 (x, y); + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; } -static inline v_f64_t -v_div_f64 (v_f64_t x, v_f64_t y) +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) { - return vdivq_f64 (x, y); + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; } -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return vfmaq_f64 (z, x, y); -} -static inline v_f64_t -v_min_f64(v_f64_t x, v_f64_t y) { - return vminq_f64(x, y); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return vrndaq_f64 (x); -} -static inline v_f64_t -v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) -{ - return vbslq_f64 (p, x, y); -} -static inline v_f64_t -v_sqrt_f64 (v_f64_t x) -{ - return vsqrtq_f64 (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return vcvtaq_s64_f64 (x); -} -static inline v_u64_t -v_trunc_u64 (v_f64_t x) -{ - return vcvtq_u64_f64 (x); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_s64_t -v_to_s64_f64 (v_f64_t x) -{ - return vcvtq_s64_f64 (x); -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return (v_f64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) + +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) { - return (v_u64_t){tab[idx[0]], tab[idx[1]]}; + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; } -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) + +static inline float64x2_t +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, + float64x2_t y, uint64x2_t p) { - return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; + double p1 = p[1]; + double x1h = x1[1]; + double x2h = x2[1]; + if (likely (p[0])) + y[0] = f (x1[0], x2[0]); + if (likely (p1)) + y[1] = f (x1h, x2h); + return y; } -static inline v_f64_t -v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y, - v_u64_t p) +static inline float64x2_t +v_zerofy_f64 (float64x2_t x, uint64x2_t mask) { - return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0], - p[1] ? f (x1[1], x2[1]) : y[1]}; + return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); } -#endif +# endif #endif + #endif diff --git a/contrib/arm-optimized-routines/pl/math/v_pow_1u5.c b/contrib/arm-optimized-routines/pl/math/v_pow_1u5.c new file mode 100644 index 000000000000..9053347d4e35 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_pow_1u5.c @@ -0,0 +1,259 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Defines parameters of the approximation and scalar fallback. */ +#include "finite_pow.h" + +#define VecSmallExp v_u64 (SmallExp) +#define VecThresExp v_u64 (ThresExp) + +#define VecSmallPowX v_u64 (SmallPowX) +#define VecThresPowX v_u64 (ThresPowX) +#define VecSmallPowY v_u64 (SmallPowY) +#define VecThresPowY v_u64 (ThresPowY) + +static const struct data +{ + float64x2_t log_poly[7]; + float64x2_t exp_poly[3]; + float64x2_t ln2_hi, ln2_lo; + float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n; +} data = { + /* Coefficients copied from v_pow_log_data.c + relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2), + V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4), + V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8), + V2 (-0x1.0002b8b263fc3p-3 * -8) }, + .ln2_hi = V2 (0x1.62e42fefa3800p-1), + .ln2_lo = V2 (0x1.ef35793c76730p-45), + /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 + (0.550 without fma) if |x| < ln2/512. */ + .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3), + V2 (0x1.5555576a5adcep-5) }, + .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */ + .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */ + .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */ + .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45), +}; + +#define A(i) data.log_poly[i] +#define C(i) data.exp_poly[i] + +/* This version implements an algorithm close to AOR scalar pow but + - does not implement the trick in the exp's specialcase subroutine to avoid + double-rounding, + - does not use a tail in the exponential core computation, + - and pow's exp polynomial order and table bits might differ. + + Maximum measured error is 1.04 ULPs: + _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13) + got 0x1.f71162f473251p-1 + want 0x1.f71162f473252p-1. */ + +static inline float64x2_t +v_masked_lookup_f64 (const double *table, uint64x2_t i) +{ + return (float64x2_t){ + table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)], + table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)] + }; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline float64x2_t +v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) +{ + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off)); + int64x2_t k + = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52))); + float64x2_t z = vreinterpretq_f64_u64 (iz); + float64x2_t kd = vcvtq_f64_s64 (k); + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp); + float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp); + float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp); + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); + /* k*Ln2 + log(c) + r. */ + float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi); + float64x2_t t2 = vaddq_f64 (t1, r); + float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo); + float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); + /* Evaluation is optimized assuming superscalar pipelined execution. */ + float64x2_t ar = vmulq_f64 (A (0), r); + float64x2_t ar2 = vmulq_f64 (r, ar); + float64x2_t ar3 = vmulq_f64 (r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + float64x2_t hi = vaddq_f64 (t2, ar2); + float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); + float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + float64x2_t a56 = vfmaq_f64 (A (5), r, A (6)); + float64x2_t a34 = vfmaq_f64 (A (3), r, A (4)); + float64x2_t a12 = vfmaq_f64 (A (1), r, A (2)); + float64x2_t p = vfmaq_f64 (a34, ar2, a56); + p = vfmaq_f64 (a12, ar2, p); + p = vmulq_f64 (ar3, p); + float64x2_t lo + = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p); + float64x2_t y = vaddq_f64 (hi, lo); + *tail = vaddq_f64 (vsubq_f64 (hi, y), lo); + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ +static inline float64x2_t +v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) +{ + /* Fallback to scalar exp_inline for all lanes if any lane + contains value of x s.t. |x| <= 2^-54 or >= 512. */ + uint64x2_t abstop + = vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff)); + uint64x2_t uoflowx + = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp); + if (unlikely (v_any_u64 (uoflowx))) + return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1)); + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + float64x2_t z = vmulq_f64 (d->inv_ln2_n, x); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + float64x2_t kd = vaddq_f64 (z, d->shift); + uint64x2_t ki = vreinterpretq_u64_f64 (kd); + kd = vsubq_f64 (kd, d->shift); + float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n); + r = vfmsq_f64 (r, kd, d->ln2_lo_n); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = vaddq_f64 (r, xtail); + /* 2^(k/N) ~= scale. */ + uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); + uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64x2_t sbits = v_lookup_u64 (SBits, idx); + sbits = vaddq_u64 (sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t tmp = vfmaq_f64 (C (1), r, C (2)); + tmp = vfmaq_f64 (C (0), r, tmp); + tmp = vfmaq_f64 (r, r2, tmp); + float64x2_t scale = vreinterpretq_f64_u64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return vfmaq_f64 (scale, scale, tmp); +} + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + /* Case of x <= 0 is too complicated to be vectorised efficiently here, + fallback to scalar pow for all lanes if any x < 0 detected. */ + if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x)))) + return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1)); + + uint64x2_t vix = vreinterpretq_u64_f64 (x); + uint64x2_t viy = vreinterpretq_u64_f64 (y); + uint64x2_t vtopx = vshrq_n_u64 (vix, 52); + uint64x2_t vtopy = vshrq_n_u64 (viy, 52); + uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff)); + uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff)); + + /* Special cases of x or y. */ +#if WANT_SIMD_EXCEPT + /* Small or large. */ + uint64x2_t specialx + = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); + uint64x2_t specialy + = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); +#else + /* Inf or nan. */ + uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff)); + uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff)); + /* The case y==0 does not trigger a special case, since in this case it is + necessary to fix the result only if x is a signalling nan, which already + triggers a special case. We test y==0 directly in the scalar fallback. */ +#endif + uint64x2_t special = vorrq_u64 (specialx, specialy); + /* Fallback to scalar on all lanes if any lane is inf or nan. */ + if (unlikely (v_any_u64 (special))) + return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1)); + + /* Small cases of x: |x| < 0x1p-126. */ + uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX); + if (unlikely (v_any_u64 (smallx))) + { + /* Update ix if top 12 bits of x are 0. */ + uint64x2_t sub_x = vceqzq_u64 (vtopx); + if (unlikely (v_any_u64 (sub_x))) + { + /* Normalize subnormal x so exponent becomes negative. */ + uint64x2_t vix_norm + = vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52))); + vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff)); + vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52)); + vix = vbslq_u64 (sub_x, vix_norm, vix); + } + } + + /* Vector Log(ix, &lo). */ + float64x2_t vlo; + float64x2_t vhi = v_log_inline (vix, &vlo, d); + + /* Vector Exp(y_loghi, y_loglo). */ + float64x2_t vehi = vmulq_f64 (y, vhi); + float64x2_t velo = vmulq_f64 (y, vlo); + float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); + velo = vsubq_f64 (velo, vemi); + return v_exp_inline (vehi, velo, d); +} + +PL_SIG (V, D, 2, pow) +PL_TEST_ULP (V_NAME_D2 (pow), 0.55) +PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +/* around argmaxs of ULP error. */ +V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* 1.0^y. */ +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/math/v_exp_data.c b/contrib/arm-optimized-routines/pl/math/v_pow_exp_data.c similarity index 64% rename from contrib/arm-optimized-routines/math/v_exp_data.c rename to contrib/arm-optimized-routines/pl/math/v_pow_exp_data.c index 30421da81429..5d921ef648a4 100644 --- a/contrib/arm-optimized-routines/math/v_exp_data.c +++ b/contrib/arm-optimized-routines/pl/math/v_pow_exp_data.c @@ -1,403 +1,289 @@ /* - * Lookup table for double-precision e^x vector function. + * Shared data between exp, exp2 and pow. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2018-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "v_exp.h" -#if WANT_VMATH +#include "math_config.h" -#define N (1 << V_EXP_TABLE_BITS) +#define N (1 << V_POW_EXP_TABLE_BITS) -/* 2^(j/N), j=0..N. */ -const u64_t __v_exp_data[] = { -#if N == 128 -0x3ff0000000000000, -0x3feff63da9fb3335, -0x3fefec9a3e778061, -0x3fefe315e86e7f85, -0x3fefd9b0d3158574, -0x3fefd06b29ddf6de, -0x3fefc74518759bc8, -0x3fefbe3ecac6f383, -0x3fefb5586cf9890f, -0x3fefac922b7247f7, -0x3fefa3ec32d3d1a2, -0x3fef9b66affed31b, -0x3fef9301d0125b51, -0x3fef8abdc06c31cc, -0x3fef829aaea92de0, -0x3fef7a98c8a58e51, -0x3fef72b83c7d517b, -0x3fef6af9388c8dea, -0x3fef635beb6fcb75, -0x3fef5be084045cd4, -0x3fef54873168b9aa, -0x3fef4d5022fcd91d, -0x3fef463b88628cd6, -0x3fef3f49917ddc96, -0x3fef387a6e756238, -0x3fef31ce4fb2a63f, -0x3fef2b4565e27cdd, -0x3fef24dfe1f56381, -0x3fef1e9df51fdee1, -0x3fef187fd0dad990, -0x3fef1285a6e4030b, -0x3fef0cafa93e2f56, -0x3fef06fe0a31b715, -0x3fef0170fc4cd831, -0x3feefc08b26416ff, -0x3feef6c55f929ff1, -0x3feef1a7373aa9cb, -0x3feeecae6d05d866, -0x3feee7db34e59ff7, -0x3feee32dc313a8e5, -0x3feedea64c123422, -0x3feeda4504ac801c, -0x3feed60a21f72e2a, -0x3feed1f5d950a897, -0x3feece086061892d, -0x3feeca41ed1d0057, -0x3feec6a2b5c13cd0, -0x3feec32af0d7d3de, -0x3feebfdad5362a27, -0x3feebcb299fddd0d, -0x3feeb9b2769d2ca7, -0x3feeb6daa2cf6642, -0x3feeb42b569d4f82, -0x3feeb1a4ca5d920f, -0x3feeaf4736b527da, -0x3feead12d497c7fd, -0x3feeab07dd485429, -0x3feea9268a5946b7, -0x3feea76f15ad2148, -0x3feea5e1b976dc09, -0x3feea47eb03a5585, -0x3feea34634ccc320, -0x3feea23882552225, -0x3feea155d44ca973, -0x3feea09e667f3bcd, -0x3feea012750bdabf, -0x3fee9fb23c651a2f, -0x3fee9f7df9519484, -0x3fee9f75e8ec5f74, -0x3fee9f9a48a58174, -0x3fee9feb564267c9, -0x3feea0694fde5d3f, -0x3feea11473eb0187, -0x3feea1ed0130c132, -0x3feea2f336cf4e62, -0x3feea427543e1a12, -0x3feea589994cce13, -0x3feea71a4623c7ad, -0x3feea8d99b4492ed, -0x3feeaac7d98a6699, -0x3feeace5422aa0db, -0x3feeaf3216b5448c, -0x3feeb1ae99157736, -0x3feeb45b0b91ffc6, -0x3feeb737b0cdc5e5, -0x3feeba44cbc8520f, -0x3feebd829fde4e50, -0x3feec0f170ca07ba, -0x3feec49182a3f090, -0x3feec86319e32323, -0x3feecc667b5de565, -0x3feed09bec4a2d33, -0x3feed503b23e255d, -0x3feed99e1330b358, -0x3feede6b5579fdbf, -0x3feee36bbfd3f37a, -0x3feee89f995ad3ad, -0x3feeee07298db666, -0x3feef3a2b84f15fb, -0x3feef9728de5593a, -0x3feeff76f2fb5e47, -0x3fef05b030a1064a, -0x3fef0c1e904bc1d2, -0x3fef12c25bd71e09, -0x3fef199bdd85529c, -0x3fef20ab5fffd07a, -0x3fef27f12e57d14b, -0x3fef2f6d9406e7b5, -0x3fef3720dcef9069, -0x3fef3f0b555dc3fa, -0x3fef472d4a07897c, -0x3fef4f87080d89f2, -0x3fef5818dcfba487, -0x3fef60e316c98398, -0x3fef69e603db3285, -0x3fef7321f301b460, -0x3fef7c97337b9b5f, -0x3fef864614f5a129, -0x3fef902ee78b3ff6, -0x3fef9a51fbc74c83, -0x3fefa4afa2a490da, -0x3fefaf482d8e67f1, -0x3fefba1bee615a27, -0x3fefc52b376bba97, -0x3fefd0765b6e4540, -0x3fefdbfdad9cbe14, -0x3fefe7c1819e90d8, -0x3feff3c22b8f71f1, -#elif N == 256 +const struct v_pow_exp_data __v_pow_exp_data = { +// exp polynomial coefficients. +.poly = { +// abs error: 1.43*2^-58 +// ulp error: 0.549 (0.550 without fma) +// if |x| < ln2/512 +0x1.fffffffffffd4p-2, +0x1.5555571d6ef9p-3, +0x1.5555576a5adcep-5, +}, +// N/ln2 +.n_over_ln2 = 0x1.71547652b82fep0 * N, +// ln2/N +.ln2_over_n_hi = 0x1.62e42fefc0000p-9, +.ln2_over_n_lo = -0x1.c610ca86c3899p-45, +// Used for rounding to nearest integer without using intrinsics. +.shift = 0x1.8p52, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// sbits[k] = asuint64(H[k]) - (k << 52)/N +.sbits = { 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, 0x3feff9d96b2a23d9, -#endif +}, }; -#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_pow_log_data.c b/contrib/arm-optimized-routines/pl/math/v_pow_log_data.c new file mode 100644 index 000000000000..036faa5c97c1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_pow_log_data.c @@ -0,0 +1,174 @@ +/* + * Data for the log part of pow. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_POW_LOG_TABLE_BITS) + +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + + where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals + and z falls into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = round(0x1p43*log(c))/0x1p43 + tab[i].logctail = (double)(log(c) - logc) + + where c is chosen near the center of the subinterval such that 1/c has only + a few precision bits so z/c - 1 is exactly representible as double: + + 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2 + + Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < + 0x1p-97, the last few bits of logc are rounded away so k*ln2hi + logc has no + rounding error and the interval for z is selected such that near x == 1, + where log(x) + is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */ +const struct v_pow_log_data __v_pow_log_data = { + /* relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .poly = { -0x1p-1, -0x1.555555555556p-1, 0x1.0000000000006p-1, + 0x1.999999959554ep-1, -0x1.555555529a47ap-1, -0x1.2495b9b4845e9p0, + 0x1.0002b8b263fc3p0, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .invc = { 0x1.6a00000000000p+0, 0x1.6800000000000p+0, 0x1.6600000000000p+0, + 0x1.6400000000000p+0, 0x1.6200000000000p+0, 0x1.6000000000000p+0, + 0x1.5e00000000000p+0, 0x1.5c00000000000p+0, 0x1.5a00000000000p+0, + 0x1.5800000000000p+0, 0x1.5600000000000p+0, 0x1.5600000000000p+0, + 0x1.5400000000000p+0, 0x1.5200000000000p+0, 0x1.5000000000000p+0, + 0x1.4e00000000000p+0, 0x1.4c00000000000p+0, 0x1.4a00000000000p+0, + 0x1.4a00000000000p+0, 0x1.4800000000000p+0, 0x1.4600000000000p+0, + 0x1.4400000000000p+0, 0x1.4200000000000p+0, 0x1.4000000000000p+0, + 0x1.4000000000000p+0, 0x1.3e00000000000p+0, 0x1.3c00000000000p+0, + 0x1.3a00000000000p+0, 0x1.3a00000000000p+0, 0x1.3800000000000p+0, + 0x1.3600000000000p+0, 0x1.3400000000000p+0, 0x1.3400000000000p+0, + 0x1.3200000000000p+0, 0x1.3000000000000p+0, 0x1.3000000000000p+0, + 0x1.2e00000000000p+0, 0x1.2c00000000000p+0, 0x1.2c00000000000p+0, + 0x1.2a00000000000p+0, 0x1.2800000000000p+0, 0x1.2600000000000p+0, + 0x1.2600000000000p+0, 0x1.2400000000000p+0, 0x1.2400000000000p+0, + 0x1.2200000000000p+0, 0x1.2000000000000p+0, 0x1.2000000000000p+0, + 0x1.1e00000000000p+0, 0x1.1c00000000000p+0, 0x1.1c00000000000p+0, + 0x1.1a00000000000p+0, 0x1.1a00000000000p+0, 0x1.1800000000000p+0, + 0x1.1600000000000p+0, 0x1.1600000000000p+0, 0x1.1400000000000p+0, + 0x1.1400000000000p+0, 0x1.1200000000000p+0, 0x1.1000000000000p+0, + 0x1.1000000000000p+0, 0x1.0e00000000000p+0, 0x1.0e00000000000p+0, + 0x1.0c00000000000p+0, 0x1.0c00000000000p+0, 0x1.0a00000000000p+0, + 0x1.0a00000000000p+0, 0x1.0800000000000p+0, 0x1.0800000000000p+0, + 0x1.0600000000000p+0, 0x1.0400000000000p+0, 0x1.0400000000000p+0, + 0x1.0200000000000p+0, 0x1.0200000000000p+0, 0x1.0000000000000p+0, + 0x1.0000000000000p+0, 0x1.fc00000000000p-1, 0x1.f800000000000p-1, + 0x1.f400000000000p-1, 0x1.f000000000000p-1, 0x1.ec00000000000p-1, + 0x1.e800000000000p-1, 0x1.e400000000000p-1, 0x1.e200000000000p-1, + 0x1.de00000000000p-1, 0x1.da00000000000p-1, 0x1.d600000000000p-1, + 0x1.d400000000000p-1, 0x1.d000000000000p-1, 0x1.cc00000000000p-1, + 0x1.ca00000000000p-1, 0x1.c600000000000p-1, 0x1.c400000000000p-1, + 0x1.c000000000000p-1, 0x1.be00000000000p-1, 0x1.ba00000000000p-1, + 0x1.b800000000000p-1, 0x1.b400000000000p-1, 0x1.b200000000000p-1, + 0x1.ae00000000000p-1, 0x1.ac00000000000p-1, 0x1.aa00000000000p-1, + 0x1.a600000000000p-1, 0x1.a400000000000p-1, 0x1.a000000000000p-1, + 0x1.9e00000000000p-1, 0x1.9c00000000000p-1, 0x1.9a00000000000p-1, + 0x1.9600000000000p-1, 0x1.9400000000000p-1, 0x1.9200000000000p-1, + 0x1.9000000000000p-1, 0x1.8c00000000000p-1, 0x1.8a00000000000p-1, + 0x1.8800000000000p-1, 0x1.8600000000000p-1, 0x1.8400000000000p-1, + 0x1.8200000000000p-1, 0x1.7e00000000000p-1, 0x1.7c00000000000p-1, + 0x1.7a00000000000p-1, 0x1.7800000000000p-1, 0x1.7600000000000p-1, + 0x1.7400000000000p-1, 0x1.7200000000000p-1, 0x1.7000000000000p-1, + 0x1.6e00000000000p-1, 0x1.6c00000000000p-1, }, + .logc + = { -0x1.62c82f2b9c800p-2, -0x1.5d1bdbf580800p-2, -0x1.5767717455800p-2, + -0x1.51aad872df800p-2, -0x1.4be5f95777800p-2, -0x1.4618bc21c6000p-2, + -0x1.404308686a800p-2, -0x1.3a64c55694800p-2, -0x1.347dd9a988000p-2, + -0x1.2e8e2bae12000p-2, -0x1.2895a13de8800p-2, -0x1.2895a13de8800p-2, + -0x1.22941fbcf7800p-2, -0x1.1c898c1699800p-2, -0x1.1675cababa800p-2, + -0x1.1058bf9ae4800p-2, -0x1.0a324e2739000p-2, -0x1.0402594b4d000p-2, + -0x1.0402594b4d000p-2, -0x1.fb9186d5e4000p-3, -0x1.ef0adcbdc6000p-3, + -0x1.e27076e2af000p-3, -0x1.d5c216b4fc000p-3, -0x1.c8ff7c79aa000p-3, + -0x1.c8ff7c79aa000p-3, -0x1.bc286742d9000p-3, -0x1.af3c94e80c000p-3, + -0x1.a23bc1fe2b000p-3, -0x1.a23bc1fe2b000p-3, -0x1.9525a9cf45000p-3, + -0x1.87fa06520d000p-3, -0x1.7ab890210e000p-3, -0x1.7ab890210e000p-3, + -0x1.6d60fe719d000p-3, -0x1.5ff3070a79000p-3, -0x1.5ff3070a79000p-3, + -0x1.526e5e3a1b000p-3, -0x1.44d2b6ccb8000p-3, -0x1.44d2b6ccb8000p-3, + -0x1.371fc201e9000p-3, -0x1.29552f81ff000p-3, -0x1.1b72ad52f6000p-3, + -0x1.1b72ad52f6000p-3, -0x1.0d77e7cd09000p-3, -0x1.0d77e7cd09000p-3, + -0x1.fec9131dbe000p-4, -0x1.e27076e2b0000p-4, -0x1.e27076e2b0000p-4, + -0x1.c5e548f5bc000p-4, -0x1.a926d3a4ae000p-4, -0x1.a926d3a4ae000p-4, + -0x1.8c345d631a000p-4, -0x1.8c345d631a000p-4, -0x1.6f0d28ae56000p-4, + -0x1.51b073f062000p-4, -0x1.51b073f062000p-4, -0x1.341d7961be000p-4, + -0x1.341d7961be000p-4, -0x1.16536eea38000p-4, -0x1.f0a30c0118000p-5, + -0x1.f0a30c0118000p-5, -0x1.b42dd71198000p-5, -0x1.b42dd71198000p-5, + -0x1.77458f632c000p-5, -0x1.77458f632c000p-5, -0x1.39e87b9fec000p-5, + -0x1.39e87b9fec000p-5, -0x1.f829b0e780000p-6, -0x1.f829b0e780000p-6, + -0x1.7b91b07d58000p-6, -0x1.fc0a8b0fc0000p-7, -0x1.fc0a8b0fc0000p-7, + -0x1.fe02a6b100000p-8, -0x1.fe02a6b100000p-8, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, 0x1.0101575890000p-7, 0x1.0205658938000p-6, + 0x1.8492528c90000p-6, 0x1.0415d89e74000p-5, 0x1.466aed42e0000p-5, + 0x1.894aa149fc000p-5, 0x1.ccb73cdddc000p-5, 0x1.eea31c006c000p-5, + 0x1.1973bd1466000p-4, 0x1.3bdf5a7d1e000p-4, 0x1.5e95a4d97a000p-4, + 0x1.700d30aeac000p-4, 0x1.9335e5d594000p-4, 0x1.b6ac88dad6000p-4, + 0x1.c885801bc4000p-4, 0x1.ec739830a2000p-4, 0x1.fe89139dbe000p-4, + 0x1.1178e8227e000p-3, 0x1.1aa2b7e23f000p-3, 0x1.2d1610c868000p-3, + 0x1.365fcb0159000p-3, 0x1.4913d8333b000p-3, 0x1.527e5e4a1b000p-3, + 0x1.6574ebe8c1000p-3, 0x1.6f0128b757000p-3, 0x1.7898d85445000p-3, + 0x1.8beafeb390000p-3, 0x1.95a5adcf70000p-3, 0x1.a93ed3c8ae000p-3, + 0x1.b31d8575bd000p-3, 0x1.bd087383be000p-3, 0x1.c6ffbc6f01000p-3, + 0x1.db13db0d49000p-3, 0x1.e530effe71000p-3, 0x1.ef5ade4dd0000p-3, + 0x1.f991c6cb3b000p-3, 0x1.07138604d5800p-2, 0x1.0c42d67616000p-2, + 0x1.1178e8227e800p-2, 0x1.16b5ccbacf800p-2, 0x1.1bf99635a6800p-2, + 0x1.214456d0eb800p-2, 0x1.2bef07cdc9000p-2, 0x1.314f1e1d36000p-2, + 0x1.36b6776be1000p-2, 0x1.3c25277333000p-2, 0x1.419b423d5e800p-2, + 0x1.4718dc271c800p-2, 0x1.4c9e09e173000p-2, 0x1.522ae0738a000p-2, + 0x1.57bf753c8d000p-2, 0x1.5d5bddf596000p-2, }, + .logctail + = { 0x1.ab42428375680p-48, -0x1.ca508d8e0f720p-46, -0x1.362a4d5b6506dp-45, + -0x1.684e49eb067d5p-49, -0x1.41b6993293ee0p-47, 0x1.3d82f484c84ccp-46, + 0x1.c42f3ed820b3ap-50, 0x1.0b1c686519460p-45, 0x1.5594dd4c58092p-45, + 0x1.67b1e99b72bd8p-45, 0x1.5ca14b6cfb03fp-46, 0x1.5ca14b6cfb03fp-46, + -0x1.65a242853da76p-46, -0x1.fafbc68e75404p-46, 0x1.f1fc63382a8f0p-46, + -0x1.6a8c4fd055a66p-45, -0x1.c6bee7ef4030ep-47, -0x1.036b89ef42d7fp-48, + -0x1.036b89ef42d7fp-48, 0x1.d572aab993c87p-47, 0x1.b26b79c86af24p-45, + -0x1.72f4f543fff10p-46, 0x1.1ba91bbca681bp-45, 0x1.7794f689f8434p-45, + 0x1.7794f689f8434p-45, 0x1.94eb0318bb78fp-46, 0x1.a4e633fcd9066p-52, + -0x1.58c64dc46c1eap-45, -0x1.58c64dc46c1eap-45, -0x1.ad1d904c1d4e3p-45, + 0x1.bbdbf7fdbfa09p-45, 0x1.bdb9072534a58p-45, 0x1.bdb9072534a58p-45, + -0x1.0e46aa3b2e266p-46, -0x1.e9e439f105039p-46, -0x1.e9e439f105039p-46, + -0x1.0de8b90075b8fp-45, 0x1.70cc16135783cp-46, 0x1.70cc16135783cp-46, + 0x1.178864d27543ap-48, -0x1.48d301771c408p-45, -0x1.e80a41811a396p-45, + -0x1.e80a41811a396p-45, 0x1.a699688e85bf4p-47, 0x1.a699688e85bf4p-47, + -0x1.575545ca333f2p-45, 0x1.a342c2af0003cp-45, 0x1.a342c2af0003cp-45, + -0x1.d0c57585fbe06p-46, 0x1.53935e85baac8p-45, 0x1.53935e85baac8p-45, + 0x1.37c294d2f5668p-46, 0x1.37c294d2f5668p-46, -0x1.69737c93373dap-45, + 0x1.f025b61c65e57p-46, 0x1.f025b61c65e57p-46, 0x1.c5edaccf913dfp-45, + 0x1.c5edaccf913dfp-45, 0x1.47c5e768fa309p-46, 0x1.d599e83368e91p-45, + 0x1.d599e83368e91p-45, 0x1.c827ae5d6704cp-46, 0x1.c827ae5d6704cp-46, + -0x1.cfc4634f2a1eep-45, -0x1.cfc4634f2a1eep-45, 0x1.502b7f526feaap-48, + 0x1.502b7f526feaap-48, -0x1.980267c7e09e4p-45, -0x1.980267c7e09e4p-45, + -0x1.88d5493faa639p-45, -0x1.f1e7cf6d3a69cp-50, -0x1.f1e7cf6d3a69cp-50, + -0x1.9e23f0dda40e4p-46, -0x1.9e23f0dda40e4p-46, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, -0x1.0c76b999d2be8p-46, -0x1.3dc5b06e2f7d2p-45, + -0x1.aa0ba325a0c34p-45, 0x1.111c05cf1d753p-47, -0x1.c167375bdfd28p-45, + -0x1.97995d05a267dp-46, -0x1.a68f247d82807p-46, -0x1.e113e4fc93b7bp-47, + -0x1.5325d560d9e9bp-45, 0x1.cc85ea5db4ed7p-45, -0x1.c69063c5d1d1ep-45, + 0x1.c1e8da99ded32p-49, 0x1.3115c3abd47dap-45, -0x1.390802bf768e5p-46, + 0x1.646d1c65aacd3p-45, -0x1.dc068afe645e0p-45, -0x1.534d64fa10afdp-45, + 0x1.1ef78ce2d07f2p-45, 0x1.ca78e44389934p-45, 0x1.39d6ccb81b4a1p-47, + 0x1.62fa8234b7289p-51, 0x1.5837954fdb678p-45, 0x1.633e8e5697dc7p-45, + 0x1.9cf8b2c3c2e78p-46, -0x1.5118de59c21e1p-45, -0x1.c661070914305p-46, + -0x1.73d54aae92cd1p-47, 0x1.7f22858a0ff6fp-47, -0x1.8724350562169p-45, + -0x1.c358d4eace1aap-47, -0x1.d4bc4595412b6p-45, -0x1.1ec72c5962bd2p-48, + -0x1.aff2af715b035p-45, 0x1.212276041f430p-51, -0x1.a211565bb8e11p-51, + 0x1.bcbecca0cdf30p-46, 0x1.89cdb16ed4e91p-48, 0x1.7188b163ceae9p-45, + -0x1.c210e63a5f01cp-45, 0x1.b9acdf7a51681p-45, 0x1.ca6ed5147bdb7p-45, + 0x1.a87deba46baeap-47, 0x1.a9cfa4a5004f4p-45, -0x1.8e27ad3213cb8p-45, + 0x1.16ecdb0f177c8p-46, 0x1.83b54b606bd5cp-46, 0x1.8e436ec90e09dp-47, + -0x1.f27ce0967d675p-45, -0x1.e20891b0ad8a4p-45, 0x1.ebe708164c759p-45, + 0x1.fadedee5d40efp-46, -0x1.a0b2a08a465dcp-47, }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_powf_data.c b/contrib/arm-optimized-routines/pl/math/v_powf_data.c new file mode 100644 index 000000000000..ded211924b80 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_powf_data.c @@ -0,0 +1,89 @@ +/* + * Coefficients for single-precision SVE pow(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct v_powf_data __v_powf_data = { + .invc = { 0x1.6489890582816p+0, + 0x1.5cf19b35e3472p+0, + 0x1.55aac0e956d65p+0, + 0x1.4eb0022977e01p+0, + 0x1.47fcccda1dd1fp+0, + 0x1.418ceabab68c1p+0, + 0x1.3b5c788f1edb3p+0, + 0x1.3567de48e9c9ap+0, + 0x1.2fabc80fd19bap+0, + 0x1.2a25200ce536bp+0, + 0x1.24d108e0152e3p+0, + 0x1.1facd8ab2fbe1p+0, + 0x1.1ab614a03efdfp+0, + 0x1.15ea6d03af9ffp+0, + 0x1.1147b994bb776p+0, + 0x1.0ccbf650593aap+0, + 0x1.0875408477302p+0, + 0x1.0441d42a93328p+0, + 0x1p+0, + 0x1.f1d006c855e86p-1, + 0x1.e28c3341aa301p-1, + 0x1.d4bdf9aa64747p-1, + 0x1.c7b45a24e5803p-1, + 0x1.bb5f5eb2ed60ap-1, + 0x1.afb0bff8fe6b4p-1, + 0x1.a49badf7ab1f5p-1, + 0x1.9a14a111fc4c9p-1, + 0x1.901131f5b2fdcp-1, + 0x1.8687f73f6d865p-1, + 0x1.7d7067eb77986p-1, + 0x1.74c2c1cf97b65p-1, + 0x1.6c77f37cff2a1p-1 + }, + .logc = { -0x1.e960f97b22702p+3, + -0x1.c993406cd4db6p+3, + -0x1.aa711d9a7d0f3p+3, + -0x1.8bf37bacdce9bp+3, + -0x1.6e13b3519946ep+3, + -0x1.50cb8281e4089p+3, + -0x1.341504a237e2bp+3, + -0x1.17eaab624ffbbp+3, + -0x1.f88e708f8c853p+2, + -0x1.c24b6da113914p+2, + -0x1.8d02ee397cb1dp+2, + -0x1.58ac1223408b3p+2, + -0x1.253e6fd190e89p+2, + -0x1.e5641882c12ffp+1, + -0x1.81fea712926f7p+1, + -0x1.203e240de64a3p+1, + -0x1.8029b86a78281p0, + -0x1.85d713190fb9p-1, + 0x0p+0, + 0x1.4c1cc07312997p0, + 0x1.5e1848ccec948p+1, + 0x1.04cfcb7f1196fp+2, + 0x1.582813d463c21p+2, + 0x1.a936fa68760ccp+2, + 0x1.f81bc31d6cc4ep+2, + 0x1.2279a09fae6b1p+3, + 0x1.47ec0b6df5526p+3, + 0x1.6c71762280f1p+3, + 0x1.90155070798dap+3, + 0x1.b2e23b1d3068cp+3, + 0x1.d4e21b0daa86ap+3, + 0x1.f61e2a2f67f3fp+3 + }, + .scale = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_sincos_3u5.c b/contrib/arm-optimized-routines/pl/math/v_sincos_3u5.c new file mode 100644 index 000000000000..6fc014c120b8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincos_3u5.c @@ -0,0 +1,57 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "v_math.h" +#include "pl_test.h" +#include "v_sincos_common.h" + +static void VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, double *out_sin, + double *out_cos) +{ + if (special[0]) + sincos (x[0], out_sin, out_cos); + if (special[1]) + sincos (x[1], out_sin + 1, out_cos + 1); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); + + if (unlikely (v_any_u64 (special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73) +PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73) +#define V_SINCOS_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n) +V_SINCOS_INTERVAL (0, 0x1p23, 500000) +V_SINCOS_INTERVAL (-0, -0x1p23, 500000) +V_SINCOS_INTERVAL (0x1p23, inf, 10000) +V_SINCOS_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sincos_common.h b/contrib/arm-optimized-routines/pl/math/v_sincos_common.h new file mode 100644 index 000000000000..ee7937e0785a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincos_common.h @@ -0,0 +1,86 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +static const struct v_sincos_data +{ + float64x2_t sin_poly[7], cos_poly[6], pio2[3]; + float64x2_t inv_pio2, shift, range_val; +} v_sincos_data = { + .inv_pio2 = V2 (0x1.45f306dc9c882p-1), + .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26), + V2 (0x1.1a62633145c07p-54) }, + .shift = V2 (0x1.8p52), + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10), + V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22), + V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) }, + .range_val = V2 (0x1p23), }; + +static inline uint64x2_t +check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d) +{ + return vcagtq_f64 (x, d->range_val); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline float64x2x2_t +v_sincos_inline (float64x2_t x, const struct v_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift); + int64x2_t n = vcvtq_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; + r = vfmsq_f64 (r, q, d->pio2[0]); + r = vfmsq_f64 (r, q, d->pio2[1]); + r = vfmsq_f64 (r, q, d->pio2[2]); + + float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2; + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly); + s = vfmaq_f64 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly); + c = vfmaq_f64 (v_f64 (-0.5), r2, c); + c = vfmaq_f64 (v_f64 (1), r2, c); + + /* If odd quadrant, swap cos and sin. */ + uint64x2_t swap = vtstq_s64 (n, v_s64 (1)); + float64x2_t ss = vbslq_f64 (swap, c, s); + float64x2_t cc = vbslq_f64 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + uint64x2_t sin_sign + = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62); + uint64x2_t cos_sign = vshlq_n_u64 ( + vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)), + 62); + ss = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign)); + cc = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign)); + + return (float64x2x2_t){ ss, cc }; +} diff --git a/contrib/arm-optimized-routines/pl/math/v_sincosf_1u8.c b/contrib/arm-optimized-routines/pl/math/v_sincosf_1u8.c new file mode 100644 index 000000000000..bf77afaa14db --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincosf_1u8.c @@ -0,0 +1,58 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include +#undef _GNU_SOURCE + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "pl_test.h" + +static void VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float *out_sin, + float *out_cos) +{ + for (int i = 0; i < 4; i++) + if (special[i]) + sincosf (x[i], out_sin + i, out_cos + i); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); + + if (unlikely (v_any_u32 (special))) + special_case (x, special, out_sin, out_cos); +} + +PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17) +PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31) +#define V_SINCOSF_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \ + PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n) +V_SINCOSF_INTERVAL (0, 0x1p20, 500000) +V_SINCOSF_INTERVAL (-0, -0x1p20, 500000) +V_SINCOSF_INTERVAL (0x1p20, inf, 10000) +V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sincosf_common.h b/contrib/arm-optimized-routines/pl/math/v_sincosf_common.h new file mode 100644 index 000000000000..8239bd9f0176 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sincosf_common.h @@ -0,0 +1,84 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +const static struct v_sincosf_data +{ + float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} v_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) }, + .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) }, + .inv_pio2 = V4 (0x1.45f306p-1f), + .shift = V4 (0x1.8p23), + .range_val = V4 (0x1p20), +}; + +static inline uint32x4_t +check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d) +{ + return vcagtq_f32 (x, d->range_val); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline float32x4x2_t +v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + float32x4_t shift = d->shift; + float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2); + q = vsubq_f32 (q, shift); + int32x4_t n = vcvtq_s32_f32 (q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + float32x4_t r = x; + r = vfmsq_f32 (r, q, d->pio2[0]); + r = vfmsq_f32 (r, q, d->pio2[1]); + r = vfmsq_f32 (r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2); + float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]); + s = vfmaq_f32 (d->poly_sin[0], r2, s); + s = vfmaq_f32 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]); + float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]); + c = vfmaq_f32 (c, r4, p); + c = vfmaq_f32 (v_f32 (1), c, r2); + + /* If odd quadrant, swap cos and sin. */ + uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1)); + float32x4_t ss = vbslq_f32 (swap, c, s); + float32x4_t cc = vbslq_f32 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + uint32x4_t sin_sign + = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30); + uint32x4_t cos_sign = vshlq_n_u32 ( + vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)), + 30); + ss = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign)); + cc = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign)); + + return (float32x4x2_t){ ss, cc }; +} diff --git a/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c index 57ec66ecc282..a644f54b4a0f 100644 --- a/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c @@ -1,94 +1,118 @@ /* * Double-precision vector sinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#define AbsMask 0x7fffffffffffffff -#define Half 0x3fe0000000000000 -#define BigBound \ - 0x4080000000000000 /* 2^9. expm1 helper overflows for large input. */ -#define TinyBound \ - 0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x. */ -#define InvLn2 v_f64 (0x1.71547652b82fep0) -#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) -#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) -#define Shift v_f64 (0x1.8p52) -#define One 0x3ff0000000000000 -#define C(i) v_f64 (__expm1_poly[i]) +static const struct data +{ + float64x2_t poly[11]; + float64x2_t inv_ln2, m_ln2, shift; + uint64x2_t halff; + int64x2_t onef; +#if WANT_SIMD_EXCEPT + uint64x2_t tiny_bound, thresh; +#else + uint64x2_t large_bound; +#endif +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, -#if V_SUPPORTED + .inv_ln2 = V2 (0x1.71547652b82fep0), + .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, + .shift = V2 (0x1.8p52), -static inline v_f64_t -expm1_inline (v_f64_t x) + .halff = V2 (0x3fe0000000000000), + .onef = V2 (0x3ff0000000000000), +#if WANT_SIMD_EXCEPT + /* 2^-26, below which sinh(x) rounds to x. */ + .tiny_bound = V2 (0x3e50000000000000), + /* asuint(large_bound) - asuint(tiny_bound). */ + .thresh = V2 (0x0230000000000000), +#else +/* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x4080000000000000), +#endif +}; + +static inline float64x2_t +expm1_inline (float64x2_t x) { + const struct data *d = ptr_barrier (&data); + /* Reduce argument: exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where i = round(x / ln2) and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ - v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; - v_s64_t i = v_to_s64_f64 (j); - v_f64_t f = v_fma_f64 (j, MLn2hi, x); - f = v_fma_f64 (j, MLn2lo, f); + float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (j); + float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0); + f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1); /* Approximate expm1(f) using polynomial. */ - v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4; - v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t f8 = vmulq_f64 (f4, f4); + float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); /* t = 2^i. */ - v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + float64x2_t t = vreinterpretq_f64_u64 ( + vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef))); /* expm1(x) ~= p * t + (t - 1). */ - return v_fma_f64 (p, t, t - 1); + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); } -static NOINLINE VPCS_ATTR v_f64_t -special_case (v_f64_t x) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x) { return v_call_f64 (sinh, x, x, v_u64 (-1)); } /* Approximation for vector double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The greatest observed error is 2.57 ULP: - sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 - want 0x1.ab34e59d678d9p-2. */ -VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x) + _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 + want 0x1.ab34e59d678d9p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t iax = ix & AbsMask; - v_f64_t ax = v_as_f64_u64 (iax); - v_u64_t sign = ix & ~AbsMask; - v_f64_t halfsign = v_as_f64_u64 (sign | Half); + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t sign + = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax)); + float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff)); #if WANT_SIMD_EXCEPT - v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound)); + uint64x2_t special = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); #else - v_u64_t special = v_cond_u64 (iax >= BigBound); + uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound); #endif /* Fall back to scalar variant for all lanes if any of them are special. */ if (unlikely (v_any_u64 (special))) return special_case (x); /* Up to the point that expm1 overflows, we can use it to calculate sinh using a slight rearrangement of the definition of sinh. This allows us to retain acceptable accuracy for very small inputs. */ - v_f64_t t = expm1_inline (ax); - return (t + t / (t + 1)) * halfsign; + float64x2_t t = expm1_inline (ax); + t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); + return vmulq_f64 (t, halfsign); } -VPCS_ALIAS PL_SIG (V, D, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (sinh), 2.08) -PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000) -PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000) -PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000) -PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_D1 (sinh), 2.08) +PL_TEST_EXPECT_FENV (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c index 49cf078d0651..cd8c0f08f784 100644 --- a/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c +++ b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c @@ -1,69 +1,84 @@ /* * Single-precision vector sinh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - #include "v_expm1f_inline.h" -#define AbsMask 0x7fffffff -#define Half 0x3f000000 -#define BigBound \ - 0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows. */ -#define TinyBound \ - 0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows. */ +static const struct data +{ + struct v_expm1f_data expm1f_consts; + uint32x4_t halff; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound, thresh; +#else + uint32x4_t oflow_bound; +#endif +} data = { + .expm1f_consts = V_EXPM1F_DATA, + .halff = V4 (0x3f000000), +#if WANT_SIMD_EXCEPT + /* 0x1.6a09e8p-32, below which expm1f underflows. */ + .tiny_bound = V4 (0x2fb504f4), + /* asuint(oflow_bound) - asuint(tiny_bound). */ + .thresh = V4 (0x12fbbbb3), +#else + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .oflow_bound = V4 (0x42b0c0a7), +#endif +}; -static NOINLINE VPCS_ATTR v_f32_t -special_case (v_f32_t x) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { - return v_call_f32 (sinhf, x, x, v_u32 (-1)); + return v_call_f32 (sinhf, x, y, special); } /* Approximation for vector single-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. The maximum error is 2.26 ULP: - __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ -VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x) + _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - v_f32_t ax = v_as_f32_u32 (iax); - v_u32_t sign = ix & ~AbsMask; - v_f32_t halfsign = v_as_f32_u32 (sign | Half); + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t sign = veorq_u32 (ix, iax); + float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff)); #if WANT_SIMD_EXCEPT - v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound)); + uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh); + ax = v_zerofy_f32 (ax, special); #else - v_u32_t special = v_cond_u32 (iax >= BigBound); + uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound); #endif - /* Fall back to the scalar variant for all lanes if any of them should trigger - an exception. */ + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us + to retain acceptable accuracy for very small inputs. */ + float32x4_t t = expm1f_inline (ax, &d->expm1f_consts); + t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0)))); + + /* Fall back to the scalar variant for any lanes that should trigger an + exception. */ if (unlikely (v_any_u32 (special))) - return special_case (x); + return special_case (x, vmulq_f32 (t, halfsign), special); - /* Up to the point that expm1f overflows, we can use it to calculate sinhf - using a slight rearrangement of the definition of asinh. This allows us to - retain acceptable accuracy for very small inputs. */ - v_f32_t t = expm1f_inline (ax); - return (t + t / (t + 1)) * halfsign; + return vmulq_f32 (t, halfsign); } -VPCS_ALIAS PL_SIG (V, F, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (sinhf), 1.76) -PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000) -PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000) -PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000) -PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_F1 (sinh), 1.76) +PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sinpi_3u1.c b/contrib/arm-optimized-routines/pl/math/v_sinpi_3u1.c new file mode 100644 index 000000000000..8d2917ff8ecd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sinpi_3u1.c @@ -0,0 +1,86 @@ +/* + * Double-precision vector sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f64.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float64x2_t poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */ +/* asuint64(0x1p64) - TinyBound. */ +# define Thresh v_u64 (0x07f0000000000000) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sinpi, x, y, cmp); +} +#endif + +/* Approximation for vector double-precision sinpi(x). + Maximum Error 3.05 ULP: + _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1 + want 0x1.fb295878301cap-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float64x2_t r = v_zerofy_f64 (x, cmp); +#else + float64x2_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +PL_SIG (V, D, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06) +PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_sinpif_3u.c b/contrib/arm-optimized-routines/pl/math/v_sinpif_3u.c new file mode 100644 index 000000000000..3d6eeff333f7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sinpif_3u.c @@ -0,0 +1,81 @@ +/* + * Single-precision vector sinpi function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "poly_advsimd_f32.h" +#include "pl_sig.h" +#include "pl_test.h" + +static const struct data +{ + float32x4_t poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */ +# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinpif, x, y, cmp); +} +#endif + +/* Approximation for vector single-precision sinpi(x) + Maximum Error 3.03 ULP: + _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float32x4_t r = v_zerofy_f32 (x, cmp); +#else + float32x4_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +PL_SIG (V, F, 1, sinpi, -0.9, 0.9) +PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54) +PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c index f87baccc4fd7..c431c8c4889e 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c @@ -1,102 +1,120 @@ /* * Double-precision vector tan(x) function. * * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float64x2_t poly[9]; + float64x2_t half_pi, two_over_pi, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3), + V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6), + V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), + V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), + V2 (0x1.4e4fd14147622p-12) }, + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, + .two_over_pi = V2 (0x1.45f306dc9c883p-1), + .shift = V2 (0x1.8p52), +#if !WANT_SIMD_EXCEPT + .range_val = V2 (0x1p23), +#endif +}; -#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi) -#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo) -#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1) -#define Shift v_f64 (0x1.8p52) -#define AbsMask 0x7fffffffffffffff -#define RangeVal 0x4160000000000000 /* asuint64(2^23). */ +#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */ #define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */ -#define C(i) v_f64 (__v_tan_data.poly[i]) +#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */ /* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f64_t -specialcase (v_f64_t x) +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x) { return v_call_f64 (tan, x, x, v_u64 (-1)); } /* Vector approximation for double-precision tan. Maximum measured error is 3.48 ULP: - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 - want -0x1.f6ccd8ecf7deap+37. */ -VPCS_ATTR -v_f64_t V_NAME (tan) (v_f64_t x) + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) { - v_u64_t iax = v_as_u64_f64 (x) & AbsMask; - - /* Our argument reduction cannot calculate q with sufficient accuracy for very - large inputs. Fall back to scalar routine for all lanes if any are too - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny - input to avoid underflow. Note pl does not supply a scalar double-precision - tan, so the fallback will be statically linked from the system libm. */ + const struct data *dat = ptr_barrier (&data); + /* Our argument reduction cannot calculate q with sufficient accuracy for + very large inputs. Fall back to scalar routine for all lanes if any are + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for + tiny input to avoid underflow. */ #if WANT_SIMD_EXCEPT - if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound))) -#else - if (unlikely (v_any_u64 (iax > RangeVal))) + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + /* iax - tiny_bound > range_val - tiny_bound. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh)); + if (unlikely (v_any_u64 (special))) + return special_case (x); #endif - return specialcase (x); /* q = nearest integer to 2 * x / pi. */ - v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift; - v_s64_t qi = v_to_s64_f64 (q); + float64x2_t q + = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift); + int64x2_t qi = vcvtq_s64_f64 (q); /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ - v_f64_t r = x; - r = v_fma_f64 (q, MHalfPiHi, r); - r = v_fma_f64 (q, MHalfPiLo, r); + float64x2_t r = x; + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ - r = r * 0.5; + r = vmulq_n_f64 (r, 0.5); /* Approximate tan(r) using order 8 polynomial. tan(x) is odd, so polynomial has the form: tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... Then compute the approximation by: tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ - v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4; - /* Use offset version of Estrin wrapper to evaluate from C1 onwards. */ - v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1); - p = v_fma_f64 (p, r2, C (0)); - p = v_fma_f64 (r2, p * r, r); + float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2), + r8 = vmulq_f64 (r4, r4); + /* Offset coefficients to evaluate from C1 onwards. */ + float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1); + p = vfmaq_f64 (dat->poly[0], p, r2); + p = vfmaq_f64 (r, r2, vmulq_f64 (p, r)); /* Recombination uses double-angle formula: tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) and reciprocity around pi/2: tan(x) = 1 / (tan(pi/2 - x)) to assemble result using change-of-sign and conditional selection of - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ - v_f64_t n = v_fma_f64 (p, p, v_f64 (-1)); - v_f64_t d = p * 2; + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). + */ + float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); + float64x2_t d = vaddq_f64 (p, p); - v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0); + uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); - return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d); +#if !WANT_SIMD_EXCEPT + uint64x2_t special = vcageq_f64 (x, dat->range_val); + if (unlikely (v_any_u64 (special))) + return special_case (x); +#endif + + return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)), + vbslq_f64 (no_recip, d, n)); } -VPCS_ALIAS PL_SIG (V, D, 1, tan, -3.1, 3.1) -PL_TEST_ULP (V_NAME (tan), 2.99) -PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000) -PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000) -PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000) -PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000) -PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000) -PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000) -#endif +PL_TEST_ULP (V_NAME_D1 (tan), 2.99) +PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_data.c b/contrib/arm-optimized-routines/pl/math/v_tan_data.c deleted file mode 100644 index 04e25169bd88..000000000000 --- a/contrib/arm-optimized-routines/pl/math/v_tan_data.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Coefficients and helpers for double-precision vector tan(x) function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "math_config.h" - -const struct v_tan_data __v_tan_data - = {.neg_half_pi_hi = -0x1.921fb54442d18p0, - .neg_half_pi_lo = -0x1.1a62633145c07p-54, - .poly - = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, - 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, - 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c index 828466b03182..98948b0a9ecf 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c +++ b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c @@ -1,131 +1,127 @@ /* * Single-precision vector tan(x) function. * * Copyright (c) 2021-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "estrinf.h" +#include "poly_advsimd_f32.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - -/* Constants. */ -#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f)) -#define NegPio2_2 (v_f32 (0x1.777a5cp-25f)) -#define NegPio2_3 (v_f32 (0x1.ee59dap-50f)) -#define InvPio2 (v_f32 (0x1.45f306p-1f)) -#define RangeVal (0x47000000) /* asuint32(0x1p15f). */ -#define TinyBound (0x30000000) /* asuint32 (0x1p-31). */ -#define Shift (v_f32 (0x1.8p+23f)) -#define AbsMask (v_u32 (0x7fffffff)) +static const struct data +{ + float32x4_t poly[6]; + float32x4_t pi_consts; + float32x4_t shift; +#if !WANT_SIMD_EXCEPT + float32x4_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), + V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ + .pi_consts + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, + .shift = V4 (0x1.8p+23f), +#if !WANT_SIMD_EXCEPT + .range_val = V4 (0x1p15f), +#endif +}; -#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i]) +#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */ +#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */ +#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */ /* Special cases (fall back to scalar calls). */ -VPCS_ATTR -NOINLINE static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) { return v_call_f32 (tanf, x, y, cmp); } /* Use a full Estrin scheme to evaluate polynomial. */ -static inline v_f32_t -eval_poly (v_f32_t z) +static inline float32x4_t +eval_poly (float32x4_t z, const struct data *d) { - v_f32_t z2 = z * z; + float32x4_t z2 = vmulq_f32 (z, z); #if WANT_SIMD_EXCEPT - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions - are to be triggered correctly, sidestep this by fixing such lanes to 0. */ - v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound); + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. + If fp exceptions are to be triggered correctly, + sidestep this by fixing such lanes to 0. */ + uint32x4_t will_uflow + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); if (unlikely (v_any_u32 (will_uflow))) - z2 = v_sel_f32 (will_uflow, v_f32 (0), z2); + z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); #endif - v_f32_t z4 = z2 * z2; - return ESTRIN_5 (z, z2, z4, poly); + float32x4_t z4 = vmulq_f32 (z2, z2); + return v_estrin_5_f32 (z, z2, z4, d->poly); } -/* Fast implementation of Neon tanf. +/* Fast implementation of AdvSIMD tanf. Maximum error is 3.45 ULP: __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 want 0x1.ff9850p-1. */ -VPCS_ATTR -v_f32_t V_NAME (tanf) (v_f32_t x) +float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x) { - v_f32_t special_arg = x; - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; + const struct data *d = ptr_barrier (&data); + float32x4_t special_arg = x; /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast regression. */ #if WANT_SIMD_EXCEPT + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); /* If fp exceptions are to be triggered correctly, also special-case tiny input, as this will load to overflow later. Fix any special lanes to 1 to prevent any exceptions being triggered. */ - v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound); + uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh); if (unlikely (v_any_u32 (special))) - x = v_sel_f32 (special, v_f32 (1.0f), x); + x = vbslq_f32 (special, v_f32 (1.0f), x); #else /* Otherwise, special-case large and special values. */ - v_u32_t special = v_cond_u32 (iax >= RangeVal); + uint32x4_t special = vcageq_f32 (x, d->range_val); #endif /* n = rint(x/(pi/2)). */ - v_f32_t q = v_fma_f32 (InvPio2, x, Shift); - v_f32_t n = q - Shift; - /* n is representable as a signed integer, simply convert it. */ - v_s32_t in = v_round_s32 (n); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); + float32x4_t n = vsubq_f32 (q, d->shift); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ - v_s32_t alt = in & 1; - v_u32_t pred_alt = (alt != 0); + uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ - v_f32_t r; - r = v_fma_f32 (NegPio2_1, n, x); - r = v_fma_f32 (NegPio2_2, n, r); - r = v_fma_f32 (NegPio2_3, n, r); + float32x4_t r; + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). - grows to infinity then use symmetries of tangent and the identity tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use the same polynomial approximation of tan as above. */ - /* Perform additional reduction if required. */ - v_f32_t z = v_sel_f32 (pred_alt, -r, r); + /* Invert sign of r if odd quadrant. */ + float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1))); /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ - v_f32_t z2 = r * r; - v_f32_t p = eval_poly (z2); - v_f32_t y = v_fma_f32 (z * z2, p, z); + float32x4_t z2 = vmulq_f32 (r, r); + float32x4_t p = eval_poly (z2, d); + float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p); /* Compute reciprocal and apply if required. */ - v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y); - y = v_sel_f32 (pred_alt, inv_y, y); - - /* Fast reduction does not handle the x = -0.0 case well, - therefore it is fixed here. */ - y = v_sel_f32 (x == v_f32 (-0.0), x, y); + float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y); if (unlikely (v_any_u32 (special))) - return specialcase (special_arg, y, special); - return y; + return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special); + return vbslq_f32 (pred_alt, inv_y, y); } -VPCS_ALIAS PL_SIG (V, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (V_NAME (tanf), 2.96) -PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000) -PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000) -#endif +PL_TEST_ULP (V_NAME_F1 (tan), 2.96) +PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c index c8b6c251d453..5de85c68da2c 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c +++ b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c @@ -1,94 +1,106 @@ /* * Double-precision vector tanh(x) function. * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "estrin.h" +#include "poly_advsimd_f64.h" #include "mathlib.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED +static const struct data +{ + float64x2_t poly[11]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64x2_t onef; + uint64x2_t thresh, tiny_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), + V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), + V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), + V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), + V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, -#define AbsMask v_u64 (0x7fffffffffffffff) -#define InvLn2 v_f64 (0x1.71547652b82fep0) -#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) -#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) -#define Shift v_f64 (0x1.8p52) -#define C(i) v_f64 (__expm1_poly[i]) + .inv_ln2 = V2 (0x1.71547652b82fep0), + .ln2_hi = V2 (-0x1.62e42fefa39efp-1), + .ln2_lo = V2 (-0x1.abc9e3b39803fp-56), + .shift = V2 (0x1.8p52), -#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ -#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ -#define One v_u64 (0x3ff0000000000000) + .onef = V2 (0x3ff0000000000000), + .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = V2 (0x01f241bf835f9d5f), +}; -static inline v_f64_t -expm1_inline (v_f64_t x) +static inline float64x2_t +expm1_inline (float64x2_t x, const struct data *d) { /* Helper routine for calculating exp(x) - 1. Vector port of the helper from the scalar variant of tanh. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; - v_s64_t i = v_to_s64_f64 (j); - v_f64_t f = v_fma_f64 (j, MLn2hi, x); - f = v_fma_f64 (j, MLn2lo, f); + float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (j); + float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi); + f = vfmaq_f64 (f, j, d->ln2_lo); /* Approximate expm1(f) using polynomial. */ - v_f64_t f2 = f * f; - v_f64_t f4 = f2 * f2; - v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t p = vfmaq_f64 ( + f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly)); /* t = 2 ^ i. */ - v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + float64x2_t t = vreinterpretq_f64_u64 ( + vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef)); /* expm1(x) = p * t + (t - 1). */ - return v_fma_f64 (p, t, t - 1); + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t); } -static NOINLINE v_f64_t -special_case (v_f64_t x, v_f64_t y, v_u64_t special) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) { return v_call_f64 (tanh, x, y, special); } /* Vector approximation for double-precision tanh(x), using a simplified - version of expm1. The greatest observed error is 2.75 ULP: - __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3 - want -0x1.ba31ba4691ab4p-3. */ -VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x) + version of expm1. The greatest observed error is 2.77 ULP: + _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) { - v_u64_t ix = v_as_u64_f64 (x); - v_u64_t ia = ix & AbsMask; + const struct data *d = ptr_barrier (&data); - /* Trigger special-cases for tiny, boring and infinity/NaN. */ - v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound)); - v_f64_t u; + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + float64x2_t u = x; + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); +#if WANT_SIMD_EXCEPT /* To trigger fp exceptions correctly, set special lanes to a neutral value. They will be fixed up later by the special-case handler. */ if (unlikely (v_any_u64 (special))) - u = v_sel_f64 (special, v_f64 (1), x) * 2; - else - u = x * 2; + u = v_zerofy_f64 (u, special); +#endif + + u = vaddq_f64 (u, u); /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - v_f64_t q = expm1_inline (u); - v_f64_t y = q / (q + 2); + float64x2_t q = expm1_inline (u, d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2)); if (unlikely (v_any_u64 (special))) - return special_case (x, y, special); - return y; + return special_case (x, vdivq_f64 (q, qp2), special); + return vdivq_f64 (q, qp2); } -VPCS_ALIAS PL_SIG (V, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (tanh), 2.26) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh)) -PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000) -PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000) -PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000) -PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000) -PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000) -#endif +PL_TEST_ULP (V_NAME_D1 (tanh), 2.27) +PL_TEST_EXPECT_FENV (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c index 36166118c0f0..d1cb9fb6eeb3 100644 --- a/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c +++ b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c @@ -1,69 +1,73 @@ /* * Single-precision vector tanh(x) function. * * Copyright (c) 2022-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" #include "pl_sig.h" #include "pl_test.h" -#if V_SUPPORTED - #include "v_expm1f_inline.h" -#define BoringBound \ - 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ - negative). */ -#define AbsMask 0x7fffffff +static const struct data +{ + struct v_expm1f_data expm1f_consts; + uint32x4_t boring_bound, large_bound, onef; +} data = { + .expm1f_consts = V_EXPM1F_DATA, + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ + .boring_bound = V4 (0x41102cb3), + .large_bound = V4 (0x7f800000), + .onef = V4 (0x3f800000), +}; -static NOINLINE v_f32_t -special_case (v_f32_t x, v_f32_t y, v_u32_t special) +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { return v_call_f32 (tanhf, x, y, special); } -/* Approximation for single-precision vector tanh(x), using a simplified version - of expm1f. The maximum error is 2.58 ULP: - __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 - want 0x1.f9ba08p-5. */ -VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x) +/* Approximation for single-precision vector tanh(x), using a simplified + version of expm1f. The maximum error is 2.58 ULP: + _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x) { - v_u32_t ix = v_as_u32_f32 (x); - v_u32_t iax = ix & AbsMask; - v_u32_t sign = ix & ~AbsMask; - v_u32_t is_boring = v_cond_u32 (iax > BoringBound); - v_f32_t boring = v_as_f32_u32 (sign | One); + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t sign = veorq_u32 (ix, iax); + uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef)); #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered properly, set all special and boring - lanes to 1, which will trigger no exceptions, and fix them up later. */ - v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000)); - ix = v_sel_u32 (is_boring, v_u32 (One), ix); + lanes to 0, which will trigger no exceptions, and fix them up later. */ + uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound), + vcltq_u32 (iax, v_u32 (0x34000000))); + x = v_zerofy_f32 (x, is_boring); if (unlikely (v_any_u32 (special))) - ix = v_sel_u32 (special, v_u32 (One), ix); + x = v_zerofy_f32 (x, special); #else - v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0)); + uint32x4_t special = vcgtq_u32 (iax, d->large_bound); #endif /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix)); - v_f32_t y = q / (q + 2); - y = v_sel_f32 (is_boring, boring, y); + float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); if (unlikely (v_any_u32 (special))) - return special_case (x, y, special); - return y; + return special_case (vreinterpretq_f32_u32 (ix), + vbslq_f32 (is_boring, boring, y), special); + return vbslq_f32 (is_boring, boring, y); } -VPCS_ALIAS PL_SIG (V, F, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (V_NAME (tanhf), 2.09) -PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000) -PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100) -PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100) -#endif +PL_TEST_ULP (V_NAME_F1 (tanh), 2.09) +PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c deleted file mode 100644 index 649735b140f3..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_acosh. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh) -#include "v_acosh_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c deleted file mode 100644 index 8c5f106992a7..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_acoshf. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf) -#include "v_acoshf_3u1.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c deleted file mode 100644 index 0d2373b5e4b2..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_asinh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh) -#include "v_asinh_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c deleted file mode 100644 index 6c8927f0875b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_asinhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf) -#include "v_asinhf_2u7.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c deleted file mode 100644 index 925b5b4ef324..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atan2. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2) -#include "v_atan2_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c deleted file mode 100644 index 51d33d50f6ef..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atan2f. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f) -#include "v_atan2f_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c deleted file mode 100644 index ccebce2dc2ed..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atan. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan) -#include "v_atan_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c deleted file mode 100644 index b8797276d981..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atanf. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf) -#include "v_atanf_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c deleted file mode 100644 index 19429b209b3a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atanh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh) -#include "v_atanh_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c deleted file mode 100644 index 7de226dda054..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_atanhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf) -#include "v_atanhf_3u1.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c deleted file mode 100644 index 4cb0dc8cefb5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cbrt. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt) -#include "v_cbrt_2u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c deleted file mode 100644 index 40a72d8c301e..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cbrtf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf) -#include "v_cbrtf_1u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c deleted file mode 100644 index 9bf7f026447a..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cosh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh) -#include "v_cosh_2u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c deleted file mode 100644 index b149cb34df61..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_coshf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf) -#include "v_coshf_2u4.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c b/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c deleted file mode 100644 index 95bd141554e4..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erf. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf) -#include "v_erf_2u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c deleted file mode 100644 index 1cf6546ce715..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erfc. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc) -#include "v_erfc_4u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c deleted file mode 100644 index ef5a21d6336c..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erfcf. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf) -#include "v_erfcf_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c deleted file mode 100644 index ee8848ee24ed..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erff. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff) -#include "v_erff_1u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c b/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c deleted file mode 100644 index 52a57feefbff..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_erfc. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_exp_tail.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expf.c b/contrib/arm-optimized-routines/pl/math/vn_expf.c deleted file mode 100644 index 83e7f0a2070b..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_expf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) -#include "v_expf.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c deleted file mode 100644 index 35111e2fc221..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expm1. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1) -#include "v_expm1_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c deleted file mode 100644 index bea491f4898e..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expm1f. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f) -#include "v_expm1f_1u6.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c deleted file mode 100644 index 5f32c33e059f..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log10. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10) -#include "v_log10_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c deleted file mode 100644 index 2673ef515df7..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log10f. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f) -#include "v_log10f_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c deleted file mode 100644 index 3f4f8d1bd297..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log1p. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p) -#include "v_log1p_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c deleted file mode 100644 index a319bc98f491..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log1pf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf) -#include "v_log1pf_2u1.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c deleted file mode 100644 index a87039204439..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log2. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2) -#include "v_log2_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c deleted file mode 100644 index b4a9cb708bae..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log2f. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f) -#include "v_log2f_2u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c deleted file mode 100644 index 7c881de21688..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh) -#include "v_sinh_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c deleted file mode 100644 index 251e73232d01..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf) -#include "v_sinhf_2u3.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c deleted file mode 100644 index a4efb065bc08..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tan. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan) -#include "v_tan_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c deleted file mode 100644 index a88cb4077b3d..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tanf. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf) -#include "v_tanf_3u5.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c deleted file mode 100644 index cb2746cf22a5..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tanh. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh) -#include "v_tanh_3u.c" -#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c deleted file mode 100644 index 47f0a7f57d05..000000000000 --- a/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_tanhf. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#include "include/mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf) -#include "v_tanhf_2u6.c" -#endif diff --git a/contrib/arm-optimized-routines/string/aarch64/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h index 069b146f4a69..131b95e1fea9 100644 --- a/contrib/arm-optimized-routines/string/aarch64/asmdefs.h +++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h @@ -1,92 +1,106 @@ /* * Macros for asm code. AArch64 version. * * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _ASMDEFS_H #define _ASMDEFS_H /* Branch Target Identitication support. */ #define BTI_C hint 34 #define BTI_J hint 36 /* Return address signing support (pac-ret). */ #define PACIASP hint 25; .cfi_window_save #define AUTIASP hint 29; .cfi_window_save /* GNU_PROPERTY_AARCH64_* macros from elf.h. */ #define FEATURE_1_AND 0xc0000000 #define FEATURE_1_BTI 1 #define FEATURE_1_PAC 2 /* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#ifdef __ILP32__ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 2; \ + .word 4; \ + .word 12; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .text +#else #define GNU_PROPERTY(type, value) \ .section .note.gnu.property, "a"; \ .p2align 3; \ .word 4; \ .word 16; \ .word 5; \ .asciz "GNU"; \ .word type; \ .word 4; \ .word value; \ .word 0; \ .text +#endif /* If set then the GNU Property Note section will be added to mark objects to support BTI and PAC-RET. */ #ifndef WANT_GNU_PROPERTY #define WANT_GNU_PROPERTY 1 #endif #if WANT_GNU_PROPERTY /* Add property note with supported features to all asm files. */ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) #endif #define ENTRY_ALIGN(name, alignment) \ .global name; \ .type name,%function; \ .align alignment; \ name: \ .cfi_startproc; \ BTI_C; #define ENTRY(name) ENTRY_ALIGN(name, 6) #define ENTRY_ALIAS(name) \ .global name; \ .type name,%function; \ name: #define END(name) \ .cfi_endproc; \ .size name, .-name; #define L(l) .L ## l #ifdef __ILP32__ /* Sanitize padding bits of pointer arguments as per aapcs64 */ #define PTR_ARG(n) mov w##n, w##n #else #define PTR_ARG(n) #endif #ifdef __ILP32__ /* Sanitize padding bits of size arguments as per aapcs64 */ #define SIZE_ARG(n) mov w##n, w##n #else #define SIZE_ARG(n) #endif /* Compiler supports SVE instructions */ #ifndef HAVE_SVE # if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) # define HAVE_SVE 1 # else # define HAVE_SVE 0 # endif #endif #endif diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S index e6527d0dac2c..9d3027d4d3cd 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S @@ -1,206 +1,212 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ #include "asmdefs.h" #define dstin x0 #define src x1 #define count x2 #define dst x3 #define srcend x4 #define dstend x5 #define A_l x6 #define A_lw w6 #define A_h x7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_lw w10 #define tmp1 x14 #define A_q q0 #define B_q q1 #define C_q q2 #define D_q q3 #define E_q q4 #define F_q q5 #define G_q q6 #define H_q q7 /* This implementation handles overlaps and supports both memcpy and memmove from a single entry point. It uses unaligned accesses and branchless sequences to keep the code small, simple and improve performance. Copies are split into 3 main cases: small copies of up to 32 bytes, medium copies of up to 128 bytes, and large copies. The overhead of the overlap check is negligible since it is only required for large copies. Large copies use a software pipelined loop processing 64 bytes per iteration. The source pointer is 16-byte aligned to minimize unaligned accesses. The loop tail is handled by always copying 64 bytes from the end. */ ENTRY_ALIAS (__memmove_aarch64_simd) ENTRY (__memcpy_aarch64_simd) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) add srcend, src, count - add dstend, dstin, count cmp count, 128 b.hi L(copy_long) + add dstend, dstin, count cmp count, 32 b.hi L(copy32_128) + nop /* Small copies: 0..32 bytes. */ cmp count, 16 b.lo L(copy16) ldr A_q, [src] ldr B_q, [srcend, -16] str A_q, [dstin] str B_q, [dstend, -16] ret + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 /* Copy 8-15 bytes. */ L(copy16): tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 3 /* Copy 4-7 bytes. */ L(copy8): tbz count, 2, L(copy4) ldr A_lw, [src] ldr B_lw, [srcend, -4] str A_lw, [dstin] str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes using a branchless sequence. */ -L(copy4): - cbz count, L(copy0) - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb C_lw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] -L(copy0): - ret - - .p2align 4 - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - .p2align 4 /* Copy 65..128 bytes. */ L(copy128): ldp E_q, F_q, [src, 32] cmp count, 96 b.ls L(copy96) ldp G_q, H_q, [srcend, -64] stp G_q, H_q, [dstend, -64] L(copy96): stp A_q, B_q, [dstin] stp E_q, F_q, [dstin, 32] stp C_q, D_q, [dstend, -32] ret + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 3 /* Copy more than 128 bytes. */ L(copy_long): + add dstend, dstin, count + /* Use backwards copy if there is an overlap. */ sub tmp1, dstin, src cmp tmp1, count b.lo L(copy_long_backwards) /* Copy 16 bytes and then align src to 16-byte alignment. */ ldr D_q, [src] and tmp1, src, 15 bic src, src, 15 sub dst, dstin, tmp1 add count, count, tmp1 /* Count is now 16 too large. */ ldp A_q, B_q, [src, 16] str D_q, [dstin] ldp C_q, D_q, [src, 48] subs count, count, 128 + 16 /* Test and readjust count. */ b.ls L(copy64_from_end) L(loop64): stp A_q, B_q, [dst, 16] ldp A_q, B_q, [src, 80] stp C_q, D_q, [dst, 48] ldp C_q, D_q, [src, 112] add src, src, 64 add dst, dst, 64 subs count, count, 64 b.hi L(loop64) /* Write the last iteration and copy 64 bytes from the end. */ L(copy64_from_end): ldp E_q, F_q, [srcend, -64] stp A_q, B_q, [dst, 16] ldp A_q, B_q, [srcend, -32] stp C_q, D_q, [dst, 48] stp E_q, F_q, [dstend, -64] stp A_q, B_q, [dstend, -32] ret + .p2align 4 + nop + /* Large backwards copy for overlapping copies. Copy 16 bytes and then align srcend to 16-byte alignment. */ L(copy_long_backwards): cbz tmp1, L(copy0) ldr D_q, [srcend, -16] and tmp1, srcend, 15 bic srcend, srcend, 15 sub count, count, tmp1 ldp A_q, B_q, [srcend, -32] str D_q, [dstend, -16] ldp C_q, D_q, [srcend, -64] sub dstend, dstend, tmp1 subs count, count, 128 b.ls L(copy64_from_start) L(loop64_backwards): str B_q, [dstend, -16] str A_q, [dstend, -32] ldp A_q, B_q, [srcend, -96] str D_q, [dstend, -48] str C_q, [dstend, -64]! ldp C_q, D_q, [srcend, -128] sub srcend, srcend, 64 subs count, count, 64 b.hi L(loop64_backwards) /* Write the last iteration and copy 64 bytes from the start. */ L(copy64_from_start): ldp E_q, F_q, [src, 32] stp A_q, B_q, [dstend, -32] ldp A_q, B_q, [src] stp C_q, D_q, [dstend, -64] stp E_q, F_q, [dstin, 32] stp A_q, B_q, [dstin] ret END (__memcpy_aarch64_simd) diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S new file mode 100644 index 000000000000..b45c31418717 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S @@ -0,0 +1,21 @@ +/* + * memcpy using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memcpy_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ + .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ + .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */ + ret + +END (__memcpy_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S new file mode 100644 index 000000000000..6c73017bb16f --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S @@ -0,0 +1,21 @@ +/* + * memmove using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memmove_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ + .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ + .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */ + ret + +END (__memmove_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-mops.S b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S new file mode 100644 index 000000000000..ec791493bae9 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S @@ -0,0 +1,20 @@ +/* + * memset using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memset_aarch64_mops) + PTR_ARG (0) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ + .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ + .inst 0x19c18443 /* sete [x3]!, x2!, x1 */ + ret + +END (__memset_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/bench/memcpy.c b/contrib/arm-optimized-routines/string/bench/memcpy.c index 1468663e51cd..b628f9b60d96 100644 --- a/contrib/arm-optimized-routines/string/bench/memcpy.c +++ b/contrib/arm-optimized-routines/string/bench/memcpy.c @@ -1,339 +1,342 @@ /* * memcpy benchmark. * - * Copyright (c) 2020-2022, Arm Limited. + * Copyright (c) 2020-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE #include #include #include #include #include "stringlib.h" #include "benchlib.h" #define ITERS 5000 #define ITERS2 20000000 #define ITERS3 200000 #define NUM_TESTS 16384 #define MIN_SIZE 32768 #define MAX_SIZE (1024 * 1024) static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); #define F(x) {#x, x}, static const struct fun { const char *name; void *(*fun)(void *, const void *, size_t); } funtab[] = { #if __aarch64__ F(__memcpy_aarch64) # if __ARM_NEON F(__memcpy_aarch64_simd) # endif # if __ARM_FEATURE_SVE F(__memcpy_aarch64_sve) # endif +# if WANT_MOPS + F(__memcpy_aarch64_mops) +# endif #elif __arm__ F(__memcpy_arm) #endif F(memcpy) #undef F {0, 0} }; typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; #define SIZE_NUM 65536 #define SIZE_MASK (SIZE_NUM-1) static uint8_t size_arr[SIZE_NUM]; /* Frequency data for memcpy of less than 4096 bytes based on SPEC2017. */ static freq_data_t size_freq[] = { {32,22320}, { 16,9554}, { 8,8915}, {152,5327}, { 4,2159}, {292,2035}, { 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721}, {120, 661}, { 2, 649}, {882, 550}, { 5, 475}, { 7, 461}, {108, 460}, { 10, 361}, { 9, 361}, { 6, 334}, { 3, 326}, {464, 308}, {2048,303}, { 1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185}, {192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288, 96}, {104, 96}, {1144, 83}, { 18, 80}, { 23, 78}, { 40, 77}, { 19, 68}, { 48, 63}, { 17, 57}, { 72, 54}, {1280, 51}, { 20, 49}, { 28, 47}, { 22, 46}, {640, 45}, { 25, 41}, { 14, 40}, { 56, 37}, { 27, 35}, { 35, 33}, {384, 33}, { 29, 32}, { 80, 30}, {4095, 22}, {232, 22}, { 36, 19}, {184, 17}, { 21, 17}, {256, 16}, { 44, 15}, { 26, 15}, { 31, 14}, { 88, 14}, {176, 13}, { 33, 12}, {1024, 12}, {208, 11}, { 62, 11}, {128, 10}, {704, 10}, {324, 10}, { 96, 10}, { 60, 9}, {136, 9}, {124, 9}, { 34, 8}, { 30, 8}, {480, 8}, {1344, 8}, {273, 7}, {520, 7}, {112, 6}, { 52, 6}, {344, 6}, {336, 6}, {504, 5}, {168, 5}, {424, 5}, { 0, 4}, { 76, 3}, {200, 3}, {512, 3}, {312, 3}, {240, 3}, {960, 3}, {264, 2}, {672, 2}, { 38, 2}, {328, 2}, { 84, 2}, { 39, 2}, {216, 2}, { 42, 2}, { 37, 2}, {1608, 2}, { 70, 2}, { 46, 2}, {536, 2}, {280, 1}, {248, 1}, { 47, 1}, {1088, 1}, {1288, 1}, {224, 1}, { 41, 1}, { 50, 1}, { 49, 1}, {808, 1}, {360, 1}, {440, 1}, { 43, 1}, { 45, 1}, { 78, 1}, {968, 1}, {392, 1}, { 54, 1}, { 53, 1}, { 59, 1}, {376, 1}, {664, 1}, { 58, 1}, {272, 1}, { 66, 1}, {2688, 1}, {472, 1}, {568, 1}, {720, 1}, { 51, 1}, { 63, 1}, { 86, 1}, {496, 1}, {776, 1}, { 57, 1}, {680, 1}, {792, 1}, {122, 1}, {760, 1}, {824, 1}, {552, 1}, { 67, 1}, {456, 1}, {984, 1}, { 74, 1}, {408, 1}, { 75, 1}, { 92, 1}, {576, 1}, {116, 1}, { 65, 1}, {117, 1}, { 82, 1}, {352, 1}, { 55, 1}, {100, 1}, { 90, 1}, {696, 1}, {111, 1}, {880, 1}, { 79, 1}, {488, 1}, { 61, 1}, {114, 1}, { 94, 1}, {1032, 1}, { 98, 1}, { 87, 1}, {584, 1}, { 85, 1}, {648, 1}, {0, 0} }; #define ALIGN_NUM 1024 #define ALIGN_MASK (ALIGN_NUM-1) static uint8_t src_align_arr[ALIGN_NUM]; static uint8_t dst_align_arr[ALIGN_NUM]; /* Source alignment frequency for memcpy based on SPEC2017. */ static align_data_t src_align_freq[] = { {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0} }; static align_data_t dst_align_freq[] = { {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0} }; typedef struct { uint64_t src : 24; uint64_t dst : 24; uint64_t len : 16; } copy_t; static copy_t test_arr[NUM_TESTS]; typedef char *(*proto_t) (char *, const char *, size_t); static void init_copy_distribution (void) { int i, j, freq, size, n; for (n = i = 0; (freq = size_freq[i].freq) != 0; i++) for (j = 0, size = size_freq[i].size; j < freq; j++) size_arr[n++] = size; assert (n == SIZE_NUM); for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++) for (j = 0, size = src_align_freq[i].align; j < freq; j++) src_align_arr[n++] = size - 1; assert (n == ALIGN_NUM); for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++) for (j = 0, size = dst_align_freq[i].align; j < freq; j++) dst_align_arr[n++] = size - 1; assert (n == ALIGN_NUM); } static size_t init_copies (size_t max_size) { size_t total = 0; /* Create a random set of copies with the given size and alignment distributions. */ for (int i = 0; i < NUM_TESTS; i++) { test_arr[i].dst = (rand32 (0) & (max_size - 1)); test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; test_arr[i].src = (rand32 (0) & (max_size - 1)); test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK]; total += test_arr[i].len; } return total; } int main (void) { init_copy_distribution (); memset (a, 1, sizeof (a)); memset (b, 2, sizeof (b)); printf("Random memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { size_t total = 0; uint64_t tsum = 0; printf ("%22s ", funtab[f].name); rand32 (0x12345678); for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) { size_t copy_size = init_copies (size) * ITERS; for (int c = 0; c < NUM_TESTS; c++) funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) for (int c = 0; c < NUM_TESTS; c++) funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); t = clock_get_ns () - t; total += copy_size; tsum += t; printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); } printf( "avg %.2f\n", (double)total / tsum); } size_t total = 0; uint64_t tsum = 0; printf ("%22s ", "memcpy_call"); rand32 (0x12345678); for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) { size_t copy_size = init_copies (size) * ITERS; for (int c = 0; c < NUM_TESTS; c++) memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) for (int c = 0; c < NUM_TESTS; c++) memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); t = clock_get_ns () - t; total += copy_size; tsum += t; printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); } printf( "avg %.2f\n", (double)total / tsum); printf ("\nAligned medium memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { printf ("%22s ", funtab[f].name); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); } printf ("%22s ", "memcpy_call"); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) memcpy (b, a, size); t = clock_get_ns () - t; printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); printf ("\nUnaligned medium memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { printf ("%22s ", funtab[f].name); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) funtab[f].fun (b + 3, a + 1, size); t = clock_get_ns () - t; printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); } printf ("%22s ", "memcpy_call"); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) memcpy (b + 3, a + 1, size); t = clock_get_ns () - t; printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); printf ("\nLarge memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { printf ("%22s ", funtab[f].name); for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } printf ("%22s ", "memcpy_call"); for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) memcpy (b, a, size); t = clock_get_ns () - t; printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); printf ("\nUnaligned forwards memmove (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { printf ("%22s ", funtab[f].name); for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a, a + 256 + (i & 31), size); t = clock_get_ns () - t; printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } printf ("\nUnaligned backwards memmove (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { printf ("%22s ", funtab[f].name); for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a + 256 + (i & 31), a, size); t = clock_get_ns () - t; printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } printf ("\n"); return 0; } diff --git a/contrib/arm-optimized-routines/string/include/stringlib.h b/contrib/arm-optimized-routines/string/include/stringlib.h index f41a46446888..01da7ebfc18d 100644 --- a/contrib/arm-optimized-routines/string/include/stringlib.h +++ b/contrib/arm-optimized-routines/string/include/stringlib.h @@ -1,67 +1,72 @@ /* * Public API. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include /* restrict is not needed, but kept for documenting the interface contract. */ #ifndef __restrict # define __restrict #endif #if __aarch64__ void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64 (void *, const void *, size_t); void *__memset_aarch64 (void *, int, size_t); void *__memchr_aarch64 (const void *, int, size_t); void *__memrchr_aarch64 (const void *, int, size_t); int __memcmp_aarch64 (const void *, const void *, size_t); char *__strcpy_aarch64 (char *__restrict, const char *__restrict); char *__stpcpy_aarch64 (char *__restrict, const char *__restrict); int __strcmp_aarch64 (const char *, const char *); char *__strchr_aarch64 (const char *, int); char *__strrchr_aarch64 (const char *, int); char *__strchrnul_aarch64 (const char *, int ); size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); void * __memchr_aarch64_mte (const void *, int, size_t); char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); char *__strrchr_aarch64_mte (const char *, int); #if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); #endif # if __ARM_FEATURE_SVE void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t); void *__memchr_aarch64_sve (const void *, int, size_t); int __memcmp_aarch64_sve (const void *, const void *, size_t); char *__strchr_aarch64_sve (const char *, int); char *__strrchr_aarch64_sve (const char *, int); char *__strchrnul_aarch64_sve (const char *, int ); int __strcmp_aarch64_sve (const char *, const char *); char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict); char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict); size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if WANT_MOPS +void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_mops (void *, int, size_t); +# endif # if __ARM_FEATURE_MEMORY_TAGGING void *__mtag_tag_region (void *, size_t); void *__mtag_tag_zero_region (void *, size_t); # endif #elif __arm__ void *__memcpy_arm (void *__restrict, const void *__restrict, size_t); void *__memset_arm (void *, int, size_t); void *__memchr_arm (const void *, int, size_t); char *__strcpy_arm (char *__restrict, const char *__restrict); int __strcmp_arm (const char *, const char *); int __strcmp_armv6m (const char *, const char *); size_t __strlen_armv6t2 (const char *); #endif diff --git a/contrib/arm-optimized-routines/string/test/memcpy.c b/contrib/arm-optimized-routines/string/test/memcpy.c index fa15a95b2bda..dc95844bd45a 100644 --- a/contrib/arm-optimized-routines/string/test/memcpy.c +++ b/contrib/arm-optimized-routines/string/test/memcpy.c @@ -1,123 +1,126 @@ /* * memcpy test. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include #include #include "mte.h" #include "stringlib.h" #include "stringtest.h" #define F(x, mte) {#x, x, mte}, static const struct fun { const char *name; void *(*fun) (void *, const void *, size_t); int test_mte; } funtab[] = { // clang-format off F(memcpy, 0) #if __aarch64__ F(__memcpy_aarch64, 1) # if __ARM_NEON F(__memcpy_aarch64_simd, 1) # endif # if __ARM_FEATURE_SVE F(__memcpy_aarch64_sve, 1) # endif +# if WANT_MOPS + F(__memcpy_aarch64_mops, 1) +# endif #elif __arm__ F(__memcpy_arm, 0) #endif {0, 0, 0} // clang-format on }; #undef F #define A 32 #define LEN 250000 static unsigned char *dbuf; static unsigned char *sbuf; static unsigned char wbuf[LEN + 2 * A]; static void * alignup (void *p) { return (void *) (((uintptr_t) p + A - 1) & -A); } static void test (const struct fun *fun, int dalign, int salign, int len) { unsigned char *src = alignup (sbuf); unsigned char *dst = alignup (dbuf); unsigned char *want = wbuf; unsigned char *s = src + salign; unsigned char *d = dst + dalign; unsigned char *w = want + dalign; void *p; int i; if (err_count >= ERR_LIMIT) return; if (len > LEN || dalign >= A || salign >= A) abort (); for (i = 0; i < len + A; i++) { src[i] = '?'; want[i] = dst[i] = '*'; } for (i = 0; i < len; i++) s[i] = w[i] = 'a' + i % 23; s = tag_buffer (s, len, fun->test_mte); d = tag_buffer (d, len, fun->test_mte); p = fun->fun (d, s, len); untag_buffer (s, len, fun->test_mte); untag_buffer (d, len, fun->test_mte); if (p != d) ERR ("%s(%p,..) returned %p\n", fun->name, d, p); for (i = 0; i < len + A; i++) { if (dst[i] != want[i]) { ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); quoteat ("got", dst, len + A, i); quoteat ("want", want, len + A, i); break; } } } int main () { dbuf = mte_mmap (LEN + 2 * A); sbuf = mte_mmap (LEN + 2 * A); int r = 0; for (int i = 0; funtab[i].name; i++) { err_count = 0; for (int d = 0; d < A; d++) for (int s = 0; s < A; s++) { int n; for (n = 0; n < 100; n++) test (funtab + i, d, s, n); for (; n < LEN; n *= 2) test (funtab + i, d, s, n); } char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); if (err_count) r = -1; } return r; } diff --git a/contrib/arm-optimized-routines/string/test/memmove.c b/contrib/arm-optimized-routines/string/test/memmove.c index 5d509c03affa..b85dd1e864ef 100644 --- a/contrib/arm-optimized-routines/string/test/memmove.c +++ b/contrib/arm-optimized-routines/string/test/memmove.c @@ -1,167 +1,170 @@ /* * memmove test. * - * Copyright (c) 2019-2022, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include #include #include "mte.h" #include "stringlib.h" #include "stringtest.h" #define F(x, mte) {#x, x, mte}, static const struct fun { const char *name; void *(*fun) (void *, const void *, size_t); int test_mte; } funtab[] = { // clang-format off F(memmove, 0) #if __aarch64__ F(__memmove_aarch64, 1) # if __ARM_NEON F(__memmove_aarch64_simd, 1) # endif # if __ARM_FEATURE_SVE F(__memmove_aarch64_sve, 1) # endif +# if WANT_MOPS + F(__memmove_aarch64_mops, 1) +# endif #endif {0, 0, 0} // clang-format on }; #undef F #define A 32 #define LEN 250000 static unsigned char *dbuf; static unsigned char *sbuf; static unsigned char wbuf[LEN + 2 * A]; static void * alignup (void *p) { return (void *) (((uintptr_t) p + A - 1) & -A); } static void test (const struct fun *fun, int dalign, int salign, int len) { unsigned char *src = alignup (sbuf); unsigned char *dst = alignup (dbuf); unsigned char *want = wbuf; unsigned char *s = src + salign; unsigned char *d = dst + dalign; unsigned char *w = want + dalign; void *p; int i; if (err_count >= ERR_LIMIT) return; if (len > LEN || dalign >= A || salign >= A) abort (); for (i = 0; i < len + A; i++) { src[i] = '?'; want[i] = dst[i] = '*'; } for (i = 0; i < len; i++) s[i] = w[i] = 'a' + i % 23; p = fun->fun (d, s, len); if (p != d) ERR ("%s(%p,..) returned %p\n", fun->name, d, p); for (i = 0; i < len + A; i++) { if (dst[i] != want[i]) { ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); quoteat ("got", dst, len + A, i); quoteat ("want", want, len + A, i); break; } } } static void test_overlap (const struct fun *fun, int dalign, int salign, int len) { unsigned char *src = alignup (sbuf); unsigned char *dst = src; unsigned char *want = wbuf; unsigned char *s = src + salign; unsigned char *d = dst + dalign; unsigned char *w = wbuf + dalign; void *p; if (err_count >= ERR_LIMIT) return; if (len > LEN || dalign >= A || salign >= A) abort (); for (int i = 0; i < len + A; i++) src[i] = want[i] = '?'; for (int i = 0; i < len; i++) s[i] = want[salign + i] = 'a' + i % 23; for (int i = 0; i < len; i++) w[i] = s[i]; s = tag_buffer (s, len, fun->test_mte); d = tag_buffer (d, len, fun->test_mte); p = fun->fun (d, s, len); untag_buffer (s, len, fun->test_mte); untag_buffer (d, len, fun->test_mte); if (p != d) ERR ("%s(%p,..) returned %p\n", fun->name, d, p); for (int i = 0; i < len + A; i++) { if (dst[i] != want[i]) { ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len); quoteat ("got", dst, len + A, i); quoteat ("want", want, len + A, i); break; } } } int main () { dbuf = mte_mmap (LEN + 2 * A); sbuf = mte_mmap (LEN + 2 * A); int r = 0; for (int i = 0; funtab[i].name; i++) { err_count = 0; for (int d = 0; d < A; d++) for (int s = 0; s < A; s++) { int n; for (n = 0; n < 100; n++) { test (funtab + i, d, s, n); test_overlap (funtab + i, d, s, n); } for (; n < LEN; n *= 2) { test (funtab + i, d, s, n); test_overlap (funtab + i, d, s, n); } } char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); if (err_count) r = -1; } return r; } diff --git a/contrib/arm-optimized-routines/string/test/memset.c b/contrib/arm-optimized-routines/string/test/memset.c index 5543f44bb026..7d09c267ffec 100644 --- a/contrib/arm-optimized-routines/string/test/memset.c +++ b/contrib/arm-optimized-routines/string/test/memset.c @@ -1,129 +1,132 @@ /* * memset test. * - * Copyright (c) 2019-2020, Arm Limited. + * Copyright (c) 2019-2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include #include #include "mte.h" #include "stringlib.h" #include "stringtest.h" #define F(x, mte) {#x, x, mte}, static const struct fun { const char *name; void *(*fun) (void *s, int c, size_t n); int test_mte; } funtab[] = { // clang-format off F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) +# if WANT_MOPS + F(__memset_aarch64_mops, 1) +# endif #elif __arm__ F(__memset_arm, 0) #endif {0, 0, 0} // clang-format on }; #undef F #define A 32 #define LEN 250000 static unsigned char *sbuf; static void * alignup (void *p) { return (void *) (((uintptr_t) p + A - 1) & -A); } static void test (const struct fun *fun, int salign, int c, int len) { unsigned char *src = alignup (sbuf); unsigned char *s = src + salign; void *p; int i; if (err_count >= ERR_LIMIT) return; if (len > LEN || salign >= A) abort (); for (i = 0; i < len + A; i++) src[i] = '?'; for (i = 0; i < len; i++) s[i] = 'a' + i % 23; s = tag_buffer (s, len, fun->test_mte); p = fun->fun (s, c, len); untag_buffer (s, len, fun->test_mte); if (p != s) ERR ("%s(%p,..) returned %p\n", fun->name, s, p); for (i = 0; i < salign; i++) { if (src[i] != '?') { ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); quoteat ("got", src, len + A, i); return; } } for (; i < salign + len; i++) { if (src[i] != (unsigned char) c) { ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); quoteat ("got", src, len + A, i); return; } } for (; i < len + A; i++) { if (src[i] != '?') { ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); quoteat ("got", src, len + A, i); return; } } } int main () { sbuf = mte_mmap (LEN + 2 * A); int r = 0; for (int i = 0; funtab[i].name; i++) { err_count = 0; for (int s = 0; s < A; s++) { int n; for (n = 0; n < 100; n++) { test (funtab + i, s, 0, n); test (funtab + i, s, 0x25, n); test (funtab + i, s, 0xaa25, n); } for (; n < LEN; n *= 2) { test (funtab + i, s, 0, n); test (funtab + i, s, 0x25, n); test (funtab + i, s, 0xaa25, n); } } char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); if (err_count) r = -1; } return r; }