diff --git a/contrib/arm-optimized-routines/LICENSE b/contrib/arm-optimized-routines/LICENSE
index 2543b82ed92d..20a4b7717cf5 100644
--- a/contrib/arm-optimized-routines/LICENSE
+++ b/contrib/arm-optimized-routines/LICENSE
@@ -1,21 +1,249 @@
+MIT OR Apache-2.0 WITH LLVM-exception
+=====================================
+
+
 MIT License
+-----------
 
-Copyright (c) 1999-2019, Arm Limited.
+Copyright (c) 1999-2022, Arm Limited.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+
+Apache-2.0 WITH LLVM-exception
+------------------------------
+
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/contrib/arm-optimized-routines/MAINTAINERS b/contrib/arm-optimized-routines/MAINTAINERS
new file mode 100644
index 000000000000..6c5823a8dbce
--- /dev/null
+++ b/contrib/arm-optimized-routines/MAINTAINERS
@@ -0,0 +1,12 @@
+/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+math/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+networking/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+pl/
+	Pierre Blanchard <pierre.blanchard@arm.com>
+	Joe Ramsay <joe.ramsay@arm.com>
+string/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+	Wilco Dijkstra <wilco.dijkstra@arm.com>
diff --git a/contrib/arm-optimized-routines/Makefile b/contrib/arm-optimized-routines/Makefile
index 169f89e2c9d6..c487896728c2 100644
--- a/contrib/arm-optimized-routines/Makefile
+++ b/contrib/arm-optimized-routines/Makefile
@@ -1,89 +1,92 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 srcdir = .
 prefix = /usr
 bindir = $(prefix)/bin
 libdir = $(prefix)/lib
 includedir = $(prefix)/include
 
 # Configure these in config.mk, do not make changes in this file.
 SUBS = math string networking
+PLSUBS = math
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
 HOST_LDLIBS =
 EMULATOR =
 CPPFLAGS =
 CFLAGS = -std=c99 -O2
 CFLAGS_SHARED = -fPIC
 CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
+CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
 LDFLAGS =
 LDLIBS =
 AR = $(CROSS_COMPILE)ar
 RANLIB = $(CROSS_COMPILE)ranlib
 INSTALL = install
 
 all:
 
 -include config.mk
 
 $(foreach sub,$(SUBS),$(eval include $(srcdir)/$(sub)/Dir.mk))
 
 # Required targets of subproject foo:
 #   all-foo
 #   check-foo
 #   clean-foo
 #   install-foo
 # Required make variables of subproject foo:
 #   foo-files: Built files (all in build/).
 # Make variables used by subproject foo:
 #   foo-...: Variables defined in foo/Dir.mk or by config.mk.
 
 all: $(SUBS:%=all-%)
 
 ALL_FILES = $(foreach sub,$(SUBS),$($(sub)-files))
 DIRS = $(sort $(patsubst %/,%,$(dir $(ALL_FILES))))
 $(ALL_FILES): | $(DIRS)
 $(DIRS):
 	mkdir -p $@
 
 $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
+$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
 
 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
 
 build/%.o: $(srcdir)/%.c
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
 
 build/%.os: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
 
 build/%.os: $(srcdir)/%.c
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
 
 clean: $(SUBS:%=clean-%)
 	rm -rf build
 
 distclean: clean
 	rm -f config.mk
 
 $(DESTDIR)$(bindir)/%: build/bin/%
 	$(INSTALL) -D $< $@
 
 $(DESTDIR)$(libdir)/%.so: build/lib/%.so
 	$(INSTALL) -D $< $@
 
 $(DESTDIR)$(libdir)/%: build/lib/%
 	$(INSTALL) -m 644 -D $< $@
 
 $(DESTDIR)$(includedir)/%: build/include/%
 	$(INSTALL) -m 644 -D $< $@
 
 install: $(SUBS:%=install-%)
 
 check: $(SUBS:%=check-%)
 
 .PHONY: all clean distclean install check
diff --git a/contrib/arm-optimized-routines/README b/contrib/arm-optimized-routines/README
index 9e1a34fdc65d..a2143a28488a 100644
--- a/contrib/arm-optimized-routines/README
+++ b/contrib/arm-optimized-routines/README
@@ -1,56 +1,60 @@
 Arm Optimized Routines
 ----------------------
 
 This repository contains implementations of library functions
-provided by Arm under MIT License (See LICENSE). Contributions
-to this project are accepted, but Contributors have to sign an
-Assignment Agreement, please follow the instructions in
+provided by Arm. The outbound license is available under a dual
+license, at the user’s election, as reflected in the LICENSE file.
+Contributions to this project are accepted, but Contributors have
+to sign an Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
-to projects that require copyright assignment is possible.
+to projects that require copyright assignment is possible. Further
+contribution requirements are documented in README.contributors of
+the appropriate subdirectory.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v21.02.
+release is v23.01.
 
 Source code layout:
 
 build/          - build directory (created by make).
 math/           - math subproject sources.
 math/include/   - math library public headers.
 math/test/      - math test and benchmark related sources.
 math/tools/     - tools used for designing the algorithms.
 networking/     - networking subproject sources.
 networking/include/ - networking library public headers.
 networking/test/ - networking test and benchmark related sources.
 string/         - string routines subproject sources.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
+pl/...          - separately maintained performance library code.
 
 The steps to build the target libraries and run the tests:
 
 cp config.mk.dist config.mk
 # edit config.mk if necessary ...
 make
 make check
 
 Or building outside of the source directory:
 
 ln -s path/to/src/Makefile Makefile
 cp path/to/src/config.mk.dist config.mk
 echo 'srcdir = path/to/src' >> config.mk
 # further edits to config.mk
 make
 make check
 
 Or building and testing the math subproject only:
 
 make all-math
 make check-math
 
 The test system requires libmpfr and libmpc.
 For example on debian linux they can be installed as:
 
 sudo apt-get install libmpfr-dev libmpc-dev
 
 For cross build, CROSS_COMPILE should be set in config.mk and EMULATOR
 should be set for cross testing (e.g. using qemu-user or remote access
 to a target machine), see the examples in config.mk.dist.
diff --git a/contrib/arm-optimized-routines/README.contributors b/contrib/arm-optimized-routines/README.contributors
new file mode 100644
index 000000000000..f8fcdde432e1
--- /dev/null
+++ b/contrib/arm-optimized-routines/README.contributors
@@ -0,0 +1,44 @@
+GENERIC CONTRIBUTION GUIDELINES
+===============================
+
+1. Sub-projects are maintained independently and thus have independent
+   contribution rules. If there exists a README.contributors in the
+   sub-directory to which the contribution is made, it must be followed.
+
+2. Legal:
+   - Contributors who are not employed by Arm must sign an Assignment Agreement.
+     See contributor-agreement.pdf.
+   - All code must be copyright owned by Arm Limited and the appropriate
+     copyright notice and license identifier must be present in every source
+     file.
+
+3. Build:
+   - Build should only depend on GNU make and posix utilities (shell, awk, sed,
+     etc) and on a C toolchain.
+   - Build should pass with the default configuration (see config.mk.dist)
+     and other supported configurations, with both gcc and clang based
+     toolchains. (The build should not depend on a recent toolchain, the use
+     of a new feature should be possible to disable.)
+   - Currently there is no automated configuration, target specific configuration
+     should be done via make variables in config.mk. This is the user interface
+     to the build system, so it should be documented in sufficient detail and
+     kept reasonably stable.
+
+4. Testing:
+   - On aarch64 the tests must pass. If the code may behave differently under
+     some supported configurations (e.g. CFLAGS) those should be tested.
+   - New symbols are expected to have new associated test code and ideally
+     benchmark code too.
+
+4. Commits:
+   - Commit message should be descriptive and should not refer to Arm internal
+     information (such as Jira tickets, or internal discussions). Non-obvious
+     decisions should be recorded or explained in the commit message if they are
+     not explained in source comments.
+   - Ideally tools and scripts used to write the code should be added to the
+     repository or at least mentioned in the commit.
+   - Logically independent changes should not be mixed into the same commit.
+
+5. Style:
+   - Unless otherwise required differently by the sub-project, follow the
+     clang-format tool using the style from the gcc contrib/ directory.
diff --git a/contrib/arm-optimized-routines/config.mk.dist b/contrib/arm-optimized-routines/config.mk.dist
index 177e1ac4f53a..7a8497507a81 100644
--- a/contrib/arm-optimized-routines/config.mk.dist
+++ b/contrib/arm-optimized-routines/config.mk.dist
@@ -1,73 +1,93 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 # Subprojects to build
 SUBS = math string networking
 
+# Subsubprojects to build if subproject pl is built
+PLSUBS = math
+
 # Target architecture: aarch64, arm or x86_64
 ARCH = aarch64
 
 # Use for cross compilation with gcc.
 #CROSS_COMPILE = aarch64-none-linux-gnu-
 
 # Compiler for the target
 CC = $(CROSS_COMPILE)gcc
 CFLAGS = -std=c99 -pipe -O3
 CFLAGS += -Wall -Wno-missing-braces
 CFLAGS += -Werror=implicit-function-declaration
 
 # Used for test case generator that is executed on the host
 HOST_CC = gcc
 HOST_CFLAGS = -std=c99 -O2
 HOST_CFLAGS += -Wall -Wno-unused-function
 
 # Enable debug info.
 HOST_CFLAGS += -g
 CFLAGS += -g
 
 # Optimize the shared libraries on aarch64 assuming they fit in 1M.
 #CFLAGS_SHARED = -fPIC -mcmodel=tiny
 
 # Enable MTE support.
 #CFLAGS += -march=armv8.5-a+memtag -DWANT_MTE_TEST=1
 
 # Use with cross testing.
 #EMULATOR = qemu-aarch64-static
 #EMULATOR = sh -c 'scp $$1 user@host:/dir && ssh user@host /dir/"$$@"' --
 
 # Additional flags for subprojects.
 math-cflags =
 math-ldlibs =
 math-ulpflags =
 math-testflags =
 string-cflags =
 networking-cflags =
 
 # Use if mpfr is available on the target for ulp error checking.
 #math-ldlibs += -lmpfr -lgmp
 #math-cflags += -DUSE_MPFR
 
 # Use with gcc.
 math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector
 math-cflags += -ffp-contract=fast -fno-math-errno
 
 # Use with clang.
 #math-cflags += -ffp-contract=fast
 
 # Disable vector math code
 #math-cflags += -DWANT_VMATH=0
 
+# Disable/enable SVE vector math code and tests
+WANT_SVE_MATH = 0
+ifeq ($(WANT_SVE_MATH), 1)
+  math-cflags += -march=armv8.2-a+sve
+endif
+math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
+
+# If defined to 1, set errno in math functions according to ISO C.  Many math
+# libraries do not set errno, so this is 0 by default.  It may need to be
+# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
+WANT_ERRNO = 0
+math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
+
+# If set to 1, set fenv in vector math routines.
+WANT_SIMD_EXCEPT = 0
+math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
+
 # Disable fenv checks
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
 
 # Remove GNU Property Notes from asm files.
 #string-cflags += -DWANT_GNU_PROPERTY=0
 
 # Enable assertion checks.
 #networking-cflags += -DWANT_ASSERT
 
 # Avoid auto-vectorization of scalar code and unroll loops
 networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk
index 3b841ab71955..2a9cad10d96a 100644
--- a/contrib/arm-optimized-routines/math/Dir.mk
+++ b/contrib/arm-optimized-routines/math/Dir.mk
@@ -1,110 +1,115 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/math
 B := build/math
 
 math-lib-srcs := $(wildcard $(S)/*.[cS])
 math-test-srcs := \
 	$(S)/test/mathtest.c \
 	$(S)/test/mathbench.c \
 	$(S)/test/ulp.c \
 
 math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
 
 math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
 
 math-libs := \
 	build/lib/libmathlib.so \
 	build/lib/libmathlib.a \
 
 math-tools := \
 	build/bin/mathtest \
 	build/bin/mathbench \
 	build/bin/mathbench_libc \
 	build/bin/runulp.sh \
 	build/bin/ulp \
 
 math-host-tools := \
 	build/bin/rtest \
 
 math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
 math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs)))
 math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
 math-target-objs := $(math-lib-objs) $(math-test-objs)
 math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
 
 math-files := \
 	$(math-objs) \
 	$(math-libs) \
 	$(math-tools) \
 	$(math-host-tools) \
 	$(math-includes) \
+	$(math-test-includes) \
 
-all-math: $(math-libs) $(math-tools) $(math-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
 
-$(math-objs): $(math-includes)
+$(math-objs): $(math-includes) $(math-test-includes)
 $(math-objs): CFLAGS_ALL += $(math-cflags)
 $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
 $(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS)
 
 $(B)/test/ulp.o: $(S)/test/ulp.h
 
 build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
 
 build/lib/libmathlib.a: $(math-lib-objs)
 	rm -f $@
 	$(AR) rc $@ $^
 	$(RANLIB) $@
 
 $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
 $(math-tools): LDLIBS += $(math-ldlibs) -lm
 
 build/bin/rtest: $(math-host-objs)
 	$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
 
 build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
 build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
 # This is not ideal, but allows custom symbols in mathbench to get resolved.
 build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm
 
 build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
+build/include/test/%.h: $(S)/test/%.h
+	cp $< $@
+
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
 math-tests := $(wildcard $(S)/test/testcases/directed/*.tst)
 math-rtests := $(wildcard $(S)/test/testcases/random/*.tst)
 
 check-math-test: $(math-tools)
 	cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags)
 
 check-math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
 
 check-math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
+	ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
 
 check-math: check-math-test check-math-rtest check-math-ulp
 
 install-math: \
  $(math-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
  $(math-includes:build/include/%=$(DESTDIR)$(includedir)/%)
 
 clean-math:
 	rm -f $(math-files)
 
 .PHONY: all-math check-math-test check-math-rtest check-math-ulp check-math install-math clean-math
diff --git a/contrib/arm-optimized-routines/math/README.contributors b/contrib/arm-optimized-routines/math/README.contributors
new file mode 100644
index 000000000000..33e7ba376e41
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/README.contributors
@@ -0,0 +1,78 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
+==============================================
+
+1. Math functions have quality and performance requirements.
+
+2. Quality:
+   - Worst-case ULP error should be small in the entire input domain (for most
+     common double precision scalar functions the target is < 0.66 ULP error,
+     and < 1 ULP for single precision, even performance optimized function
+     variant should not have > 5 ULP error if the goal is to be a drop in
+     replacement for a standard math function), this should be tested
+     statistically (or on all inputs if possible in reasonable amount of time).
+     The ulp tool is for this and runulp.sh should be updated for new functions.
+
+   - All standard rounding modes need to be supported but in non-default rounding
+     modes the quality requirement can be relaxed. (Non-nearest rounded
+     computation can be slow and inaccurate but has to be correct for conformance
+     reasons.)
+
+   - Special cases and error handling need to follow ISO C Annex F requirements,
+     POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
+     https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
+     this should be tested by direct tests (glibc test system may be used for it).
+
+   - Error handling code should be decoupled from the approximation code as much
+     as possible. (There are helper functions, these take care of errno as well
+     as exception raising.)
+
+   - Vector math code does not need to work in non-nearest rounding mode and error
+     handling side effects need not happen (fenv exceptions and errno), but the
+     result should be correct (within quality requirements, which are lower for
+     vector code than for scalar code).
+
+   - Error bounds of the approximation should be clearly documented.
+
+   - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
+     systems. (Routines and features can be disabled on specific targets, but
+     the build must complete). On aarch64, both little- and big-endian targets
+     are supported as well as valid combinations of architecture extensions.
+     The configurations that should be tested depend on the contribution.
+
+3. Performance:
+   - Common math code should be benchmarked on modern aarch64 microarchitectures
+     over typical inputs.
+
+   - Performance improvements should be documented (relative numbers can be
+     published; it is enough to use the mathbench microbenchmark tool which should
+     be updated for new functions).
+
+   - Attention should be paid to the compilation flags: for aarch64 fma
+     contraction should be on and math errno turned off so some builtins can be
+     inlined.
+
+   - The code should be reasonably performant on x86_64 too, e.g. some rounding
+     instructions and fma may not be available on x86_64, such builtins turn into
+     libc calls with slow code. Such slowdown is not acceptable, a faster fallback
+     should be present: glibc and bionic use the same code on all targets. (This
+     does not apply to vector math code).
diff --git a/contrib/arm-optimized-routines/math/cosf.c b/contrib/arm-optimized-routines/math/cosf.c
index 67a3798b573e..6293ce8f1b7d 100644
--- a/contrib/arm-optimized-routines/math/cosf.c
+++ b/contrib/arm-optimized-routines/math/cosf.c
@@ -1,63 +1,63 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <math.h>
 #include "math_config.h"
 #include "sincosf.h"
 
 /* Fast cosf implementation.  Worst-case ULP is 0.5607, maximum relative
    error is 0.5303 * 2^-23.  A single-step range reduction is used for
    small values.  Large inputs have their range reduced using fast integer
    arithmetic.  */
 float
 cosf (float y)
 {
   double x = y;
   double s;
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
   if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
       if (unlikely (abstop12 (y) < abstop12 (0x1p-12f)))
 	return 1.0f;
 
       return sinf_poly (x, x2, p, 1);
     }
   else if (likely (abstop12 (y) < abstop12 (120.0f)))
     {
       x = reduce_fast (x, p, &n);
 
       /* Setup the signs for sin and cos.  */
       s = p->sign[n & 3];
 
       if (n & 2)
 	p = &__sincosf_table[1];
 
       return sinf_poly (x * s, x * x, p, n ^ 1);
     }
   else if (abstop12 (y) < abstop12 (INFINITY))
     {
       uint32_t xi = asuint (y);
       int sign = xi >> 31;
 
       x = reduce_large (xi, &n);
 
       /* Setup signs for sin and cos - include original sign.  */
       s = p->sign[(n + sign) & 3];
 
       if ((n + sign) & 2)
 	p = &__sincosf_table[1];
 
       return sinf_poly (x * s, x * x, p, n ^ 1);
     }
   else
     return __math_invalidf (y);
 }
diff --git a/contrib/arm-optimized-routines/math/erf.c b/contrib/arm-optimized-routines/math/erf.c
index 12d7e5160df7..5f9f40dda264 100644
--- a/contrib/arm-optimized-routines/math/erf.c
+++ b/contrib/arm-optimized-routines/math/erf.c
@@ -1,244 +1,244 @@
 /*
  * Double-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 #include <math.h>
 #include <stdint.h>
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
 #define C 0x1.b0ac16p-1
 #define PA __erf_data.erf_poly_A
 #define NA __erf_data.erf_ratio_N_A
 #define DA __erf_data.erf_ratio_D_A
 #define NB __erf_data.erf_ratio_N_B
 #define DB __erf_data.erf_ratio_D_B
 #define PC __erf_data.erfc_poly_C
 #define PD __erf_data.erfc_poly_D
 #define PE __erf_data.erfc_poly_E
 #define PF __erf_data.erfc_poly_F
 
 /* Top 32 bits of a double.  */
 static inline uint32_t
 top32 (double x)
 {
   return asuint64 (x) >> 32;
 }
 
 /* Fast erf implementation using a mix of
    rational and polynomial approximations.
    Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.  */
 double
 erf (double x)
 {
   /* Get top word and sign.  */
   uint32_t ix = top32 (x);
   uint32_t ia = ix & 0x7fffffff;
   uint32_t sign = ix >> 31;
 
   /* Normalized and subnormal cases */
   if (ia < 0x3feb0000)
     { /* a = |x| < 0.84375.  */
 
       if (ia < 0x3e300000)
 	{ /* a < 2^(-28).  */
 	  if (ia < 0x00800000)
 	    { /* a < 2^(-1015).  */
 	      double y =  fma (TwoOverSqrtPiMinusOne, x, x);
 	      return check_uflow (y);
 	    }
 	  return x + TwoOverSqrtPiMinusOne * x;
 	}
 
       double x2 = x * x;
 
       if (ia < 0x3fe00000)
 	{ /* a < 0.5  - Use polynomial approximation.  */
 	  double r1 = fma (x2, PA[1], PA[0]);
 	  double r2 = fma (x2, PA[3], PA[2]);
 	  double r3 = fma (x2, PA[5], PA[4]);
 	  double r4 = fma (x2, PA[7], PA[6]);
 	  double r5 = fma (x2, PA[9], PA[8]);
 	  double x4 = x2 * x2;
 	  double r = r5;
 	  r = fma (x4, r, r4);
 	  r = fma (x4, r, r3);
 	  r = fma (x4, r, r2);
 	  r = fma (x4, r, r1);
 	  return fma (r, x, x); /* This fma is crucial for accuracy.  */
 	}
       else
 	{ /* 0.5 <= a < 0.84375 - Use rational approximation.  */
 	  double x4, x8, r1n, r2n, r1d, r2d, r3d;
 
 	  r1n = fma (x2, NA[1], NA[0]);
 	  x4 = x2 * x2;
 	  r2n = fma (x2, NA[3], NA[2]);
 	  x8 = x4 * x4;
 	  r1d = fma (x2, DA[0], 1.0);
 	  r2d = fma (x2, DA[2], DA[1]);
 	  r3d = fma (x2, DA[4], DA[3]);
 	  double P = r1n + x4 * r2n + x8 * NA[4];
 	  double Q = r1d + x4 * r2d + x8 * r3d;
 	  return fma (P / Q, x, x);
 	}
     }
   else if (ia < 0x3ff40000)
     { /* 0.84375 <= |x| < 1.25.  */
       double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
       double a = fabs (x) - 1.0;
       r1n = fma (a, NB[1], NB[0]);
       a2 = a * a;
       r1d = fma (a, DB[0], 1.0);
       a4 = a2 * a2;
       r2n = fma (a, NB[3], NB[2]);
       a6 = a4 * a2;
       r2d = fma (a, DB[2], DB[1]);
       r3n = fma (a, NB[5], NB[4]);
       r3d = fma (a, DB[4], DB[3]);
       r4n = NB[6];
       r4d = DB[5];
       double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
       double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
       if (sign)
 	return -C - P / Q;
       else
 	return C + P / Q;
     }
   else if (ia < 0x40000000)
     { /* 1.25 <= |x| < 2.0.  */
       double a = fabs (x);
       a = a - 1.25;
 
       double r1 = fma (a, PC[1], PC[0]);
       double r2 = fma (a, PC[3], PC[2]);
       double r3 = fma (a, PC[5], PC[4]);
       double r4 = fma (a, PC[7], PC[6]);
       double r5 = fma (a, PC[9], PC[8]);
       double r6 = fma (a, PC[11], PC[10]);
       double r7 = fma (a, PC[13], PC[12]);
       double r8 = fma (a, PC[15], PC[14]);
 
       double a2 = a * a;
 
       double r = r8;
       r = fma (a2, r, r7);
       r = fma (a2, r, r6);
       r = fma (a2, r, r5);
       r = fma (a2, r, r4);
       r = fma (a2, r, r3);
       r = fma (a2, r, r2);
       r = fma (a2, r, r1);
 
       if (sign)
 	return -1.0 + r;
       else
 	return 1.0 - r;
     }
   else if (ia < 0x400a0000)
     { /* 2 <= |x| < 3.25.  */
       double a = fabs (x);
       a = fma (0.5, a, -1.0);
 
       double r1 = fma (a, PD[1], PD[0]);
       double r2 = fma (a, PD[3], PD[2]);
       double r3 = fma (a, PD[5], PD[4]);
       double r4 = fma (a, PD[7], PD[6]);
       double r5 = fma (a, PD[9], PD[8]);
       double r6 = fma (a, PD[11], PD[10]);
       double r7 = fma (a, PD[13], PD[12]);
       double r8 = fma (a, PD[15], PD[14]);
       double r9 = fma (a, PD[17], PD[16]);
 
       double a2 = a * a;
 
       double r = r9;
       r = fma (a2, r, r8);
       r = fma (a2, r, r7);
       r = fma (a2, r, r6);
       r = fma (a2, r, r5);
       r = fma (a2, r, r4);
       r = fma (a2, r, r3);
       r = fma (a2, r, r2);
       r = fma (a2, r, r1);
 
       if (sign)
 	return -1.0 + r;
       else
 	return 1.0 - r;
     }
   else if (ia < 0x40100000)
     { /* 3.25 <= |x| < 4.0.  */
       double a = fabs (x);
       a = a - 3.25;
 
       double r1 = fma (a, PE[1], PE[0]);
       double r2 = fma (a, PE[3], PE[2]);
       double r3 = fma (a, PE[5], PE[4]);
       double r4 = fma (a, PE[7], PE[6]);
       double r5 = fma (a, PE[9], PE[8]);
       double r6 = fma (a, PE[11], PE[10]);
       double r7 = fma (a, PE[13], PE[12]);
 
       double a2 = a * a;
 
       double r = r7;
       r = fma (a2, r, r6);
       r = fma (a2, r, r5);
       r = fma (a2, r, r4);
       r = fma (a2, r, r3);
       r = fma (a2, r, r2);
       r = fma (a2, r, r1);
 
       if (sign)
 	return -1.0 + r;
       else
 	return 1.0 - r;
     }
   else if (ia < 0x4017a000)
     { /* 4 <= |x| < 5.90625.  */
       double a = fabs (x);
       a = fma (0.5, a, -2.0);
 
       double r1 = fma (a, PF[1], PF[0]);
       double r2 = fma (a, PF[3], PF[2]);
       double r3 = fma (a, PF[5], PF[4]);
       double r4 = fma (a, PF[7], PF[6]);
       double r5 = fma (a, PF[9], PF[8]);
       double r6 = fma (a, PF[11], PF[10]);
       double r7 = fma (a, PF[13], PF[12]);
       double r8 = fma (a, PF[15], PF[14]);
       double r9 = PF[16];
 
       double a2 = a * a;
 
       double r = r9;
       r = fma (a2, r, r8);
       r = fma (a2, r, r7);
       r = fma (a2, r, r6);
       r = fma (a2, r, r5);
       r = fma (a2, r, r4);
       r = fma (a2, r, r3);
       r = fma (a2, r, r2);
       r = fma (a2, r, r1);
 
       if (sign)
 	return -1.0 + r;
       else
 	return 1.0 - r;
     }
   else
     {
       /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1.  */
       if (unlikely (ia >= 0x7ff00000))
 	return (double) (1.0 - (sign << 1)) + 1.0 / x;
 
       if (sign)
 	return -1.0;
       else
 	return 1.0;
     }
 }
diff --git a/contrib/arm-optimized-routines/math/erf_data.c b/contrib/arm-optimized-routines/math/erf_data.c
index 807875bdd7f5..10cf1fae93e0 100644
--- a/contrib/arm-optimized-routines/math/erf_data.c
+++ b/contrib/arm-optimized-routines/math/erf_data.c
@@ -1,85 +1,85 @@
 /*
  * Shared data between erf and erfc.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 /*
 Minimax approximation of erf
 */
 const struct erf_data __erf_data = {
 .erf_poly_A = {
 #if ERF_POLY_A_NCOEFFS == 10
 0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,
 -0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,
 0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,
 -0x1.18c47fd143c5ep-23
 #endif
 },
 /* Rational approximation on [0x1p-28, 0.84375] */
 .erf_ratio_N_A = {
 0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,
 -0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16
 },
 .erf_ratio_D_A = {
 0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,
 0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18
 },
 /* Rational approximation on [0.84375, 1.25] */
 .erf_ratio_N_B = {
 -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2,
 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5,
 -0x1.1bf380a96073fp-9
 },
 .erf_ratio_D_B = {
 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4,
 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7
 },
 .erfc_poly_C = {
 #if ERFC_POLY_C_NCOEFFS == 16
 /* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */
 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2,
 -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5,
 -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8,
 -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12,
 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16,
 -0x1.578c9e375d37p-19
 #endif
 },
 .erfc_poly_D = {
 #if ERFC_POLY_D_NCOEFFS == 18
 /* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */
 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3,
 -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2,
 -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2,
 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3,
 -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3,
 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10
 #endif
 },
 .erfc_poly_E = {
 #if ERFC_POLY_E_NCOEFFS == 14
 /* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */
 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14,
 -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12,
 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14,
 -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20,
 -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23
 #endif
 },
 .erfc_poly_F = {
 #if ERFC_POLY_F_NCOEFFS == 17
 /* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */
 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19,
 -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14,
 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11,
 -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11,
 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14,
 -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19
 #endif
 }
 };
 
diff --git a/contrib/arm-optimized-routines/math/erff.c b/contrib/arm-optimized-routines/math/erff.c
index a58e82565dc3..9fa476dbbab2 100644
--- a/contrib/arm-optimized-routines/math/erff.c
+++ b/contrib/arm-optimized-routines/math/erff.c
@@ -1,104 +1,104 @@
 /*
  * Single-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <math.h>
 #include "math_config.h"
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
 #define A __erff_data.erff_poly_A
 #define B __erff_data.erff_poly_B
 
 /* Top 12 bits of a float.  */
 static inline uint32_t
 top12 (float x)
 {
   return asuint (x) >> 20;
 }
 
 /* Efficient implementation of erff
    using either a pure polynomial approximation or
    the exponential of a polynomial.
    Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
 float
 erff (float x)
 {
   float r, x2, u;
 
   /* Get top word.  */
   uint32_t ix = asuint (x);
   uint32_t sign = ix >> 31;
   uint32_t ia12 = top12 (x) & 0x7ff;
 
   /* Limit of both intervals is 0.875 for performance reasons but coefficients
      computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
      from 0.94 to 1.1ulps.  */
   if (ia12 < 0x3f6)
     { /* a = |x| < 0.875.  */
 
       /* Tiny and subnormal cases.  */
       if (unlikely (ia12 < 0x318))
 	{ /* |x| < 2^(-28).  */
 	  if (unlikely (ia12 < 0x040))
 	    { /* |x| < 2^(-119).  */
 	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
 	      return check_uflowf (y);
 	    }
 	  return x + TwoOverSqrtPiMinusOne * x;
 	}
 
       x2 = x * x;
 
       /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
       r = A[5];
       r = fmaf (r, x2, A[4]);
       r = fmaf (r, x2, A[3]);
       r = fmaf (r, x2, A[2]);
       r = fmaf (r, x2, A[1]);
       r = fmaf (r, x2, A[0]);
       r = fmaf (r, x, x);
     }
   else if (ia12 < 0x408)
     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
 
       float a = fabsf (x);
       /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
       r = fmaf (B[6], a, B[5]);
       u = fmaf (B[4], a, B[3]);
       x2 = x * x;
       r = fmaf (r, x2, u);
       /* Then switch to pure Horner scheme.  */
       r = fmaf (r, a, B[2]);
       r = fmaf (r, a, B[1]);
       r = fmaf (r, a, B[0]);
       r = fmaf (r, a, a);
       /* Single precision exponential with ~0.5ulps,
 	 ensures erff has max. rel. error
 	 < 1ulp on [0.921875, 4.0],
 	 < 1.1ulps on [0.875, 4.0].  */
       r = expf (-r);
       /* Explicit copysign (calling copysignf increases latency).  */
       if (sign)
 	r = -1.0f + r;
       else
 	r = 1.0f - r;
     }
   else
     { /* |x| >= 4.0.  */
 
       /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
       if (unlikely (ia12 >= 0x7f8))
 	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
 
       /* Explicit copysign (calling copysignf increases latency).  */
       if (sign)
 	r = -1.0f;
       else
 	r = 1.0f;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/math/erff_data.c b/contrib/arm-optimized-routines/math/erff_data.c
index fa6b1ef4dedb..f822788d0dd8 100644
--- a/contrib/arm-optimized-routines/math/erff_data.c
+++ b/contrib/arm-optimized-routines/math/erff_data.c
@@ -1,22 +1,22 @@
 /*
  * Data for approximation of erff.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 /* Minimax approximation of erff. */
 const struct erff_data __erff_data = {
 .erff_poly_A = {
 0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
 -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f
 },
 .erff_poly_B = {
 0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f,
 -0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f,
 0x1.222900p-16f
 }
 };
 
diff --git a/contrib/arm-optimized-routines/math/exp.c b/contrib/arm-optimized-routines/math/exp.c
index 7f5024cd8792..1de500c31f3e 100644
--- a/contrib/arm-optimized-routines/math/exp.c
+++ b/contrib/arm-optimized-routines/math/exp.c
@@ -1,176 +1,176 @@
 /*
  * Double-precision e^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 #define N (1 << EXP_TABLE_BITS)
 #define InvLn2N __exp_data.invln2N
 #define NegLn2hiN __exp_data.negln2hiN
 #define NegLn2loN __exp_data.negln2loN
 #define Shift __exp_data.shift
 #define T __exp_data.tab
 #define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
 #define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
 #define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
 #define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
 #define C6 __exp_data.poly[9 - EXP_POLY_ORDER]
 
 /* Handle cases that may overflow or underflow when computing the result that
    is scale*(1+TMP) without intermediate rounding.  The bit representation of
    scale is in SBITS, however it has a computed exponent that may have
    overflown into the sign bit so that needs to be adjusted before using it as
    a double.  (int32_t)KI is the k used in the argument reduction and exponent
    adjustment of scale, positive k here means the result may overflow and
    negative k means the result may underflow.  */
 static inline double
 specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
 {
   double_t scale, y;
 
   if ((ki & 0x80000000) == 0)
     {
       /* k > 0, the exponent of scale might have overflowed by <= 460.  */
       sbits -= 1009ull << 52;
       scale = asdouble (sbits);
       y = 0x1p1009 * (scale + scale * tmp);
       return check_oflow (eval_as_double (y));
     }
   /* k < 0, need special care in the subnormal range.  */
   sbits += 1022ull << 52;
   scale = asdouble (sbits);
   y = scale + scale * tmp;
   if (y < 1.0)
     {
       /* Round y to the right precision before scaling it into the subnormal
 	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
 	 E is the worst-case ulp error outside the subnormal range.  So this
 	 is only useful if the goal is better than 1 ulp worst-case error.  */
       double_t hi, lo;
       lo = scale - y + scale * tmp;
       hi = 1.0 + y;
       lo = 1.0 - hi + y + lo;
       y = eval_as_double (hi + lo) - 1.0;
       /* Avoid -0.0 with downward rounding.  */
       if (WANT_ROUNDING && y == 0.0)
 	y = 0.0;
       /* The underflow exception needs to be signaled explicitly.  */
       force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
     }
   y = 0x1p-1022 * y;
   return check_uflow (eval_as_double (y));
 }
 
 /* Top 12 bits of a double (sign and exponent bits).  */
 static inline uint32_t
 top12 (double x)
 {
   return asuint64 (x) >> 52;
 }
 
 /* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
    If hastail is 0 then xtail is assumed to be 0 too.  */
 static inline double
 exp_inline (double x, double xtail, int hastail)
 {
   uint32_t abstop;
   uint64_t ki, idx, top, sbits;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, z, r, r2, scale, tail, tmp;
 
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54)))
     {
       if (abstop - top12 (0x1p-54) >= 0x80000000)
 	/* Avoid spurious underflow for tiny x.  */
 	/* Note: 0 is common input.  */
 	return WANT_ROUNDING ? 1.0 + x : 1.0;
       if (abstop >= top12 (1024.0))
 	{
 	  if (asuint64 (x) == asuint64 (-INFINITY))
 	    return 0.0;
 	  if (abstop >= top12 (INFINITY))
 	    return 1.0 + x;
 	  if (asuint64 (x) >> 63)
 	    return __math_uflow (0);
 	  else
 	    return __math_oflow (0);
 	}
       /* Large x is special cased below.  */
       abstop = 0;
     }
 
   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
   /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
   z = InvLn2N * x;
 #if TOINT_INTRINSICS
   kd = roundtoint (z);
   ki = converttoint (z);
 #elif EXP_USE_TOINT_NARROW
   /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
   kd = eval_as_double (z + Shift);
   ki = asuint64 (kd) >> 16;
   kd = (double_t) (int32_t) ki;
 #else
   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
   kd = eval_as_double (z + Shift);
   ki = asuint64 (kd);
   kd -= Shift;
 #endif
   r = x + kd * NegLn2hiN + kd * NegLn2loN;
   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
   if (hastail)
     r += xtail;
   /* 2^(k/N) ~= scale * (1 + tail).  */
   idx = 2 * (ki % N);
   top = ki << (52 - EXP_TABLE_BITS);
   tail = asdouble (T[idx]);
   /* This is only a valid scale when -1023*N < k < 1024*N.  */
   sbits = T[idx + 1] + top;
   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
   r2 = r * r;
   /* Without fma the worst case error is 0.25/N ulp larger.  */
   /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
 #if EXP_POLY_ORDER == 4
   tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4);
 #elif EXP_POLY_ORDER == 5
   tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
 #elif EXP_POLY_ORDER == 6
   tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
 #endif
   if (unlikely (abstop == 0))
     return specialcase (tmp, sbits, ki);
   scale = asdouble (sbits);
   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
      is no spurious underflow here even without fma.  */
   return eval_as_double (scale + scale * tmp);
 }
 
 double
 exp (double x)
 {
   return exp_inline (x, 0, 0);
 }
 
 /* May be useful for implementing pow where more than double
    precision input is needed.  */
 double
 __exp_dd (double x, double xtail)
 {
   return exp_inline (x, xtail, 1);
 }
 #if USE_GLIBC_ABI
 strong_alias (exp, __exp_finite)
 hidden_alias (exp, __ieee754_exp)
 hidden_alias (__exp_dd, __exp1)
 # if LDBL_MANT_DIG == 53
 long double expl (long double x) { return exp (x); }
 # endif
 #endif
diff --git a/contrib/arm-optimized-routines/math/exp2.c b/contrib/arm-optimized-routines/math/exp2.c
index 35ab39f22ed5..a1eee44f1f48 100644
--- a/contrib/arm-optimized-routines/math/exp2.c
+++ b/contrib/arm-optimized-routines/math/exp2.c
@@ -1,143 +1,143 @@
 /*
  * Double-precision 2^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 #define N (1 << EXP_TABLE_BITS)
 #define Shift __exp_data.exp2_shift
 #define T __exp_data.tab
 #define C1 __exp_data.exp2_poly[0]
 #define C2 __exp_data.exp2_poly[1]
 #define C3 __exp_data.exp2_poly[2]
 #define C4 __exp_data.exp2_poly[3]
 #define C5 __exp_data.exp2_poly[4]
 #define C6 __exp_data.exp2_poly[5]
 
 /* Handle cases that may overflow or underflow when computing the result that
    is scale*(1+TMP) without intermediate rounding.  The bit representation of
    scale is in SBITS, however it has a computed exponent that may have
    overflown into the sign bit so that needs to be adjusted before using it as
    a double.  (int32_t)KI is the k used in the argument reduction and exponent
    adjustment of scale, positive k here means the result may overflow and
    negative k means the result may underflow.  */
 static inline double
 specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
 {
   double_t scale, y;
 
   if ((ki & 0x80000000) == 0)
     {
       /* k > 0, the exponent of scale might have overflowed by 1.  */
       sbits -= 1ull << 52;
       scale = asdouble (sbits);
       y = 2 * (scale + scale * tmp);
       return check_oflow (eval_as_double (y));
     }
   /* k < 0, need special care in the subnormal range.  */
   sbits += 1022ull << 52;
   scale = asdouble (sbits);
   y = scale + scale * tmp;
   if (y < 1.0)
     {
       /* Round y to the right precision before scaling it into the subnormal
 	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
 	 E is the worst-case ulp error outside the subnormal range.  So this
 	 is only useful if the goal is better than 1 ulp worst-case error.  */
       double_t hi, lo;
       lo = scale - y + scale * tmp;
       hi = 1.0 + y;
       lo = 1.0 - hi + y + lo;
       y = eval_as_double (hi + lo) - 1.0;
       /* Avoid -0.0 with downward rounding.  */
       if (WANT_ROUNDING && y == 0.0)
 	y = 0.0;
       /* The underflow exception needs to be signaled explicitly.  */
       force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
     }
   y = 0x1p-1022 * y;
   return check_uflow (eval_as_double (y));
 }
 
 /* Top 12 bits of a double (sign and exponent bits).  */
 static inline uint32_t
 top12 (double x)
 {
   return asuint64 (x) >> 52;
 }
 
 double
 exp2 (double x)
 {
   uint32_t abstop;
   uint64_t ki, idx, top, sbits;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, r, r2, scale, tail, tmp;
 
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54)))
     {
       if (abstop - top12 (0x1p-54) >= 0x80000000)
 	/* Avoid spurious underflow for tiny x.  */
 	/* Note: 0 is common input.  */
 	return WANT_ROUNDING ? 1.0 + x : 1.0;
       if (abstop >= top12 (1024.0))
 	{
 	  if (asuint64 (x) == asuint64 (-INFINITY))
 	    return 0.0;
 	  if (abstop >= top12 (INFINITY))
 	    return 1.0 + x;
 	  if (!(asuint64 (x) >> 63))
 	    return __math_oflow (0);
 	  else if (asuint64 (x) >= asuint64 (-1075.0))
 	    return __math_uflow (0);
 	}
       if (2 * asuint64 (x) > 2 * asuint64 (928.0))
 	/* Large x is special cased below.  */
 	abstop = 0;
     }
 
   /* exp2(x) = 2^(k/N) * 2^r, with 2^r in [2^(-1/2N),2^(1/2N)].  */
   /* x = k/N + r, with int k and r in [-1/2N, 1/2N].  */
   kd = eval_as_double (x + Shift);
   ki = asuint64 (kd); /* k.  */
   kd -= Shift; /* k/N for int k.  */
   r = x - kd;
   /* 2^(k/N) ~= scale * (1 + tail).  */
   idx = 2 * (ki % N);
   top = ki << (52 - EXP_TABLE_BITS);
   tail = asdouble (T[idx]);
   /* This is only a valid scale when -1023*N < k < 1024*N.  */
   sbits = T[idx + 1] + top;
   /* exp2(x) = 2^(k/N) * 2^r ~= scale + scale * (tail + 2^r - 1).  */
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
   r2 = r * r;
   /* Without fma the worst case error is 0.5/N ulp larger.  */
   /* Worst case error is less than 0.5+0.86/N+(abs poly error * 2^53) ulp.  */
 #if EXP2_POLY_ORDER == 4
   tmp = tail + r * C1 + r2 * C2 + r * r2 * (C3 + r * C4);
 #elif EXP2_POLY_ORDER == 5
   tmp = tail + r * C1 + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
 #elif EXP2_POLY_ORDER == 6
   tmp = tail + r * C1 + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
 #endif
   if (unlikely (abstop == 0))
     return specialcase (tmp, sbits, ki);
   scale = asdouble (sbits);
   /* Note: tmp == 0 or |tmp| > 2^-65 and scale > 2^-928, so there
      is no spurious underflow here even without fma.  */
   return eval_as_double (scale + scale * tmp);
 }
 #if USE_GLIBC_ABI
 strong_alias (exp2, __exp2_finite)
 hidden_alias (exp2, __ieee754_exp2)
 # if LDBL_MANT_DIG == 53
 long double exp2l (long double x) { return exp2 (x); }
 # endif
 #endif
diff --git a/contrib/arm-optimized-routines/math/exp2f.c b/contrib/arm-optimized-routines/math/exp2f.c
index 94b32538aa0d..776c3ddf7663 100644
--- a/contrib/arm-optimized-routines/math/exp2f.c
+++ b/contrib/arm-optimized-routines/math/exp2f.c
@@ -1,80 +1,80 @@
 /*
  * Single-precision 2^x function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 EXP2F_TABLE_BITS = 5
 EXP2F_POLY_ORDER = 3
 
 ULP error: 0.502 (nearest rounding.)
 Relative error: 1.69 * 2^-34 in [-1/64, 1/64] (before rounding.)
 Wrong count: 168353 (all nearest rounding wrong results with fma.)
 Non-nearest ULP error: 1 (rounded ULP error)
 */
 
 #define N (1 << EXP2F_TABLE_BITS)
 #define T __exp2f_data.tab
 #define C __exp2f_data.poly
 #define SHIFT __exp2f_data.shift_scaled
 
 static inline uint32_t
 top12 (float x)
 {
   return asuint (x) >> 20;
 }
 
 float
 exp2f (float x)
 {
   uint32_t abstop;
   uint64_t ki, t;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, xd, z, r, r2, y, s;
 
   xd = (double_t) x;
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop >= top12 (128.0f)))
     {
       /* |x| >= 128 or x is nan.  */
       if (asuint (x) == asuint (-INFINITY))
 	return 0.0f;
       if (abstop >= top12 (INFINITY))
 	return x + x;
       if (x > 0.0f)
 	return __math_oflowf (0);
       if (x <= -150.0f)
 	return __math_uflowf (0);
 #if WANT_ERRNO_UFLOW
       if (x < -149.0f)
 	return __math_may_uflowf (0);
 #endif
     }
 
   /* x = k/N + r with r in [-1/(2N), 1/(2N)] and int k.  */
   kd = eval_as_double (xd + SHIFT);
   ki = asuint64 (kd);
   kd -= SHIFT; /* k/N for int k.  */
   r = xd - kd;
 
   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
   t = T[ki % N];
   t += ki << (52 - EXP2F_TABLE_BITS);
   s = asdouble (t);
   z = C[0] * r + C[1];
   r2 = r * r;
   y = C[2] * r + 1;
   y = z * r2 + y;
   y = y * s;
   return eval_as_float (y);
 }
 #if USE_GLIBC_ABI
 strong_alias (exp2f, __exp2f_finite)
 hidden_alias (exp2f, __ieee754_exp2f)
 #endif
diff --git a/contrib/arm-optimized-routines/math/exp2f_data.c b/contrib/arm-optimized-routines/math/exp2f_data.c
index 3fb0ad11b15a..f0cb7fccacd1 100644
--- a/contrib/arm-optimized-routines/math/exp2f_data.c
+++ b/contrib/arm-optimized-routines/math/exp2f_data.c
@@ -1,78 +1,78 @@
 /*
  * Shared data between expf, exp2f and powf.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << EXP2F_TABLE_BITS)
 
 const struct exp2f_data __exp2f_data = {
   /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
      used for computing 2^(k/N) for an int |k| < 150 N as
      double(tab[k%N] + (k << 52-BITS)) */
   .tab = {
 #if N == 8
 0x3ff0000000000000, 0x3fef72b83c7d517b, 0x3fef06fe0a31b715, 0x3feebfdad5362a27,
 0x3feea09e667f3bcd, 0x3feeace5422aa0db, 0x3feee89f995ad3ad, 0x3fef5818dcfba487,
 #elif N == 16
 0x3ff0000000000000, 0x3fefb5586cf9890f, 0x3fef72b83c7d517b, 0x3fef387a6e756238,
 0x3fef06fe0a31b715, 0x3feedea64c123422, 0x3feebfdad5362a27, 0x3feeab07dd485429,
 0x3feea09e667f3bcd, 0x3feea11473eb0187, 0x3feeace5422aa0db, 0x3feec49182a3f090,
 0x3feee89f995ad3ad, 0x3fef199bdd85529c, 0x3fef5818dcfba487, 0x3fefa4afa2a490da,
 #elif N == 32
 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
 0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
 0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
 #elif N == 64
 0x3ff0000000000000, 0x3fefec9a3e778061, 0x3fefd9b0d3158574, 0x3fefc74518759bc8,
 0x3fefb5586cf9890f, 0x3fefa3ec32d3d1a2, 0x3fef9301d0125b51, 0x3fef829aaea92de0,
 0x3fef72b83c7d517b, 0x3fef635beb6fcb75, 0x3fef54873168b9aa, 0x3fef463b88628cd6,
 0x3fef387a6e756238, 0x3fef2b4565e27cdd, 0x3fef1e9df51fdee1, 0x3fef1285a6e4030b,
 0x3fef06fe0a31b715, 0x3feefc08b26416ff, 0x3feef1a7373aa9cb, 0x3feee7db34e59ff7,
 0x3feedea64c123422, 0x3feed60a21f72e2a, 0x3feece086061892d, 0x3feec6a2b5c13cd0,
 0x3feebfdad5362a27, 0x3feeb9b2769d2ca7, 0x3feeb42b569d4f82, 0x3feeaf4736b527da,
 0x3feeab07dd485429, 0x3feea76f15ad2148, 0x3feea47eb03a5585, 0x3feea23882552225,
 0x3feea09e667f3bcd, 0x3fee9fb23c651a2f, 0x3fee9f75e8ec5f74, 0x3fee9feb564267c9,
 0x3feea11473eb0187, 0x3feea2f336cf4e62, 0x3feea589994cce13, 0x3feea8d99b4492ed,
 0x3feeace5422aa0db, 0x3feeb1ae99157736, 0x3feeb737b0cdc5e5, 0x3feebd829fde4e50,
 0x3feec49182a3f090, 0x3feecc667b5de565, 0x3feed503b23e255d, 0x3feede6b5579fdbf,
 0x3feee89f995ad3ad, 0x3feef3a2b84f15fb, 0x3feeff76f2fb5e47, 0x3fef0c1e904bc1d2,
 0x3fef199bdd85529c, 0x3fef27f12e57d14b, 0x3fef3720dcef9069, 0x3fef472d4a07897c,
 0x3fef5818dcfba487, 0x3fef69e603db3285, 0x3fef7c97337b9b5f, 0x3fef902ee78b3ff6,
 0x3fefa4afa2a490da, 0x3fefba1bee615a27, 0x3fefd0765b6e4540, 0x3fefe7c1819e90d8,
 #endif
   },
   .shift_scaled = 0x1.8p+52 / N,
   .poly = {
 #if N == 8
   0x1.c6a00335106e2p-5, 0x1.ec0c313449f55p-3, 0x1.62e431111f69fp-1,
 #elif N == 16
   0x1.c6ac6aa313963p-5, 0x1.ebfff4532d9bap-3, 0x1.62e43001bc49fp-1,
 #elif N == 32
   0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1,
 #elif N == 64
   0x1.c6b04b4221b2ap-5, 0x1.ebfc213e184d7p-3, 0x1.62e42fefb5b7fp-1,
 #endif
   },
   .shift = 0x1.8p+52,
   .invln2_scaled = 0x1.71547652b82fep+0 * N,
   .poly_scaled = {
 #if N == 8
   0x1.c6a00335106e2p-5/N/N/N, 0x1.ec0c313449f55p-3/N/N, 0x1.62e431111f69fp-1/N,
 #elif N == 16
   0x1.c6ac6aa313963p-5/N/N/N, 0x1.ebfff4532d9bap-3/N/N, 0x1.62e43001bc49fp-1/N,
 #elif N == 32
   0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
 #elif N == 64
   0x1.c6b04b4221b2ap-5/N/N/N, 0x1.ebfc213e184d7p-3/N/N, 0x1.62e42fefb5b7fp-1/N,
 #endif
   },
 };
diff --git a/contrib/arm-optimized-routines/math/exp_data.c b/contrib/arm-optimized-routines/math/exp_data.c
index cba76832566f..714c845709aa 100644
--- a/contrib/arm-optimized-routines/math/exp_data.c
+++ b/contrib/arm-optimized-routines/math/exp_data.c
@@ -1,1120 +1,1120 @@
 /*
  * Shared data between exp, exp2 and pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << EXP_TABLE_BITS)
 
 const struct exp_data __exp_data = {
 // N/ln2
 .invln2N = 0x1.71547652b82fep0 * N,
 // -ln2/N
 #if N == 64
 .negln2hiN = -0x1.62e42fefa0000p-7,
 .negln2loN = -0x1.cf79abc9e3b3ap-46,
 #elif N == 128
 .negln2hiN = -0x1.62e42fefa0000p-8,
 .negln2loN = -0x1.cf79abc9e3b3ap-47,
 #elif N == 256
 .negln2hiN = -0x1.62e42fefc0000p-9,
 .negln2loN = 0x1.c610ca86c3899p-45,
 #elif N == 512
 .negln2hiN = -0x1.62e42fef80000p-10,
 .negln2loN = -0x1.1cf79abc9e3b4p-45,
 #endif
 // Used for rounding when !TOINT_INTRINSICS
 #if EXP_USE_TOINT_NARROW
 .shift = 0x1800000000.8p0,
 #else
 .shift = 0x1.8p52,
 #endif
 // exp polynomial coefficients.
 .poly = {
 #if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
 // abs error: 1.5543*2^-60
 // ulp error: 0.529 (0.533 without fma)
 // if |x| < ln2/128+eps
 // abs error if |x| < ln2/64: 1.7157*2^-50
 0x1.fffffffffdbcdp-2,
 0x1.555555555444cp-3,
 0x1.555573c6a9f7dp-5,
 0x1.1111266d28935p-7,
 #elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
 // abs error: 1.6735*2^-64
 // ulp error: 0.518 (0.522 without fma)
 // if |x| < ln2/64
 0x1.5555555548f9ap-3,
 0x1.555555554bf5dp-5,
 0x1.11115b75f0f4dp-7,
 0x1.6c171a6b6303ep-10,
 #elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
 // abs error: 1.555*2^-66
 // ulp error: 0.509 (0.511 without fma)
 // if |x| < ln2/256+eps
 // abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
 // abs error if |x| < ln2/128: 1.7145*2^-56
 0x1.ffffffffffdbdp-2,
 0x1.555555555543cp-3,
 0x1.55555cf172b91p-5,
 0x1.1111167a4d017p-7,
 #elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
 // abs error: 1.5542*2^-60
 // ulp error: 0.521 (0.523 without fma)
 // if |x| < ln2/128
 0x1.fffffffffdbcep-2,
 0x1.55555555543c2p-3,
 0x1.555573c64f2e3p-5,
 0x1.111126b4eff73p-7,
 #elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
 // abs error: 1.6861*2^-71
 // ulp error: 0.509 (0.511 without fma)
 // if |x| < ln2/128
 0x1.55555555548fdp-3,
 0x1.555555555658fp-5,
 0x1.111123a859bb6p-7,
 0x1.6c16ba6920cabp-10,
 #elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
 // abs error: 1.43*2^-58
 // ulp error: 0.549 (0.550 without fma)
 // if |x| < ln2/512
 0x1p0, // unused
 0x1.fffffffffffd4p-2,
 0x1.5555571d6ef9p-3,
 0x1.5555576a5adcep-5,
 #elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
 // abs error: 1.5547*2^-66
 // ulp error: 0.505 (0.506 without fma)
 // if |x| < ln2/256
 0x1.ffffffffffdbdp-2,
 0x1.555555555543cp-3,
 0x1.55555cf16e1edp-5,
 0x1.1111167a4b553p-7,
 #elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
 // abs error: 1.4300*2^-63
 // ulp error: 0.504
 // if |x| < ln2/1024
 // abs error if |x| < ln2/512: 1.0689*2^-55
 0x1p0, // unused
 0x1.ffffffffffffdp-2,
 0x1.555555c75bb6p-3,
 0x1.555555dec04a8p-5,
 #endif
 },
 .exp2_shift = 0x1.8p52 / N,
 // exp2 polynomial coefficients.
 .exp2_poly = {
 #if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE
 // abs error: 1.3054*2^-63
 // ulp error: 0.515
 // if |x| < 1/64
 0x1.62e42fefa39efp-1,
 0x1.ebfbdff82c58fp-3,
 0x1.c6b08d7045cf1p-5,
 0x1.3b2ab6fb8fd0ep-7,
 0x1.5d884afec48d7p-10,
 0x1.43097dc684ae1p-13,
 #elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE
 // abs error: 1.2195*2^-65
 // ulp error: 0.507 (0.511 without fma)
 // if |x| < 1/256
 // abs error if |x| < 1/128: 1.9941*2^-56
 0x1.62e42fefa39efp-1,
 0x1.ebfbdff82c424p-3,
 0x1.c6b08d70cf4b5p-5,
 0x1.3b2abd24650ccp-7,
 0x1.5d7e09b4e3a84p-10,
 #elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE
 // abs error: 1.2195*2^-65
 // ulp error: 0.504 (0.508 without fma)
 // if |x| < 1/256
 0x1.62e42fefa39efp-1,
 0x1.ebfbdff82c424p-3,
 0x1.c6b08d70cf4b5p-5,
 0x1.3b2abd24650ccp-7,
 0x1.5d7e09b4e3a84p-10,
 #elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE
 // abs error: 1.4411*2^-64
 // ulp error: 0.5024 (0.5063 without fma)
 // if |x| < 1/1024
 // abs error if |x| < 1/512: 1.9430*2^-56
 0x1.62e42fefa39ecp-1,
 0x1.ebfbdff82c58bp-3,
 0x1.c6b08e46de41fp-5,
 0x1.3b2ab786ee1dap-7,
 #endif
 },
 // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
 // tab[2*k] = asuint64(T[k])
 // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
 .tab = {
 #if N == 64
 0x0, 0x3ff0000000000000,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0xbc93cedd78565858, 0x3feea23882552225,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 #elif N == 128
 0x0, 0x3ff0000000000000,
 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0xbc6a033489906e0b, 0x3fef9b66affed31b,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc9312607a28698a, 0x3feeda4504ac801c,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0xbc845378892be9ae, 0x3feea34634ccc320,
 0xbc93cedd78565858, 0x3feea23882552225,
 0x3c5710aa807e1964, 0x3feea155d44ca973,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0xbc6a12ad8734b982, 0x3feea012750bdabf,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0x3c6dd235e10a73bb, 0x3feec86319e32323,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c90cc319cee31d2, 0x3feed99e1330b358,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc90a40e3da6f640, 0x3feef9728de5593a,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0xbc91eee26b588a35, 0x3fef05b030a1064a,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0xbc900dae3875a949, 0x3fef4f87080d89f2,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0xbc82919e2040220f, 0x3fef60e316c98398,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0x3c843a59ac016b4b, 0x3fef7321f301b460,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0xbc892ab93b470dc9, 0x3fef864614f5a129,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
 #elif N == 256
 0x0, 0x3ff0000000000000,
 0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
 0xbc82985dd8521d32, 0x3feff168143b0281,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
 0x3c845fad437fa426, 0x3fefde5f72f654b1,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0xbc954529642b232f, 0x3fefd50a0e3c1f89,
 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
 0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
 0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0x3c9407fb30d06420, 0x3fefb0f145e46c85,
 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
 0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
 0xbc6a033489906e0b, 0x3fef9b66affed31b,
 0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
 0xbc65704e90c9f860, 0x3fef86a814f204ab,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0xbc897cea57e46280, 0x3fef7e95934f312e,
 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
 0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
 0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
 0x3c8cdc1873af2155, 0x3fef582f95281c6b,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0xbc9493684653a131, 0x3fef50e75eb44027,
 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
 0xbc98e2899077520a, 0x3fef49c18438ce4d,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0x3c9120fcd4f59273, 0x3fef42be3578a819,
 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
 0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0x3c877afbca90ef84, 0x3fef351ffb82140a,
 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
 0x3c91512f082876ee, 0x3fef2e85711ece75,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
 0xbc803297e78260bf, 0x3fef21ba7591bb70,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
 0xbc91e75c40b4251e, 0x3fef157e39771b2f,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c98a911f1f7785a, 0x3fef0f961f641589,
 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
 0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
 0xbc979517a03e2847, 0x3feefeb83ba8ea32,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
 0xbc87430803972b34, 0x3feef431a2de883b,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc954de30ae02d94, 0x3feeef26231e754a,
 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
 0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
 0x3c79c3bba5562a2f, 0x3feee0e544ede173,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc85a71612e21658, 0x3feedc70df1c5175,
 0xbc9312607a28698a, 0x3feeda4504ac801c,
 0x3c86421f6f1d24d6, 0x3feed822c367a024,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0xbc9348a6815fce65, 0x3feed3fb2709468a,
 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
 0x3c835c43984d9871, 0x3feecffa3f84b9d4,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0xbc632afc8d9473a0, 0x3feecc2042a7d232,
 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
 0xbc95fc5e44de020e, 0x3feec86d668b3237,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
 0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0x3c892ca3bf144e63, 0x3feebe41b817c114,
 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
 0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0x3c73e34f67e67118, 0x3feeb8417f4531ee,
 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
 0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
 0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
 0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0xbc943a3540d1898a, 0x3feeaa11fba87a03,
 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
 0xbc951f58ddaa8090, 0x3feea84590998b93,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0xbc82e1648e50a17c, 0x3feea6a320dceb71,
 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
 0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
 0xbc845378892be9ae, 0x3feea34634ccc320,
 0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
 0xbc93cedd78565858, 0x3feea23882552225,
 0xbc85c33fdf910406, 0x3feea1c1c70833f6,
 0x3c5710aa807e1964, 0x3feea155d44ca973,
 0x3c81079ab5789604, 0x3feea0f4b19e9538,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0x3c727df161cd7778, 0x3feea052fa75173e,
 0xbc6a12ad8734b982, 0x3feea012750bdabf,
 0x3c93f9924a05b767, 0x3fee9fdcddd47645,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
 0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc88e67a9006c909, 0x3fee9f8286ead08a,
 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
 0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
 0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
 0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0xbc760a3629969871, 0x3feea3878491c491,
 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
 0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
 0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c7c88549b958471, 0x3feea9cad931a436,
 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
 0x3c931143962f7877, 0x3feeabd0a478580f,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0x3c93e9e96f112479, 0x3feeae05bad61778,
 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
 0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
 0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
 0xbc51669428996971, 0x3feebbdd9a7670b3,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
 0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
 0x3c6dd235e10a73bb, 0x3feec86319e32323,
 0xbc79740b58a20091, 0x3feeca5e8d07f29e,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
 0xbc903d5cbe27874b, 0x3feed2c980460ad8,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c5986178980fce0, 0x3feed74a8af46052,
 0x3c90cc319cee31d2, 0x3feed99e1330b358,
 0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
 0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
 0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc7274aedac8ff80, 0x3feef68415b749b1,
 0xbc90a40e3da6f640, 0x3feef9728de5593a,
 0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
 0xbc91eee26b588a35, 0x3fef05b030a1064a,
 0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0xbc302899507554e5, 0x3fef0f69c3f3a207,
 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
 0xbc80dda2d4c0010c, 0x3fef16286141b33d,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
 0x3c836909391181d3, 0x3fef244778fafb22,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
 0xbc7ac28b7bef6621, 0x3fef33405751c4db,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
 0xbc8cc734592af7fc, 0x3fef43155b5bab74,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0x3c87752a44f587e8, 0x3fef4b532b08c968,
 0xbc900dae3875a949, 0x3fef4f87080d89f2,
 0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
 0xbc82919e2040220f, 0x3fef60e316c98398,
 0x3c8c254d16117a68, 0x3fef655d71ff6075,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
 0x3c843a59ac016b4b, 0x3fef7321f301b460,
 0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
 0xbc892ab93b470dc9, 0x3fef864614f5a129,
 0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0xbc776caa4c2ff1cf, 0x3fef953924676d76,
 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
 0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
 0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
 0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0x3c901f3a75ee0efe, 0x3fefd632798844f8,
 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
 0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 0xbc699c7db2effc76, 0x3fefedba3692d514,
 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
 0x3c64b458677f9840, 0x3feff9d96b2a23d9,
 #elif N == 512
 0x0, 0x3ff0000000000000,
 0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a,
 0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
 0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11,
 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
 0x3c75c18e5ae0563a, 0x3feff3d1e77170b4,
 0xbc82985dd8521d32, 0x3feff168143b0281,
 0xbc705b1125cf49a5, 0x3fefef003103b10e,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0x3c9f879abbff3f87, 0x3fefea363d42b027,
 0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
 0x3c9b14003824712a, 0x3fefe57411915a8a,
 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
 0x3c61cbf0f38af658, 0x3fefe0b9b35659d8,
 0x3c845fad437fa426, 0x3fefde5f72f654b1,
 0xbc9a3316383dcbc5, 0x3fefdc0727fc1762,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2,
 0xbc954529642b232f, 0x3fefd50a0e3c1f89,
 0xbc89b3236d111646, 0x3fefd2b99fa6407c,
 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
 0xbc8cb191be99b1b0, 0x3fefce1ead925493,
 0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
 0xbc9acb71e83765b7, 0x3fefc98ba42e7d30,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0x3c5cd3e58b03697e, 0x3fefc50088f8093f,
 0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
 0xbc8bfb07d4755452, 0x3fefc07d61701716,
 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
 0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715,
 0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
 0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0xbc85b9eb0402507b, 0x3fefb323d833d93f,
 0x3c9407fb30d06420, 0x3fefb0f145e46c85,
 0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53,
 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
 0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba,
 0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
 0x3c9c8be44bf4cde8, 0x3fefa612a7b26300,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0x3c820c5444c93c44, 0x3fefa1c7c55189c6,
 0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
 0xbc84c6baeb580d7a, 0x3fef9d8503328e6d,
 0xbc6a033489906e0b, 0x3fef9b66affed31b,
 0x3c8657aa1b0d9f83, 0x3fef994a66f951ce,
 0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
 0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc6b0b2789925e90, 0x3fef90edb6db2dc1,
 0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
 0xbc93aad17d197fae, 0x3fef8ccbae51a5c8,
 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
 0xbc989c464a07ad70, 0x3fef88b1e264a0e9,
 0xbc65704e90c9f860, 0x3fef86a814f204ab,
 0xbc72c338fce197f4, 0x3fef84a058cbae1e,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0xbc6dca724cea0eb6, 0x3fef809717425438,
 0xbc897cea57e46280, 0x3fef7e95934f312e,
 0x3c464770b955d34d, 0x3fef7c962388149e,
 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
 0xbc962811c114424f, 0x3fef789d83606e12,
 0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
 0x3c8ec58e74904dd4, 0x3fef74ad3c92df73,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89,
 0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
 0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9,
 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
 0x3c8bda920de0f6e2, 0x3fef690eba4df41f,
 0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
 0xbc9a597f9a5ff71c, 0x3fef654013041dc2,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0x3c50835b125aa573, 0x3fef6179e2363cf8,
 0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
 0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0,
 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
 0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f,
 0x3c8cdc1873af2155, 0x3fef582f95281c6b,
 0xbc6817fd6a313e3e, 0x3fef565a51860746,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0xbc96236af85fd26a, 0x3fef52b6358e15e8,
 0xbc9493684653a131, 0x3fef50e75eb44027,
 0x3c7795eb4523abe7, 0x3fef4f1aad999e82,
 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
 0x3c8fe58b91b40095, 0x3fef4b87bf9cda38,
 0xbc98e2899077520a, 0x3fef49c18438ce4d,
 0x3c91ecaa860c614a, 0x3fef47fd7190241e,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18,
 0x3c9120fcd4f59273, 0x3fef42be3578a819,
 0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9,
 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
 0x3c87f1c7350e256d, 0x3fef3d9282fc1f27,
 0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
 0x3c420dac6c124f4f, 0x3fef3a2af0b63bff,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0xbc99501d09bc09fd, 0x3fef36cc1c78903a,
 0x3c877afbca90ef84, 0x3fef351ffb82140a,
 0x3c73baf864dc8675, 0x3fef33760c547f15,
 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
 0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff,
 0x3c91512f082876ee, 0x3fef2e85711ece75,
 0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0xbc7548165d85ed32, 0x3fef29a8b16f0a30,
 0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
 0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98,
 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
 0xbc93a255f697ecfe, 0x3fef234c0ea83f36,
 0xbc803297e78260bf, 0x3fef21ba7591bb70,
 0x3c8d2d19edc1e550, 0x3fef202b17779965,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0xbc76b2173113dd8c, 0x3fef1d130f50d65c,
 0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
 0x3c811aa5f853590b, 0x3fef1a03fc675d1f,
 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
 0x3c61d61a34c8aa02, 0x3fef16fde4f2e280,
 0xbc91e75c40b4251e, 0x3fef157e39771b2f,
 0xbc91f892bf6b286d, 0x3fef1400cf2f6c18,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c7590c65c20e680, 0x3fef110cc15d5346,
 0x3c98a911f1f7785a, 0x3fef0f961f641589,
 0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833,
 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
 0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2,
 0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
 0x3c7b3bf786a54a87, 0x3fef08670653dfe4,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c74bb6c41732885, 0x3fef05975721b004,
 0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
 0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac,
 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
 0xbc5451d60c6ac9eb, 0x3fef001375752b40,
 0xbc979517a03e2847, 0x3feefeb83ba8ea32,
 0x3c8787a210ceafd9, 0x3feefd5f4fb45e20,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc888d1e4629943d, 0x3feefab46484ebb4,
 0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
 0xbc93369c544088b6, 0x3feef812ba4ea77d,
 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
 0x3c85373ce4eb6dfb, 0x3feef57a577dd72b,
 0xbc87430803972b34, 0x3feef431a2de883b,
 0x3c83adec8265a67f, 0x3feef2eb428335b4,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc835388bcac6bc5, 0x3feef06581d3f669,
 0xbc954de30ae02d94, 0x3feeef26231e754a,
 0x3c727cdb4e4b6640, 0x3feeede91be9c811,
 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
 0x3c86c2696a26af35, 0x3feeeb761742d808,
 0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
 0x3c888f6ff06b979a, 0x3feee90c7a61d55b,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea,
 0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
 0xbc76b8867f91c9d6, 0x3feee4559212ef89,
 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
 0x3c94c9c0b5157fe6, 0x3feee20853c10f28,
 0x3c79c3bba5562a2f, 0x3feee0e544ede173,
 0xbc62455345b51c8e, 0x3feedfc4976d27fa,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc93331de45477d0, 0x3feedd8a63b0a09b,
 0xbc85a71612e21658, 0x3feedc70df1c5175,
 0xbc95f84d39b39b16, 0x3feedb59bf29743f,
 0xbc9312607a28698a, 0x3feeda4504ac801c,
 0xbc72ba4dc7c4d562, 0x3feed932b07a35df,
 0x3c86421f6f1d24d6, 0x3feed822c367a024,
 0xbc844f25dc02691f, 0x3feed7153e4a136a,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0xbc888d328eb9b501, 0x3feed5016f44d8f5,
 0xbc9348a6815fce65, 0x3feed3fb2709468a,
 0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1,
 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
 0xbc615f0a2b9cd452, 0x3feed0f6d5817663,
 0x3c835c43984d9871, 0x3feecffa3f84b9d4,
 0xbc8c2e465a919e1d, 0x3feecf0018321a1a,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0xbc865dfd02bd08f1, 0x3feecd1318eb43ec,
 0xbc632afc8d9473a0, 0x3feecc2042a7d232,
 0xbc8e68cec89b1762, 0x3feecb2fde7006f4,
 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
 0xbc48ae858eb682ca, 0x3feec9566f8827d0,
 0xbc95fc5e44de020e, 0x3feec86d668b3237,
 0x3c5dd71277c0915f, 0x3feec786d3001fe5,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0x3c92001325ecd7fb, 0x3feec5c10fa920a1,
 0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
 0x3c65ace6e2870332, 0x3feec4052c5916c4,
 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
 0xbc9595c55690ffaf, 0x3feec2532feaada6,
 0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
 0xbc8b401ba9fb5199, 0x3feec0ab213d5283,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0x3c6df82bf324cc57, 0x3feebf0d073537ca,
 0x3c892ca3bf144e63, 0x3feebe41b817c114,
 0x3c97cae38641c7bb, 0x3feebd78e8bb586b,
 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
 0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a,
 0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
 0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0xbc80b582d74a55d9, 0x3feeb8f8b804f127,
 0x3c73e34f67e67118, 0x3feeb8417f4531ee,
 0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d,
 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
 0xbc592dca38593e20, 0x3feeb62b00da3b14,
 0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
 0xbc85daca9994833e, 0x3feeb4d359dfd53d,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0xbc980b4321bc6dae, 0x3feeb385df598d78,
 0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
 0xbc8390afec5241c5, 0x3feeb24298571b06,
 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
 0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf,
 0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
 0xbc910aa91ae9b67f, 0x3feeafdac1351819,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0x3c957e1b67462375, 0x3feeaeb63f4d854c,
 0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
 0x3c8124d5051552a7, 0x3feead9c0d59ca07,
 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
 0xbc3ca103952ecf1f, 0x3feeac8c32824135,
 0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
 0x3c773345c02a4fd6, 0x3feeab86b5f43d92,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e,
 0xbc943a3540d1898a, 0x3feeaa11fba87a03,
 0xbc924f2cb4f81746, 0x3feea99af482fc8f,
 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
 0xbc943592a0a9846b, 0x3feea8b4be135acc,
 0xbc951f58ddaa8090, 0x3feea84590998b93,
 0xbc956bc85d444f4f, 0x3feea7d902d47c65,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0x3c914d1e4218319f, 0x3feea707ca0cbf0f,
 0xbc82e1648e50a17c, 0x3feea6a320dceb71,
 0x3c971c93709313f4, 0x3feea6411b078d26,
 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
 0x3c7f88303b60d222, 0x3feea584fd15612a,
 0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
 0x3c70125ca18d4b5b, 0x3feea4d3778bc944,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0x3c9592ea73798b11, 0x3feea42c91c56acd,
 0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
 0xbc9371d6d7d75739, 0x3feea390532205d8,
 0xbc845378892be9ae, 0x3feea34634ccc320,
 0xbc8ac05fd996f807, 0x3feea2fec30678b7,
 0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
 0xbc91f5067d03653a, 0x3feea277e8dcc390,
 0xbc93cedd78565858, 0x3feea23882552225,
 0x3c917339c86ce3ad, 0x3feea1fbcc140be7,
 0xbc85c33fdf910406, 0x3feea1c1c70833f6,
 0xbc77e66065ba2500, 0x3feea18a7420a036,
 0x3c5710aa807e1964, 0x3feea155d44ca973,
 0x3c964c827ee6b49a, 0x3feea123e87bfb7a,
 0x3c81079ab5789604, 0x3feea0f4b19e9538,
 0xbc928311a3c73480, 0x3feea0c830a4c8d4,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0x3c882c79e185e981, 0x3feea077541ee718,
 0x3c727df161cd7778, 0x3feea052fa75173e,
 0xbc8b48cea80b043b, 0x3feea0315a736c75,
 0xbc6a12ad8734b982, 0x3feea012750bdabf,
 0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09,
 0x3c93f9924a05b767, 0x3fee9fdcddd47645,
 0x3c954835dd4b7548, 0x3fee9fc62dea2f8a,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8,
 0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
 0xbc8f652fde52775c, 0x3fee9f86e7ba9fef,
 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
 0xbc7b0300defbcf98, 0x3fee9f77ce1303f6,
 0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
 0xbc89dab646035dc0, 0x3fee9f73c4eaa988,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc91f0c230588dde, 0x3fee9f7ad3ef9011,
 0xbc88e67a9006c909, 0x3fee9f8286ead08a,
 0x3c9106450507a28c, 0x3fee9f8d02d50b8f,
 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
 0xbc9129729a10f3a0, 0x3fee9faa5953c849,
 0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
 0x3c781a70a5124f67, 0x3fee9fd2df29ce7c,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0x3c941626ea62646d, 0x3feea0069c1a861d,
 0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
 0xbc940b9f54365b7c, 0x3feea04597eeba8f,
 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
 0x3c873455e0e826c1, 0x3feea08fda749e5d,
 0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
 0x3c94f006ad874e3e, 0x3feea0e56b7fcf03,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0xbc8f6d693d0973bb, 0x3feea14652e958aa,
 0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
 0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec,
 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
 0xbc88b25e045d207b, 0x3feea22a4456e7a3,
 0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
 0xbc69cb3314060ca7, 0x3feea2ad5e2850ac,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9,
 0xbc760a3629969871, 0x3feea3878491c491,
 0x3c94aa7212bfa73c, 0x3feea3d5fbab091f,
 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
 0xbc81e688272a8a12, 0x3feea47b8f4abaa9,
 0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
 0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0x3c8a1e274eed4476, 0x3feea5e968443d9a,
 0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
 0x3c94a533a59324da, 0x3feea6b1bdadb46d,
 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
 0x3c7a56d2760d087d, 0x3feea785b91e07f1,
 0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
 0x3c91682c1c6e8b05, 0x3feea86562ab00ec,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c89ea99cf7a9591, 0x3feea950c27004c2,
 0x3c7c88549b958471, 0x3feea9cad931a436,
 0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957,
 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
 0x3c909b176e05a9cd, 0x3feeab4ac52be8f7,
 0x3c931143962f7877, 0x3feeabd0a478580f,
 0x3c711607f1952c95, 0x3feeac597875c644,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0x3c869608f0f86431, 0x3feead74029db01e,
 0x3c93e9e96f112479, 0x3feeae05bad61778,
 0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598,
 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
 0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6,
 0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
 0x3c81c1701c359530, 0x3feeb10afc931857,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0xbc8edb1bf6809287, 0x3feeb2553499284b,
 0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
 0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c,
 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
 0xbc93fc025e1db9ce, 0x3feeb50dad829e70,
 0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
 0xbc8d737c7d71382e, 0x3feeb67bff148396,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0x3c6ae88c43905293, 0x3feeb7f669e2802b,
 0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
 0xbc93d1f7661fe51b, 0x3feeb97cf65253d1,
 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
 0x3c651b68797ffc1c, 0x3feebb0faccf9243,
 0xbc51669428996971, 0x3feebbdd9a7670b3,
 0x3c54579c5ceed70b, 0x3feebcae95cba768,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0x3c87298413381667, 0x3feebe59b9bddb5b,
 0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
 0xbc905000be64e965, 0x3feec01121235681,
 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
 0xbc89fb12e3454b73, 0x3feec1d4d47f2598,
 0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
 0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0x3c90622b15810eea, 0x3feec581414380f2,
 0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
 0x3be9a5ecc875d327, 0x3feec76a0bcfc15e,
 0x3c6dd235e10a73bb, 0x3feec86319e32323,
 0x3c88ea486a3350ef, 0x3feec95f4499c647,
 0xbc79740b58a20091, 0x3feeca5e8d07f29e,
 0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0x3c89c31f7e38028b, 0x3feecd6f23701b15,
 0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
 0xbc5fac13f4e005a3, 0x3feecf89dacfe68c,
 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
 0x3c7d8aced7162e89, 0x3feed1b1231475f7,
 0xbc903d5cbe27874b, 0x3feed2c980460ad8,
 0xbc848f50cea7269f, 0x3feed3e504f696b1,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c821eb9a08a0542, 0x3feed625893523d4,
 0x3c5986178980fce0, 0x3feed74a8af46052,
 0xbc6133a953131cfd, 0x3feed872b8950a73,
 0x3c90cc319cee31d2, 0x3feed99e1330b358,
 0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca,
 0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
 0xbc90260cf07cb311, 0x3feedd333beb0b7e,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0x3c1bca400a7b939d, 0x3feedfa6a1897fd2,
 0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
 0x3c9140bc34dfc19f, 0x3feee226d59a09ee,
 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
 0xbc8c9b1da461ab87, 0x3feee4b3e100301e,
 0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
 0x3c8c115f23ebea8e, 0x3feee74dcca5a413,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc6dcab99f23f84e, 0x3feee9f4a17a4735,
 0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
 0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4,
 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
 0x3c915b1397075f04, 0x3feeef692a8fa8cd,
 0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
 0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc86a510f31e13e6, 0x3feef511c43bbd62,
 0xbc7274aedac8ff80, 0x3feef68415b749b1,
 0xbc92887ea88e7340, 0x3feef7f9ade433c6,
 0xbc90a40e3da6f640, 0x3feef9728de5593a,
 0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87,
 0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
 0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0xbc8d1bf10460dba0, 0x3fef01004b3a7804,
 0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
 0x3c8e5d80813dddfc, 0x3fef041ce8e77680,
 0xbc91eee26b588a35, 0x3fef05b030a1064a,
 0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7,
 0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
 0x3c7a77557fd62db3, 0x3fef0a7df9285775,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0xbc651ba6128db749, 0x3fef0dc27e2cb5e5,
 0xbc302899507554e5, 0x3fef0f69c3f3a207,
 0xbc7c0ffefdc5e251, 0x3fef111462c95b60,
 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
 0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30,
 0xbc80dda2d4c0010c, 0x3fef16286141b33d,
 0x3c923759b8aca76d, 0x3fef17e06ff301f4,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0xbc895498a73dac7d, 0x3fef1b5aab23e61e,
 0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
 0x3c851de924583108, 0x3fef1ee26b34e065,
 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
 0xbc8c5fe4051ba06c, 0x3fef2277b9881650,
 0x3c836909391181d3, 0x3fef244778fafb22,
 0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0xbc7af5c67c4e8235, 0x3fef29cb269e601f,
 0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
 0xbc8304ef0045d575, 0x3fef2d89584661a1,
 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
 0x3c8725f94f910375, 0x3fef31553dfa8313,
 0xbc7ac28b7bef6621, 0x3fef33405751c4db,
 0x3c7b53e99f9191e8, 0x3fef352ee13da7cb,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc810a79e6d7e2b8, 0x3fef39164b994d23,
 0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
 0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f,
 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
 0x3c549eeef9ec910c, 0x3fef410e9be12cb9,
 0xbc8cc734592af7fc, 0x3fef43155b5bab74,
 0xbc8335827ffb9dce, 0x3fef451f95018d17,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0x3c645563980ef762, 0x3fef493e7ba2c38c,
 0x3c87752a44f587e8, 0x3fef4b532b08c968,
 0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c,
 0xbc900dae3875a949, 0x3fef4f87080d89f2,
 0xbc8aab80ceab2b4a, 0x3fef51a638197a3c,
 0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
 0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0x3c83c119f18464c5, 0x3fef5a461eec14be,
 0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
 0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b,
 0xbc82919e2040220f, 0x3fef60e316c98398,
 0xbc72550d76be719a, 0x3fef631e7e2d479d,
 0x3c8c254d16117a68, 0x3fef655d71ff6075,
 0xbc82090274667d12, 0x3fef679ff37adb4a,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd,
 0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
 0x3c890de9296f4cd1, 0x3fef70cd9ab294e4,
 0x3c843a59ac016b4b, 0x3fef7321f301b460,
 0x3c832ff9978b34bc, 0x3fef7579e065807d,
 0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
 0xbc7303b63dda1980, 0x3fef7a347f63c159,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1,
 0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
 0x3c768d9144ae12fc, 0x3fef83d4f11f8220,
 0xbc892ab93b470dc9, 0x3fef864614f5a129,
 0x3c853687f542403b, 0x3fef88bad7dcee90,
 0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
 0xbc736ed2de40b407, 0x3fef8daf3fe592e8,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0xbc614ef56c770f3b, 0x3fef92b2334ac7ee,
 0xbc776caa4c2ff1cf, 0x3fef953924676d76,
 0x3c8df7d1353d8e88, 0x3fef97c3bc24e350,
 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
 0xbc850bed64091b8a, 0x3fef9ce3e4933c7e,
 0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
 0x3c89d852381c317f, 0x3fefa212b6bc3181,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5,
 0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
 0xbc5a1f25ce94cae7, 0x3fefac9c80faa594,
 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
 0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2,
 0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
 0x3c737e8ae802b851, 0x3fefb7616ca06dd6,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0x3c875119560e34af, 0x3fefbcda28a52e59,
 0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
 0xbc7431c3840929c6, 0x3fefc261cbdf5be7,
 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
 0xbc8cb472d2e86b99, 0x3fefc7f860a70c22,
 0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
 0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0x3c8eef18336b62e3, 0x3fefd35288633625,
 0x3c901f3a75ee0efe, 0x3fefd632798844f8,
 0x3c80d23f87b50a2a, 0x3fefd916302bd526,
 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
 0x3c8302dee657c8e6, 0x3fefdee8f32a4b45,
 0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
 0xbc7b0caa080df170, 0x3fefe4cadbdac61d,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54,
 0xbc699c7db2effc76, 0x3fefedba3692d514,
 0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad,
 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
 0x3c8e70b094fa075a, 0x3feff6cbe15f6314,
 0x3c64b458677f9840, 0x3feff9d96b2a23d9,
 0xbc72ec9a3e5d680a, 0x3feffceaca4391b6,
 #endif
 },
 };
diff --git a/contrib/arm-optimized-routines/math/expf.c b/contrib/arm-optimized-routines/math/expf.c
index 9b2f0c3d8c56..08a20d59e491 100644
--- a/contrib/arm-optimized-routines/math/expf.c
+++ b/contrib/arm-optimized-routines/math/expf.c
@@ -1,91 +1,91 @@
 /*
  * Single-precision e^x function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 EXP2F_TABLE_BITS = 5
 EXP2F_POLY_ORDER = 3
 
 ULP error: 0.502 (nearest rounding.)
 Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
 Wrong count: 170635 (all nearest rounding wrong results with fma.)
 Non-nearest ULP error: 1 (rounded ULP error)
 */
 
 #define N (1 << EXP2F_TABLE_BITS)
 #define InvLn2N __exp2f_data.invln2_scaled
 #define T __exp2f_data.tab
 #define C __exp2f_data.poly_scaled
 
 static inline uint32_t
 top12 (float x)
 {
   return asuint (x) >> 20;
 }
 
 float
 expf (float x)
 {
   uint32_t abstop;
   uint64_t ki, t;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, xd, z, r, r2, y, s;
 
   xd = (double_t) x;
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop >= top12 (88.0f)))
     {
       /* |x| >= 88 or x is nan.  */
       if (asuint (x) == asuint (-INFINITY))
 	return 0.0f;
       if (abstop >= top12 (INFINITY))
 	return x + x;
       if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
 	return __math_oflowf (0);
       if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
 	return __math_uflowf (0);
 #if WANT_ERRNO_UFLOW
       if (x < -0x1.9d1d9ep6f) /* x < log(0x1p-149) ~= -103.28 */
 	return __math_may_uflowf (0);
 #endif
     }
 
   /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
   z = InvLn2N * xd;
 
   /* Round and convert z to int, the result is in [-150*N, 128*N] and
      ideally nearest int is used, otherwise the magnitude of r can be
      bigger which gives larger approximation error.  */
 #if TOINT_INTRINSICS
   kd = roundtoint (z);
   ki = converttoint (z);
 #else
 # define SHIFT __exp2f_data.shift
   kd = eval_as_double (z + SHIFT);
   ki = asuint64 (kd);
   kd -= SHIFT;
 #endif
   r = z - kd;
 
   /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
   t = T[ki % N];
   t += ki << (52 - EXP2F_TABLE_BITS);
   s = asdouble (t);
   z = C[0] * r + C[1];
   r2 = r * r;
   y = C[2] * r + 1;
   y = z * r2 + y;
   y = y * s;
   return eval_as_float (y);
 }
 #if USE_GLIBC_ABI
 strong_alias (expf, __expf_finite)
 hidden_alias (expf, __ieee754_expf)
 #endif
diff --git a/contrib/arm-optimized-routines/math/include/mathlib.h b/contrib/arm-optimized-routines/math/include/mathlib.h
index 279d829d8ea1..c520c3772f7f 100644
--- a/contrib/arm-optimized-routines/math/include/mathlib.h
+++ b/contrib/arm-optimized-routines/math/include/mathlib.h
@@ -1,100 +1,100 @@
 /*
  * Public API.
  *
  * Copyright (c) 2015-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATHLIB_H
 #define _MATHLIB_H
 
 float expf (float);
 float exp2f (float);
 float logf (float);
 float log2f (float);
 float powf (float, float);
 float sinf (float);
 float cosf (float);
 void sincosf (float, float*, float*);
 
 double exp (double);
 double exp2 (double);
 double log (double);
 double log2 (double);
 double pow (double, double);
 
 /* Scalar functions using the vector algorithm with identical result.  */
 float __s_sinf (float);
 float __s_cosf (float);
 float __s_expf (float);
 float __s_expf_1u (float);
 float __s_exp2f (float);
 float __s_exp2f_1u (float);
 float __s_logf (float);
 float __s_powf (float, float);
 double __s_sin (double);
 double __s_cos (double);
 double __s_exp (double);
 double __s_log (double);
 double __s_pow (double, double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
 typedef __Float32x4_t __f32x4_t;
 typedef __Float64x2_t __f64x2_t;
 #elif __clang_major__*100+__clang_minor__ >= 305
 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
 typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
 #else
 #error Unsupported compiler
 #endif
 
 /* Vector functions following the base PCS.  */
 __f32x4_t __v_sinf (__f32x4_t);
 __f32x4_t __v_cosf (__f32x4_t);
 __f32x4_t __v_expf (__f32x4_t);
 __f32x4_t __v_expf_1u (__f32x4_t);
 __f32x4_t __v_exp2f (__f32x4_t);
 __f32x4_t __v_exp2f_1u (__f32x4_t);
 __f32x4_t __v_logf (__f32x4_t);
 __f32x4_t __v_powf (__f32x4_t, __f32x4_t);
 __f64x2_t __v_sin (__f64x2_t);
 __f64x2_t __v_cos (__f64x2_t);
 __f64x2_t __v_exp (__f64x2_t);
 __f64x2_t __v_log (__f64x2_t);
 __f64x2_t __v_pow (__f64x2_t, __f64x2_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
 __vpcs __f32x4_t __vn_sinf (__f32x4_t);
 __vpcs __f32x4_t __vn_cosf (__f32x4_t);
 __vpcs __f32x4_t __vn_expf (__f32x4_t);
 __vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
 __vpcs __f32x4_t __vn_exp2f (__f32x4_t);
 __vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
 __vpcs __f32x4_t __vn_logf (__f32x4_t);
 __vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_sin (__f64x2_t);
 __vpcs __f64x2_t __vn_cos (__f64x2_t);
 __vpcs __f64x2_t __vn_exp (__f64x2_t);
 __vpcs __f64x2_t __vn_log (__f64x2_t);
 __vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
 #endif
 #endif
 
 #endif
diff --git a/contrib/arm-optimized-routines/math/log.c b/contrib/arm-optimized-routines/math/log.c
index d3b7bc60747c..43dfc2a744f0 100644
--- a/contrib/arm-optimized-routines/math/log.c
+++ b/contrib/arm-optimized-routines/math/log.c
@@ -1,162 +1,162 @@
 /*
  * Double-precision log(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 #define T __log_data.tab
 #define T2 __log_data.tab2
 #define B __log_data.poly1
 #define A __log_data.poly
 #define Ln2hi __log_data.ln2hi
 #define Ln2lo __log_data.ln2lo
 #define N (1 << LOG_TABLE_BITS)
 #define OFF 0x3fe6000000000000
 
 /* Top 16 bits of a double.  */
 static inline uint32_t
 top16 (double x)
 {
   return asuint64 (x) >> 48;
 }
 
 double
 log (double x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
   uint64_t ix, iz, tmp;
   uint32_t top;
   int k, i;
 
   ix = asuint64 (x);
   top = top16 (x);
 
 #if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11
 # define LO asuint64 (1.0 - 0x1p-5)
 # define HI asuint64 (1.0 + 0x1.1p-5)
 #elif LOG_POLY1_ORDER == 12
 # define LO asuint64 (1.0 - 0x1p-4)
 # define HI asuint64 (1.0 + 0x1.09p-4)
 #endif
   if (unlikely (ix - LO < HI - LO))
     {
       /* Handle close to 1.0 inputs separately.  */
       /* Fix sign of zero with downward rounding when x==1.  */
       if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
 	return 0;
       r = x - 1.0;
       r2 = r * r;
       r3 = r * r2;
 #if LOG_POLY1_ORDER == 10
       /* Worst-case error is around 0.516 ULP.  */
       y = r3 * (B[1] + r * B[2] + r2 * B[3]
 		+ r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
       w = B[0] * r2; /* B[0] == -0.5.  */
       hi = r + w;
       y += r - hi + w;
       y += hi;
 #elif LOG_POLY1_ORDER == 11
       /* Worst-case error is around 0.516 ULP.  */
       y = r3 * (B[1] + r * B[2]
 		+ r2 * (B[3] + r * B[4] + r2 * B[5]
 			+ r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9])));
       w = B[0] * r2; /* B[0] == -0.5.  */
       hi = r + w;
       y += r - hi + w;
       y += hi;
 #elif LOG_POLY1_ORDER == 12
       y = r3 * (B[1] + r * B[2] + r2 * B[3]
 		+ r3 * (B[4] + r * B[5] + r2 * B[6]
 			+ r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
 # if N <= 64
       /* Worst-case error is around 0.532 ULP.  */
       w = B[0] * r2; /* B[0] == -0.5.  */
       hi = r + w;
       y += r - hi + w;
       y += hi;
 # else
       /* Worst-case error is around 0.507 ULP.  */
       w = r * 0x1p27;
       double_t rhi = r + w - w;
       double_t rlo = r - rhi;
       w = rhi * rhi * B[0]; /* B[0] == -0.5.  */
       hi = r + w;
       lo = r - hi + w;
       lo += B[0] * rlo * (rhi + r);
       y += lo;
       y += hi;
 # endif
 #endif
       return eval_as_double (y);
     }
   if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
     {
       /* x < 0x1p-1022 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzero (1);
       if (ix == asuint64 (INFINITY)) /* log(inf) == inf.  */
 	return x;
       if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
 	return __math_invalid (x);
       /* x is subnormal, normalize it.  */
       ix = asuint64 (x * 0x1p52);
       ix -= 52ULL << 52;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
   k = (int64_t) tmp >> 52; /* arithmetic shift */
   iz = ix - (tmp & 0xfffULL << 52);
   invc = T[i].invc;
   logc = T[i].logc;
   z = asdouble (iz);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
   /* r ~= z/c - 1, |r| < 1/(2*N).  */
 #if HAVE_FAST_FMA
   /* rounding error: 0x1p-55/N.  */
   r = fma (z, invc, -1.0);
 #else
   /* rounding error: 0x1p-55/N + 0x1p-66.  */
   r = (z - T2[i].chi - T2[i].clo) * invc;
 #endif
   kd = (double_t) k;
 
   /* hi + lo = r + log(c) + k*Ln2.  */
   w = kd * Ln2hi + logc;
   hi = w + r;
   lo = w - hi + r + kd * Ln2lo;
 
   /* log(x) = lo + (log1p(r) - r) + hi.  */
   r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
   /* Worst case error if |y| > 0x1p-5:
      0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
      Worst case error if |y| > 0x1p-4:
      0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).  */
 #if LOG_POLY_ORDER == 6
   y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
 #elif LOG_POLY_ORDER == 7
   y = lo
       + r2 * (A[0] + r * A[1] + r2 * (A[2] + r * A[3])
 	      + r2 * r2 * (A[4] + r * A[5]))
       + hi;
 #endif
   return eval_as_double (y);
 }
 #if USE_GLIBC_ABI
 strong_alias (log, __log_finite)
 hidden_alias (log, __ieee754_log)
 # if LDBL_MANT_DIG == 53
 long double logl (long double x) { return log (x); }
 # endif
 #endif
diff --git a/contrib/arm-optimized-routines/math/log2.c b/contrib/arm-optimized-routines/math/log2.c
index 55102b772969..3f9c21b03962 100644
--- a/contrib/arm-optimized-routines/math/log2.c
+++ b/contrib/arm-optimized-routines/math/log2.c
@@ -1,141 +1,141 @@
 /*
  * Double-precision log2(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 #define T __log2_data.tab
 #define T2 __log2_data.tab2
 #define B __log2_data.poly1
 #define A __log2_data.poly
 #define InvLn2hi __log2_data.invln2hi
 #define InvLn2lo __log2_data.invln2lo
 #define N (1 << LOG2_TABLE_BITS)
 #define OFF 0x3fe6000000000000
 
 /* Top 16 bits of a double.  */
 static inline uint32_t
 top16 (double x)
 {
   return asuint64 (x) >> 48;
 }
 
 double
 log2 (double x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p;
   uint64_t ix, iz, tmp;
   uint32_t top;
   int k, i;
 
   ix = asuint64 (x);
   top = top16 (x);
 
 #if LOG2_POLY1_ORDER == 11
 # define LO asuint64 (1.0 - 0x1.5b51p-5)
 # define HI asuint64 (1.0 + 0x1.6ab2p-5)
 #endif
   if (unlikely (ix - LO < HI - LO))
     {
       /* Handle close to 1.0 inputs separately.  */
       /* Fix sign of zero with downward rounding when x==1.  */
       if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
 	return 0;
       r = x - 1.0;
 #if HAVE_FAST_FMA
       hi = r * InvLn2hi;
       lo = r * InvLn2lo + fma (r, InvLn2hi, -hi);
 #else
       double_t rhi, rlo;
       rhi = asdouble (asuint64 (r) & -1ULL << 32);
       rlo = r - rhi;
       hi = rhi * InvLn2hi;
       lo = rlo * InvLn2hi + r * InvLn2lo;
 #endif
       r2 = r * r; /* rounding error: 0x1p-62.  */
       r4 = r2 * r2;
 #if LOG2_POLY1_ORDER == 11
       /* Worst-case error is less than 0.54 ULP (0.55 ULP without fma).  */
       p = r2 * (B[0] + r * B[1]);
       y = hi + p;
       lo += hi - y + p;
       lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5])
 		  + r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9])));
       y += lo;
 #endif
       return eval_as_double (y);
     }
   if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
     {
       /* x < 0x1p-1022 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzero (1);
       if (ix == asuint64 (INFINITY)) /* log(inf) == inf.  */
 	return x;
       if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
 	return __math_invalid (x);
       /* x is subnormal, normalize it.  */
       ix = asuint64 (x * 0x1p52);
       ix -= 52ULL << 52;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (52 - LOG2_TABLE_BITS)) % N;
   k = (int64_t) tmp >> 52; /* arithmetic shift */
   iz = ix - (tmp & 0xfffULL << 52);
   invc = T[i].invc;
   logc = T[i].logc;
   z = asdouble (iz);
   kd = (double_t) k;
 
   /* log2(x) = log2(z/c) + log2(c) + k.  */
   /* r ~= z/c - 1, |r| < 1/(2*N).  */
 #if HAVE_FAST_FMA
   /* rounding error: 0x1p-55/N.  */
   r = fma (z, invc, -1.0);
   t1 = r * InvLn2hi;
   t2 = r * InvLn2lo + fma (r, InvLn2hi, -t1);
 #else
   double_t rhi, rlo;
   /* rounding error: 0x1p-55/N + 0x1p-65.  */
   r = (z - T2[i].chi - T2[i].clo) * invc;
   rhi = asdouble (asuint64 (r) & -1ULL << 32);
   rlo = r - rhi;
   t1 = rhi * InvLn2hi;
   t2 = rlo * InvLn2hi + r * InvLn2lo;
 #endif
 
   /* hi + lo = r/ln2 + log2(c) + k.  */
   t3 = kd + logc;
   hi = t3 + t1;
   lo = t3 - hi + t1 + t2;
 
   /* log2(r+1) = r/ln2 + r^2*poly(r).  */
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
   r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
   r4 = r2 * r2;
 #if LOG2_POLY_ORDER == 7
   /* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma).
      ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma).  */
   p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]);
   y = lo + r2 * p + hi;
 #endif
   return eval_as_double (y);
 }
 #if USE_GLIBC_ABI
 strong_alias (log2, __log2_finite)
 hidden_alias (log2, __ieee754_log2)
 # if LDBL_MANT_DIG == 53
 long double log2l (long double x) { return log2 (x); }
 # endif
 #endif
diff --git a/contrib/arm-optimized-routines/math/log2_data.c b/contrib/arm-optimized-routines/math/log2_data.c
index 3fc9b47c1f03..293bd7df4118 100644
--- a/contrib/arm-optimized-routines/math/log2_data.c
+++ b/contrib/arm-optimized-routines/math/log2_data.c
@@ -1,209 +1,209 @@
 /*
  * Data for log2.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << LOG2_TABLE_BITS)
 
 const struct log2_data __log2_data = {
 // First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0
 .invln2hi = 0x1.7154765200000p+0,
 .invln2lo = 0x1.705fc2eefa200p-33,
 .poly1 = {
 #if LOG2_POLY1_ORDER == 11
 // relative error: 0x1.2fad8188p-63
 // in -0x1.5b51p-5 0x1.6ab2p-5
 -0x1.71547652b82fep-1,
 0x1.ec709dc3a03f7p-2,
 -0x1.71547652b7c3fp-2,
 0x1.2776c50f05be4p-2,
 -0x1.ec709dd768fe5p-3,
 0x1.a61761ec4e736p-3,
 -0x1.7153fbc64a79bp-3,
 0x1.484d154f01b4ap-3,
 -0x1.289e4a72c383cp-3,
 0x1.0b32f285aee66p-3,
 #endif
 },
 .poly = {
 #if N == 64 && LOG2_POLY_ORDER == 7
 // relative error: 0x1.a72c2bf8p-58
 // abs error: 0x1.67a552c8p-66
 // in -0x1.f45p-8 0x1.f45p-8
 -0x1.71547652b8339p-1,
 0x1.ec709dc3a04bep-2,
 -0x1.7154764702ffbp-2,
 0x1.2776c50034c48p-2,
 -0x1.ec7b328ea92bcp-3,
 0x1.a6225e117f92ep-3,
 #endif
 },
 /* Algorithm:
 
 	x = 2^k z
 	log2(x) = k + log2(c) + log2(z/c)
 	log2(z/c) = poly(z/c - 1)
 
 where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
 into the ith one, then table entries are computed as
 
 	tab[i].invc = 1/c
 	tab[i].logc = (double)log2(c)
 	tab2[i].chi = (double)c
 	tab2[i].clo = (double)(c - (double)c)
 
 where c is near the center of the subinterval and is chosen by trying +-2^29
 floating point invc candidates around 1/center and selecting one for which
 
 	1) the rounding error in 0x1.8p10 + logc is 0,
 	2) the rounding error in z - chi - clo is < 0x1p-64 and
 	3) the rounding error in (double)log2(c) is minimized (< 0x1p-68).
 
 Note: 1) ensures that k + logc can be computed without rounding error, 2)
 ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a
 single rounding error when there is no fast fma for z*invc - 1, 3) ensures
 that logc + poly(z/c - 1) has small error, however near x == 1 when
 |log2(x)| < 0x1p-4, this is not enough so that is special cased.  */
 .tab = {
 #if N == 64
 {0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1},
 {0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1},
 {0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1},
 {0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2},
 {0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2},
 {0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2},
 {0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2},
 {0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2},
 {0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2},
 {0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2},
 {0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2},
 {0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2},
 {0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2},
 {0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2},
 {0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2},
 {0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2},
 {0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2},
 {0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2},
 {0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2},
 {0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2},
 {0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3},
 {0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3},
 {0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3},
 {0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3},
 {0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3},
 {0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3},
 {0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3},
 {0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3},
 {0x1.19453847f2200p+0, -0x1.162595afdc000p-3},
 {0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4},
 {0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4},
 {0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4},
 {0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4},
 {0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4},
 {0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4},
 {0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5},
 {0x1.07325cac53b83p+0, -0x1.47a954f770000p-5},
 {0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6},
 {0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6},
 {0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8},
 {0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7},
 {0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5},
 {0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5},
 {0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4},
 {0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4},
 {0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4},
 {0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3},
 {0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3},
 {0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3},
 {0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3},
 {0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3},
 {0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3},
 {0x1.ac57026295039p-1, 0x1.0790ab4678000p-2},
 {0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2},
 {0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2},
 {0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2},
 {0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2},
 {0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2},
 {0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2},
 {0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2},
 {0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2},
 {0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2},
 {0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2},
 {0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2},
 #endif
 },
 #if !HAVE_FAST_FMA
 .tab2 = {
 # if N == 64
 {0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55},
 {0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57},
 {0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55},
 {0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55},
 {0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55},
 {0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56},
 {0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56},
 {0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57},
 {0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55},
 {0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57},
 {0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55},
 {0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55},
 {0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56},
 {0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56},
 {0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56},
 {0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55},
 {0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57},
 {0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55},
 {0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55},
 {0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58},
 {0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55},
 {0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58},
 {0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56},
 {0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56},
 {0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57},
 {0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56},
 {0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56},
 {0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55},
 {0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58},
 {0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56},
 {0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55},
 {0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56},
 {0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55},
 {0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56},
 {0x1.ea00027edc00cp-1, -0x1.c848309459811p-55},
 {0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55},
 {0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55},
 {0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59},
 {0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58},
 {0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55},
 {0x1.0200004292367p+0, 0x1.b7ff365324681p-54},
 {0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55},
 {0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58},
 {0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54},
 {0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55},
 {0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54},
 {0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54},
 {0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54},
 {0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55},
 {0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55},
 {0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56},
 {0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54},
 {0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56},
 {0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54},
 {0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56},
 {0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54},
 {0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56},
 {0x1.460000d387cb1p+0, 0x1.20837856599a6p-55},
 {0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55},
 {0x1.4e000043543f3p+0, -0x1.81125ed175329p-56},
 {0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54},
 {0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55},
 {0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55},
 {0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54},
 # endif
 },
 #endif /* !HAVE_FAST_FMA */
 };
diff --git a/contrib/arm-optimized-routines/math/log2f.c b/contrib/arm-optimized-routines/math/log2f.c
index acb629e6846c..0a44fa2024f6 100644
--- a/contrib/arm-optimized-routines/math/log2f.c
+++ b/contrib/arm-optimized-routines/math/log2f.c
@@ -1,80 +1,80 @@
 /*
  * Single-precision log2 function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 LOG2F_TABLE_BITS = 4
 LOG2F_POLY_ORDER = 4
 
 ULP error: 0.752 (nearest rounding.)
 Relative error: 1.9 * 2^-26 (before rounding.)
 */
 
 #define N (1 << LOG2F_TABLE_BITS)
 #define T __log2f_data.tab
 #define A __log2f_data.poly
 #define OFF 0x3f330000
 
 float
 log2f (float x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, r2, p, y, y0, invc, logc;
   uint32_t ix, iz, top, tmp;
   int k, i;
 
   ix = asuint (x);
 #if WANT_ROUNDING
   /* Fix sign of zero with downward rounding when x==1.  */
   if (unlikely (ix == 0x3f800000))
     return 0;
 #endif
   if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
     {
       /* x < 0x1p-126 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzerof (1);
       if (ix == 0x7f800000) /* log2(inf) == inf.  */
 	return x;
       if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
 	return __math_invalidf (x);
       /* x is subnormal, normalize it.  */
       ix = asuint (x * 0x1p23f);
       ix -= 23 << 23;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N;
   top = tmp & 0xff800000;
   iz = ix - top;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
 
   /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
   r = z * invc - 1;
   y0 = logc + (double_t) k;
 
   /* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
   r2 = r * r;
   y = A[1] * r + A[2];
   y = A[0] * r2 + y;
   p = A[3] * r + y0;
   y = y * r2 + p;
   return eval_as_float (y);
 }
 #if USE_GLIBC_ABI
 strong_alias (log2f, __log2f_finite)
 hidden_alias (log2f, __ieee754_log2f)
 #endif
diff --git a/contrib/arm-optimized-routines/math/log2f_data.c b/contrib/arm-optimized-routines/math/log2f_data.c
index f3546d730aba..4866ef7f8171 100644
--- a/contrib/arm-optimized-routines/math/log2f_data.c
+++ b/contrib/arm-optimized-routines/math/log2f_data.c
@@ -1,33 +1,33 @@
 /*
  * Data definition for log2f.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 const struct log2f_data __log2f_data = {
   .tab = {
   { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 },
   { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 },
   { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 },
   { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 },
   { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 },
   { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 },
   { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 },
   { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 },
   { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 },
   { 0x1p+0, 0x0p+0 },
   { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 },
   { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 },
   { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 },
   { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 },
   { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 },
   { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 },
   },
   .poly = {
   -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1,
   0x1.715475f35c8b8p0,
   }
 };
diff --git a/contrib/arm-optimized-routines/math/log_data.c b/contrib/arm-optimized-routines/math/log_data.c
index 96a098d42c16..3ecc1f40a822 100644
--- a/contrib/arm-optimized-routines/math/log_data.c
+++ b/contrib/arm-optimized-routines/math/log_data.c
@@ -1,511 +1,511 @@
 /*
  * Data for log.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << LOG_TABLE_BITS)
 
 const struct log_data __log_data = {
 .ln2hi = 0x1.62e42fefa3800p-1,
 .ln2lo = 0x1.ef35793c76730p-45,
 .poly1 = {
 #if LOG_POLY1_ORDER == 10
 // relative error: 0x1.32eccc6p-62
 // in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
 -0x1p-1,
 0x1.55555555554e5p-2,
 -0x1.0000000000af2p-2,
 0x1.9999999bbe436p-3,
 -0x1.55555537f9cdep-3,
 0x1.24922fc8127cfp-3,
 -0x1.0000b7d6bb612p-3,
 0x1.c806ee1ddbcafp-4,
 -0x1.972335a9c2d6ep-4,
 #elif LOG_POLY1_ORDER == 11
 // relative error: 0x1.52c8b708p-68
 // in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
 -0x1p-1,
 0x1.5555555555555p-2,
 -0x1.ffffffffffea9p-3,
 0x1.999999999c4d4p-3,
 -0x1.55555557f5541p-3,
 0x1.249248fbe33e4p-3,
 -0x1.ffffc9a3c825bp-4,
 0x1.c71e1f204435dp-4,
 -0x1.9a7f26377d06ep-4,
 0x1.71c30cf8f7364p-4,
 #elif LOG_POLY1_ORDER == 12
 // relative error: 0x1.c04d76cp-63
 // in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
 -0x1p-1,
 0x1.5555555555577p-2,
 -0x1.ffffffffffdcbp-3,
 0x1.999999995dd0cp-3,
 -0x1.55555556745a7p-3,
 0x1.24924a344de3p-3,
 -0x1.fffffa4423d65p-4,
 0x1.c7184282ad6cap-4,
 -0x1.999eb43b068ffp-4,
 0x1.78182f7afd085p-4,
 -0x1.5521375d145cdp-4,
 #endif
 },
 .poly = {
 #if N == 64 && LOG_POLY_ORDER == 7
 // relative error: 0x1.906eb8ap-58
 // abs error: 0x1.d2cad5a8p-67
 // in -0x1.fp-8 0x1.fp-8
 -0x1.0000000000027p-1,
 0x1.555555555556ap-2,
 -0x1.fffffff0440bap-3,
 0x1.99999991906c3p-3,
 -0x1.555c8d7e8201ep-3,
 0x1.24978c59151fap-3,
 #elif N == 128 && LOG_POLY_ORDER == 6
 // relative error: 0x1.926199e8p-56
 // abs error: 0x1.882ff33p-65
 // in -0x1.fp-9 0x1.fp-9
 -0x1.0000000000001p-1,
 0x1.555555551305bp-2,
 -0x1.fffffffeb459p-3,
 0x1.999b324f10111p-3,
 -0x1.55575e506c89fp-3,
 #elif N == 128 && LOG_POLY_ORDER == 7
 // relative error: 0x1.649fc4bp-64
 // abs error: 0x1.c3b5769p-74
 // in -0x1.fp-9 0x1.fp-9
 -0x1.0000000000001p-1,
 0x1.5555555555556p-2,
 -0x1.fffffffea1a8p-3,
 0x1.99999998e9139p-3,
 -0x1.555776801b968p-3,
 0x1.2493c29331a5cp-3,
 #endif
 },
 /* Algorithm:
 
 	x = 2^k z
 	log(x) = k ln2 + log(c) + log(z/c)
 	log(z/c) = poly(z/c - 1)
 
 where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
 into the ith one, then table entries are computed as
 
 	tab[i].invc = 1/c
 	tab[i].logc = (double)log(c)
 	tab2[i].chi = (double)c
 	tab2[i].clo = (double)(c - (double)c)
 
 where c is near the center of the subinterval and is chosen by trying +-2^29
 floating point invc candidates around 1/center and selecting one for which
 
 	1) the rounding error in 0x1.8p9 + logc is 0,
 	2) the rounding error in z - chi - clo is < 0x1p-66 and
 	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
 
 Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
 2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
 a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
 that logc + poly(z/c - 1) has small error, however near x == 1 when
 |log(x)| < 0x1p-4, this is not enough so that is special cased.  */
 .tab = {
 #if N == 64
 {0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
 {0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2},
 {0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2},
 {0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2},
 {0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2},
 {0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2},
 {0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2},
 {0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2},
 {0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2},
 {0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2},
 {0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2},
 {0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2},
 {0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3},
 {0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3},
 {0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3},
 {0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3},
 {0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3},
 {0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3},
 {0x1.33ae463091760p+0, -0x1.7898db878d000p-3},
 {0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3},
 {0x1.2e025c9203c89p+0, -0x1.527e620845000p-3},
 {0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3},
 {0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3},
 {0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3},
 {0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3},
 {0x1.20b4703174157p+0, -0x1.ec738fee40000p-4},
 {0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4},
 {0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4},
 {0x1.194538e960658p+0, -0x1.8197efba9a000p-4},
 {0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4},
 {0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4},
 {0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4},
 {0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5},
 {0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5},
 {0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5},
 {0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5},
 {0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6},
 {0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6},
 {0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7},
 {0x1.01010152cf066p+0, -0x1.0080a711c0000p-8},
 {0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8},
 {0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6},
 {0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5},
 {0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5},
 {0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4},
 {0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4},
 {0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4},
 {0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4},
 {0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4},
 {0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3},
 {0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3},
 {0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3},
 {0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3},
 {0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3},
 {0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3},
 {0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3},
 {0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3},
 {0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3},
 {0x1.8d3018b58699ap-1, 0x1.040259974e000p-2},
 {0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2},
 {0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2},
 {0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2},
 {0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2},
 {0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2},
 #elif N == 128
 {0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
 {0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
 {0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
 {0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
 {0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
 {0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
 {0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
 {0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
 {0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
 {0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
 {0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
 {0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
 {0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
 {0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
 {0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
 {0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
 {0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
 {0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
 {0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
 {0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
 {0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
 {0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
 {0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
 {0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
 {0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
 {0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
 {0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
 {0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
 {0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
 {0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
 {0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
 {0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
 {0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
 {0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
 {0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
 {0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
 {0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
 {0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
 {0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
 {0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
 {0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
 {0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
 {0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
 {0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
 {0x1.293726014b530p+0, -0x1.31b996b490000p-3},
 {0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
 {0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
 {0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
 {0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
 {0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
 {0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
 {0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
 {0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
 {0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
 {0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
 {0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
 {0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
 {0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
 {0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
 {0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
 {0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
 {0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
 {0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
 {0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
 {0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
 {0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
 {0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
 {0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
 {0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
 {0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
 {0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
 {0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
 {0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
 {0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
 {0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
 {0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
 {0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
 {0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
 {0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
 {0x1.008040614b195p+0, -0x1.0040979240000p-9},
 {0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
 {0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
 {0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
 {0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
 {0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
 {0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
 {0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
 {0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
 {0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
 {0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
 {0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
 {0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
 {0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
 {0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
 {0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
 {0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
 {0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
 {0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
 {0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
 {0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
 {0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
 {0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
 {0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
 {0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
 {0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
 {0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
 {0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
 {0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
 {0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
 {0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
 {0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
 {0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
 {0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
 {0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
 {0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
 {0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
 {0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
 {0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
 {0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
 {0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
 {0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
 {0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
 {0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
 {0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
 {0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
 {0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
 {0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
 {0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
 #endif
 },
 #if !HAVE_FAST_FMA
 .tab2 = {
 # if N == 64
 {0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56},
 {0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55},
 {0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55},
 {0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56},
 {0x1.720000b37216ep-1, 0x1.802bc8d437043p-55},
 {0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57},
 {0x1.7a0000628daep-1, -0x1.e00434b49313dp-56},
 {0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56},
 {0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57},
 {0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56},
 {0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57},
 {0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55},
 {0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55},
 {0x1.960000135d8eap-1, -0x1.f832268dc3095p-55},
 {0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56},
 {0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58},
 {0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55},
 {0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55},
 {0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57},
 {0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56},
 {0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57},
 {0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57},
 {0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56},
 {0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55},
 {0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58},
 {0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55},
 {0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58},
 {0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59},
 {0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59},
 {0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58},
 {0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55},
 {0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55},
 {0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55},
 {0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60},
 {0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55},
 {0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55},
 {0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56},
 {0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57},
 {0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58},
 {0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58},
 {0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54},
 {0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55},
 {0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54},
 {0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54},
 {0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55},
 {0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57},
 {0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54},
 {0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55},
 {0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54},
 {0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55},
 {0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56},
 {0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54},
 {0x1.320000324c55bp+0, 0x1.f81983997354fp-54},
 {0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54},
 {0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54},
 {0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56},
 {0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54},
 {0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55},
 {0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55},
 {0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56},
 {0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54},
 {0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55},
 {0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55},
 {0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54},
 # elif N == 128
 {0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
 {0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
 {0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
 {0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
 {0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
 {0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
 {0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
 {0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
 {0x1.710000e86978p-1, 0x1.bff6671097952p-56},
 {0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
 {0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
 {0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
 {0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
 {0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
 {0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
 {0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
 {0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
 {0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
 {0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
 {0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
 {0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
 {0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
 {0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
 {0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
 {0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
 {0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
 {0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
 {0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
 {0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
 {0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
 {0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
 {0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
 {0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
 {0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
 {0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
 {0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
 {0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
 {0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
 {0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
 {0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
 {0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
 {0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
 {0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
 {0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
 {0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
 {0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
 {0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
 {0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
 {0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
 {0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
 {0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
 {0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
 {0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
 {0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
 {0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
 {0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
 {0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
 {0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
 {0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
 {0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
 {0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
 {0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
 {0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
 {0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
 {0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
 {0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
 {0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
 {0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
 {0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
 {0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
 {0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
 {0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
 {0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
 {0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
 {0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
 {0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
 {0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
 {0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
 {0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
 {0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
 {0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
 {0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
 {0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
 {0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
 {0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
 {0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
 {0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
 {0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
 {0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
 {0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
 {0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
 {0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
 {0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
 {0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
 {0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
 {0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
 {0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
 {0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
 {0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
 {0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
 {0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
 {0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
 {0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
 {0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
 {0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
 {0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
 {0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
 {0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
 {0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
 {0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
 {0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
 {0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
 {0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
 {0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
 {0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
 {0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
 {0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
 {0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
 {0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
 {0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
 {0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
 {0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
 {0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
 {0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
 {0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
 {0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
 {0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
 {0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
 #endif
 },
 #endif /* !HAVE_FAST_FMA */
 };
diff --git a/contrib/arm-optimized-routines/math/logf.c b/contrib/arm-optimized-routines/math/logf.c
index cfbaee12df10..820f74c3e66a 100644
--- a/contrib/arm-optimized-routines/math/logf.c
+++ b/contrib/arm-optimized-routines/math/logf.c
@@ -1,79 +1,79 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 LOGF_TABLE_BITS = 4
 LOGF_POLY_ORDER = 4
 
 ULP error: 0.818 (nearest rounding.)
 Relative error: 1.957 * 2^-26 (before rounding.)
 */
 
 #define T __logf_data.tab
 #define A __logf_data.poly
 #define Ln2 __logf_data.ln2
 #define N (1 << LOGF_TABLE_BITS)
 #define OFF 0x3f330000
 
 float
 logf (float x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, r2, y, y0, invc, logc;
   uint32_t ix, iz, tmp;
   int k, i;
 
   ix = asuint (x);
 #if WANT_ROUNDING
   /* Fix sign of zero with downward rounding when x==1.  */
   if (unlikely (ix == 0x3f800000))
     return 0;
 #endif
   if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
     {
       /* x < 0x1p-126 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzerof (1);
       if (ix == 0x7f800000) /* log(inf) == inf.  */
 	return x;
       if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
 	return __math_invalidf (x);
       /* x is subnormal, normalize it.  */
       ix = asuint (x * 0x1p23f);
       ix -= 23 << 23;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
-  iz = ix - (tmp & 0x1ff << 23);
+  iz = ix - (tmp & 0xff800000);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
   r = z * invc - 1;
   y0 = logc + (double_t) k * Ln2;
 
   /* Pipelined polynomial evaluation to approximate log1p(r).  */
   r2 = r * r;
   y = A[1] * r + A[2];
   y = A[0] * r2 + y;
   y = y * r2 + (y0 + r);
   return eval_as_float (y);
 }
 #if USE_GLIBC_ABI
 strong_alias (logf, __logf_finite)
 hidden_alias (logf, __ieee754_logf)
 #endif
diff --git a/contrib/arm-optimized-routines/math/logf_data.c b/contrib/arm-optimized-routines/math/logf_data.c
index e8973ce4fedc..04247684755f 100644
--- a/contrib/arm-optimized-routines/math/logf_data.c
+++ b/contrib/arm-optimized-routines/math/logf_data.c
@@ -1,33 +1,33 @@
 /*
  * Data definition for logf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 const struct logf_data __logf_data = {
   .tab = {
   { 0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2 },
   { 0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2 },
   { 0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2 },
   { 0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3 },
   { 0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3 },
   { 0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3 },
   { 0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4 },
   { 0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4 },
   { 0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5 },
   { 0x1p+0, 0x0p+0 },
   { 0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5 },
   { 0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4 },
   { 0x1.b2036576afce6p-1, 0x1.526e57720db08p-3 },
   { 0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3 },
   { 0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2 },
   { 0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2 },
   },
   .ln2 = 0x1.62e42fefa39efp-1,
   .poly = {
   -0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2,
   }
 };
diff --git a/contrib/arm-optimized-routines/math/math_config.h b/contrib/arm-optimized-routines/math/math_config.h
index e85104337048..7ffc0cd2796a 100644
--- a/contrib/arm-optimized-routines/math/math_config.h
+++ b/contrib/arm-optimized-routines/math/math_config.h
@@ -1,462 +1,462 @@
 /*
  * Configuration for math routines.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATH_CONFIG_H
 #define _MATH_CONFIG_H
 
 #include <math.h>
 #include <stdint.h>
 
 #ifndef WANT_ROUNDING
 /* If defined to 1, return correct results for special cases in non-nearest
    rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
    This may be set to 0 if there is no fenv support or if math functions only
    get called in round to nearest mode.  */
 # define WANT_ROUNDING 1
 #endif
 #ifndef WANT_ERRNO
 /* If defined to 1, set errno in math functions according to ISO C.  Many math
    libraries do not set errno, so this is 0 by default.  It may need to be
    set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
 # define WANT_ERRNO 0
 #endif
 #ifndef WANT_ERRNO_UFLOW
 /* Set errno to ERANGE if result underflows to 0 (in all rounding modes).  */
 # define WANT_ERRNO_UFLOW (WANT_ROUNDING && WANT_ERRNO)
 #endif
 
 /* Compiler can inline round as a single instruction.  */
 #ifndef HAVE_FAST_ROUND
 # if __aarch64__
 #   define HAVE_FAST_ROUND 1
 # else
 #   define HAVE_FAST_ROUND 0
 # endif
 #endif
 
 /* Compiler can inline lround, but not (long)round(x).  */
 #ifndef HAVE_FAST_LROUND
 # if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__
 #   define HAVE_FAST_LROUND 1
 # else
 #   define HAVE_FAST_LROUND 0
 # endif
 #endif
 
 /* Compiler can inline fma as a single instruction.  */
 #ifndef HAVE_FAST_FMA
 # if defined FP_FAST_FMA || __aarch64__
 #   define HAVE_FAST_FMA 1
 # else
 #   define HAVE_FAST_FMA 0
 # endif
 #endif
 
 /* Provide *_finite symbols and some of the glibc hidden symbols
    so libmathlib can be used with binaries compiled against glibc
    to interpose math functions with both static and dynamic linking.  */
 #ifndef USE_GLIBC_ABI
 # if __GNUC__
 #   define USE_GLIBC_ABI 1
 # else
 #   define USE_GLIBC_ABI 0
 # endif
 #endif
 
 /* Optionally used extensions.  */
 #ifdef __GNUC__
 # define HIDDEN __attribute__ ((__visibility__ ("hidden")))
 # define NOINLINE __attribute__ ((noinline))
 # define UNUSED __attribute__ ((unused))
 # define likely(x) __builtin_expect (!!(x), 1)
 # define unlikely(x) __builtin_expect (x, 0)
 # if __GNUC__ >= 9
 #   define attribute_copy(f) __attribute__ ((copy (f)))
 # else
 #   define attribute_copy(f)
 # endif
 # define strong_alias(f, a) \
   extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
 # define hidden_alias(f, a) \
   extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
   attribute_copy (f);
 #else
 # define HIDDEN
 # define NOINLINE
 # define UNUSED
 # define likely(x) (x)
 # define unlikely(x) (x)
 #endif
 
 #if HAVE_FAST_ROUND
 /* When set, the roundtoint and converttoint functions are provided with
    the semantics documented below.  */
 # define TOINT_INTRINSICS 1
 
 /* Round x to nearest int in all rounding modes, ties have to be rounded
    consistently with converttoint so the results match.  If the result
    would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
 static inline double_t
 roundtoint (double_t x)
 {
   return round (x);
 }
 
 /* Convert x to nearest int in all rounding modes, ties have to be rounded
    consistently with roundtoint.  If the result is not representible in an
    int32_t then the semantics is unspecified.  */
 static inline int32_t
 converttoint (double_t x)
 {
 # if HAVE_FAST_LROUND
   return lround (x);
 # else
   return (long) round (x);
 # endif
 }
 #endif
 
 static inline uint32_t
 asuint (float f)
 {
   union
   {
     float f;
     uint32_t i;
   } u = {f};
   return u.i;
 }
 
 static inline float
 asfloat (uint32_t i)
 {
   union
   {
     uint32_t i;
     float f;
   } u = {i};
   return u.f;
 }
 
 static inline uint64_t
 asuint64 (double f)
 {
   union
   {
     double f;
     uint64_t i;
   } u = {f};
   return u.i;
 }
 
 static inline double
 asdouble (uint64_t i)
 {
   union
   {
     uint64_t i;
     double f;
   } u = {i};
   return u.f;
 }
 
 #ifndef IEEE_754_2008_SNAN
 # define IEEE_754_2008_SNAN 1
 #endif
 static inline int
 issignalingf_inline (float x)
 {
   uint32_t ix = asuint (x);
   if (!IEEE_754_2008_SNAN)
     return (ix & 0x7fc00000) == 0x7fc00000;
   return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
 }
 
 static inline int
 issignaling_inline (double x)
 {
   uint64_t ix = asuint64 (x);
   if (!IEEE_754_2008_SNAN)
     return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
   return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
 }
 
 #if __aarch64__ && __GNUC__
 /* Prevent the optimization of a floating-point expression.  */
 static inline float
 opt_barrier_float (float x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
   return x;
 }
 static inline double
 opt_barrier_double (double x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
   return x;
 }
 /* Force the evaluation of a floating-point expression for its side-effect.  */
 static inline void
 force_eval_float (float x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
 }
 static inline void
 force_eval_double (double x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
 }
 #else
 static inline float
 opt_barrier_float (float x)
 {
   volatile float y = x;
   return y;
 }
 static inline double
 opt_barrier_double (double x)
 {
   volatile double y = x;
   return y;
 }
 static inline void
 force_eval_float (float x)
 {
   volatile float y UNUSED = x;
 }
 static inline void
 force_eval_double (double x)
 {
   volatile double y UNUSED = x;
 }
 #endif
 
 /* Evaluate an expression as the specified type, normally a type
    cast should be enough, but compilers implement non-standard
    excess-precision handling, so when FLT_EVAL_METHOD != 0 then
    these functions may need to be customized.  */
 static inline float
 eval_as_float (float x)
 {
   return x;
 }
 static inline double
 eval_as_double (double x)
 {
   return x;
 }
 
 /* Error handling tail calls for special cases, with a sign argument.
    The sign of the return value is set if the argument is non-zero.  */
 
 /* The result overflows.  */
 HIDDEN float __math_oflowf (uint32_t);
 /* The result underflows to 0 in nearest rounding mode.  */
 HIDDEN float __math_uflowf (uint32_t);
 /* The result underflows to 0 in some directed rounding mode only.  */
 HIDDEN float __math_may_uflowf (uint32_t);
 /* Division by zero.  */
 HIDDEN float __math_divzerof (uint32_t);
 /* The result overflows.  */
 HIDDEN double __math_oflow (uint32_t);
 /* The result underflows to 0 in nearest rounding mode.  */
 HIDDEN double __math_uflow (uint32_t);
 /* The result underflows to 0 in some directed rounding mode only.  */
 HIDDEN double __math_may_uflow (uint32_t);
 /* Division by zero.  */
 HIDDEN double __math_divzero (uint32_t);
 
 /* Error handling using input checking.  */
 
 /* Invalid input unless it is a quiet NaN.  */
 HIDDEN float __math_invalidf (float);
 /* Invalid input unless it is a quiet NaN.  */
 HIDDEN double __math_invalid (double);
 
 /* Error handling using output checking, only for errno setting.  */
 
 /* Check if the result overflowed to infinity.  */
 HIDDEN double __math_check_oflow (double);
 /* Check if the result underflowed to 0.  */
 HIDDEN double __math_check_uflow (double);
 
 /* Check if the result overflowed to infinity.  */
 static inline double
 check_oflow (double x)
 {
   return WANT_ERRNO ? __math_check_oflow (x) : x;
 }
 
 /* Check if the result underflowed to 0.  */
 static inline double
 check_uflow (double x)
 {
   return WANT_ERRNO ? __math_check_uflow (x) : x;
 }
 
 /* Check if the result overflowed to infinity.  */
 HIDDEN float __math_check_oflowf (float);
 /* Check if the result underflowed to 0.  */
 HIDDEN float __math_check_uflowf (float);
 
 /* Check if the result overflowed to infinity.  */
 static inline float
 check_oflowf (float x)
 {
   return WANT_ERRNO ? __math_check_oflowf (x) : x;
 }
 
 /* Check if the result underflowed to 0.  */
 static inline float
 check_uflowf (float x)
 {
   return WANT_ERRNO ? __math_check_uflowf (x) : x;
 }
 
 /* Shared between expf, exp2f and powf.  */
 #define EXP2F_TABLE_BITS 5
 #define EXP2F_POLY_ORDER 3
 extern const struct exp2f_data
 {
   uint64_t tab[1 << EXP2F_TABLE_BITS];
   double shift_scaled;
   double poly[EXP2F_POLY_ORDER];
   double shift;
   double invln2_scaled;
   double poly_scaled[EXP2F_POLY_ORDER];
 } __exp2f_data HIDDEN;
 
 #define LOGF_TABLE_BITS 4
 #define LOGF_POLY_ORDER 4
 extern const struct logf_data
 {
   struct
   {
     double invc, logc;
   } tab[1 << LOGF_TABLE_BITS];
   double ln2;
   double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1.  */
 } __logf_data HIDDEN;
 
 #define LOG2F_TABLE_BITS 4
 #define LOG2F_POLY_ORDER 4
 extern const struct log2f_data
 {
   struct
   {
     double invc, logc;
   } tab[1 << LOG2F_TABLE_BITS];
   double poly[LOG2F_POLY_ORDER];
 } __log2f_data HIDDEN;
 
 #define POWF_LOG2_TABLE_BITS 4
 #define POWF_LOG2_POLY_ORDER 5
 #if TOINT_INTRINSICS
 # define POWF_SCALE_BITS EXP2F_TABLE_BITS
 #else
 # define POWF_SCALE_BITS 0
 #endif
 #define POWF_SCALE ((double) (1 << POWF_SCALE_BITS))
 extern const struct powf_log2_data
 {
   struct
   {
     double invc, logc;
   } tab[1 << POWF_LOG2_TABLE_BITS];
   double poly[POWF_LOG2_POLY_ORDER];
 } __powf_log2_data HIDDEN;
 
 
 #define EXP_TABLE_BITS 7
 #define EXP_POLY_ORDER 5
 /* Use polynomial that is optimized for a wider input range.  This may be
    needed for good precision in non-nearest rounding and !TOINT_INTRINSICS.  */
 #define EXP_POLY_WIDE 0
 /* Use close to nearest rounding toint when !TOINT_INTRINSICS.  This may be
    needed for good precision in non-nearest rouning and !EXP_POLY_WIDE.  */
 #define EXP_USE_TOINT_NARROW 0
 #define EXP2_POLY_ORDER 5
 #define EXP2_POLY_WIDE 0
 extern const struct exp_data
 {
   double invln2N;
   double shift;
   double negln2hiN;
   double negln2loN;
   double poly[4]; /* Last four coefficients.  */
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data HIDDEN;
 
 #define LOG_TABLE_BITS 7
 #define LOG_POLY_ORDER 6
 #define LOG_POLY1_ORDER 12
 extern const struct log_data
 {
   double ln2hi;
   double ln2lo;
   double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
   double poly1[LOG_POLY1_ORDER - 1];
   struct {double invc, logc;} tab[1 << LOG_TABLE_BITS];
 #if !HAVE_FAST_FMA
   struct {double chi, clo;} tab2[1 << LOG_TABLE_BITS];
 #endif
 } __log_data HIDDEN;
 
 #define LOG2_TABLE_BITS 6
 #define LOG2_POLY_ORDER 7
 #define LOG2_POLY1_ORDER 11
 extern const struct log2_data
 {
   double invln2hi;
   double invln2lo;
   double poly[LOG2_POLY_ORDER - 1];
   double poly1[LOG2_POLY1_ORDER - 1];
   struct {double invc, logc;} tab[1 << LOG2_TABLE_BITS];
 #if !HAVE_FAST_FMA
   struct {double chi, clo;} tab2[1 << LOG2_TABLE_BITS];
 #endif
 } __log2_data HIDDEN;
 
 #define POW_LOG_TABLE_BITS 7
 #define POW_LOG_POLY_ORDER 8
 extern const struct pow_log_data
 {
   double ln2hi;
   double ln2lo;
   double poly[POW_LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
   /* Note: the pad field is unused, but allows slightly faster indexing.  */
   struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
 } __pow_log_data HIDDEN;
 
 extern const struct erff_data
 {
   float erff_poly_A[6];
   float erff_poly_B[7];
 } __erff_data HIDDEN;
 
 #define ERF_POLY_A_ORDER 19
 #define ERF_POLY_A_NCOEFFS 10
 #define ERFC_POLY_C_NCOEFFS 16
 #define ERFC_POLY_D_NCOEFFS 18
 #define ERFC_POLY_E_NCOEFFS 14
 #define ERFC_POLY_F_NCOEFFS 17
 extern const struct erf_data
 {
   double erf_poly_A[ERF_POLY_A_NCOEFFS];
   double erf_ratio_N_A[5];
   double erf_ratio_D_A[5];
   double erf_ratio_N_B[7];
   double erf_ratio_D_B[6];
   double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
   double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
   double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
   double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
 } __erf_data HIDDEN;
 
 #endif
diff --git a/contrib/arm-optimized-routines/math/math_err.c b/contrib/arm-optimized-routines/math/math_err.c
index 1bf9538a1ab1..cfe072809cf4 100644
--- a/contrib/arm-optimized-routines/math/math_err.c
+++ b/contrib/arm-optimized-routines/math/math_err.c
@@ -1,80 +1,80 @@
 /*
  * Double-precision math error handling.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #if WANT_ERRNO
 #include <errno.h>
 /* NOINLINE reduces code size and avoids making math functions non-leaf
    when the error handling is inlined.  */
 NOINLINE static double
 with_errno (double y, int e)
 {
   errno = e;
   return y;
 }
 #else
 #define with_errno(x, e) (x)
 #endif
 
 /* NOINLINE reduces code size.  */
 NOINLINE static double
 xflow (uint32_t sign, double y)
 {
   y = eval_as_double (opt_barrier_double (sign ? -y : y) * y);
   return with_errno (y, ERANGE);
 }
 
 HIDDEN double
 __math_uflow (uint32_t sign)
 {
   return xflow (sign, 0x1p-767);
 }
 
 #if WANT_ERRNO_UFLOW
 /* Underflows to zero in some non-nearest rounding mode, setting errno
    is valid even if the result is non-zero, but in the subnormal range.  */
 HIDDEN double
 __math_may_uflow (uint32_t sign)
 {
   return xflow (sign, 0x1.8p-538);
 }
 #endif
 
 HIDDEN double
 __math_oflow (uint32_t sign)
 {
   return xflow (sign, 0x1p769);
 }
 
 HIDDEN double
 __math_divzero (uint32_t sign)
 {
   double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0;
   return with_errno (y, ERANGE);
 }
 
 HIDDEN double
 __math_invalid (double x)
 {
   double y = (x - x) / (x - x);
   return isnan (x) ? y : with_errno (y, EDOM);
 }
 
 /* Check result and set errno if necessary.  */
 
 HIDDEN double
 __math_check_uflow (double y)
 {
   return y == 0.0 ? with_errno (y, ERANGE) : y;
 }
 
 HIDDEN double
 __math_check_oflow (double y)
 {
   return isinf (y) ? with_errno (y, ERANGE) : y;
 }
diff --git a/contrib/arm-optimized-routines/math/math_errf.c b/contrib/arm-optimized-routines/math/math_errf.c
index d5350b819ab1..4233918b1eae 100644
--- a/contrib/arm-optimized-routines/math/math_errf.c
+++ b/contrib/arm-optimized-routines/math/math_errf.c
@@ -1,80 +1,80 @@
 /*
  * Single-precision math error handling.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #if WANT_ERRNO
 #include <errno.h>
 /* NOINLINE reduces code size and avoids making math functions non-leaf
    when the error handling is inlined.  */
 NOINLINE static float
 with_errnof (float y, int e)
 {
   errno = e;
   return y;
 }
 #else
 #define with_errnof(x, e) (x)
 #endif
 
 /* NOINLINE reduces code size.  */
 NOINLINE static float
 xflowf (uint32_t sign, float y)
 {
   y = eval_as_float (opt_barrier_float (sign ? -y : y) * y);
   return with_errnof (y, ERANGE);
 }
 
 HIDDEN float
 __math_uflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1p-95f);
 }
 
 #if WANT_ERRNO_UFLOW
 /* Underflows to zero in some non-nearest rounding mode, setting errno
    is valid even if the result is non-zero, but in the subnormal range.  */
 HIDDEN float
 __math_may_uflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1.4p-75f);
 }
 #endif
 
 HIDDEN float
 __math_oflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1p97f);
 }
 
 HIDDEN float
 __math_divzerof (uint32_t sign)
 {
   float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f;
   return with_errnof (y, ERANGE);
 }
 
 HIDDEN float
 __math_invalidf (float x)
 {
   float y = (x - x) / (x - x);
   return isnan (x) ? y : with_errnof (y, EDOM);
 }
 
 /* Check result and set errno if necessary.  */
 
 HIDDEN float
 __math_check_uflowf (float y)
 {
   return y == 0.0f ? with_errnof (y, ERANGE) : y;
 }
 
 HIDDEN float
 __math_check_oflowf (float y)
 {
   return isinf (y) ? with_errnof (y, ERANGE) : y;
 }
diff --git a/contrib/arm-optimized-routines/math/pow.c b/contrib/arm-optimized-routines/math/pow.c
index 86842c6abacd..af719fe5ab10 100644
--- a/contrib/arm-optimized-routines/math/pow.c
+++ b/contrib/arm-optimized-routines/math/pow.c
@@ -1,380 +1,380 @@
 /*
  * Double-precision x^y function.
  *
  * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 Worst-case error: 0.54 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53)
 relerr_log: 1.3 * 2^-68 (Relative error of log, 1.5 * 2^-68 without fma)
 ulperr_exp: 0.509 ULP (ULP error of exp, 0.511 ULP without fma)
 */
 
 #define T __pow_log_data.tab
 #define A __pow_log_data.poly
 #define Ln2hi __pow_log_data.ln2hi
 #define Ln2lo __pow_log_data.ln2lo
 #define N (1 << POW_LOG_TABLE_BITS)
 #define OFF 0x3fe6955500000000
 
 /* Top 12 bits of a double (sign and exponent bits).  */
 static inline uint32_t
 top12 (double x)
 {
   return asuint64 (x) >> 52;
 }
 
 /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
    additional 15 bits precision.  IX is the bit representation of x, but
    normalized in the subnormal range using the sign bit for the exponent.  */
 static inline double_t
 log_inline (uint64_t ix, double_t *tail)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, y, invc, logc, logctail, kd, hi, t1, t2, lo, lo1, lo2, p;
   uint64_t iz, tmp;
   int k, i;
 
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (52 - POW_LOG_TABLE_BITS)) % N;
   k = (int64_t) tmp >> 52; /* arithmetic shift */
   iz = ix - (tmp & 0xfffULL << 52);
   z = asdouble (iz);
   kd = (double_t) k;
 
   /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
   invc = T[i].invc;
   logc = T[i].logc;
   logctail = T[i].logctail;
 
   /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
 #if HAVE_FAST_FMA
   r = fma (z, invc, -1.0);
 #else
   /* Split z such that rhi, rlo and rhi*rhi are exact and |rlo| <= |r|.  */
   double_t zhi = asdouble ((iz + (1ULL << 31)) & (-1ULL << 32));
   double_t zlo = z - zhi;
   double_t rhi = zhi * invc - 1.0;
   double_t rlo = zlo * invc;
   r = rhi + rlo;
 #endif
 
   /* k*Ln2 + log(c) + r.  */
   t1 = kd * Ln2hi + logc;
   t2 = t1 + r;
   lo1 = kd * Ln2lo + logctail;
   lo2 = t1 - t2 + r;
 
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
   double_t ar, ar2, ar3, lo3, lo4;
   ar = A[0] * r; /* A[0] = -0.5.  */
   ar2 = r * ar;
   ar3 = r * ar2;
   /* k*Ln2 + log(c) + r + A[0]*r*r.  */
 #if HAVE_FAST_FMA
   hi = t2 + ar2;
   lo3 = fma (ar, r, -ar2);
   lo4 = t2 - hi + ar2;
 #else
   double_t arhi = A[0] * rhi;
   double_t arhi2 = rhi * arhi;
   hi = t2 + arhi2;
   lo3 = rlo * (ar + arhi);
   lo4 = t2 - hi + arhi2;
 #endif
   /* p = log1p(r) - r - A[0]*r*r.  */
 #if POW_LOG_POLY_ORDER == 8
   p = (ar3
        * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * A[6]))));
 #endif
   lo = lo1 + lo2 + lo3 + lo4 + p;
   y = hi + lo;
   *tail = hi - y + lo;
   return y;
 }
 
 #undef N
 #undef T
 #define N (1 << EXP_TABLE_BITS)
 #define InvLn2N __exp_data.invln2N
 #define NegLn2hiN __exp_data.negln2hiN
 #define NegLn2loN __exp_data.negln2loN
 #define Shift __exp_data.shift
 #define T __exp_data.tab
 #define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
 #define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
 #define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
 #define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
 #define C6 __exp_data.poly[9 - EXP_POLY_ORDER]
 
 /* Handle cases that may overflow or underflow when computing the result that
    is scale*(1+TMP) without intermediate rounding.  The bit representation of
    scale is in SBITS, however it has a computed exponent that may have
    overflown into the sign bit so that needs to be adjusted before using it as
    a double.  (int32_t)KI is the k used in the argument reduction and exponent
    adjustment of scale, positive k here means the result may overflow and
    negative k means the result may underflow.  */
 static inline double
 specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
 {
   double_t scale, y;
 
   if ((ki & 0x80000000) == 0)
     {
       /* k > 0, the exponent of scale might have overflowed by <= 460.  */
       sbits -= 1009ull << 52;
       scale = asdouble (sbits);
       y = 0x1p1009 * (scale + scale * tmp);
       return check_oflow (eval_as_double (y));
     }
   /* k < 0, need special care in the subnormal range.  */
   sbits += 1022ull << 52;
   /* Note: sbits is signed scale.  */
   scale = asdouble (sbits);
   y = scale + scale * tmp;
   if (fabs (y) < 1.0)
     {
       /* Round y to the right precision before scaling it into the subnormal
 	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
 	 E is the worst-case ulp error outside the subnormal range.  So this
 	 is only useful if the goal is better than 1 ulp worst-case error.  */
       double_t hi, lo, one = 1.0;
       if (y < 0.0)
 	one = -1.0;
       lo = scale - y + scale * tmp;
       hi = one + y;
       lo = one - hi + y + lo;
       y = eval_as_double (hi + lo) - one;
       /* Fix the sign of 0.  */
       if (y == 0.0)
 	y = asdouble (sbits & 0x8000000000000000);
       /* The underflow exception needs to be signaled explicitly.  */
       force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
     }
   y = 0x1p-1022 * y;
   return check_uflow (eval_as_double (y));
 }
 
 #define SIGN_BIAS (0x800 << EXP_TABLE_BITS)
 
 /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
    The sign_bias argument is SIGN_BIAS or 0 and sets the sign to -1 or 1.  */
 static inline double
 exp_inline (double_t x, double_t xtail, uint32_t sign_bias)
 {
   uint32_t abstop;
   uint64_t ki, idx, top, sbits;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, z, r, r2, scale, tail, tmp;
 
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54)))
     {
       if (abstop - top12 (0x1p-54) >= 0x80000000)
 	{
 	  /* Avoid spurious underflow for tiny x.  */
 	  /* Note: 0 is common input.  */
 	  double_t one = WANT_ROUNDING ? 1.0 + x : 1.0;
 	  return sign_bias ? -one : one;
 	}
       if (abstop >= top12 (1024.0))
 	{
 	  /* Note: inf and nan are already handled.  */
 	  if (asuint64 (x) >> 63)
 	    return __math_uflow (sign_bias);
 	  else
 	    return __math_oflow (sign_bias);
 	}
       /* Large x is special cased below.  */
       abstop = 0;
     }
 
   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
   /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
   z = InvLn2N * x;
 #if TOINT_INTRINSICS
   kd = roundtoint (z);
   ki = converttoint (z);
 #elif EXP_USE_TOINT_NARROW
   /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
   kd = eval_as_double (z + Shift);
   ki = asuint64 (kd) >> 16;
   kd = (double_t) (int32_t) ki;
 #else
   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
   kd = eval_as_double (z + Shift);
   ki = asuint64 (kd);
   kd -= Shift;
 #endif
   r = x + kd * NegLn2hiN + kd * NegLn2loN;
   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
   r += xtail;
   /* 2^(k/N) ~= scale * (1 + tail).  */
   idx = 2 * (ki % N);
   top = (ki + sign_bias) << (52 - EXP_TABLE_BITS);
   tail = asdouble (T[idx]);
   /* This is only a valid scale when -1023*N < k < 1024*N.  */
   sbits = T[idx + 1] + top;
   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
   r2 = r * r;
   /* Without fma the worst case error is 0.25/N ulp larger.  */
   /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
 #if EXP_POLY_ORDER == 4
   tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4);
 #elif EXP_POLY_ORDER == 5
   tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
 #elif EXP_POLY_ORDER == 6
   tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
 #endif
   if (unlikely (abstop == 0))
     return specialcase (tmp, sbits, ki);
   scale = asdouble (sbits);
   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
      is no spurious underflow here even without fma.  */
   return eval_as_double (scale + scale * tmp);
 }
 
 /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
    the bit representation of a non-zero finite floating-point value.  */
 static inline int
 checkint (uint64_t iy)
 {
   int e = iy >> 52 & 0x7ff;
   if (e < 0x3ff)
     return 0;
   if (e > 0x3ff + 52)
     return 2;
   if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
     return 0;
   if (iy & (1ULL << (0x3ff + 52 - e)))
     return 1;
   return 2;
 }
 
 /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
 static inline int
 zeroinfnan (uint64_t i)
 {
   return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
 }
 
 double
 pow (double x, double y)
 {
   uint32_t sign_bias = 0;
   uint64_t ix, iy;
   uint32_t topx, topy;
 
   ix = asuint64 (x);
   iy = asuint64 (y);
   topx = top12 (x);
   topy = top12 (y);
   if (unlikely (topx - 0x001 >= 0x7ff - 0x001
 		|| (topy & 0x7ff) - 0x3be >= 0x43e - 0x3be))
     {
       /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0
 	 and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1.  */
       /* Special cases: (x < 0x1p-126 or inf or nan) or
 	 (|y| < 0x1p-65 or |y| >= 0x1p63 or nan).  */
       if (unlikely (zeroinfnan (iy)))
 	{
 	  if (2 * iy == 0)
 	    return issignaling_inline (x) ? x + y : 1.0;
 	  if (ix == asuint64 (1.0))
 	    return issignaling_inline (y) ? x + y : 1.0;
 	  if (2 * ix > 2 * asuint64 (INFINITY)
 	      || 2 * iy > 2 * asuint64 (INFINITY))
 	    return x + y;
 	  if (2 * ix == 2 * asuint64 (1.0))
 	    return 1.0;
 	  if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
 	    return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
 	  return y * y;
 	}
       if (unlikely (zeroinfnan (ix)))
 	{
 	  double_t x2 = x * x;
 	  if (ix >> 63 && checkint (iy) == 1)
 	    {
 	      x2 = -x2;
 	      sign_bias = 1;
 	    }
 	  if (WANT_ERRNO && 2 * ix == 0 && iy >> 63)
 	    return __math_divzero (sign_bias);
 	  /* Without the barrier some versions of clang hoist the 1/x2 and
 	     thus division by zero exception can be signaled spuriously.  */
 	  return iy >> 63 ? opt_barrier_double (1 / x2) : x2;
 	}
       /* Here x and y are non-zero finite.  */
       if (ix >> 63)
 	{
 	  /* Finite x < 0.  */
 	  int yint = checkint (iy);
 	  if (yint == 0)
 	    return __math_invalid (x);
 	  if (yint == 1)
 	    sign_bias = SIGN_BIAS;
 	  ix &= 0x7fffffffffffffff;
 	  topx &= 0x7ff;
 	}
       if ((topy & 0x7ff) - 0x3be >= 0x43e - 0x3be)
 	{
 	  /* Note: sign_bias == 0 here because y is not odd.  */
 	  if (ix == asuint64 (1.0))
 	    return 1.0;
 	  if ((topy & 0x7ff) < 0x3be)
 	    {
 	      /* |y| < 2^-65, x^y ~= 1 + y*log(x).  */
 	      if (WANT_ROUNDING)
 		return ix > asuint64 (1.0) ? 1.0 + y : 1.0 - y;
 	      else
 		return 1.0;
 	    }
 	  return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)
 							 : __math_uflow (0);
 	}
       if (topx == 0)
 	{
 	  /* Normalize subnormal x so exponent becomes negative.  */
 	  /* Without the barrier some versions of clang evalutate the mul
 	     unconditionally causing spurious overflow exceptions.  */
 	  ix = asuint64 (opt_barrier_double (x) * 0x1p52);
 	  ix &= 0x7fffffffffffffff;
 	  ix -= 52ULL << 52;
 	}
     }
 
   double_t lo;
   double_t hi = log_inline (ix, &lo);
   double_t ehi, elo;
 #if HAVE_FAST_FMA
   ehi = y * hi;
   elo = y * lo + fma (y, hi, -ehi);
 #else
   double_t yhi = asdouble (iy & -1ULL << 27);
   double_t ylo = y - yhi;
   double_t lhi = asdouble (asuint64 (hi) & -1ULL << 27);
   double_t llo = hi - lhi + lo;
   ehi = yhi * lhi;
   elo = ylo * lhi + y * llo; /* |elo| < |ehi| * 2^-25.  */
 #endif
   return exp_inline (ehi, elo, sign_bias);
 }
 #if USE_GLIBC_ABI
 strong_alias (pow, __pow_finite)
 hidden_alias (pow, __ieee754_pow)
 # if LDBL_MANT_DIG == 53
 long double powl (long double x, long double y) { return pow (x, y); }
 # endif
 #endif
diff --git a/contrib/arm-optimized-routines/math/pow_log_data.c b/contrib/arm-optimized-routines/math/pow_log_data.c
index 45569c5cc064..2a4c250d85c3 100644
--- a/contrib/arm-optimized-routines/math/pow_log_data.c
+++ b/contrib/arm-optimized-routines/math/pow_log_data.c
@@ -1,184 +1,184 @@
 /*
  * Data for the log part of pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << POW_LOG_TABLE_BITS)
 
 const struct pow_log_data __pow_log_data = {
 .ln2hi = 0x1.62e42fefa3800p-1,
 .ln2lo = 0x1.ef35793c76730p-45,
 .poly = {
 #if N == 128 && POW_LOG_POLY_ORDER == 8
 // relative error: 0x1.11922ap-70
 // in -0x1.6bp-8 0x1.6bp-8
 // Coefficients are scaled to match the scaling during evaluation.
 -0x1p-1,
 0x1.555555555556p-2 * -2,
 -0x1.0000000000006p-2 * -2,
 0x1.999999959554ep-3 * 4,
 -0x1.555555529a47ap-3 * 4,
 0x1.2495b9b4845e9p-3 * -8,
 -0x1.0002b8b263fc3p-3 * -8,
 #endif
 },
 /* Algorithm:
 
 	x = 2^k z
 	log(x) = k ln2 + log(c) + log(z/c)
 	log(z/c) = poly(z/c - 1)
 
 where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals
 and z falls into the ith one, then table entries are computed as
 
 	tab[i].invc = 1/c
 	tab[i].logc = round(0x1p43*log(c))/0x1p43
 	tab[i].logctail = (double)(log(c) - logc)
 
 where c is chosen near the center of the subinterval such that 1/c has only a
 few precision bits so z/c - 1 is exactly representible as double:
 
 	1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2
 
 Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < 0x1p-97,
 the last few bits of logc are rounded away so k*ln2hi + logc has no rounding
 error and the interval for z is selected such that near x == 1, where log(x)
 is tiny, large cancellation error is avoided in logc + poly(z/c - 1).  */
 .tab = {
 #if N == 128
 #define A(a, b, c) {a, 0, b, c},
 A(0x1.6a00000000000p+0, -0x1.62c82f2b9c800p-2, 0x1.ab42428375680p-48)
 A(0x1.6800000000000p+0, -0x1.5d1bdbf580800p-2, -0x1.ca508d8e0f720p-46)
 A(0x1.6600000000000p+0, -0x1.5767717455800p-2, -0x1.362a4d5b6506dp-45)
 A(0x1.6400000000000p+0, -0x1.51aad872df800p-2, -0x1.684e49eb067d5p-49)
 A(0x1.6200000000000p+0, -0x1.4be5f95777800p-2, -0x1.41b6993293ee0p-47)
 A(0x1.6000000000000p+0, -0x1.4618bc21c6000p-2, 0x1.3d82f484c84ccp-46)
 A(0x1.5e00000000000p+0, -0x1.404308686a800p-2, 0x1.c42f3ed820b3ap-50)
 A(0x1.5c00000000000p+0, -0x1.3a64c55694800p-2, 0x1.0b1c686519460p-45)
 A(0x1.5a00000000000p+0, -0x1.347dd9a988000p-2, 0x1.5594dd4c58092p-45)
 A(0x1.5800000000000p+0, -0x1.2e8e2bae12000p-2, 0x1.67b1e99b72bd8p-45)
 A(0x1.5600000000000p+0, -0x1.2895a13de8800p-2, 0x1.5ca14b6cfb03fp-46)
 A(0x1.5600000000000p+0, -0x1.2895a13de8800p-2, 0x1.5ca14b6cfb03fp-46)
 A(0x1.5400000000000p+0, -0x1.22941fbcf7800p-2, -0x1.65a242853da76p-46)
 A(0x1.5200000000000p+0, -0x1.1c898c1699800p-2, -0x1.fafbc68e75404p-46)
 A(0x1.5000000000000p+0, -0x1.1675cababa800p-2, 0x1.f1fc63382a8f0p-46)
 A(0x1.4e00000000000p+0, -0x1.1058bf9ae4800p-2, -0x1.6a8c4fd055a66p-45)
 A(0x1.4c00000000000p+0, -0x1.0a324e2739000p-2, -0x1.c6bee7ef4030ep-47)
 A(0x1.4a00000000000p+0, -0x1.0402594b4d000p-2, -0x1.036b89ef42d7fp-48)
 A(0x1.4a00000000000p+0, -0x1.0402594b4d000p-2, -0x1.036b89ef42d7fp-48)
 A(0x1.4800000000000p+0, -0x1.fb9186d5e4000p-3, 0x1.d572aab993c87p-47)
 A(0x1.4600000000000p+0, -0x1.ef0adcbdc6000p-3, 0x1.b26b79c86af24p-45)
 A(0x1.4400000000000p+0, -0x1.e27076e2af000p-3, -0x1.72f4f543fff10p-46)
 A(0x1.4200000000000p+0, -0x1.d5c216b4fc000p-3, 0x1.1ba91bbca681bp-45)
 A(0x1.4000000000000p+0, -0x1.c8ff7c79aa000p-3, 0x1.7794f689f8434p-45)
 A(0x1.4000000000000p+0, -0x1.c8ff7c79aa000p-3, 0x1.7794f689f8434p-45)
 A(0x1.3e00000000000p+0, -0x1.bc286742d9000p-3, 0x1.94eb0318bb78fp-46)
 A(0x1.3c00000000000p+0, -0x1.af3c94e80c000p-3, 0x1.a4e633fcd9066p-52)
 A(0x1.3a00000000000p+0, -0x1.a23bc1fe2b000p-3, -0x1.58c64dc46c1eap-45)
 A(0x1.3a00000000000p+0, -0x1.a23bc1fe2b000p-3, -0x1.58c64dc46c1eap-45)
 A(0x1.3800000000000p+0, -0x1.9525a9cf45000p-3, -0x1.ad1d904c1d4e3p-45)
 A(0x1.3600000000000p+0, -0x1.87fa06520d000p-3, 0x1.bbdbf7fdbfa09p-45)
 A(0x1.3400000000000p+0, -0x1.7ab890210e000p-3, 0x1.bdb9072534a58p-45)
 A(0x1.3400000000000p+0, -0x1.7ab890210e000p-3, 0x1.bdb9072534a58p-45)
 A(0x1.3200000000000p+0, -0x1.6d60fe719d000p-3, -0x1.0e46aa3b2e266p-46)
 A(0x1.3000000000000p+0, -0x1.5ff3070a79000p-3, -0x1.e9e439f105039p-46)
 A(0x1.3000000000000p+0, -0x1.5ff3070a79000p-3, -0x1.e9e439f105039p-46)
 A(0x1.2e00000000000p+0, -0x1.526e5e3a1b000p-3, -0x1.0de8b90075b8fp-45)
 A(0x1.2c00000000000p+0, -0x1.44d2b6ccb8000p-3, 0x1.70cc16135783cp-46)
 A(0x1.2c00000000000p+0, -0x1.44d2b6ccb8000p-3, 0x1.70cc16135783cp-46)
 A(0x1.2a00000000000p+0, -0x1.371fc201e9000p-3, 0x1.178864d27543ap-48)
 A(0x1.2800000000000p+0, -0x1.29552f81ff000p-3, -0x1.48d301771c408p-45)
 A(0x1.2600000000000p+0, -0x1.1b72ad52f6000p-3, -0x1.e80a41811a396p-45)
 A(0x1.2600000000000p+0, -0x1.1b72ad52f6000p-3, -0x1.e80a41811a396p-45)
 A(0x1.2400000000000p+0, -0x1.0d77e7cd09000p-3, 0x1.a699688e85bf4p-47)
 A(0x1.2400000000000p+0, -0x1.0d77e7cd09000p-3, 0x1.a699688e85bf4p-47)
 A(0x1.2200000000000p+0, -0x1.fec9131dbe000p-4, -0x1.575545ca333f2p-45)
 A(0x1.2000000000000p+0, -0x1.e27076e2b0000p-4, 0x1.a342c2af0003cp-45)
 A(0x1.2000000000000p+0, -0x1.e27076e2b0000p-4, 0x1.a342c2af0003cp-45)
 A(0x1.1e00000000000p+0, -0x1.c5e548f5bc000p-4, -0x1.d0c57585fbe06p-46)
 A(0x1.1c00000000000p+0, -0x1.a926d3a4ae000p-4, 0x1.53935e85baac8p-45)
 A(0x1.1c00000000000p+0, -0x1.a926d3a4ae000p-4, 0x1.53935e85baac8p-45)
 A(0x1.1a00000000000p+0, -0x1.8c345d631a000p-4, 0x1.37c294d2f5668p-46)
 A(0x1.1a00000000000p+0, -0x1.8c345d631a000p-4, 0x1.37c294d2f5668p-46)
 A(0x1.1800000000000p+0, -0x1.6f0d28ae56000p-4, -0x1.69737c93373dap-45)
 A(0x1.1600000000000p+0, -0x1.51b073f062000p-4, 0x1.f025b61c65e57p-46)
 A(0x1.1600000000000p+0, -0x1.51b073f062000p-4, 0x1.f025b61c65e57p-46)
 A(0x1.1400000000000p+0, -0x1.341d7961be000p-4, 0x1.c5edaccf913dfp-45)
 A(0x1.1400000000000p+0, -0x1.341d7961be000p-4, 0x1.c5edaccf913dfp-45)
 A(0x1.1200000000000p+0, -0x1.16536eea38000p-4, 0x1.47c5e768fa309p-46)
 A(0x1.1000000000000p+0, -0x1.f0a30c0118000p-5, 0x1.d599e83368e91p-45)
 A(0x1.1000000000000p+0, -0x1.f0a30c0118000p-5, 0x1.d599e83368e91p-45)
 A(0x1.0e00000000000p+0, -0x1.b42dd71198000p-5, 0x1.c827ae5d6704cp-46)
 A(0x1.0e00000000000p+0, -0x1.b42dd71198000p-5, 0x1.c827ae5d6704cp-46)
 A(0x1.0c00000000000p+0, -0x1.77458f632c000p-5, -0x1.cfc4634f2a1eep-45)
 A(0x1.0c00000000000p+0, -0x1.77458f632c000p-5, -0x1.cfc4634f2a1eep-45)
 A(0x1.0a00000000000p+0, -0x1.39e87b9fec000p-5, 0x1.502b7f526feaap-48)
 A(0x1.0a00000000000p+0, -0x1.39e87b9fec000p-5, 0x1.502b7f526feaap-48)
 A(0x1.0800000000000p+0, -0x1.f829b0e780000p-6, -0x1.980267c7e09e4p-45)
 A(0x1.0800000000000p+0, -0x1.f829b0e780000p-6, -0x1.980267c7e09e4p-45)
 A(0x1.0600000000000p+0, -0x1.7b91b07d58000p-6, -0x1.88d5493faa639p-45)
 A(0x1.0400000000000p+0, -0x1.fc0a8b0fc0000p-7, -0x1.f1e7cf6d3a69cp-50)
 A(0x1.0400000000000p+0, -0x1.fc0a8b0fc0000p-7, -0x1.f1e7cf6d3a69cp-50)
 A(0x1.0200000000000p+0, -0x1.fe02a6b100000p-8, -0x1.9e23f0dda40e4p-46)
 A(0x1.0200000000000p+0, -0x1.fe02a6b100000p-8, -0x1.9e23f0dda40e4p-46)
 A(0x1.0000000000000p+0, 0x0.0000000000000p+0, 0x0.0000000000000p+0)
 A(0x1.0000000000000p+0, 0x0.0000000000000p+0, 0x0.0000000000000p+0)
 A(0x1.fc00000000000p-1, 0x1.0101575890000p-7, -0x1.0c76b999d2be8p-46)
 A(0x1.f800000000000p-1, 0x1.0205658938000p-6, -0x1.3dc5b06e2f7d2p-45)
 A(0x1.f400000000000p-1, 0x1.8492528c90000p-6, -0x1.aa0ba325a0c34p-45)
 A(0x1.f000000000000p-1, 0x1.0415d89e74000p-5, 0x1.111c05cf1d753p-47)
 A(0x1.ec00000000000p-1, 0x1.466aed42e0000p-5, -0x1.c167375bdfd28p-45)
 A(0x1.e800000000000p-1, 0x1.894aa149fc000p-5, -0x1.97995d05a267dp-46)
 A(0x1.e400000000000p-1, 0x1.ccb73cdddc000p-5, -0x1.a68f247d82807p-46)
 A(0x1.e200000000000p-1, 0x1.eea31c006c000p-5, -0x1.e113e4fc93b7bp-47)
 A(0x1.de00000000000p-1, 0x1.1973bd1466000p-4, -0x1.5325d560d9e9bp-45)
 A(0x1.da00000000000p-1, 0x1.3bdf5a7d1e000p-4, 0x1.cc85ea5db4ed7p-45)
 A(0x1.d600000000000p-1, 0x1.5e95a4d97a000p-4, -0x1.c69063c5d1d1ep-45)
 A(0x1.d400000000000p-1, 0x1.700d30aeac000p-4, 0x1.c1e8da99ded32p-49)
 A(0x1.d000000000000p-1, 0x1.9335e5d594000p-4, 0x1.3115c3abd47dap-45)
 A(0x1.cc00000000000p-1, 0x1.b6ac88dad6000p-4, -0x1.390802bf768e5p-46)
 A(0x1.ca00000000000p-1, 0x1.c885801bc4000p-4, 0x1.646d1c65aacd3p-45)
 A(0x1.c600000000000p-1, 0x1.ec739830a2000p-4, -0x1.dc068afe645e0p-45)
 A(0x1.c400000000000p-1, 0x1.fe89139dbe000p-4, -0x1.534d64fa10afdp-45)
 A(0x1.c000000000000p-1, 0x1.1178e8227e000p-3, 0x1.1ef78ce2d07f2p-45)
 A(0x1.be00000000000p-1, 0x1.1aa2b7e23f000p-3, 0x1.ca78e44389934p-45)
 A(0x1.ba00000000000p-1, 0x1.2d1610c868000p-3, 0x1.39d6ccb81b4a1p-47)
 A(0x1.b800000000000p-1, 0x1.365fcb0159000p-3, 0x1.62fa8234b7289p-51)
 A(0x1.b400000000000p-1, 0x1.4913d8333b000p-3, 0x1.5837954fdb678p-45)
 A(0x1.b200000000000p-1, 0x1.527e5e4a1b000p-3, 0x1.633e8e5697dc7p-45)
 A(0x1.ae00000000000p-1, 0x1.6574ebe8c1000p-3, 0x1.9cf8b2c3c2e78p-46)
 A(0x1.ac00000000000p-1, 0x1.6f0128b757000p-3, -0x1.5118de59c21e1p-45)
 A(0x1.aa00000000000p-1, 0x1.7898d85445000p-3, -0x1.c661070914305p-46)
 A(0x1.a600000000000p-1, 0x1.8beafeb390000p-3, -0x1.73d54aae92cd1p-47)
 A(0x1.a400000000000p-1, 0x1.95a5adcf70000p-3, 0x1.7f22858a0ff6fp-47)
 A(0x1.a000000000000p-1, 0x1.a93ed3c8ae000p-3, -0x1.8724350562169p-45)
 A(0x1.9e00000000000p-1, 0x1.b31d8575bd000p-3, -0x1.c358d4eace1aap-47)
 A(0x1.9c00000000000p-1, 0x1.bd087383be000p-3, -0x1.d4bc4595412b6p-45)
 A(0x1.9a00000000000p-1, 0x1.c6ffbc6f01000p-3, -0x1.1ec72c5962bd2p-48)
 A(0x1.9600000000000p-1, 0x1.db13db0d49000p-3, -0x1.aff2af715b035p-45)
 A(0x1.9400000000000p-1, 0x1.e530effe71000p-3, 0x1.212276041f430p-51)
 A(0x1.9200000000000p-1, 0x1.ef5ade4dd0000p-3, -0x1.a211565bb8e11p-51)
 A(0x1.9000000000000p-1, 0x1.f991c6cb3b000p-3, 0x1.bcbecca0cdf30p-46)
 A(0x1.8c00000000000p-1, 0x1.07138604d5800p-2, 0x1.89cdb16ed4e91p-48)
 A(0x1.8a00000000000p-1, 0x1.0c42d67616000p-2, 0x1.7188b163ceae9p-45)
 A(0x1.8800000000000p-1, 0x1.1178e8227e800p-2, -0x1.c210e63a5f01cp-45)
 A(0x1.8600000000000p-1, 0x1.16b5ccbacf800p-2, 0x1.b9acdf7a51681p-45)
 A(0x1.8400000000000p-1, 0x1.1bf99635a6800p-2, 0x1.ca6ed5147bdb7p-45)
 A(0x1.8200000000000p-1, 0x1.214456d0eb800p-2, 0x1.a87deba46baeap-47)
 A(0x1.7e00000000000p-1, 0x1.2bef07cdc9000p-2, 0x1.a9cfa4a5004f4p-45)
 A(0x1.7c00000000000p-1, 0x1.314f1e1d36000p-2, -0x1.8e27ad3213cb8p-45)
 A(0x1.7a00000000000p-1, 0x1.36b6776be1000p-2, 0x1.16ecdb0f177c8p-46)
 A(0x1.7800000000000p-1, 0x1.3c25277333000p-2, 0x1.83b54b606bd5cp-46)
 A(0x1.7600000000000p-1, 0x1.419b423d5e800p-2, 0x1.8e436ec90e09dp-47)
 A(0x1.7400000000000p-1, 0x1.4718dc271c800p-2, -0x1.f27ce0967d675p-45)
 A(0x1.7200000000000p-1, 0x1.4c9e09e173000p-2, -0x1.e20891b0ad8a4p-45)
 A(0x1.7000000000000p-1, 0x1.522ae0738a000p-2, 0x1.ebe708164c759p-45)
 A(0x1.6e00000000000p-1, 0x1.57bf753c8d000p-2, 0x1.fadedee5d40efp-46)
 A(0x1.6c00000000000p-1, 0x1.5d5bddf596000p-2, -0x1.a0b2a08a465dcp-47)
 #endif
 },
 };
diff --git a/contrib/arm-optimized-routines/math/powf.c b/contrib/arm-optimized-routines/math/powf.c
index 6ba45d3852a5..05c80bb2eb67 100644
--- a/contrib/arm-optimized-routines/math/powf.c
+++ b/contrib/arm-optimized-routines/math/powf.c
@@ -1,221 +1,221 @@
 /*
  * Single-precision pow function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 POWF_LOG2_POLY_ORDER = 5
 EXP2F_TABLE_BITS = 5
 
 ULP error: 0.82 (~ 0.5 + relerr*2^24)
 relerr: 1.27 * 2^-26 (Relative error ~= 128*Ln2*relerr_log2 + relerr_exp2)
 relerr_log2: 1.83 * 2^-33 (Relative error of logx.)
 relerr_exp2: 1.69 * 2^-34 (Relative error of exp2(ylogx).)
 */
 
 #define N (1 << POWF_LOG2_TABLE_BITS)
 #define T __powf_log2_data.tab
 #define A __powf_log2_data.poly
 #define OFF 0x3f330000
 
 /* Subnormal input is normalized so ix has negative biased exponent.
    Output is multiplied by N (POWF_SCALE) if TOINT_INTRINICS is set.  */
 static inline double_t
 log2_inline (uint32_t ix)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, r2, r4, p, q, y, y0, invc, logc;
   uint32_t iz, top, tmp;
   int k, i;
 
   /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (23 - POWF_LOG2_TABLE_BITS)) % N;
   top = tmp & 0xff800000;
   iz = ix - top;
   k = (int32_t) top >> (23 - POWF_SCALE_BITS); /* arithmetic shift */
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
 
   /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
   r = z * invc - 1;
   y0 = logc + (double_t) k;
 
   /* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
   r2 = r * r;
   y = A[0] * r + A[1];
   p = A[2] * r + A[3];
   r4 = r2 * r2;
   q = A[4] * r + y0;
   q = p * r2 + q;
   y = y * r4 + q;
   return y;
 }
 
 #undef N
 #undef T
 #define N (1 << EXP2F_TABLE_BITS)
 #define T __exp2f_data.tab
 #define SIGN_BIAS (1 << (EXP2F_TABLE_BITS + 11))
 
 /* The output of log2 and thus the input of exp2 is either scaled by N
    (in case of fast toint intrinsics) or not.  The unscaled xd must be
    in [-1021,1023], sign_bias sets the sign of the result.  */
 static inline float
 exp2_inline (double_t xd, uint32_t sign_bias)
 {
   uint64_t ki, ski, t;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, z, r, r2, y, s;
 
 #if TOINT_INTRINSICS
 # define C __exp2f_data.poly_scaled
   /* N*x = k + r with r in [-1/2, 1/2] */
   kd = roundtoint (xd); /* k */
   ki = converttoint (xd);
 #else
 # define C __exp2f_data.poly
 # define SHIFT __exp2f_data.shift_scaled
   /* x = k/N + r with r in [-1/(2N), 1/(2N)] */
   kd = eval_as_double (xd + SHIFT);
   ki = asuint64 (kd);
   kd -= SHIFT; /* k/N */
 #endif
   r = xd - kd;
 
   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
   t = T[ki % N];
   ski = ki + sign_bias;
   t += ski << (52 - EXP2F_TABLE_BITS);
   s = asdouble (t);
   z = C[0] * r + C[1];
   r2 = r * r;
   y = C[2] * r + 1;
   y = z * r2 + y;
   y = y * s;
   return eval_as_float (y);
 }
 
 /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
    the bit representation of a non-zero finite floating-point value.  */
 static inline int
 checkint (uint32_t iy)
 {
   int e = iy >> 23 & 0xff;
   if (e < 0x7f)
     return 0;
   if (e > 0x7f + 23)
     return 2;
   if (iy & ((1 << (0x7f + 23 - e)) - 1))
     return 0;
   if (iy & (1 << (0x7f + 23 - e)))
     return 1;
   return 2;
 }
 
 static inline int
 zeroinfnan (uint32_t ix)
 {
   return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
 }
 
 float
 powf (float x, float y)
 {
   uint32_t sign_bias = 0;
   uint32_t ix, iy;
 
   ix = asuint (x);
   iy = asuint (y);
   if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000 || zeroinfnan (iy)))
     {
       /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
       if (unlikely (zeroinfnan (iy)))
 	{
 	  if (2 * iy == 0)
 	    return issignalingf_inline (x) ? x + y : 1.0f;
 	  if (ix == 0x3f800000)
 	    return issignalingf_inline (y) ? x + y : 1.0f;
 	  if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
 	    return x + y;
 	  if (2 * ix == 2 * 0x3f800000)
 	    return 1.0f;
 	  if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
 	    return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
 	  return y * y;
 	}
       if (unlikely (zeroinfnan (ix)))
 	{
 	  float_t x2 = x * x;
 	  if (ix & 0x80000000 && checkint (iy) == 1)
 	    {
 	      x2 = -x2;
 	      sign_bias = 1;
 	    }
 #if WANT_ERRNO
 	  if (2 * ix == 0 && iy & 0x80000000)
 	    return __math_divzerof (sign_bias);
 #endif
 	  /* Without the barrier some versions of clang hoist the 1/x2 and
 	     thus division by zero exception can be signaled spuriously.  */
 	  return iy & 0x80000000 ? opt_barrier_float (1 / x2) : x2;
 	}
       /* x and y are non-zero finite.  */
       if (ix & 0x80000000)
 	{
 	  /* Finite x < 0.  */
 	  int yint = checkint (iy);
 	  if (yint == 0)
 	    return __math_invalidf (x);
 	  if (yint == 1)
 	    sign_bias = SIGN_BIAS;
 	  ix &= 0x7fffffff;
 	}
       if (ix < 0x00800000)
 	{
 	  /* Normalize subnormal x so exponent becomes negative.  */
 	  ix = asuint (x * 0x1p23f);
 	  ix &= 0x7fffffff;
 	  ix -= 23 << 23;
 	}
     }
   double_t logx = log2_inline (ix);
   double_t ylogx = y * logx; /* Note: cannot overflow, y is single prec.  */
   if (unlikely ((asuint64 (ylogx) >> 47 & 0xffff)
 		 >= asuint64 (126.0 * POWF_SCALE) >> 47))
     {
       /* |y*log(x)| >= 126.  */
       if (ylogx > 0x1.fffffffd1d571p+6 * POWF_SCALE)
 	/* |x^y| > 0x1.ffffffp127.  */
 	return __math_oflowf (sign_bias);
       if (WANT_ROUNDING && WANT_ERRNO
 	  && ylogx > 0x1.fffffffa3aae2p+6 * POWF_SCALE)
 	/* |x^y| > 0x1.fffffep127, check if we round away from 0.  */
 	if ((!sign_bias
 	     && eval_as_float (1.0f + opt_barrier_float (0x1p-25f)) != 1.0f)
 	    || (sign_bias
 		&& eval_as_float (-1.0f - opt_barrier_float (0x1p-25f))
 		     != -1.0f))
 	  return __math_oflowf (sign_bias);
       if (ylogx <= -150.0 * POWF_SCALE)
 	return __math_uflowf (sign_bias);
 #if WANT_ERRNO_UFLOW
       if (ylogx < -149.0 * POWF_SCALE)
 	return __math_may_uflowf (sign_bias);
 #endif
     }
   return exp2_inline (ylogx, sign_bias);
 }
 #if USE_GLIBC_ABI
 strong_alias (powf, __powf_finite)
 hidden_alias (powf, __ieee754_powf)
 #endif
diff --git a/contrib/arm-optimized-routines/math/powf_log2_data.c b/contrib/arm-optimized-routines/math/powf_log2_data.c
index 97e0d98cdbab..243836a549fd 100644
--- a/contrib/arm-optimized-routines/math/powf_log2_data.c
+++ b/contrib/arm-optimized-routines/math/powf_log2_data.c
@@ -1,34 +1,34 @@
 /*
  * Data definition for powf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 const struct powf_log2_data __powf_log2_data = {
   .tab = {
   { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * POWF_SCALE },
   { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * POWF_SCALE },
   { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * POWF_SCALE },
   { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * POWF_SCALE },
   { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * POWF_SCALE },
   { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * POWF_SCALE },
   { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * POWF_SCALE },
   { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * POWF_SCALE },
   { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * POWF_SCALE },
   { 0x1p+0, 0x0p+0 * POWF_SCALE },
   { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * POWF_SCALE },
   { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * POWF_SCALE },
   { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * POWF_SCALE },
   { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * POWF_SCALE },
   { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * POWF_SCALE },
   { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * POWF_SCALE },
   },
   .poly = {
   0x1.27616c9496e0bp-2 * POWF_SCALE, -0x1.71969a075c67ap-2 * POWF_SCALE,
   0x1.ec70a6ca7baddp-2 * POWF_SCALE, -0x1.7154748bef6c8p-1 * POWF_SCALE,
   0x1.71547652ab82bp0 * POWF_SCALE,
   }
 };
diff --git a/contrib/arm-optimized-routines/math/s_cos.c b/contrib/arm-optimized-routines/math/s_cos.c
index 53a95b0adfde..e66d563d15b5 100644
--- a/contrib/arm-optimized-routines/math/s_cos.c
+++ b/contrib/arm-optimized-routines/math/s_cos.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_cos.c"
diff --git a/contrib/arm-optimized-routines/math/s_cosf.c b/contrib/arm-optimized-routines/math/s_cosf.c
index 914c02eba651..f615d260b39b 100644
--- a/contrib/arm-optimized-routines/math/s_cosf.c
+++ b/contrib/arm-optimized-routines/math/s_cosf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_cosf.c"
diff --git a/contrib/arm-optimized-routines/math/s_exp.c b/contrib/arm-optimized-routines/math/s_exp.c
index ac7246b2c100..5da0099e3c65 100644
--- a/contrib/arm-optimized-routines/math/s_exp.c
+++ b/contrib/arm-optimized-routines/math/s_exp.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp.c"
diff --git a/contrib/arm-optimized-routines/math/s_exp2f.c b/contrib/arm-optimized-routines/math/s_exp2f.c
index df7dfd680ff4..dcbfea9e1e79 100644
--- a/contrib/arm-optimized-routines/math/s_exp2f.c
+++ b/contrib/arm-optimized-routines/math/s_exp2f.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp2f.c"
diff --git a/contrib/arm-optimized-routines/math/s_exp2f_1u.c b/contrib/arm-optimized-routines/math/s_exp2f_1u.c
index 5e3852b41d83..bf387e44cfb2 100644
--- a/contrib/arm-optimized-routines/math/s_exp2f_1u.c
+++ b/contrib/arm-optimized-routines/math/s_exp2f_1u.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp2f_1u.c"
diff --git a/contrib/arm-optimized-routines/math/s_expf.c b/contrib/arm-optimized-routines/math/s_expf.c
index 3492c460733d..dacda7fb4fd5 100644
--- a/contrib/arm-optimized-routines/math/s_expf.c
+++ b/contrib/arm-optimized-routines/math/s_expf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_expf.c"
diff --git a/contrib/arm-optimized-routines/math/s_expf_1u.c b/contrib/arm-optimized-routines/math/s_expf_1u.c
index eb7bbcba5566..00096449f7a5 100644
--- a/contrib/arm-optimized-routines/math/s_expf_1u.c
+++ b/contrib/arm-optimized-routines/math/s_expf_1u.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_expf_1u.c"
diff --git a/contrib/arm-optimized-routines/math/s_log.c b/contrib/arm-optimized-routines/math/s_log.c
index 23289cf948ec..27d2eb290f56 100644
--- a/contrib/arm-optimized-routines/math/s_log.c
+++ b/contrib/arm-optimized-routines/math/s_log.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_log.c"
diff --git a/contrib/arm-optimized-routines/math/s_logf.c b/contrib/arm-optimized-routines/math/s_logf.c
index 9399350fc1ee..7d98b2ba15c4 100644
--- a/contrib/arm-optimized-routines/math/s_logf.c
+++ b/contrib/arm-optimized-routines/math/s_logf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_logf.c"
diff --git a/contrib/arm-optimized-routines/math/s_pow.c b/contrib/arm-optimized-routines/math/s_pow.c
index 2e34c9f896d6..6eca2b2b17f1 100644
--- a/contrib/arm-optimized-routines/math/s_pow.c
+++ b/contrib/arm-optimized-routines/math/s_pow.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_pow.c"
diff --git a/contrib/arm-optimized-routines/math/s_powf.c b/contrib/arm-optimized-routines/math/s_powf.c
index 6d91a4a72b37..1d55d90df7b2 100644
--- a/contrib/arm-optimized-routines/math/s_powf.c
+++ b/contrib/arm-optimized-routines/math/s_powf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_powf.c"
diff --git a/contrib/arm-optimized-routines/math/s_sin.c b/contrib/arm-optimized-routines/math/s_sin.c
index 06982c2018c6..0c6171259c0c 100644
--- a/contrib/arm-optimized-routines/math/s_sin.c
+++ b/contrib/arm-optimized-routines/math/s_sin.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_sin.c"
diff --git a/contrib/arm-optimized-routines/math/s_sinf.c b/contrib/arm-optimized-routines/math/s_sinf.c
index 68ca90853736..3aae61149618 100644
--- a/contrib/arm-optimized-routines/math/s_sinf.c
+++ b/contrib/arm-optimized-routines/math/s_sinf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_sinf.c"
diff --git a/contrib/arm-optimized-routines/math/sincosf.c b/contrib/arm-optimized-routines/math/sincosf.c
index 6fb299d10309..446f21d60faf 100644
--- a/contrib/arm-optimized-routines/math/sincosf.c
+++ b/contrib/arm-optimized-routines/math/sincosf.c
@@ -1,79 +1,79 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <math.h>
 #include "math_config.h"
 #include "sincosf.h"
 
 /* Fast sincosf implementation.  Worst-case ULP is 0.5607, maximum relative
    error is 0.5303 * 2^-23.  A single-step range reduction is used for
    small values.  Large inputs have their range reduced using fast integer
    arithmetic.  */
 void
 sincosf (float y, float *sinp, float *cosp)
 {
   double x = y;
   double s;
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
   if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
       if (unlikely (abstop12 (y) < abstop12 (0x1p-12f)))
 	{
 	  if (unlikely (abstop12 (y) < abstop12 (0x1p-126f)))
 	    /* Force underflow for tiny y.  */
 	    force_eval_float (x2);
 	  *sinp = y;
 	  *cosp = 1.0f;
 	  return;
 	}
 
       sincosf_poly (x, x2, p, 0, sinp, cosp);
     }
   else if (abstop12 (y) < abstop12 (120.0f))
     {
       x = reduce_fast (x, p, &n);
 
       /* Setup the signs for sin and cos.  */
       s = p->sign[n & 3];
 
       if (n & 2)
 	p = &__sincosf_table[1];
 
       sincosf_poly (x * s, x * x, p, n, sinp, cosp);
     }
   else if (likely (abstop12 (y) < abstop12 (INFINITY)))
     {
       uint32_t xi = asuint (y);
       int sign = xi >> 31;
 
       x = reduce_large (xi, &n);
 
       /* Setup signs for sin and cos - include original sign.  */
       s = p->sign[(n + sign) & 3];
 
       if ((n + sign) & 2)
 	p = &__sincosf_table[1];
 
       sincosf_poly (x * s, x * x, p, n, sinp, cosp);
     }
   else
     {
       /* Return NaN if Inf or NaN for both sin and cos.  */
       *sinp = *cosp = y - y;
 #if WANT_ERRNO
       /* Needed to set errno for +-Inf, the add is a hack to work
 	 around a gcc register allocation issue: just passing y
 	 affects code generation in the fast path.  */
       __math_invalidf (y + y);
 #endif
     }
 }
diff --git a/contrib/arm-optimized-routines/math/sincosf.h b/contrib/arm-optimized-routines/math/sincosf.h
index 59124699f552..ec23ed7aeb26 100644
--- a/contrib/arm-optimized-routines/math/sincosf.h
+++ b/contrib/arm-optimized-routines/math/sincosf.h
@@ -1,153 +1,153 @@
 /*
  * Header for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <math.h>
 #include "math_config.h"
 
 /* 2PI * 2^-64.  */
 static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
 static const float pio4f = 0x1.921FB6p-1f;
 
 /* The constants and polynomials for sine and cosine.  */
 typedef struct
 {
   double sign[4];		/* Sign of sine in quadrants 0..3.  */
   double hpi_inv;		/* 2 / PI ( * 2^24 if !TOINT_INTRINSICS).  */
   double hpi;			/* PI / 2.  */
   double c0, c1, c2, c3, c4;	/* Cosine polynomial.  */
   double s1, s2, s3;		/* Sine polynomial.  */
 } sincos_t;
 
 /* Polynomial data (the cosine polynomial is negated in the 2nd entry).  */
 extern const sincos_t __sincosf_table[2] HIDDEN;
 
 /* Table with 4/PI to 192 bit precision.  */
 extern const uint32_t __inv_pio4[] HIDDEN;
 
 /* Top 12 bits of the float representation with the sign bit cleared.  */
 static inline uint32_t
 abstop12 (float x)
 {
   return (asuint (x) >> 20) & 0x7ff;
 }
 
 /* Compute the sine and cosine of inputs X and X2 (X squared), using the
    polynomial P and store the results in SINP and COSP.  N is the quadrant,
    if odd the cosine and sine polynomials are swapped.  */
 static inline void
 sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
 	      float *cosp)
 {
   double x3, x4, x5, x6, s, c, c1, c2, s1;
 
   x4 = x2 * x2;
   x3 = x2 * x;
   c2 = p->c3 + x2 * p->c4;
   s1 = p->s2 + x2 * p->s3;
 
   /* Swap sin/cos result based on quadrant.  */
   float *tmp = (n & 1 ? cosp : sinp);
   cosp = (n & 1 ? sinp : cosp);
   sinp = tmp;
 
   c1 = p->c0 + x2 * p->c1;
   x5 = x3 * x2;
   x6 = x4 * x2;
 
   s = x + x3 * p->s1;
   c = c1 + x4 * p->c2;
 
   *sinp = s + x5 * s1;
   *cosp = c + x6 * c2;
 }
 
 /* Return the sine of inputs X and X2 (X squared) using the polynomial P.
    N is the quadrant, and if odd the cosine polynomial is used.  */
 static inline float
 sinf_poly (double x, double x2, const sincos_t *p, int n)
 {
   double x3, x4, x6, x7, s, c, c1, c2, s1;
 
   if ((n & 1) == 0)
     {
       x3 = x * x2;
       s1 = p->s2 + x2 * p->s3;
 
       x7 = x3 * x2;
       s = x + x3 * p->s1;
 
       return s + x7 * s1;
     }
   else
     {
       x4 = x2 * x2;
       c2 = p->c3 + x2 * p->c4;
       c1 = p->c0 + x2 * p->c1;
 
       x6 = x4 * x2;
       c = c1 + x4 * p->c2;
 
       return c + x6 * c2;
     }
 }
 
 /* Fast range reduction using single multiply-subtract.  Return the modulo of
    X as a value between -PI/4 and PI/4 and store the quadrant in NP.
    The values for PI/2 and 2/PI are accessed via P.  Since PI/2 as a double
    is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
    the result is accurate for |X| <= 120.0.  */
 static inline double
 reduce_fast (double x, const sincos_t *p, int *np)
 {
   double r;
 #if TOINT_INTRINSICS
   /* Use fast round and lround instructions when available.  */
   r = x * p->hpi_inv;
   *np = converttoint (r);
   return x - roundtoint (r) * p->hpi;
 #else
   /* Use scaled float to int conversion with explicit rounding.
      hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
      This avoids inaccuracies introduced by truncating negative values.  */
   r = x * p->hpi_inv;
   int n = ((int32_t)r + 0x800000) >> 24;
   *np = n;
   return x - n * p->hpi;
 #endif
 }
 
 /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
    XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
    Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
    Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
    multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
    can have at most 29 leading zeros after the binary point, the double
    precision result is accurate to 33 bits.  */
 static inline double
 reduce_large (uint32_t xi, int *np)
 {
   const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
   int shift = (xi >> 23) & 7;
   uint64_t n, res0, res1, res2;
 
   xi = (xi & 0xffffff) | 0x800000;
   xi <<= shift;
 
   res0 = xi * arr[0];
   res1 = (uint64_t)xi * arr[4];
   res2 = (uint64_t)xi * arr[8];
   res0 = (res2 >> 32) | (res0 << 32);
   res0 += res1;
 
   n = (res0 + (1ULL << 61)) >> 62;
   res0 -= n << 62;
   double x = (int64_t)res0;
   *np = n;
   return x * pi63;
 }
diff --git a/contrib/arm-optimized-routines/math/sincosf_data.c b/contrib/arm-optimized-routines/math/sincosf_data.c
index ab4ac4710fef..22525290ab08 100644
--- a/contrib/arm-optimized-routines/math/sincosf_data.c
+++ b/contrib/arm-optimized-routines/math/sincosf_data.c
@@ -1,63 +1,63 @@
 /*
  * Data definition for sinf, cosf and sincosf.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <math.h>
 #include "math_config.h"
 #include "sincosf.h"
 
 /* The constants and polynomials for sine and cosine.  The 2nd entry
    computes -cos (x) rather than cos (x) to get negation for free.  */
 const sincos_t __sincosf_table[2] =
 {
   {
     { 1.0, -1.0, -1.0, 1.0 },
 #if TOINT_INTRINSICS
     0x1.45F306DC9C883p-1,
 #else
     0x1.45F306DC9C883p+23,
 #endif
     0x1.921FB54442D18p0,
     0x1p0,
     -0x1.ffffffd0c621cp-2,
     0x1.55553e1068f19p-5,
     -0x1.6c087e89a359dp-10,
     0x1.99343027bf8c3p-16,
     -0x1.555545995a603p-3,
     0x1.1107605230bc4p-7,
     -0x1.994eb3774cf24p-13
   },
   {
     { 1.0, -1.0, -1.0, 1.0 },
 #if TOINT_INTRINSICS
     0x1.45F306DC9C883p-1,
 #else
     0x1.45F306DC9C883p+23,
 #endif
     0x1.921FB54442D18p0,
     -0x1p0,
     0x1.ffffffd0c621cp-2,
     -0x1.55553e1068f19p-5,
     0x1.6c087e89a359dp-10,
     -0x1.99343027bf8c3p-16,
     -0x1.555545995a603p-3,
     0x1.1107605230bc4p-7,
     -0x1.994eb3774cf24p-13
   }
 };
 
 /* Table with 4/PI to 192 bit precision.  To avoid unaligned accesses
    only 8 new bits are added per entry, making the table 4 times larger.  */
 const uint32_t __inv_pio4[24] =
 {
   0xa2,       0xa2f9,	  0xa2f983,   0xa2f9836e,
   0xf9836e4e, 0x836e4e44, 0x6e4e4415, 0x4e441529,
   0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
   0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0,
   0x34ddc0db, 0xddc0db62, 0xc0db6295, 0xdb629599,
   0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041
 };
diff --git a/contrib/arm-optimized-routines/math/sinf.c b/contrib/arm-optimized-routines/math/sinf.c
index 4d2cbd6fae72..8dd8ae458794 100644
--- a/contrib/arm-optimized-routines/math/sinf.c
+++ b/contrib/arm-optimized-routines/math/sinf.c
@@ -1,67 +1,67 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include "math_config.h"
 #include "sincosf.h"
 
 /* Fast sinf implementation.  Worst-case ULP is 0.5607, maximum relative
    error is 0.5303 * 2^-23.  A single-step range reduction is used for
    small values.  Large inputs have their range reduced using fast integer
    arithmetic.  */
 float
 sinf (float y)
 {
   double x = y;
   double s;
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
   if (abstop12 (y) < abstop12 (pio4f))
     {
       s = x * x;
 
       if (unlikely (abstop12 (y) < abstop12 (0x1p-12f)))
 	{
 	  if (unlikely (abstop12 (y) < abstop12 (0x1p-126f)))
 	    /* Force underflow for tiny y.  */
 	    force_eval_float (s);
 	  return y;
 	}
 
       return sinf_poly (x, s, p, 0);
     }
   else if (likely (abstop12 (y) < abstop12 (120.0f)))
     {
       x = reduce_fast (x, p, &n);
 
       /* Setup the signs for sin and cos.  */
       s = p->sign[n & 3];
 
       if (n & 2)
 	p = &__sincosf_table[1];
 
       return sinf_poly (x * s, x * x, p, n);
     }
   else if (abstop12 (y) < abstop12 (INFINITY))
     {
       uint32_t xi = asuint (y);
       int sign = xi >> 31;
 
       x = reduce_large (xi, &n);
 
       /* Setup signs for sin and cos - include original sign.  */
       s = p->sign[(n + sign) & 3];
 
       if ((n + sign) & 2)
 	p = &__sincosf_table[1];
 
       return sinf_poly (x * s, x * x, p, n);
     }
   else
     return __math_invalidf (y);
 }
diff --git a/contrib/arm-optimized-routines/math/test/mathbench.c b/contrib/arm-optimized-routines/math/test/mathbench.c
index 0c17826e5296..6e18e36fbcb2 100644
--- a/contrib/arm-optimized-routines/math/test/mathbench.c
+++ b/contrib/arm-optimized-routines/math/test/mathbench.c
@@ -1,773 +1,702 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #undef _GNU_SOURCE
 #define _GNU_SOURCE 1
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <math.h>
 #include "mathlib.h"
 
 #ifndef WANT_VMATH
 /* Enable the build of vector math code.  */
 # define WANT_VMATH 1
 #endif
 
 /* Number of measurements, best result is reported.  */
 #define MEASURE 60
 /* Array size.  */
 #define N 8000
 /* Iterations over the array.  */
 #define ITER 125
 
 static double *Trace;
 static size_t trace_size;
 static double A[N];
 static float Af[N];
 static long measurecount = MEASURE;
 static long itercount = ITER;
 
 #if __aarch64__ && WANT_VMATH
 typedef __f64x2_t v_double;
 
 #define v_double_len() 2
 
 static inline v_double
 v_double_load (const double *p)
 {
   return (v_double){p[0], p[1]};
 }
 
 static inline v_double
 v_double_dup (double x)
 {
   return (v_double){x, x};
 }
 
 typedef __f32x4_t v_float;
 
 #define v_float_len() 4
 
 static inline v_float
 v_float_load (const float *p)
 {
   return (v_float){p[0], p[1], p[2], p[3]};
 }
 
 static inline v_float
 v_float_dup (float x)
 {
   return (v_float){x, x, x, x};
 }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef svbool_t sv_bool;
+typedef svfloat64_t sv_double;
+
+#define sv_double_len() svcntd()
+
+static inline sv_double
+sv_double_load (const double *p)
+{
+  svbool_t pg = svptrue_b64();
+  return svld1(pg, p);
+}
+
+static inline sv_double
+sv_double_dup (double x)
+{
+  return svdup_n_f64(x);
+}
+
+typedef svfloat32_t sv_float;
+
+#define sv_float_len() svcntw()
+
+static inline sv_float
+sv_float_load (const float *p)
+{
+  svbool_t pg = svptrue_b32();
+  return svld1(pg, p);
+}
+
+static inline sv_float
+sv_float_dup (float x)
+{
+  return svdup_n_f32(x);
+}
+#endif
 #else
 /* dummy definitions to make things compile.  */
 typedef double v_double;
 typedef float v_float;
 #define v_double_len(x) 1
 #define v_double_load(x) (x)[0]
 #define v_double_dup(x) (x)
 #define v_float_len(x) 1
 #define v_float_load(x) (x)[0]
 #define v_float_dup(x) (x)
 #endif
 
 static double
 dummy (double x)
 {
   return x;
 }
 
 static float
 dummyf (float x)
 {
   return x;
 }
-
 #if WANT_VMATH
 #if __aarch64__
 static v_double
 __v_dummy (v_double x)
 {
   return x;
 }
 
 static v_float
 __v_dummyf (v_float x)
 {
   return x;
 }
 
 #ifdef __vpcs
 __vpcs static v_double
 __vn_dummy (v_double x)
 {
   return x;
 }
 
 __vpcs static v_float
 __vn_dummyf (v_float x)
 {
   return x;
 }
-
-__vpcs static v_float
-xy__vn_powf (v_float x)
-{
-  return __vn_powf (x, x);
-}
-
-__vpcs static v_float
-xy_Z_powf (v_float x)
+#endif
+#if WANT_SVE_MATH
+static sv_double
+__sv_dummy (sv_double x, sv_bool pg)
 {
-  return _ZGVnN4vv_powf (x, x);
+  return x;
 }
 
-__vpcs static v_double
-xy__vn_pow (v_double x)
+static sv_float
+__sv_dummyf (sv_float x, sv_bool pg)
 {
-  return __vn_pow (x, x);
+  return x;
 }
 
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
-  return _ZGVnN2vv_pow (x, x);
-}
 #endif
-
-static v_float
-xy__v_powf (v_float x)
-{
-  return __v_powf (x, x);
-}
-
-static v_double
-xy__v_pow (v_double x)
-{
-  return __v_pow (x, x);
-}
 #endif
-
-static float
-xy__s_powf (float x)
-{
-  return __s_powf (x, x);
-}
-
-static double
-xy__s_pow (double x)
-{
-  return __s_pow (x, x);
-}
 #endif
 
-static double
-xypow (double x)
-{
-  return pow (x, x);
-}
-
-static float
-xypowf (float x)
-{
-  return powf (x, x);
-}
-
-static double
-xpow (double x)
-{
-  return pow (x, 23.4);
-}
-
-static float
-xpowf (float x)
-{
-  return powf (x, 23.4f);
-}
-
-static double
-ypow (double x)
-{
-  return pow (2.34, x);
-}
-
-static float
-ypowf (float x)
-{
-  return powf (2.34f, x);
-}
-
-static float
-sincosf_wrap (float x)
-{
-  float s, c;
-  sincosf (x, &s, &c);
-  return s + c;
-}
+#include "test/mathbench_wrappers.h"
 
 static const struct fun
 {
   const char *name;
   int prec;
   int vec;
   double lo;
   double hi;
   union
   {
     double (*d) (double);
     float (*f) (float);
     v_double (*vd) (v_double);
     v_float (*vf) (v_float);
 #ifdef __vpcs
     __vpcs v_double (*vnd) (v_double);
     __vpcs v_float (*vnf) (v_float);
+#endif
+#if WANT_SVE_MATH
+    sv_double (*svd) (sv_double, sv_bool);
+    sv_float (*svf) (sv_float, sv_bool);
 #endif
   } fun;
 } funtab[] = {
 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
 #define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
+#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
+#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
 D (dummy, 1.0, 2.0)
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
 F (dummyf, 1.0, 2.0)
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
 #if WANT_VMATH
-D (__s_sin, -3.1, 3.1)
-D (__s_cos, -3.1, 3.1)
-D (__s_exp, -9.9, 9.9)
-D (__s_log, 0.01, 11.1)
-{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
-F (__s_expf, -9.9, 9.9)
-F (__s_expf_1u, -9.9, 9.9)
-F (__s_exp2f, -9.9, 9.9)
-F (__s_exp2f_1u, -9.9, 9.9)
-F (__s_logf, 0.01, 11.1)
-{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
-F (__s_sinf, -3.1, 3.1)
-F (__s_cosf, -3.1, 3.1)
 #if __aarch64__
 VD (__v_dummy, 1.0, 2.0)
-VD (__v_sin, -3.1, 3.1)
-VD (__v_cos, -3.1, 3.1)
-VD (__v_exp, -9.9, 9.9)
-VD (__v_log, 0.01, 11.1)
-{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
 VF (__v_dummyf, 1.0, 2.0)
-VF (__v_expf, -9.9, 9.9)
-VF (__v_expf_1u, -9.9, 9.9)
-VF (__v_exp2f, -9.9, 9.9)
-VF (__v_exp2f_1u, -9.9, 9.9)
-VF (__v_logf, 0.01, 11.1)
-{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
-VF (__v_sinf, -3.1, 3.1)
-VF (__v_cosf, -3.1, 3.1)
 #ifdef __vpcs
 VND (__vn_dummy, 1.0, 2.0)
-VND (__vn_exp, -9.9, 9.9)
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (__vn_log, 0.01, 11.1)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (__vn_sin, -3.1, 3.1)
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (__vn_cos, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
 VNF (__vn_dummyf, 1.0, 2.0)
-VNF (__vn_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (__vn_expf_1u, -9.9, 9.9)
-VNF (__vn_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (__vn_exp2f_1u, -9.9, 9.9)
-VNF (__vn_logf, 0.01, 11.1)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (__vn_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (__vn_cosf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+#if WANT_SVE_MATH
+SVD (__sv_dummy, 1.0, 2.0)
+SVF (__sv_dummyf, 1.0, 2.0)
 #endif
 #endif
 #endif
+#include "test/mathbench_funcs.h"
 {0},
 #undef F
 #undef D
 #undef VF
 #undef VD
 #undef VNF
 #undef VND
+#undef SVF
+#undef SVD
 };
 
 static void
 gen_linear (double lo, double hi)
 {
   for (int i = 0; i < N; i++)
     A[i] = (lo * (N - i) + hi * i) / N;
 }
 
 static void
 genf_linear (double lo, double hi)
 {
   for (int i = 0; i < N; i++)
     Af[i] = (float)(lo * (N - i) + hi * i) / N;
 }
 
 static inline double
 asdouble (uint64_t i)
 {
   union
   {
     uint64_t i;
     double f;
   } u = {i};
   return u.f;
 }
 
 static uint64_t seed = 0x0123456789abcdef;
 
 static double
 frand (double lo, double hi)
 {
   seed = 6364136223846793005ULL * seed + 1;
   return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
 }
 
 static void
 gen_rand (double lo, double hi)
 {
   for (int i = 0; i < N; i++)
     A[i] = frand (lo, hi);
 }
 
 static void
 genf_rand (double lo, double hi)
 {
   for (int i = 0; i < N; i++)
     Af[i] = (float)frand (lo, hi);
 }
 
 static void
 gen_trace (int index)
 {
   for (int i = 0; i < N; i++)
     A[i] = Trace[index + i];
 }
 
 static void
 genf_trace (int index)
 {
   for (int i = 0; i < N; i++)
     Af[i] = (float)Trace[index + i];
 }
 
 static void
 run_thruput (double f (double))
 {
   for (int i = 0; i < N; i++)
     f (A[i]);
 }
 
 static void
 runf_thruput (float f (float))
 {
   for (int i = 0; i < N; i++)
     f (Af[i]);
 }
 
 volatile double zero = 0;
 
 static void
 run_latency (double f (double))
 {
   double z = zero;
   double prev = z;
   for (int i = 0; i < N; i++)
     prev = f (A[i] + prev * z);
 }
 
 static void
 runf_latency (float f (float))
 {
   float z = (float)zero;
   float prev = z;
   for (int i = 0; i < N; i++)
     prev = f (Af[i] + prev * z);
 }
 
 static void
 run_v_thruput (v_double f (v_double))
 {
   for (int i = 0; i < N; i += v_double_len ())
     f (v_double_load (A+i));
 }
 
 static void
 runf_v_thruput (v_float f (v_float))
 {
   for (int i = 0; i < N; i += v_float_len ())
     f (v_float_load (Af+i));
 }
 
 static void
 run_v_latency (v_double f (v_double))
 {
   v_double z = v_double_dup (zero);
   v_double prev = z;
   for (int i = 0; i < N; i += v_double_len ())
     prev = f (v_double_load (A+i) + prev * z);
 }
 
 static void
 runf_v_latency (v_float f (v_float))
 {
   v_float z = v_float_dup (zero);
   v_float prev = z;
   for (int i = 0; i < N; i += v_float_len ())
     prev = f (v_float_load (Af+i) + prev * z);
 }
 
 #ifdef __vpcs
 static void
 run_vn_thruput (__vpcs v_double f (v_double))
 {
   for (int i = 0; i < N; i += v_double_len ())
     f (v_double_load (A+i));
 }
 
 static void
 runf_vn_thruput (__vpcs v_float f (v_float))
 {
   for (int i = 0; i < N; i += v_float_len ())
     f (v_float_load (Af+i));
 }
 
 static void
 run_vn_latency (__vpcs v_double f (v_double))
 {
   v_double z = v_double_dup (zero);
   v_double prev = z;
   for (int i = 0; i < N; i += v_double_len ())
     prev = f (v_double_load (A+i) + prev * z);
 }
 
 static void
 runf_vn_latency (__vpcs v_float f (v_float))
 {
   v_float z = v_float_dup (zero);
   v_float prev = z;
   for (int i = 0; i < N; i += v_float_len ())
     prev = f (v_float_load (Af+i) + prev * z);
 }
 #endif
 
+#if WANT_SVE_MATH
+static void
+run_sv_thruput (sv_double f (sv_double, sv_bool))
+{
+  for (int i = 0; i < N; i += sv_double_len ())
+    f (sv_double_load (A+i), svptrue_b64 ());
+}
+
+static void
+runf_sv_thruput (sv_float f (sv_float, sv_bool))
+{
+  for (int i = 0; i < N; i += sv_float_len ())
+    f (sv_float_load (Af+i), svptrue_b32 ());
+}
+
+static void
+run_sv_latency (sv_double f (sv_double, sv_bool))
+{
+  sv_double z = sv_double_dup (zero);
+  sv_double prev = z;
+  for (int i = 0; i < N; i += sv_double_len ())
+    prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
+}
+
+static void
+runf_sv_latency (sv_float f (sv_float, sv_bool))
+{
+  sv_float z = sv_float_dup (zero);
+  sv_float prev = z;
+  for (int i = 0; i < N; i += sv_float_len ())
+    prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
+}
+#endif
+
 static uint64_t
 tic (void)
 {
   struct timespec ts;
   if (clock_gettime (CLOCK_REALTIME, &ts))
     abort ();
   return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
 #define TIMEIT(run, f) do { \
   dt = -1; \
   run (f); /* Warm up.  */ \
   for (int j = 0; j < measurecount; j++) \
     { \
       uint64_t t0 = tic (); \
       for (int i = 0; i < itercount; i++) \
 	run (f); \
       uint64_t t1 = tic (); \
       if (t1 - t0 < dt) \
 	dt = t1 - t0; \
     } \
 } while (0)
 
 static void
 bench1 (const struct fun *f, int type, double lo, double hi)
 {
   uint64_t dt = 0;
   uint64_t ns100;
   const char *s = type == 't' ? "rthruput" : "latency";
   int vlen = 1;
 
   if (f->vec && f->prec == 'd')
     vlen = v_double_len();
   else if (f->vec && f->prec == 'f')
     vlen = v_float_len();
 
   if (f->prec == 'd' && type == 't' && f->vec == 0)
     TIMEIT (run_thruput, f->fun.d);
   else if (f->prec == 'd' && type == 'l' && f->vec == 0)
     TIMEIT (run_latency, f->fun.d);
   else if (f->prec == 'f' && type == 't' && f->vec == 0)
     TIMEIT (runf_thruput, f->fun.f);
   else if (f->prec == 'f' && type == 'l' && f->vec == 0)
     TIMEIT (runf_latency, f->fun.f);
   else if (f->prec == 'd' && type == 't' && f->vec == 'v')
     TIMEIT (run_v_thruput, f->fun.vd);
   else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
     TIMEIT (run_v_latency, f->fun.vd);
   else if (f->prec == 'f' && type == 't' && f->vec == 'v')
     TIMEIT (runf_v_thruput, f->fun.vf);
   else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
     TIMEIT (runf_v_latency, f->fun.vf);
 #ifdef __vpcs
   else if (f->prec == 'd' && type == 't' && f->vec == 'n')
     TIMEIT (run_vn_thruput, f->fun.vnd);
   else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
     TIMEIT (run_vn_latency, f->fun.vnd);
   else if (f->prec == 'f' && type == 't' && f->vec == 'n')
     TIMEIT (runf_vn_thruput, f->fun.vnf);
   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
     TIMEIT (runf_vn_latency, f->fun.vnf);
 #endif
+#if WANT_SVE_MATH
+  else if (f->prec == 'd' && type == 't' && f->vec == 's')
+    TIMEIT (run_sv_thruput, f->fun.svd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 's')
+    TIMEIT (run_sv_latency, f->fun.svd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 's')
+    TIMEIT (runf_sv_thruput, f->fun.svf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 's')
+    TIMEIT (runf_sv_latency, f->fun.svf);
+#endif
 
   if (type == 't')
     {
       ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
       printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
 	      (unsigned long long) dt, lo, hi);
     }
   else if (type == 'l')
     {
       ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
       printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
 	      (unsigned long long) dt, lo, hi);
     }
   fflush (stdout);
 }
 
 static void
 bench (const struct fun *f, double lo, double hi, int type, int gen)
 {
   if (f->prec == 'd' && gen == 'r')
     gen_rand (lo, hi);
   else if (f->prec == 'd' && gen == 'l')
     gen_linear (lo, hi);
   else if (f->prec == 'd' && gen == 't')
     gen_trace (0);
   else if (f->prec == 'f' && gen == 'r')
     genf_rand (lo, hi);
   else if (f->prec == 'f' && gen == 'l')
     genf_linear (lo, hi);
   else if (f->prec == 'f' && gen == 't')
     genf_trace (0);
 
   if (gen == 't')
     hi = trace_size / N;
 
   if (type == 'b' || type == 't')
     bench1 (f, 't', lo, hi);
 
   if (type == 'b' || type == 'l')
     bench1 (f, 'l', lo, hi);
 
   for (int i = N; i < trace_size; i += N)
     {
       if (f->prec == 'd')
 	gen_trace (i);
       else
 	genf_trace (i);
 
       lo = i / N;
       if (type == 'b' || type == 't')
 	bench1 (f, 't', lo, hi);
 
       if (type == 'b' || type == 'l')
 	bench1 (f, 'l', lo, hi);
     }
 }
 
 static void
 readtrace (const char *name)
 {
 	int n = 0;
 	FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
 	if (!f)
 	  {
 	    printf ("openning \"%s\" failed: %m\n", name);
 	    exit (1);
 	  }
 	for (;;)
 	  {
 	    if (n >= trace_size)
 	      {
 		trace_size += N;
 		Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
 		if (Trace == NULL)
 		  {
 		    printf ("out of memory\n");
 		    exit (1);
 		  }
 	      }
 	    if (fscanf (f, "%lf", Trace + n) != 1)
 	      break;
 	    n++;
 	  }
 	if (ferror (f) || n == 0)
 	  {
 	    printf ("reading \"%s\" failed: %m\n", name);
 	    exit (1);
 	  }
 	fclose (f);
 	if (n % N == 0)
 	  trace_size = n;
 	for (int i = 0; n < trace_size; n++, i++)
 	  Trace[n] = Trace[i];
 }
 
 static void
 usage (void)
 {
   printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
 	  "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
 	  "[func2 ..]\n");
   printf ("func:\n");
   printf ("%7s [run all benchmarks]\n", "all");
   for (const struct fun *f = funtab; f->name; f++)
     printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
   exit (1);
 }
 
 int
 main (int argc, char *argv[])
 {
   int usergen = 0, gen = 'r', type = 'b', all = 0;
   double lo = 0, hi = 0;
   const char *tracefile = "-";
 
   argv++;
   argc--;
   for (;;)
     {
       if (argc <= 0)
 	usage ();
       if (argv[0][0] != '-')
 	break;
       else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
 	{
 	  usergen = 1;
 	  lo = strtod (argv[1], 0);
 	  hi = strtod (argv[2], 0);
 	  argv += 3;
 	  argc -= 3;
 	}
       else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
 	{
 	  measurecount = strtol (argv[1], 0, 0);
 	  argv += 2;
 	  argc -= 2;
 	}
       else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
 	{
 	  itercount = strtol (argv[1], 0, 0);
 	  argv += 2;
 	  argc -= 2;
 	}
       else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
 	{
 	  gen = argv[1][0];
 	  if (strchr ("rlt", gen) == 0)
 	    usage ();
 	  argv += 2;
 	  argc -= 2;
 	}
       else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
 	{
 	  gen = 't';  /* -f implies -g trace.  */
 	  tracefile = argv[1];
 	  argv += 2;
 	  argc -= 2;
 	}
       else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
 	{
 	  type = argv[1][0];
 	  if (strchr ("ltb", type) == 0)
 	    usage ();
 	  argv += 2;
 	  argc -= 2;
 	}
       else
 	usage ();
     }
   if (gen == 't')
     {
       readtrace (tracefile);
       lo = hi = 0;
       usergen = 1;
     }
   while (argc > 0)
     {
       int found = 0;
       all = strcmp (argv[0], "all") == 0;
       for (const struct fun *f = funtab; f->name; f++)
 	if (all || strcmp (argv[0], f->name) == 0)
 	  {
 	    found = 1;
 	    if (!usergen)
 	      {
 		lo = f->lo;
 		hi = f->hi;
 	      }
 	    bench (f, lo, hi, type, gen);
 	    if (usergen && !all)
 	      break;
 	  }
       if (!found)
 	printf ("unknown function: %s\n", argv[0]);
       argv++;
       argc--;
     }
   return 0;
 }
diff --git a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h
new file mode 100644
index 000000000000..ad6dd2a2313d
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h
@@ -0,0 +1,100 @@
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#if WANT_VMATH
+D (__s_sin, -3.1, 3.1)
+D (__s_cos, -3.1, 3.1)
+D (__s_exp, -9.9, 9.9)
+D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
+F (__s_exp2f, -9.9, 9.9)
+F (__s_exp2f_1u, -9.9, 9.9)
+F (__s_logf, 0.01, 11.1)
+{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
+F (__s_sinf, -3.1, 3.1)
+F (__s_cosf, -3.1, 3.1)
+#if __aarch64__
+VD (__v_sin, -3.1, 3.1)
+VD (__v_cos, -3.1, 3.1)
+VD (__v_exp, -9.9, 9.9)
+VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+VF (__v_exp2f, -9.9, 9.9)
+VF (__v_exp2f_1u, -9.9, 9.9)
+VF (__v_logf, 0.01, 11.1)
+{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
+VF (__v_sinf, -3.1, 3.1)
+VF (__v_cosf, -3.1, 3.1)
+#ifdef __vpcs
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (__vn_log, 0.01, 11.1)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (__vn_sin, -3.1, 3.1)
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (__vn_cos, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+VNF (__vn_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (__vn_exp2f_1u, -9.9, 9.9)
+VNF (__vn_logf, 0.01, 11.1)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (__vn_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (__vn_cosf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+#endif
+#endif
diff --git a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h
new file mode 100644
index 000000000000..8311f0f4e173
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h
@@ -0,0 +1,104 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#if WANT_VMATH
+#if __aarch64__
+
+#ifdef __vpcs
+__vpcs static v_float
+xy__vn_powf (v_float x)
+{
+  return __vn_powf (x, x);
+}
+
+__vpcs static v_float
+xy_Z_powf (v_float x)
+{
+  return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+  return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
+}
+#endif // __vpcs
+
+static v_float
+xy__v_powf (v_float x)
+{
+  return __v_powf (x, x);
+}
+
+static v_double
+xy__v_pow (v_double x)
+{
+  return __v_pow (x, x);
+}
+#endif // __aarch64__
+
+static float
+xy__s_powf (float x)
+{
+  return __s_powf (x, x);
+}
+
+static double
+xy__s_pow (double x)
+{
+  return __s_pow (x, x);
+}
+#endif // WANT_VMATH
+
+static double
+xypow (double x)
+{
+  return pow (x, x);
+}
+
+static float
+xypowf (float x)
+{
+  return powf (x, x);
+}
+
+static double
+xpow (double x)
+{
+  return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+  return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+  return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+  return powf (2.34f, x);
+}
+
+static float
+sincosf_wrap (float x)
+{
+  float s, c;
+  sincosf (x, &s, &c);
+  return s + c;
+}
diff --git a/contrib/arm-optimized-routines/math/test/mathtest.c b/contrib/arm-optimized-routines/math/test/mathtest.c
index 310896738e47..3168da43b01d 100644
--- a/contrib/arm-optimized-routines/math/test/mathtest.c
+++ b/contrib/arm-optimized-routines/math/test/mathtest.c
@@ -1,1701 +1,1704 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 1998-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <setjmp.h>
 #include <ctype.h>
 #include <math.h>
 #include <errno.h>
 #include <limits.h>
 #include <fenv.h>
 #include "mathlib.h"
 
 #ifndef math_errhandling
 # define math_errhandling 0
 #endif
 
 #ifdef __cplusplus
  #define EXTERN_C extern "C"
 #else
  #define EXTERN_C extern
 #endif
 
 #ifndef TRUE
 #define TRUE 1
 #endif
 #ifndef FALSE
 #define FALSE 0
 #endif
 
 #ifdef IMPORT_SYMBOL
 #define STR2(x) #x
 #define STR(x) STR2(x)
 _Pragma(STR(import IMPORT_SYMBOL))
 #endif
 
 int dmsd, dlsd;
 int quiet = 0;
 int doround = 0;
 unsigned statusmask = FE_ALL_EXCEPT;
 
 #define EXTRABITS (12)
 #define ULPUNIT (1<<EXTRABITS)
 
 typedef int (*test) (void);
 
 /*
   struct to hold info about a function (which could actually be a macro)
 */
 typedef struct {
     enum {
         t_func, t_macro
     } type;
     enum {
         at_d, at_s,      /* double or single precision float */
         at_d2, at_s2,    /* same, but taking two args */
         at_di, at_si,    /* double/single and an int */
         at_dip, at_sip,  /* double/single and an int ptr */
         at_ddp, at_ssp,  /* d/s and a d/s ptr */
         at_dc, at_sc,    /* double or single precision complex */
         at_dc2, at_sc2   /* same, but taking two args */
     } argtype;
     enum {
         rt_d, rt_s, rt_i, /* double, single, int */
         rt_dc, rt_sc,     /* double, single precision complex */
         rt_d2, rt_s2      /* also use res2 */
     } rettype;
     union {
         void* ptr;
         double (*d_d_ptr)(double);
         float (*s_s_ptr)(float);
         int (*d_i_ptr)(double);
         int (*s_i_ptr)(float);
         double (*d2_d_ptr)(double, double);
         float (*s2_s_ptr)(float, float);
         double (*di_d_ptr)(double,int);
         float (*si_s_ptr)(float,int);
         double (*dip_d_ptr)(double,int*);
         float (*sip_s_ptr)(float,int*);
         double (*ddp_d_ptr)(double,double*);
         float (*ssp_s_ptr)(float,float*);
     } func;
     enum {
         m_none,
         m_isfinite, m_isfinitef,
         m_isgreater, m_isgreaterequal,
         m_isgreaterequalf, m_isgreaterf,
         m_isinf, m_isinff,
         m_isless, m_islessequal,
         m_islessequalf, m_islessf,
         m_islessgreater, m_islessgreaterf,
         m_isnan, m_isnanf,
         m_isnormal, m_isnormalf,
         m_isunordered, m_isunorderedf,
         m_fpclassify, m_fpclassifyf,
         m_signbit, m_signbitf,
         /* not actually a macro, but makes things easier */
         m_rred, m_rredf,
         m_cadd, m_csub, m_cmul, m_cdiv,
         m_caddf, m_csubf, m_cmulf, m_cdivf
     } macro_name; /* only used if a macro/something that can't be done using func */
     long long tolerance;
     const char* name;
 } test_func;
 
 /* used in qsort */
 int compare_tfuncs(const void* a, const void* b) {
     return strcmp(((test_func*)a)->name, ((test_func*)b)->name);
 }
 
 int is_double_argtype(int argtype) {
     switch(argtype) {
     case at_d:
     case at_d2:
     case at_dc:
     case at_dc2:
         return 1;
     default:
         return 0;
     }
 }
 
 int is_single_argtype(int argtype) {
     switch(argtype) {
     case at_s:
     case at_s2:
     case at_sc:
     case at_sc2:
         return 1;
     default:
         return 0;
     }
 }
 
 int is_double_rettype(int rettype) {
     switch(rettype) {
     case rt_d:
     case rt_dc:
     case rt_d2:
         return 1;
     default:
         return 0;
     }
 }
 
 int is_single_rettype(int rettype) {
     switch(rettype) {
     case rt_s:
     case rt_sc:
     case rt_s2:
         return 1;
     default:
         return 0;
     }
 }
 
 int is_complex_argtype(int argtype) {
     switch(argtype) {
     case at_dc:
     case at_sc:
     case at_dc2:
     case at_sc2:
         return 1;
     default:
         return 0;
     }
 }
 
 int is_complex_rettype(int rettype) {
     switch(rettype) {
     case rt_dc:
     case rt_sc:
         return 1;
     default:
         return 0;
     }
 }
 
 /*
  * Special-case flags indicating that some functions' error
  * tolerance handling is more complicated than a fixed relative
  * error bound.
  */
 #define ABSLOWERBOUND 0x4000000000000000LL
 #define PLUSMINUSPIO2 0x1000000000000000LL
 
 #define ARM_PREFIX(x) x
 
 #define TFUNC(arg,ret,name,tolerance) { t_func, arg, ret, (void*)&name, m_none, tolerance, #name }
 #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
 #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
 
+#ifndef PL
 /* sincosf wrappers for easier testing.  */
 static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
 static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
+#endif
 
 test_func tfuncs[] = {
     /* trigonometric */
     TFUNC(at_d,rt_d, acos, 4*ULPUNIT),
     TFUNC(at_d,rt_d, asin, 4*ULPUNIT),
     TFUNC(at_d,rt_d, atan, 4*ULPUNIT),
     TFUNC(at_d2,rt_d, atan2, 4*ULPUNIT),
 
     TFUNC(at_d,rt_d, tan, 2*ULPUNIT),
     TFUNC(at_d,rt_d, sin, 2*ULPUNIT),
     TFUNC(at_d,rt_d, cos, 2*ULPUNIT),
 
     TFUNC(at_s,rt_s, acosf, 4*ULPUNIT),
     TFUNC(at_s,rt_s, asinf, 4*ULPUNIT),
     TFUNC(at_s,rt_s, atanf, 4*ULPUNIT),
     TFUNC(at_s2,rt_s, atan2f, 4*ULPUNIT),
     TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
     TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
+#ifndef PL
     TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-
+#endif
     /* hyperbolic */
     TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
     TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
     TFUNC(at_d, rt_d, acosh, 4*ULPUNIT),
     TFUNC(at_d,rt_d, tanh, 4*ULPUNIT),
     TFUNC(at_d,rt_d, sinh, 4*ULPUNIT),
     TFUNC(at_d,rt_d, cosh, 4*ULPUNIT),
 
     TFUNC(at_s, rt_s, atanhf, 4*ULPUNIT),
     TFUNC(at_s, rt_s, asinhf, 4*ULPUNIT),
     TFUNC(at_s, rt_s, acoshf, 4*ULPUNIT),
     TFUNC(at_s,rt_s, tanhf, 4*ULPUNIT),
     TFUNC(at_s,rt_s, sinhf, 4*ULPUNIT),
     TFUNC(at_s,rt_s, coshf, 4*ULPUNIT),
 
     /* exponential and logarithmic */
     TFUNC(at_d,rt_d, log, 3*ULPUNIT/4),
     TFUNC(at_d,rt_d, log10, 3*ULPUNIT),
     TFUNC(at_d,rt_d, log2, 3*ULPUNIT/4),
     TFUNC(at_d,rt_d, log1p, 2*ULPUNIT),
     TFUNC(at_d,rt_d, exp, 3*ULPUNIT/4),
     TFUNC(at_d,rt_d, exp2, 3*ULPUNIT/4),
     TFUNC(at_d,rt_d, expm1, ULPUNIT),
     TFUNCARM(at_s,rt_s, logf, ULPUNIT),
     TFUNC(at_s,rt_s, log10f, 3*ULPUNIT),
     TFUNCARM(at_s,rt_s, log2f, ULPUNIT),
     TFUNC(at_s,rt_s, log1pf, 2*ULPUNIT),
     TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
     TFUNC(at_s,rt_s, expm1f, ULPUNIT),
 
     /* power */
     TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
     TFUNC(at_d,rt_d, sqrt, ULPUNIT/2),
     TFUNC(at_d,rt_d, cbrt, 2*ULPUNIT),
     TFUNC(at_d2, rt_d, hypot, 4*ULPUNIT),
 
     TFUNCARM(at_s2,rt_s, powf, ULPUNIT),
     TFUNC(at_s,rt_s, sqrtf, ULPUNIT/2),
     TFUNC(at_s,rt_s, cbrtf, 2*ULPUNIT),
     TFUNC(at_s2, rt_s, hypotf, 4*ULPUNIT),
 
     /* error function */
     TFUNC(at_d,rt_d, erf, 16*ULPUNIT),
     TFUNC(at_s,rt_s, erff, 16*ULPUNIT),
     TFUNC(at_d,rt_d, erfc, 16*ULPUNIT),
     TFUNC(at_s,rt_s, erfcf, 16*ULPUNIT),
 
     /* gamma functions */
     TFUNC(at_d,rt_d, tgamma, 16*ULPUNIT),
     TFUNC(at_s,rt_s, tgammaf, 16*ULPUNIT),
     TFUNC(at_d,rt_d, lgamma, 16*ULPUNIT | ABSLOWERBOUND),
     TFUNC(at_s,rt_s, lgammaf, 16*ULPUNIT | ABSLOWERBOUND),
 
     TFUNC(at_d,rt_d, ceil, 0),
     TFUNC(at_s,rt_s, ceilf, 0),
     TFUNC(at_d2,rt_d, copysign, 0),
     TFUNC(at_s2,rt_s, copysignf, 0),
     TFUNC(at_d,rt_d, floor, 0),
     TFUNC(at_s,rt_s, floorf, 0),
     TFUNC(at_d2,rt_d, fmax, 0),
     TFUNC(at_s2,rt_s, fmaxf, 0),
     TFUNC(at_d2,rt_d, fmin, 0),
     TFUNC(at_s2,rt_s, fminf, 0),
     TFUNC(at_d2,rt_d, fmod, 0),
     TFUNC(at_s2,rt_s, fmodf, 0),
     MFUNC(at_d, rt_i, fpclassify, 0),
     MFUNC(at_s, rt_i, fpclassifyf, 0),
     TFUNC(at_dip,rt_d, frexp, 0),
     TFUNC(at_sip,rt_s, frexpf, 0),
     MFUNC(at_d, rt_i, isfinite, 0),
     MFUNC(at_s, rt_i, isfinitef, 0),
     MFUNC(at_d, rt_i, isgreater, 0),
     MFUNC(at_d, rt_i, isgreaterequal, 0),
     MFUNC(at_s, rt_i, isgreaterequalf, 0),
     MFUNC(at_s, rt_i, isgreaterf, 0),
     MFUNC(at_d, rt_i, isinf, 0),
     MFUNC(at_s, rt_i, isinff, 0),
     MFUNC(at_d, rt_i, isless, 0),
     MFUNC(at_d, rt_i, islessequal, 0),
     MFUNC(at_s, rt_i, islessequalf, 0),
     MFUNC(at_s, rt_i, islessf, 0),
     MFUNC(at_d, rt_i, islessgreater, 0),
     MFUNC(at_s, rt_i, islessgreaterf, 0),
     MFUNC(at_d, rt_i, isnan, 0),
     MFUNC(at_s, rt_i, isnanf, 0),
     MFUNC(at_d, rt_i, isnormal, 0),
     MFUNC(at_s, rt_i, isnormalf, 0),
     MFUNC(at_d, rt_i, isunordered, 0),
     MFUNC(at_s, rt_i, isunorderedf, 0),
     TFUNC(at_di,rt_d, ldexp, 0),
     TFUNC(at_si,rt_s, ldexpf, 0),
     TFUNC(at_ddp,rt_d2, modf, 0),
     TFUNC(at_ssp,rt_s2, modff, 0),
 #ifndef BIGRANGERED
     MFUNC(at_d, rt_d, rred, 2*ULPUNIT),
 #else
     MFUNC(at_d, rt_d, m_rred, ULPUNIT),
 #endif
     MFUNC(at_d, rt_i, signbit, 0),
     MFUNC(at_s, rt_i, signbitf, 0),
 };
 
 /*
  * keywords are: func size op1 op2 result res2 errno op1r op1i op2r op2i resultr resulti
  * also we ignore: wrongresult wrongres2 wrongerrno
  * op1 equivalent to op1r, same with op2 and result
  */
 
 typedef struct {
     test_func *func;
     unsigned op1r[2]; /* real part, also used for non-complex numbers */
     unsigned op1i[2]; /* imaginary part */
     unsigned op2r[2];
     unsigned op2i[2];
     unsigned resultr[3];
     unsigned resulti[3];
     enum {
         rc_none, rc_zero, rc_infinity, rc_nan, rc_finite
     } resultc; /* special complex results, rc_none means use resultr and resulti as normal */
     unsigned res2[2];
     unsigned status;                   /* IEEE status return, if any */
     unsigned maybestatus;             /* for optional status, or allowance for spurious */
     int nresult;                       /* number of result words */
     int in_err, in_err_limit;
     int err;
     int maybeerr;
     int valid;
     int comment;
     int random;
 } testdetail;
 
 enum {                                 /* keywords */
     k_errno, k_errno_in, k_error, k_func, k_maybeerror, k_maybestatus, k_op1, k_op1i, k_op1r, k_op2, k_op2i, k_op2r,
     k_random, k_res2, k_result, k_resultc, k_resulti, k_resultr, k_status,
     k_wrongres2, k_wrongresult, k_wrongstatus, k_wrongerrno
 };
 char *keywords[] = {
     "errno", "errno_in", "error", "func", "maybeerror", "maybestatus", "op1", "op1i", "op1r", "op2", "op2i", "op2r",
     "random", "res2", "result", "resultc", "resulti", "resultr", "status",
     "wrongres2", "wrongresult", "wrongstatus", "wrongerrno"
 };
 
 enum {
     e_0, e_EDOM, e_ERANGE,
 
     /*
      * This enum makes sure that we have the right number of errnos in the
      * errno[] array
      */
     e_number_of_errnos
 };
 char *errnos[] = {
     "0", "EDOM", "ERANGE"
 };
 
 enum {
     e_none, e_divbyzero, e_domain, e_overflow, e_underflow
 };
 char *errors[] = {
     "0", "divbyzero", "domain", "overflow", "underflow"
 };
 
 static int verbose, fo, strict;
 
 /* state toggled by random=on / random=off */
 static int randomstate;
 
 /* Canonify a double NaN: SNaNs all become 7FF00000.00000001 and QNaNs
  * all become 7FF80000.00000001 */
 void canon_dNaN(unsigned a[2]) {
     if ((a[0] & 0x7FF00000) != 0x7FF00000)
         return;                        /* not Inf or NaN */
     if (!(a[0] & 0xFFFFF) && !a[1])
         return;                        /* Inf */
     a[0] &= 0x7FF80000;                /* canonify top word */
     a[1] = 0x00000001;                 /* canonify bottom word */
 }
 
 /* Canonify a single NaN: SNaNs all become 7F800001 and QNaNs
  * all become 7FC00001. Returns classification of the NaN. */
 void canon_sNaN(unsigned a[1]) {
     if ((a[0] & 0x7F800000) != 0x7F800000)
         return;                        /* not Inf or NaN */
     if (!(a[0] & 0x7FFFFF))
         return;                        /* Inf */
     a[0] &= 0x7FC00000;                /* canonify most bits */
     a[0] |= 0x00000001;                /* canonify bottom bit */
 }
 
 /*
  * Detect difficult operands for FO mode.
  */
 int is_dhard(unsigned a[2])
 {
     if ((a[0] & 0x7FF00000) == 0x7FF00000)
         return TRUE;                   /* inf or NaN */
     if ((a[0] & 0x7FF00000) == 0 &&
         ((a[0] & 0x7FFFFFFF) | a[1]) != 0)
         return TRUE;                   /* denormal */
     return FALSE;
 }
 int is_shard(unsigned a[1])
 {
     if ((a[0] & 0x7F800000) == 0x7F800000)
         return TRUE;                   /* inf or NaN */
     if ((a[0] & 0x7F800000) == 0 &&
         (a[0] & 0x7FFFFFFF) != 0)
         return TRUE;                   /* denormal */
     return FALSE;
 }
 
 /*
  * Normalise all zeroes into +0, for FO mode.
  */
 void dnormzero(unsigned a[2])
 {
     if (a[0] == 0x80000000 && a[1] == 0)
         a[0] = 0;
 }
 void snormzero(unsigned a[1])
 {
     if (a[0] == 0x80000000)
         a[0] = 0;
 }
 
 static int find(char *word, char **array, int asize) {
     int i, j;
 
     asize /= sizeof(char *);
 
     i = -1; j = asize;                 /* strictly between i and j */
     while (j-i > 1) {
         int k = (i+j) / 2;
         int c = strcmp(word, array[k]);
         if (c > 0)
             i = k;
         else if (c < 0)
             j = k;
         else                           /* found it! */
             return k;
     }
     return -1;                         /* not found */
 }
 
 static test_func* find_testfunc(char *word) {
     int i, j, asize;
 
     asize = sizeof(tfuncs)/sizeof(test_func);
 
     i = -1; j = asize;                 /* strictly between i and j */
     while (j-i > 1) {
         int k = (i+j) / 2;
         int c = strcmp(word, tfuncs[k].name);
         if (c > 0)
             i = k;
         else if (c < 0)
             j = k;
         else                           /* found it! */
             return tfuncs + k;
     }
     return NULL;                         /* not found */
 }
 
 static long long calc_error(unsigned a[2], unsigned b[3], int shift, int rettype) {
     unsigned r0, r1, r2;
     int sign, carry;
     long long result;
 
     /*
      * If either number is infinite, require exact equality. If
      * either number is NaN, require that both are NaN. If either
      * of these requirements is broken, return INT_MAX.
      */
     if (is_double_rettype(rettype)) {
         if ((a[0] & 0x7FF00000) == 0x7FF00000 ||
             (b[0] & 0x7FF00000) == 0x7FF00000) {
             if (((a[0] & 0x800FFFFF) || a[1]) &&
                 ((b[0] & 0x800FFFFF) || b[1]) &&
                 (a[0] & 0x7FF00000) == 0x7FF00000 &&
                 (b[0] & 0x7FF00000) == 0x7FF00000)
                 return 0;              /* both NaN - OK */
             if (!((a[0] & 0xFFFFF) || a[1]) &&
                 !((b[0] & 0xFFFFF) || b[1]) &&
                 a[0] == b[0])
                 return 0;              /* both same sign of Inf - OK */
             return LLONG_MAX;
         }
     } else {
         if ((a[0] & 0x7F800000) == 0x7F800000 ||
             (b[0] & 0x7F800000) == 0x7F800000) {
             if ((a[0] & 0x807FFFFF) &&
                 (b[0] & 0x807FFFFF) &&
                 (a[0] & 0x7F800000) == 0x7F800000 &&
                 (b[0] & 0x7F800000) == 0x7F800000)
                 return 0;              /* both NaN - OK */
             if (!(a[0] & 0x7FFFFF) &&
                 !(b[0] & 0x7FFFFF) &&
                 a[0] == b[0])
                 return 0;              /* both same sign of Inf - OK */
             return LLONG_MAX;
         }
     }
 
     /*
      * Both finite. Return INT_MAX if the signs differ.
      */
     if ((a[0] ^ b[0]) & 0x80000000)
         return LLONG_MAX;
 
     /*
      * Now it's just straight multiple-word subtraction.
      */
     if (is_double_rettype(rettype)) {
         r2 = -b[2]; carry = (r2 == 0);
         r1 = a[1] + ~b[1] + carry; carry = (r1 < a[1] || (carry && r1 == a[1]));
         r0 = a[0] + ~b[0] + carry;
     } else {
         r2 = -b[1]; carry = (r2 == 0);
         r1 = a[0] + ~b[0] + carry; carry = (r1 < a[0] || (carry && r1 == a[0]));
         r0 = ~0 + carry;
     }
 
     /*
      * Forgive larger errors in specialised cases.
      */
     if (shift > 0) {
         if (shift > 32*3)
             return 0;                  /* all errors are forgiven! */
         while (shift >= 32) {
             r2 = r1;
             r1 = r0;
             r0 = -(r0 >> 31);
             shift -= 32;
         }
 
         if (shift > 0) {
             r2 = (r2 >> shift) | (r1 << (32-shift));
             r1 = (r1 >> shift) | (r0 << (32-shift));
             r0 = (r0 >> shift) | ((-(r0 >> 31)) << (32-shift));
         }
     }
 
     if (r0 & 0x80000000) {
         sign = 1;
         r2 = ~r2; carry = (r2 == 0);
         r1 = 0 + ~r1 + carry; carry = (carry && (r2 == 0));
         r0 = 0 + ~r0 + carry;
     } else {
         sign = 0;
     }
 
     if (r0 >= (1LL<<(31-EXTRABITS)))
         return LLONG_MAX;                /* many ulps out */
 
     result = (r2 >> (32-EXTRABITS)) & (ULPUNIT-1);
     result |= r1 << EXTRABITS;
     result |= (long long)r0 << (32+EXTRABITS);
     if (sign)
         result = -result;
     return result;
 }
 
 /* special named operands */
 
 typedef struct {
     unsigned op1, op2;
     char* name;
 } special_op;
 
 static special_op special_ops_double[] = {
     {0x00000000,0x00000000,"0"},
     {0x3FF00000,0x00000000,"1"},
     {0x7FF00000,0x00000000,"inf"},
     {0x7FF80000,0x00000001,"qnan"},
     {0x7FF00000,0x00000001,"snan"},
     {0x3ff921fb,0x54442d18,"pi2"},
     {0x400921fb,0x54442d18,"pi"},
     {0x3fe921fb,0x54442d18,"pi4"},
     {0x4002d97c,0x7f3321d2,"3pi4"},
 };
 
 static special_op special_ops_float[] = {
     {0x00000000,0,"0"},
     {0x3f800000,0,"1"},
     {0x7f800000,0,"inf"},
     {0x7fc00000,0,"qnan"},
     {0x7f800001,0,"snan"},
     {0x3fc90fdb,0,"pi2"},
     {0x40490fdb,0,"pi"},
     {0x3f490fdb,0,"pi4"},
     {0x4016cbe4,0,"3pi4"},
 };
 
 /*
    This is what is returned by the below functions.
    We need it to handle the sign of the number
 */
 static special_op tmp_op = {0,0,0};
 
 special_op* find_special_op_from_op(unsigned op1, unsigned op2, int is_double) {
     int i;
     special_op* sop;
     if(is_double) {
         sop = special_ops_double;
     } else {
         sop = special_ops_float;
     }
     for(i = 0; i < sizeof(special_ops_double)/sizeof(special_op); i++) {
         if(sop->op1 == (op1&0x7fffffff) && sop->op2 == op2) {
             if(tmp_op.name) free(tmp_op.name);
             tmp_op.name = malloc(strlen(sop->name)+2);
             if(op1>>31) {
                 sprintf(tmp_op.name,"-%s",sop->name);
             } else {
                 strcpy(tmp_op.name,sop->name);
             }
             return &tmp_op;
         }
         sop++;
     }
     return NULL;
 }
 
 special_op* find_special_op_from_name(const char* name, int is_double) {
     int i, neg=0;
     special_op* sop;
     if(is_double) {
         sop = special_ops_double;
     } else {
         sop = special_ops_float;
     }
     if(*name=='-') {
         neg=1;
         name++;
     } else if(*name=='+') {
         name++;
     }
     for(i = 0; i < sizeof(special_ops_double)/sizeof(special_op); i++) {
         if(0 == strcmp(name,sop->name)) {
             tmp_op.op1 = sop->op1;
             if(neg) {
                 tmp_op.op1 |= 0x80000000;
             }
             tmp_op.op2 = sop->op2;
             return &tmp_op;
         }
         sop++;
     }
     return NULL;
 }
 
 /*
    helper function for the below
    type=0 for single, 1 for double, 2 for no sop
 */
 int do_op(char* q, unsigned* op, const char* name, int num, int sop_type) {
     int i;
     int n=num;
     special_op* sop = NULL;
     for(i = 0; i < num; i++) {
         op[i] = 0;
     }
     if(sop_type<2) {
         sop = find_special_op_from_name(q,sop_type);
     }
     if(sop != NULL) {
         op[0] = sop->op1;
         op[1] = sop->op2;
     } else {
         switch(num) {
         case 1: n = sscanf(q, "%x", &op[0]); break;
         case 2: n = sscanf(q, "%x.%x", &op[0], &op[1]); break;
         case 3: n = sscanf(q, "%x.%x.%x", &op[0], &op[1], &op[2]); break;
         default: return -1;
         }
     }
     if (verbose) {
         printf("%s=",name);
         for (i = 0; (i < n); ++i) printf("%x.", op[i]);
         printf(" (n=%d)\n", n);
     }
     return n;
 }
 
 testdetail parsetest(char *testbuf, testdetail oldtest) {
     char *p; /* Current part of line: Option name */
     char *q; /* Current part of line: Option value */
     testdetail ret; /* What we return */
     int k; /* Function enum from k_* */
     int n; /* Used as returns for scanfs */
     int argtype=2, rettype=2; /* for do_op */
 
     /* clear ret */
     memset(&ret, 0, sizeof(ret));
 
     if (verbose) printf("Parsing line: %s\n", testbuf);
     while (*testbuf && isspace(*testbuf)) testbuf++;
     if (testbuf[0] == ';' || testbuf[0] == '#' || testbuf[0] == '!' ||
         testbuf[0] == '>' || testbuf[0] == '\0') {
         ret.comment = 1;
         if (verbose) printf("Line is a comment\n");
         return ret;
     }
     ret.comment = 0;
 
     if (*testbuf == '+') {
         if (oldtest.valid) {
             ret = oldtest;             /* structure copy */
         } else {
             fprintf(stderr, "copy from invalid: ignored\n");
         }
         testbuf++;
     }
 
     ret.random = randomstate;
 
     ret.in_err = 0;
     ret.in_err_limit = e_number_of_errnos;
 
     p = strtok(testbuf, " \t");
     while (p != NULL) {
         q = strchr(p, '=');
         if (!q)
             goto balderdash;
         *q++ = '\0';
         k = find(p, keywords, sizeof(keywords));
         switch (k) {
         case k_random:
             randomstate = (!strcmp(q, "on"));
             ret.comment = 1;
             return ret;                /* otherwise ignore this line */
         case k_func:
             if (verbose) printf("func=%s ", q);
             //ret.func = find(q, funcs, sizeof(funcs));
             ret.func = find_testfunc(q);
             if (ret.func == NULL)
                 {
                     if (verbose) printf("(id=unknown)\n");
                     goto balderdash;
                 }
             if(is_single_argtype(ret.func->argtype))
                 argtype = 0;
             else if(is_double_argtype(ret.func->argtype))
                 argtype = 1;
             if(is_single_rettype(ret.func->rettype))
                 rettype = 0;
             else if(is_double_rettype(ret.func->rettype))
                 rettype = 1;
             //ret.size = sizes[ret.func];
             if (verbose) printf("(name=%s) (size=%d)\n", ret.func->name, ret.func->argtype);
             break;
         case k_op1:
         case k_op1r:
             n = do_op(q,ret.op1r,"op1r",2,argtype);
             if (n < 1)
                 goto balderdash;
             break;
         case k_op1i:
             n = do_op(q,ret.op1i,"op1i",2,argtype);
             if (n < 1)
                 goto balderdash;
             break;
         case k_op2:
         case k_op2r:
             n = do_op(q,ret.op2r,"op2r",2,argtype);
             if (n < 1)
                 goto balderdash;
             break;
         case k_op2i:
             n = do_op(q,ret.op2i,"op2i",2,argtype);
             if (n < 1)
                 goto balderdash;
             break;
         case k_resultc:
             puts(q);
             if(strncmp(q,"inf",3)==0) {
                 ret.resultc = rc_infinity;
             } else if(strcmp(q,"zero")==0) {
                 ret.resultc = rc_zero;
             } else if(strcmp(q,"nan")==0) {
                 ret.resultc = rc_nan;
             } else if(strcmp(q,"finite")==0) {
                 ret.resultc = rc_finite;
             } else {
                 goto balderdash;
             }
             break;
         case k_result:
         case k_resultr:
             n = (do_op)(q,ret.resultr,"resultr",3,rettype);
             if (n < 1)
                 goto balderdash;
             ret.nresult = n; /* assume real and imaginary have same no. words */
             break;
         case k_resulti:
             n = do_op(q,ret.resulti,"resulti",3,rettype);
             if (n < 1)
                 goto balderdash;
             break;
         case k_res2:
             n = do_op(q,ret.res2,"res2",2,rettype);
             if (n < 1)
                 goto balderdash;
             break;
         case k_status:
             while (*q) {
                 if (*q == 'i') ret.status |= FE_INVALID;
                 if (*q == 'z') ret.status |= FE_DIVBYZERO;
                 if (*q == 'o') ret.status |= FE_OVERFLOW;
                 if (*q == 'u') ret.status |= FE_UNDERFLOW;
                 q++;
             }
             break;
         case k_maybeerror:
             n = find(q, errors, sizeof(errors));
             if (n < 0)
                 goto balderdash;
             if(math_errhandling&MATH_ERREXCEPT) {
                 switch(n) {
                 case e_domain: ret.maybestatus |= FE_INVALID; break;
                 case e_divbyzero: ret.maybestatus |= FE_DIVBYZERO; break;
                 case e_overflow: ret.maybestatus |= FE_OVERFLOW; break;
                 case e_underflow: ret.maybestatus |= FE_UNDERFLOW; break;
                 }
             }
             {
                 switch(n) {
                 case e_domain:
                     ret.maybeerr = e_EDOM; break;
                 case e_divbyzero:
                 case e_overflow:
                 case e_underflow:
                     ret.maybeerr = e_ERANGE; break;
                 }
             }
         case k_maybestatus:
             while (*q) {
                 if (*q == 'i') ret.maybestatus |= FE_INVALID;
                 if (*q == 'z') ret.maybestatus |= FE_DIVBYZERO;
                 if (*q == 'o') ret.maybestatus |= FE_OVERFLOW;
                 if (*q == 'u') ret.maybestatus |= FE_UNDERFLOW;
                 q++;
             }
             break;
         case k_error:
             n = find(q, errors, sizeof(errors));
             if (n < 0)
                 goto balderdash;
             if(math_errhandling&MATH_ERREXCEPT) {
                 switch(n) {
                 case e_domain: ret.status |= FE_INVALID; break;
                 case e_divbyzero: ret.status |= FE_DIVBYZERO; break;
                 case e_overflow: ret.status |= FE_OVERFLOW; break;
                 case e_underflow: ret.status |= FE_UNDERFLOW; break;
                 }
             }
             if(math_errhandling&MATH_ERRNO) {
                 switch(n) {
                 case e_domain:
                     ret.err = e_EDOM; break;
                 case e_divbyzero:
                 case e_overflow:
                 case e_underflow:
                     ret.err = e_ERANGE; break;
                 }
             }
             if(!(math_errhandling&MATH_ERRNO)) {
                 switch(n) {
                 case e_domain:
                     ret.maybeerr = e_EDOM; break;
                 case e_divbyzero:
                 case e_overflow:
                 case e_underflow:
                     ret.maybeerr = e_ERANGE; break;
                 }
             }
             break;
         case k_errno:
             ret.err = find(q, errnos, sizeof(errnos));
             if (ret.err < 0)
                 goto balderdash;
             break;
         case k_errno_in:
             ret.in_err = find(q, errnos, sizeof(errnos));
             if (ret.err < 0)
                 goto balderdash;
             ret.in_err_limit = ret.in_err + 1;
             break;
         case k_wrongresult:
         case k_wrongstatus:
         case k_wrongres2:
         case k_wrongerrno:
             /* quietly ignore these keys */
             break;
         default:
             goto balderdash;
         }
         p = strtok(NULL, " \t");
     }
     ret.valid = 1;
     return ret;
 
     /* come here from almost any error */
  balderdash:
     ret.valid = 0;
     return ret;
 }
 
 typedef enum {
     test_comment,                      /* deliberately not a test */
     test_invalid,                      /* accidentally not a test */
     test_decline,                      /* was a test, and wasn't run */
     test_fail,                         /* was a test, and failed */
     test_pass                          /* was a test, and passed */
 } testresult;
 
 char failtext[512];
 
 typedef union {
     unsigned i[2];
     double f;
     double da[2];
 } dbl;
 
 typedef union {
     unsigned i;
     float f;
     float da[2];
 } sgl;
 
 /* helper function for runtest */
 void print_error(int rettype, unsigned *result, char* text, char** failp) {
     special_op *sop;
     char *str;
 
     if(result) {
         *failp += sprintf(*failp," %s=",text);
         sop = find_special_op_from_op(result[0],result[1],is_double_rettype(rettype));
         if(sop) {
             *failp += sprintf(*failp,"%s",sop->name);
         } else {
             if(is_double_rettype(rettype)) {
                 str="%08x.%08x";
             } else {
                 str="%08x";
             }
             *failp += sprintf(*failp,str,result[0],result[1]);
         }
     }
 }
 
 
 void print_ulps_helper(const char *name, long long ulps, char** failp) {
     if(ulps == LLONG_MAX) {
         *failp += sprintf(*failp, " %s=HUGE", name);
     } else {
         *failp += sprintf(*failp, " %s=%.3f", name, (double)ulps / ULPUNIT);
     }
 }
 
 /* for complex args make ulpsr or ulpsri = 0 to not print */
 void print_ulps(int rettype, long long ulpsr, long long ulpsi, char** failp) {
     if(is_complex_rettype(rettype)) {
         if (ulpsr) print_ulps_helper("ulpsr",ulpsr,failp);
         if (ulpsi) print_ulps_helper("ulpsi",ulpsi,failp);
     } else {
         if (ulpsr) print_ulps_helper("ulps",ulpsr,failp);
     }
 }
 
 int runtest(testdetail t) {
     int err, status;
 
     dbl d_arg1, d_arg2, d_res, d_res2;
     sgl s_arg1, s_arg2, s_res, s_res2;
 
     int deferred_decline = FALSE;
     char *failp = failtext;
 
     unsigned int intres=0;
 
     int res2_adjust = 0;
 
     if (t.comment)
         return test_comment;
     if (!t.valid)
         return test_invalid;
 
     /* Set IEEE status to mathlib-normal */
     feclearexcept(FE_ALL_EXCEPT);
 
     /* Deal with operands */
 #define DO_DOP(arg,op) arg.i[dmsd] = t.op[0]; arg.i[dlsd] = t.op[1]
     DO_DOP(d_arg1,op1r);
     DO_DOP(d_arg2,op2r);
     s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
 
     /*
      * Detect NaNs, infinities and denormals on input, and set a
      * deferred decline flag if we're in FO mode.
      *
      * (We defer the decline rather than doing it immediately
      * because even in FO mode the operation is not permitted to
      * crash or tight-loop; so we _run_ the test, and then ignore
      * all the results.)
      */
     if (fo) {
         if (is_double_argtype(t.func->argtype) && is_dhard(t.op1r))
             deferred_decline = TRUE;
         if (t.func->argtype==at_d2 && is_dhard(t.op2r))
             deferred_decline = TRUE;
         if (is_single_argtype(t.func->argtype) && is_shard(t.op1r))
             deferred_decline = TRUE;
         if (t.func->argtype==at_s2 && is_shard(t.op2r))
             deferred_decline = TRUE;
         if (is_double_rettype(t.func->rettype) && is_dhard(t.resultr))
             deferred_decline = TRUE;
         if (t.func->rettype==rt_d2 && is_dhard(t.res2))
             deferred_decline = TRUE;
         if (is_single_argtype(t.func->rettype) && is_shard(t.resultr))
             deferred_decline = TRUE;
         if (t.func->rettype==rt_s2 && is_shard(t.res2))
             deferred_decline = TRUE;
         if (t.err == e_ERANGE)
             deferred_decline = TRUE;
     }
 
     /*
      * Perform the operation
      */
 
     errno = t.in_err == e_EDOM ? EDOM : t.in_err == e_ERANGE ? ERANGE : 0;
     if (t.err == e_0)
         t.err = t.in_err;
     if (t.maybeerr == e_0)
         t.maybeerr = t.in_err;
 
     if(t.func->type == t_func) {
         switch(t.func->argtype) {
         case at_d: d_res.f = t.func->func.d_d_ptr(d_arg1.f); break;
         case at_s: s_res.f = t.func->func.s_s_ptr(s_arg1.f); break;
         case at_d2: d_res.f = t.func->func.d2_d_ptr(d_arg1.f, d_arg2.f); break;
         case at_s2: s_res.f = t.func->func.s2_s_ptr(s_arg1.f, s_arg2.f); break;
         case at_di: d_res.f = t.func->func.di_d_ptr(d_arg1.f, d_arg2.i[dmsd]); break;
         case at_si: s_res.f = t.func->func.si_s_ptr(s_arg1.f, s_arg2.i); break;
         case at_dip: d_res.f = t.func->func.dip_d_ptr(d_arg1.f, (int*)&intres); break;
         case at_sip: s_res.f = t.func->func.sip_s_ptr(s_arg1.f, (int*)&intres); break;
         case at_ddp: d_res.f = t.func->func.ddp_d_ptr(d_arg1.f, &d_res2.f); break;
         case at_ssp: s_res.f = t.func->func.ssp_s_ptr(s_arg1.f, &s_res2.f); break;
         default:
             printf("unhandled function: %s\n",t.func->name);
             return test_fail;
         }
     } else {
         /* printf("macro: name=%s, num=%i, s1.i=0x%08x s1.f=%f\n",t.func->name, t.func->macro_name, s_arg1.i, (double)s_arg1.f); */
         switch(t.func->macro_name) {
         case m_isfinite: intres = isfinite(d_arg1.f); break;
         case m_isinf: intres = isinf(d_arg1.f); break;
         case m_isnan: intres = isnan(d_arg1.f); break;
         case m_isnormal: intres = isnormal(d_arg1.f); break;
         case m_signbit: intres = signbit(d_arg1.f); break;
         case m_fpclassify: intres = fpclassify(d_arg1.f); break;
         case m_isgreater: intres = isgreater(d_arg1.f, d_arg2.f); break;
         case m_isgreaterequal: intres = isgreaterequal(d_arg1.f, d_arg2.f); break;
         case m_isless: intres = isless(d_arg1.f, d_arg2.f); break;
         case m_islessequal: intres = islessequal(d_arg1.f, d_arg2.f); break;
         case m_islessgreater: intres = islessgreater(d_arg1.f, d_arg2.f); break;
         case m_isunordered: intres = isunordered(d_arg1.f, d_arg2.f); break;
 
         case m_isfinitef: intres = isfinite(s_arg1.f); break;
         case m_isinff: intres = isinf(s_arg1.f); break;
         case m_isnanf: intres = isnan(s_arg1.f); break;
         case m_isnormalf: intres = isnormal(s_arg1.f); break;
         case m_signbitf: intres = signbit(s_arg1.f); break;
         case m_fpclassifyf: intres = fpclassify(s_arg1.f); break;
         case m_isgreaterf: intres = isgreater(s_arg1.f, s_arg2.f); break;
         case m_isgreaterequalf: intres = isgreaterequal(s_arg1.f, s_arg2.f); break;
         case m_islessf: intres = isless(s_arg1.f, s_arg2.f); break;
         case m_islessequalf: intres = islessequal(s_arg1.f, s_arg2.f); break;
         case m_islessgreaterf: intres = islessgreater(s_arg1.f, s_arg2.f); break;
         case m_isunorderedf: intres = isunordered(s_arg1.f, s_arg2.f); break;
 
         default:
             printf("unhandled macro: %s\n",t.func->name);
             return test_fail;
         }
     }
 
     /*
      * Decline the test if the deferred decline flag was set above.
      */
     if (deferred_decline)
         return test_decline;
 
     /* printf("intres=%i\n",intres); */
 
     /* Clear the fail text (indicating a pass unless we change it) */
     failp[0] = '\0';
 
     /* Check the IEEE status bits (except INX, which we disregard).
      * We don't bother with this for complex numbers, because the
      * complex functions are hard to get exactly right and we don't
      * have to anyway (C99 annex G is only informative). */
     if (!(is_complex_argtype(t.func->argtype) || is_complex_rettype(t.func->rettype))) {
         status = fetestexcept(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW);
         if ((status|t.maybestatus|~statusmask) != (t.status|t.maybestatus|~statusmask)) {
             if (quiet) failtext[0]='x';
             else {
                 failp += sprintf(failp,
                                  " wrongstatus=%s%s%s%s%s",
                                  (status & FE_INVALID ? "i" : ""),
                                  (status & FE_DIVBYZERO ? "z" : ""),
                                  (status & FE_OVERFLOW ? "o" : ""),
                                  (status & FE_UNDERFLOW ? "u" : ""),
                                  (status ? "" : "OK"));
             }
         }
     }
 
     /* Check the result */
     {
         unsigned resultr[2], resulti[2];
         unsigned tresultr[3], tresulti[3], wres;
 
         switch(t.func->rettype) {
         case rt_d:
         case rt_d2:
             tresultr[0] = t.resultr[0];
             tresultr[1] = t.resultr[1];
             resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
             wres = 2;
             break;
         case rt_i:
             tresultr[0] = t.resultr[0];
             resultr[0] = intres;
             wres = 1;
             break;
         case rt_s:
         case rt_s2:
             tresultr[0] = t.resultr[0];
             resultr[0] = s_res.i;
             wres = 1;
             break;
         default:
             puts("unhandled rettype in runtest");
             wres = 0;
         }
         if(t.resultc != rc_none) {
             int err = 0;
             switch(t.resultc) {
             case rc_zero:
                 if(resultr[0] != 0 || resulti[0] != 0 ||
                    (wres==2 && (resultr[1] != 0 || resulti[1] != 0))) {
                     err = 1;
                 }
                 break;
             case rc_infinity:
                 if(wres==1) {
                     if(!((resultr[0]&0x7fffffff)==0x7f800000 ||
                          (resulti[0]&0x7fffffff)==0x7f800000)) {
                         err = 1;
                     }
                 } else {
                   if(!(((resultr[0]&0x7fffffff)==0x7ff00000 && resultr[1]==0) ||
                        ((resulti[0]&0x7fffffff)==0x7ff00000 && resulti[1]==0))) {
                         err = 1;
                     }
                 }
                 break;
             case rc_nan:
                 if(wres==1) {
                     if(!((resultr[0]&0x7fffffff)>0x7f800000 ||
                          (resulti[0]&0x7fffffff)>0x7f800000)) {
                         err = 1;
                     }
                 } else {
                     canon_dNaN(resultr);
                     canon_dNaN(resulti);
                     if(!(((resultr[0]&0x7fffffff)>0x7ff00000 && resultr[1]==1) ||
                          ((resulti[0]&0x7fffffff)>0x7ff00000 && resulti[1]==1))) {
                         err = 1;
                     }
                 }
                 break;
             case rc_finite:
                 if(wres==1) {
                     if(!((resultr[0]&0x7fffffff)<0x7f800000 ||
                          (resulti[0]&0x7fffffff)<0x7f800000)) {
                         err = 1;
                     }
                 } else {
                     if(!((resultr[0]&0x7fffffff)<0x7ff00000 ||
                          (resulti[0]&0x7fffffff)<0x7ff00000)) {
                         err = 1;
                     }
                 }
                 break;
             default:
                 break;
             }
             if(err) {
                 print_error(t.func->rettype,resultr,"wrongresultr",&failp);
                 print_error(t.func->rettype,resulti,"wrongresulti",&failp);
             }
         } else if (t.nresult > wres) {
             /*
              * The test case data has provided the result to more
              * than double precision. Instead of testing exact
              * equality, we test against our maximum error
              * tolerance.
              */
             int rshift, ishift;
             long long ulpsr, ulpsi, ulptolerance;
 
             tresultr[wres] = t.resultr[wres] << (32-EXTRABITS);
             tresulti[wres] = t.resulti[wres] << (32-EXTRABITS);
             if(strict) {
                 ulptolerance = 4096; /* one ulp */
             } else {
                 ulptolerance = t.func->tolerance;
             }
             rshift = ishift = 0;
             if (ulptolerance & ABSLOWERBOUND) {
                 /*
                  * Hack for the lgamma functions, which have an
                  * error behaviour that can't conveniently be
                  * characterised in pure ULPs. Really, we want to
                  * say that the error in lgamma is "at most N ULPs,
                  * or at most an absolute error of X, whichever is
                  * larger", for appropriately chosen N,X. But since
                  * these two functions are the only cases where it
                  * arises, I haven't bothered to do it in a nice way
                  * in the function table above.
                  *
                  * (The difficult cases arise with negative input
                  * values such that |gamma(x)| is very near to 1; in
                  * this situation implementations tend to separately
                  * compute lgamma(|x|) and the log of the correction
                  * term from the Euler reflection formula, and
                  * subtract - which catastrophically loses
                  * significance.)
                  *
                  * As far as I can tell, nobody cares about this:
                  * GNU libm doesn't get those cases right either,
                  * and OpenCL explicitly doesn't state a ULP error
                  * limit for lgamma. So my guess is that this is
                  * simply considered acceptable error behaviour for
                  * this particular function, and hence I feel free
                  * to allow for it here.
                  */
                 ulptolerance &= ~ABSLOWERBOUND;
                 if (t.op1r[0] & 0x80000000) {
                     if (t.func->rettype == rt_d)
                         rshift = 0x400 - ((tresultr[0] >> 20) & 0x7ff);
                     else if (t.func->rettype == rt_s)
                         rshift = 0x80 - ((tresultr[0] >> 23) & 0xff);
                     if (rshift < 0)
                         rshift = 0;
                 }
             }
             if (ulptolerance & PLUSMINUSPIO2) {
                 ulptolerance &= ~PLUSMINUSPIO2;
                 /*
                  * Hack for range reduction, which can reduce
                  * borderline cases in the wrong direction, i.e.
                  * return a value just outside one end of the interval
                  * [-pi/4,+pi/4] when it could have returned a value
                  * just inside the other end by subtracting an
                  * adjacent multiple of pi/2.
                  *
                  * We tolerate this, up to a point, because the
                  * trigonometric functions making use of the output of
                  * rred can cope and because making the range reducer
                  * do the exactly right thing in every case would be
                  * more expensive.
                  */
                 if (wres == 1) {
                     /* Upper bound of overshoot derived in rredf.h */
                     if ((resultr[0]&0x7FFFFFFF) <= 0x3f494b02 &&
                         (resultr[0]&0x7FFFFFFF) > 0x3f490fda &&
                         (resultr[0]&0x80000000) != (tresultr[0]&0x80000000)) {
                         unsigned long long val;
                         val = tresultr[0];
                         val = (val << 32) | tresultr[1];
                         /*
                          * Compute the alternative permitted result by
                          * subtracting from the sum of the extended
                          * single-precision bit patterns of +pi/4 and
                          * -pi/4. This is a horrible hack which only
                          * works because we can be confident that
                          * numbers in this range all have the same
                          * exponent!
                          */
                         val = 0xfe921fb54442d184ULL - val;
                         tresultr[0] = val >> 32;
                         tresultr[1] = (val >> (32-EXTRABITS)) << (32-EXTRABITS);
                         /*
                          * Also, expect a correspondingly different
                          * value of res2 as a result of this change.
                          * The adjustment depends on whether we just
                          * flipped the result from + to - or vice
                          * versa.
                          */
                         if (resultr[0] & 0x80000000) {
                             res2_adjust = +1;
                         } else {
                             res2_adjust = -1;
                         }
                     }
                 }
             }
             ulpsr = calc_error(resultr, tresultr, rshift, t.func->rettype);
             if(is_complex_rettype(t.func->rettype)) {
                 ulpsi = calc_error(resulti, tresulti, ishift, t.func->rettype);
             } else {
                 ulpsi = 0;
             }
             unsigned *rr = (ulpsr > ulptolerance || ulpsr < -ulptolerance) ? resultr : NULL;
             unsigned *ri = (ulpsi > ulptolerance || ulpsi < -ulptolerance) ? resulti : NULL;
 /*             printf("tolerance=%i, ulpsr=%i, ulpsi=%i, rr=%p, ri=%p\n",ulptolerance,ulpsr,ulpsi,rr,ri); */
             if (rr || ri) {
                 if (quiet) failtext[0]='x';
                 else {
                     print_error(t.func->rettype,rr,"wrongresultr",&failp);
                     print_error(t.func->rettype,ri,"wrongresulti",&failp);
                     print_ulps(t.func->rettype,rr ? ulpsr : 0, ri ? ulpsi : 0,&failp);
                 }
             }
         } else {
             if(is_complex_rettype(t.func->rettype))
                 /*
                  * Complex functions are not fully supported,
                  * this is unreachable, but prevents warnings.
                  */
                 abort();
             /*
              * The test case data has provided the result in
              * exactly the output precision. Therefore we must
              * complain about _any_ violation.
              */
             switch(t.func->rettype) {
             case rt_dc:
                 canon_dNaN(tresulti);
                 canon_dNaN(resulti);
                 if (fo) {
                     dnormzero(tresulti);
                     dnormzero(resulti);
                 }
                 /* deliberate fall-through */
             case rt_d:
                 canon_dNaN(tresultr);
                 canon_dNaN(resultr);
                 if (fo) {
                     dnormzero(tresultr);
                     dnormzero(resultr);
                 }
                 break;
             case rt_sc:
                 canon_sNaN(tresulti);
                 canon_sNaN(resulti);
                 if (fo) {
                     snormzero(tresulti);
                     snormzero(resulti);
                 }
                 /* deliberate fall-through */
             case rt_s:
                 canon_sNaN(tresultr);
                 canon_sNaN(resultr);
                 if (fo) {
                     snormzero(tresultr);
                     snormzero(resultr);
                 }
                 break;
             default:
                 break;
             }
             if(is_complex_rettype(t.func->rettype)) {
                 unsigned *rr, *ri;
                 if(resultr[0] != tresultr[0] ||
                    (wres > 1 && resultr[1] != tresultr[1])) {
                     rr = resultr;
                 } else {
                     rr = NULL;
                 }
                 if(resulti[0] != tresulti[0] ||
                    (wres > 1 && resulti[1] != tresulti[1])) {
                     ri = resulti;
                 } else {
                     ri = NULL;
                 }
                 if(rr || ri) {
                     if (quiet) failtext[0]='x';
                     print_error(t.func->rettype,rr,"wrongresultr",&failp);
                     print_error(t.func->rettype,ri,"wrongresulti",&failp);
                 }
             } else if (resultr[0] != tresultr[0] ||
                        (wres > 1 && resultr[1] != tresultr[1])) {
                 if (quiet) failtext[0]='x';
                 print_error(t.func->rettype,resultr,"wrongresult",&failp);
             }
         }
         /*
          * Now test res2, for those functions (frexp, modf, rred)
          * which use it.
          */
         if (t.func->func.ptr == &frexp || t.func->func.ptr == &frexpf ||
             t.func->macro_name == m_rred || t.func->macro_name == m_rredf) {
             unsigned tres2 = t.res2[0];
             if (res2_adjust) {
                 /* Fix for range reduction, propagated from further up */
                 tres2 = (tres2 + res2_adjust) & 3;
             }
             if (tres2 != intres) {
                 if (quiet) failtext[0]='x';
                 else {
                     failp += sprintf(failp,
                                      " wrongres2=%08x", intres);
                 }
             }
         } else if (t.func->func.ptr == &modf || t.func->func.ptr == &modff) {
             tresultr[0] = t.res2[0];
             tresultr[1] = t.res2[1];
             if (is_double_rettype(t.func->rettype)) {
                 canon_dNaN(tresultr);
                 resultr[0] = d_res2.i[dmsd];
                 resultr[1] = d_res2.i[dlsd];
                 canon_dNaN(resultr);
                 if (fo) {
                     dnormzero(tresultr);
                     dnormzero(resultr);
                 }
             } else {
                 canon_sNaN(tresultr);
                 resultr[0] = s_res2.i;
                 resultr[1] = s_res2.i;
                 canon_sNaN(resultr);
                 if (fo) {
                     snormzero(tresultr);
                     snormzero(resultr);
                 }
             }
             if (resultr[0] != tresultr[0] ||
                 (wres > 1 && resultr[1] != tresultr[1])) {
                 if (quiet) failtext[0]='x';
                 else {
                     if (is_double_rettype(t.func->rettype))
                         failp += sprintf(failp, " wrongres2=%08x.%08x",
                                          resultr[0], resultr[1]);
                     else
                         failp += sprintf(failp, " wrongres2=%08x",
                                          resultr[0]);
                 }
             }
         }
     }
 
     /* Check errno */
     err = (errno == EDOM ? e_EDOM : errno == ERANGE ? e_ERANGE : e_0);
     if (err != t.err && err != t.maybeerr) {
         if (quiet) failtext[0]='x';
         else {
             failp += sprintf(failp, " wrongerrno=%s expecterrno=%s ", errnos[err], errnos[t.err]);
         }
     }
 
     return *failtext ? test_fail : test_pass;
 }
 
 int passed, failed, declined;
 
 void runtests(char *name, FILE *fp) {
     char testbuf[512], linebuf[512];
     int lineno = 1;
     testdetail test;
 
     test.valid = 0;
 
     if (verbose) printf("runtests: %s\n", name);
     while (fgets(testbuf, sizeof(testbuf), fp)) {
         int res, print_errno;
         testbuf[strcspn(testbuf, "\r\n")] = '\0';
         strcpy(linebuf, testbuf);
         test = parsetest(testbuf, test);
         print_errno = 0;
         while (test.in_err < test.in_err_limit) {
             res = runtest(test);
             if (res == test_pass) {
                 if (verbose)
                     printf("%s:%d: pass\n", name, lineno);
                 ++passed;
             } else if (res == test_decline) {
                 if (verbose)
                     printf("%s:%d: declined\n", name, lineno);
                 ++declined;
             } else if (res == test_fail) {
                 if (!quiet)
                     printf("%s:%d: FAIL%s: %s%s%s%s\n", name, lineno,
                            test.random ? " (random)" : "",
                            linebuf,
                            print_errno ? " errno_in=" : "",
                            print_errno ? errnos[test.in_err] : "",
                            failtext);
                 ++failed;
             } else if (res == test_invalid) {
                 printf("%s:%d: malformed: %s\n", name, lineno, linebuf);
                 ++failed;
             }
             test.in_err++;
             print_errno = 1;
         }
         lineno++;
     }
 }
 
 int main(int ac, char **av) {
     char **files;
     int i, nfiles = 0;
     dbl d;
 
 #ifdef MICROLIB
     /*
      * Invent argc and argv ourselves.
      */
     char *argv[256];
     char args[256];
     {
         int sargs[2];
         char *p;
 
         ac = 0;
 
         sargs[0]=(int)args;
         sargs[1]=(int)sizeof(args);
         if (!__semihost(0x15, sargs)) {
             args[sizeof(args)-1] = '\0';   /* just in case */
             p = args;
             while (1) {
                 while (*p == ' ' || *p == '\t') p++;
                 if (!*p) break;
                 argv[ac++] = p;
                 while (*p && *p != ' ' && *p != '\t') p++;
                 if (*p) *p++ = '\0';
             }
         }
 
         av = argv;
     }
 #endif
 
     /* Sort tfuncs */
     qsort(tfuncs, sizeof(tfuncs)/sizeof(test_func), sizeof(test_func), &compare_tfuncs);
 
     /*
      * Autodetect the `double' endianness.
      */
     dmsd = 0;
     d.f = 1.0;                       /* 0x3ff00000 / 0x00000000 */
     if (d.i[dmsd] == 0) {
         dmsd = 1;
     }
     /*
      * Now dmsd denotes what the compiler thinks we're at. Let's
      * check that it agrees with what the runtime thinks.
      */
     d.i[0] = d.i[1] = 0x11111111;/* a random +ve number */
     d.f /= d.f;                    /* must now be one */
     if (d.i[dmsd] == 0) {
         fprintf(stderr, "YIKES! Compiler and runtime disagree on endianness"
                 " of `double'. Bailing out\n");
         return 1;
     }
     dlsd = !dmsd;
 
     /* default is terse */
     verbose = 0;
     fo = 0;
     strict = 0;
 
     files = (char **)malloc((ac+1) * sizeof(char *));
     if (!files) {
         fprintf(stderr, "initial malloc failed!\n");
         return 1;
     }
 #ifdef NOCMDLINE
     files[nfiles++] = "testfile";
 #endif
 
     while (--ac) {
         char *p = *++av;
         if (*p == '-') {
             static char *options[] = {
                 "-fo",
 #if 0
                 "-noinexact",
                 "-noround",
 #endif
                 "-nostatus",
                 "-quiet",
                 "-strict",
                 "-v",
                 "-verbose",
             };
             enum {
                 op_fo,
 #if 0
                 op_noinexact,
                 op_noround,
 #endif
                 op_nostatus,
                 op_quiet,
                 op_strict,
                 op_v,
                 op_verbose,
             };
             switch (find(p, options, sizeof(options))) {
             case op_quiet:
                 quiet = 1;
                 break;
 #if 0
             case op_noinexact:
                 statusmask &= 0x0F;    /* remove bit 4 */
                 break;
             case op_noround:
                 doround = 0;
                 break;
 #endif
             case op_nostatus:        /* no status word => noinx,noround */
                 statusmask = 0;
                 doround = 0;
                 break;
             case op_v:
             case op_verbose:
                 verbose = 1;
                 break;
             case op_fo:
                 fo = 1;
                 break;
             case op_strict: /* tolerance is 1 ulp */
                 strict = 1;
                 break;
             default:
                 fprintf(stderr, "unrecognised option: %s\n", p);
                 break;
             }
         } else {
             files[nfiles++] = p;
         }
     }
 
     passed = failed = declined = 0;
 
     if (nfiles) {
         for (i = 0; i < nfiles; i++) {
             FILE *fp = fopen(files[i], "r");
             if (!fp) {
                 fprintf(stderr, "Couldn't open %s\n", files[i]);
             } else
                 runtests(files[i], fp);
         }
     } else
         runtests("(stdin)", stdin);
 
     printf("Completed. Passed %d, failed %d (total %d",
            passed, failed, passed+failed);
     if (declined)
         printf(" plus %d declined", declined);
     printf(")\n");
     if (failed || passed == 0)
         return 1;
     printf("** TEST PASSED OK **\n");
     return 0;
 }
 
 void undef_func() {
     failed++;
     puts("ERROR: undefined function called");
 }
diff --git a/contrib/arm-optimized-routines/math/test/rtest/dotest.c b/contrib/arm-optimized-routines/math/test/rtest/dotest.c
index 6be79e1df0d1..5b3e9b4f18e4 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/dotest.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/dotest.c
@@ -1,2167 +1,2167 @@
 /*
  * dotest.c - actually generate mathlib test cases
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
 #include <limits.h>
 
 #include "semi.h"
 #include "intern.h"
 #include "random.h"
 
 #define MPFR_PREC 96 /* good enough for float or double + a few extra bits */
 
 extern int lib_fo, lib_no_arith, ntests;
 
 /*
  * Prototypes.
  */
 static void cases_biased(uint32 *, uint32, uint32);
 static void cases_biased_positive(uint32 *, uint32, uint32);
 static void cases_biased_float(uint32 *, uint32, uint32);
 static void cases_uniform(uint32 *, uint32, uint32);
 static void cases_uniform_positive(uint32 *, uint32, uint32);
 static void cases_uniform_float(uint32 *, uint32, uint32);
 static void cases_uniform_float_positive(uint32 *, uint32, uint32);
 static void log_cases(uint32 *, uint32, uint32);
 static void log_cases_float(uint32 *, uint32, uint32);
 static void log1p_cases(uint32 *, uint32, uint32);
 static void log1p_cases_float(uint32 *, uint32, uint32);
 static void minmax_cases(uint32 *, uint32, uint32);
 static void minmax_cases_float(uint32 *, uint32, uint32);
 static void atan2_cases(uint32 *, uint32, uint32);
 static void atan2_cases_float(uint32 *, uint32, uint32);
 static void pow_cases(uint32 *, uint32, uint32);
 static void pow_cases_float(uint32 *, uint32, uint32);
 static void rred_cases(uint32 *, uint32, uint32);
 static void rred_cases_float(uint32 *, uint32, uint32);
 static void cases_semi1(uint32 *, uint32, uint32);
 static void cases_semi1_float(uint32 *, uint32, uint32);
 static void cases_semi2(uint32 *, uint32, uint32);
 static void cases_semi2_float(uint32 *, uint32, uint32);
 static void cases_ldexp(uint32 *, uint32, uint32);
 static void cases_ldexp_float(uint32 *, uint32, uint32);
 
 static void complex_cases_uniform(uint32 *, uint32, uint32);
 static void complex_cases_uniform_float(uint32 *, uint32, uint32);
 static void complex_cases_biased(uint32 *, uint32, uint32);
 static void complex_cases_biased_float(uint32 *, uint32, uint32);
 static void complex_log_cases(uint32 *, uint32, uint32);
 static void complex_log_cases_float(uint32 *, uint32, uint32);
 static void complex_pow_cases(uint32 *, uint32, uint32);
 static void complex_pow_cases_float(uint32 *, uint32, uint32);
 static void complex_arithmetic_cases(uint32 *, uint32, uint32);
 static void complex_arithmetic_cases_float(uint32 *, uint32, uint32);
 
 static uint32 doubletop(int x, int scale);
 static uint32 floatval(int x, int scale);
 
 /*
  * Convert back and forth between IEEE bit patterns and the
  * mpfr_t/mpc_t types.
  */
 static void set_mpfr_d(mpfr_t x, uint32 h, uint32 l)
 {
     uint64_t hl = ((uint64_t)h << 32) | l;
     uint32 exp = (hl >> 52) & 0x7ff;
     int64_t mantissa = hl & (((uint64_t)1 << 52) - 1);
     int sign = (hl >> 63) ? -1 : +1;
     if (exp == 0x7ff) {
         if (mantissa == 0)
             mpfr_set_inf(x, sign);
         else
             mpfr_set_nan(x);
     } else if (exp == 0 && mantissa == 0) {
         mpfr_set_ui(x, 0, GMP_RNDN);
         mpfr_setsign(x, x, sign < 0, GMP_RNDN);
     } else {
         if (exp != 0)
             mantissa |= ((uint64_t)1 << 52);
         else
             exp++;
         mpfr_set_sj_2exp(x, mantissa * sign, (int)exp - 0x3ff - 52, GMP_RNDN);
     }
 }
 static void set_mpfr_f(mpfr_t x, uint32 f)
 {
     uint32 exp = (f >> 23) & 0xff;
     int32 mantissa = f & ((1 << 23) - 1);
     int sign = (f >> 31) ? -1 : +1;
     if (exp == 0xff) {
         if (mantissa == 0)
             mpfr_set_inf(x, sign);
         else
             mpfr_set_nan(x);
     } else if (exp == 0 && mantissa == 0) {
         mpfr_set_ui(x, 0, GMP_RNDN);
         mpfr_setsign(x, x, sign < 0, GMP_RNDN);
     } else {
         if (exp != 0)
             mantissa |= (1 << 23);
         else
             exp++;
         mpfr_set_sj_2exp(x, mantissa * sign, (int)exp - 0x7f - 23, GMP_RNDN);
     }
 }
 static void set_mpc_d(mpc_t z, uint32 rh, uint32 rl, uint32 ih, uint32 il)
 {
     mpfr_t x, y;
     mpfr_init2(x, MPFR_PREC);
     mpfr_init2(y, MPFR_PREC);
     set_mpfr_d(x, rh, rl);
     set_mpfr_d(y, ih, il);
     mpc_set_fr_fr(z, x, y, MPC_RNDNN);
     mpfr_clear(x);
     mpfr_clear(y);
 }
 static void set_mpc_f(mpc_t z, uint32 r, uint32 i)
 {
     mpfr_t x, y;
     mpfr_init2(x, MPFR_PREC);
     mpfr_init2(y, MPFR_PREC);
     set_mpfr_f(x, r);
     set_mpfr_f(y, i);
     mpc_set_fr_fr(z, x, y, MPC_RNDNN);
     mpfr_clear(x);
     mpfr_clear(y);
 }
 static void get_mpfr_d(const mpfr_t x, uint32 *h, uint32 *l, uint32 *extra)
 {
     uint32_t sign, expfield, mantfield;
     mpfr_t significand;
     int exp;
 
     if (mpfr_nan_p(x)) {
         *h = 0x7ff80000;
         *l = 0;
         *extra = 0;
         return;
     }
 
     sign = mpfr_signbit(x) ? 0x80000000U : 0;
 
     if (mpfr_inf_p(x)) {
         *h = 0x7ff00000 | sign;
         *l = 0;
         *extra = 0;
         return;
     }
 
     if (mpfr_zero_p(x)) {
         *h = 0x00000000 | sign;
         *l = 0;
         *extra = 0;
         return;
     }
 
     mpfr_init2(significand, MPFR_PREC);
     mpfr_set(significand, x, GMP_RNDN);
     exp = mpfr_get_exp(significand);
     mpfr_set_exp(significand, 0);
 
     /* Now significand is in [1/2,1), and significand * 2^exp == x.
      * So the IEEE exponent corresponding to exp==0 is 0x3fe. */
     if (exp > 0x400) {
         /* overflow to infinity anyway */
         *h = 0x7ff00000 | sign;
         *l = 0;
         *extra = 0;
         mpfr_clear(significand);
         return;
     }
 
     if (exp <= -0x3fe || mpfr_zero_p(x))
         exp = -0x3fd;       /* denormalise */
     expfield = exp + 0x3fd; /* offset to cancel leading mantissa bit */
 
     mpfr_div_2si(significand, x, exp - 21, GMP_RNDN);
     mpfr_abs(significand, significand, GMP_RNDN);
     mantfield = mpfr_get_ui(significand, GMP_RNDZ);
     *h = sign + ((uint64_t)expfield << 20) + mantfield;
     mpfr_sub_ui(significand, significand, mantfield, GMP_RNDN);
     mpfr_mul_2ui(significand, significand, 32, GMP_RNDN);
     mantfield = mpfr_get_ui(significand, GMP_RNDZ);
     *l = mantfield;
     mpfr_sub_ui(significand, significand, mantfield, GMP_RNDN);
     mpfr_mul_2ui(significand, significand, 32, GMP_RNDN);
     mantfield = mpfr_get_ui(significand, GMP_RNDZ);
     *extra = mantfield;
 
     mpfr_clear(significand);
 }
 static void get_mpfr_f(const mpfr_t x, uint32 *f, uint32 *extra)
 {
     uint32_t sign, expfield, mantfield;
     mpfr_t significand;
     int exp;
 
     if (mpfr_nan_p(x)) {
         *f = 0x7fc00000;
         *extra = 0;
         return;
     }
 
     sign = mpfr_signbit(x) ? 0x80000000U : 0;
 
     if (mpfr_inf_p(x)) {
         *f = 0x7f800000 | sign;
         *extra = 0;
         return;
     }
 
     if (mpfr_zero_p(x)) {
         *f = 0x00000000 | sign;
         *extra = 0;
         return;
     }
 
     mpfr_init2(significand, MPFR_PREC);
     mpfr_set(significand, x, GMP_RNDN);
     exp = mpfr_get_exp(significand);
     mpfr_set_exp(significand, 0);
 
     /* Now significand is in [1/2,1), and significand * 2^exp == x.
      * So the IEEE exponent corresponding to exp==0 is 0x7e. */
     if (exp > 0x80) {
         /* overflow to infinity anyway */
         *f = 0x7f800000 | sign;
         *extra = 0;
         mpfr_clear(significand);
         return;
     }
 
     if (exp <= -0x7e || mpfr_zero_p(x))
         exp = -0x7d;                   /* denormalise */
     expfield = exp + 0x7d; /* offset to cancel leading mantissa bit */
 
     mpfr_div_2si(significand, x, exp - 24, GMP_RNDN);
     mpfr_abs(significand, significand, GMP_RNDN);
     mantfield = mpfr_get_ui(significand, GMP_RNDZ);
     *f = sign + ((uint64_t)expfield << 23) + mantfield;
     mpfr_sub_ui(significand, significand, mantfield, GMP_RNDN);
     mpfr_mul_2ui(significand, significand, 32, GMP_RNDN);
     mantfield = mpfr_get_ui(significand, GMP_RNDZ);
     *extra = mantfield;
 
     mpfr_clear(significand);
 }
 static void get_mpc_d(const mpc_t z,
                       uint32 *rh, uint32 *rl, uint32 *rextra,
                       uint32 *ih, uint32 *il, uint32 *iextra)
 {
     mpfr_t x, y;
     mpfr_init2(x, MPFR_PREC);
     mpfr_init2(y, MPFR_PREC);
     mpc_real(x, z, GMP_RNDN);
     mpc_imag(y, z, GMP_RNDN);
     get_mpfr_d(x, rh, rl, rextra);
     get_mpfr_d(y, ih, il, iextra);
     mpfr_clear(x);
     mpfr_clear(y);
 }
 static void get_mpc_f(const mpc_t z,
                       uint32 *r, uint32 *rextra,
                       uint32 *i, uint32 *iextra)
 {
     mpfr_t x, y;
     mpfr_init2(x, MPFR_PREC);
     mpfr_init2(y, MPFR_PREC);
     mpc_real(x, z, GMP_RNDN);
     mpc_imag(y, z, GMP_RNDN);
     get_mpfr_f(x, r, rextra);
     get_mpfr_f(y, i, iextra);
     mpfr_clear(x);
     mpfr_clear(y);
 }
 
 /*
  * Implementation of mathlib functions that aren't trivially
  * implementable using an existing mpfr or mpc function.
  */
 int test_rred(mpfr_t ret, const mpfr_t x, int *quadrant)
 {
     mpfr_t halfpi;
     long quo;
     int status;
 
     /*
      * In the worst case of range reduction, we get an input of size
      * around 2^1024, and must find its remainder mod pi, which means
      * we need 1024 bits of pi at least. Plus, the remainder might
      * happen to come out very very small if we're unlucky. How
      * unlucky can we be? Well, conveniently, I once went through and
      * actually worked that out using Paxson's modular minimisation
      * algorithm, and it turns out that the smallest exponent you can
      * get out of a nontrivial[1] double precision range reduction is
      * 0x3c2, i.e. of the order of 2^-61. So we need 1024 bits of pi
      * to get us down to the units digit, another 61 or so bits (say
      * 64) to get down to the highest set bit of the output, and then
      * some bits to make the actual mantissa big enough.
      *
      *   [1] of course the output of range reduction can have an
      *   arbitrarily small exponent in the trivial case, where the
      *   input is so small that it's the identity function. That
      *   doesn't count.
      */
     mpfr_init2(halfpi, MPFR_PREC + 1024 + 64);
     mpfr_const_pi(halfpi, GMP_RNDN);
     mpfr_div_ui(halfpi, halfpi, 2, GMP_RNDN);
 
     status = mpfr_remquo(ret, &quo, x, halfpi, GMP_RNDN);
     *quadrant = quo & 3;
 
     mpfr_clear(halfpi);
 
     return status;
 }
 int test_lgamma(mpfr_t ret, const mpfr_t x, mpfr_rnd_t rnd)
 {
     /*
      * mpfr_lgamma takes an extra int * parameter to hold the output
      * sign. We don't bother testing that, so this wrapper throws away
      * the sign and hence fits into the same function prototype as all
      * the other real->real mpfr functions.
      *
      * There is also mpfr_lngamma which has no sign output and hence
      * has the right prototype already, but unfortunately it returns
      * NaN in cases where gamma(x) < 0, so it's no use to us.
      */
     int sign;
     return mpfr_lgamma(ret, &sign, x, rnd);
 }
 int test_cpow(mpc_t ret, const mpc_t x, const mpc_t y, mpc_rnd_t rnd)
 {
     /*
      * For complex pow, we must bump up the precision by a huge amount
      * if we want it to get the really difficult cases right. (Not
      * that we expect the library under test to be getting those cases
      * right itself, but we'd at least like the test suite to report
      * them as wrong for the _right reason_.)
      *
      * This works around a bug in mpc_pow(), fixed by r1455 in the MPC
      * svn repository (2014-10-14) and expected to be in any MPC
      * release after 1.0.2 (which was the latest release already made
      * at the time of the fix). So as and when we update to an MPC
      * with the fix in it, we could remove this workaround.
      *
      * For the reasons for choosing this amount of extra precision,
      * see analysis in complex/cpownotes.txt for the rationale for the
      * amount.
      */
     mpc_t xbig, ybig, retbig;
     int status;
 
     mpc_init2(xbig, 1034 + 53 + 60 + MPFR_PREC);
     mpc_init2(ybig, 1034 + 53 + 60 + MPFR_PREC);
     mpc_init2(retbig, 1034 + 53 + 60 + MPFR_PREC);
 
     mpc_set(xbig, x, MPC_RNDNN);
     mpc_set(ybig, y, MPC_RNDNN);
     status = mpc_pow(retbig, xbig, ybig, rnd);
     mpc_set(ret, retbig, rnd);
 
     mpc_clear(xbig);
     mpc_clear(ybig);
     mpc_clear(retbig);
 
     return status;
 }
 
 /*
  * Identify 'hard' values (NaN, Inf, nonzero denormal) for deciding
  * whether microlib will decline to run a test.
  */
 #define is_shard(in) ( \
     (((in)[0] & 0x7F800000) == 0x7F800000 || \
      (((in)[0] & 0x7F800000) == 0 && ((in)[0]&0x7FFFFFFF) != 0)))
 
 #define is_dhard(in) ( \
     (((in)[0] & 0x7FF00000) == 0x7FF00000 || \
      (((in)[0] & 0x7FF00000) == 0 && (((in)[0] & 0xFFFFF) | (in)[1]) != 0)))
 
 /*
  * Identify integers.
  */
 int is_dinteger(uint32 *in)
 {
     uint32 out[3];
     if ((0x7FF00000 & ~in[0]) == 0)
         return 0;                      /* not finite, hence not integer */
     test_ceil(in, out);
     return in[0] == out[0] && in[1] == out[1];
 }
 int is_sinteger(uint32 *in)
 {
     uint32 out[3];
     if ((0x7F800000 & ~in[0]) == 0)
         return 0;                      /* not finite, hence not integer */
     test_ceilf(in, out);
     return in[0] == out[0];
 }
 
 /*
  * Identify signalling NaNs.
  */
 int is_dsnan(const uint32 *in)
 {
     if ((in[0] & 0x7FF00000) != 0x7FF00000)
         return 0;                      /* not the inf/nan exponent */
     if ((in[0] << 12) == 0 && in[1] == 0)
         return 0;                      /* inf */
     if (in[0] & 0x00080000)
         return 0;                      /* qnan */
     return 1;
 }
 int is_ssnan(const uint32 *in)
 {
     if ((in[0] & 0x7F800000) != 0x7F800000)
         return 0;                      /* not the inf/nan exponent */
     if ((in[0] << 9) == 0)
         return 0;                      /* inf */
     if (in[0] & 0x00400000)
         return 0;                      /* qnan */
     return 1;
 }
 int is_snan(const uint32 *in, int size)
 {
     return size == 2 ? is_dsnan(in) : is_ssnan(in);
 }
 
 /*
  * Wrapper functions called to fix up unusual results after the main
  * test function has run.
  */
 void universal_wrapper(wrapperctx *ctx)
 {
     /*
      * Any SNaN input gives rise to a QNaN output.
      */
     int op;
     for (op = 0; op < wrapper_get_nops(ctx); op++) {
         int size = wrapper_get_size(ctx, op);
 
         if (!wrapper_is_complex(ctx, op) &&
             is_snan(wrapper_get_ieee(ctx, op), size)) {
             wrapper_set_nan(ctx);
         }
     }
 }
 
 Testable functions[] = {
     /*
      * Trig functions: sin, cos, tan. We test the core function
      * between -16 and +16: we assume that range reduction exists
      * and will be used for larger arguments, and we'll test that
      * separately. Also we only go down to 2^-27 in magnitude,
      * because below that sin(x)=tan(x)=x and cos(x)=1 as far as
      * double precision can tell, which is boring.
      */
     {"sin", (funcptr)mpfr_sin, args1, {NULL},
         cases_uniform, 0x3e400000, 0x40300000},
     {"sinf", (funcptr)mpfr_sin, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x41800000},
     {"cos", (funcptr)mpfr_cos, args1, {NULL},
         cases_uniform, 0x3e400000, 0x40300000},
     {"cosf", (funcptr)mpfr_cos, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x41800000},
     {"tan", (funcptr)mpfr_tan, args1, {NULL},
         cases_uniform, 0x3e400000, 0x40300000},
     {"tanf", (funcptr)mpfr_tan, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x41800000},
     {"sincosf_sinf", (funcptr)mpfr_sin, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x41800000},
     {"sincosf_cosf", (funcptr)mpfr_cos, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x41800000},
     /*
      * Inverse trig: asin, acos. Between 1 and -1, of course. acos
      * goes down to 2^-54, asin to 2^-27.
      */
     {"asin", (funcptr)mpfr_asin, args1, {NULL},
         cases_uniform, 0x3e400000, 0x3fefffff},
     {"asinf", (funcptr)mpfr_asin, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x3f7fffff},
     {"acos", (funcptr)mpfr_acos, args1, {NULL},
         cases_uniform, 0x3c900000, 0x3fefffff},
     {"acosf", (funcptr)mpfr_acos, args1f, {NULL},
         cases_uniform_float, 0x33800000, 0x3f7fffff},
     /*
      * Inverse trig: atan. atan is stable (in double prec) with
      * argument magnitude past 2^53, so we'll test up to there.
      * atan(x) is boringly just x below 2^-27.
      */
     {"atan", (funcptr)mpfr_atan, args1, {NULL},
         cases_uniform, 0x3e400000, 0x43400000},
     {"atanf", (funcptr)mpfr_atan, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x4b800000},
     /*
      * atan2. Interesting cases arise when the exponents of the
      * arguments differ by at most about 50.
      */
     {"atan2", (funcptr)mpfr_atan2, args2, {NULL},
         atan2_cases, 0},
     {"atan2f", (funcptr)mpfr_atan2, args2f, {NULL},
         atan2_cases_float, 0},
     /*
      * The exponentials: exp, sinh, cosh. They overflow at around
      * 710. exp and sinh are boring below 2^-54, cosh below 2^-27.
      */
     {"exp", (funcptr)mpfr_exp, args1, {NULL},
         cases_uniform, 0x3c900000, 0x40878000},
     {"expf", (funcptr)mpfr_exp, args1f, {NULL},
         cases_uniform_float, 0x33800000, 0x42dc0000},
     {"sinh", (funcptr)mpfr_sinh, args1, {NULL},
         cases_uniform, 0x3c900000, 0x40878000},
     {"sinhf", (funcptr)mpfr_sinh, args1f, {NULL},
         cases_uniform_float, 0x33800000, 0x42dc0000},
     {"cosh", (funcptr)mpfr_cosh, args1, {NULL},
         cases_uniform, 0x3e400000, 0x40878000},
     {"coshf", (funcptr)mpfr_cosh, args1f, {NULL},
         cases_uniform_float, 0x39800000, 0x42dc0000},
     /*
      * tanh is stable past around 20. It's boring below 2^-27.
      */
     {"tanh", (funcptr)mpfr_tanh, args1, {NULL},
         cases_uniform, 0x3e400000, 0x40340000},
     {"tanhf", (funcptr)mpfr_tanh, args1f, {NULL},
         cases_uniform, 0x39800000, 0x41100000},
     /*
      * log must be tested only on positive numbers, but can cover
      * the whole range of positive nonzero finite numbers. It never
      * gets boring.
      */
     {"log", (funcptr)mpfr_log, args1, {NULL}, log_cases, 0},
     {"logf", (funcptr)mpfr_log, args1f, {NULL}, log_cases_float, 0},
     {"log10", (funcptr)mpfr_log10, args1, {NULL}, log_cases, 0},
     {"log10f", (funcptr)mpfr_log10, args1f, {NULL}, log_cases_float, 0},
     /*
      * pow.
      */
     {"pow", (funcptr)mpfr_pow, args2, {NULL}, pow_cases, 0},
     {"powf", (funcptr)mpfr_pow, args2f, {NULL}, pow_cases_float, 0},
     /*
      * Trig range reduction. We are able to test this for all
      * finite values, but will only bother for things between 2^-3
      * and 2^+52.
      */
     {"rred", (funcptr)test_rred, rred, {NULL}, rred_cases, 0},
     {"rredf", (funcptr)test_rred, rredf, {NULL}, rred_cases_float, 0},
     /*
      * Square and cube root.
      */
     {"sqrt", (funcptr)mpfr_sqrt, args1, {NULL}, log_cases, 0},
     {"sqrtf", (funcptr)mpfr_sqrt, args1f, {NULL}, log_cases_float, 0},
     {"cbrt", (funcptr)mpfr_cbrt, args1, {NULL}, log_cases, 0},
     {"cbrtf", (funcptr)mpfr_cbrt, args1f, {NULL}, log_cases_float, 0},
     {"hypot", (funcptr)mpfr_hypot, args2, {NULL}, atan2_cases, 0},
     {"hypotf", (funcptr)mpfr_hypot, args2f, {NULL}, atan2_cases_float, 0},
     /*
      * Seminumerical functions.
      */
     {"ceil", (funcptr)test_ceil, semi1, {NULL}, cases_semi1},
     {"ceilf", (funcptr)test_ceilf, semi1f, {NULL}, cases_semi1_float},
     {"floor", (funcptr)test_floor, semi1, {NULL}, cases_semi1},
     {"floorf", (funcptr)test_floorf, semi1f, {NULL}, cases_semi1_float},
     {"fmod", (funcptr)test_fmod, semi2, {NULL}, cases_semi2},
     {"fmodf", (funcptr)test_fmodf, semi2f, {NULL}, cases_semi2_float},
     {"ldexp", (funcptr)test_ldexp, t_ldexp, {NULL}, cases_ldexp},
     {"ldexpf", (funcptr)test_ldexpf, t_ldexpf, {NULL}, cases_ldexp_float},
     {"frexp", (funcptr)test_frexp, t_frexp, {NULL}, cases_semi1},
     {"frexpf", (funcptr)test_frexpf, t_frexpf, {NULL}, cases_semi1_float},
     {"modf", (funcptr)test_modf, t_modf, {NULL}, cases_semi1},
     {"modff", (funcptr)test_modff, t_modff, {NULL}, cases_semi1_float},
 
     /*
      * Classification and more semi-numericals
      */
     {"copysign", (funcptr)test_copysign, semi2, {NULL}, cases_semi2},
     {"copysignf", (funcptr)test_copysignf, semi2f, {NULL}, cases_semi2_float},
     {"isfinite", (funcptr)test_isfinite, classify, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isfinitef", (funcptr)test_isfinitef, classifyf, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"isinf", (funcptr)test_isinf, classify, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isinff", (funcptr)test_isinff, classifyf, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"isnan", (funcptr)test_isnan, classify, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isnanf", (funcptr)test_isnanf, classifyf, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"isnormal", (funcptr)test_isnormal, classify, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isnormalf", (funcptr)test_isnormalf, classifyf, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"signbit", (funcptr)test_signbit, classify, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"signbitf", (funcptr)test_signbitf, classifyf, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"fpclassify", (funcptr)test_fpclassify, classify, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"fpclassifyf", (funcptr)test_fpclassifyf, classifyf, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     /*
      * Comparisons
      */
     {"isgreater", (funcptr)test_isgreater, compare, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isgreaterequal", (funcptr)test_isgreaterequal, compare, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isless", (funcptr)test_isless, compare, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"islessequal", (funcptr)test_islessequal, compare, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"islessgreater", (funcptr)test_islessgreater, compare, {NULL}, cases_uniform, 0, 0x7fffffff},
     {"isunordered", (funcptr)test_isunordered, compare, {NULL}, cases_uniform, 0, 0x7fffffff},
 
     {"isgreaterf", (funcptr)test_isgreaterf, comparef, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"isgreaterequalf", (funcptr)test_isgreaterequalf, comparef, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"islessf", (funcptr)test_islessf, comparef, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"islessequalf", (funcptr)test_islessequalf, comparef, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"islessgreaterf", (funcptr)test_islessgreaterf, comparef, {NULL}, cases_uniform_float, 0, 0x7fffffff},
     {"isunorderedf", (funcptr)test_isunorderedf, comparef, {NULL}, cases_uniform_float, 0, 0x7fffffff},
 
     /*
      * Inverse Hyperbolic functions
      */
     {"atanh", (funcptr)mpfr_atanh, args1, {NULL}, cases_uniform, 0x3e400000, 0x3fefffff},
     {"asinh", (funcptr)mpfr_asinh, args1, {NULL}, cases_uniform, 0x3e400000, 0x3fefffff},
     {"acosh", (funcptr)mpfr_acosh, args1, {NULL}, cases_uniform_positive, 0x3ff00000, 0x7fefffff},
 
     {"atanhf", (funcptr)mpfr_atanh, args1f, {NULL}, cases_uniform_float, 0x32000000, 0x3f7fffff},
     {"asinhf", (funcptr)mpfr_asinh, args1f, {NULL}, cases_uniform_float, 0x32000000, 0x3f7fffff},
     {"acoshf", (funcptr)mpfr_acosh, args1f, {NULL}, cases_uniform_float_positive, 0x3f800000, 0x7f800000},
 
     /*
      * Everything else (sitting in a section down here at the bottom
      * because historically they were not tested because we didn't
      * have reference implementations for them)
      */
     {"csin", (funcptr)mpc_sin, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"csinf", (funcptr)mpc_sin, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"ccos", (funcptr)mpc_cos, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"ccosf", (funcptr)mpc_cos, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"ctan", (funcptr)mpc_tan, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"ctanf", (funcptr)mpc_tan, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
 
     {"casin", (funcptr)mpc_asin, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"casinf", (funcptr)mpc_asin, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"cacos", (funcptr)mpc_acos, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"cacosf", (funcptr)mpc_acos, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"catan", (funcptr)mpc_atan, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"catanf", (funcptr)mpc_atan, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
 
     {"csinh", (funcptr)mpc_sinh, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"csinhf", (funcptr)mpc_sinh, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"ccosh", (funcptr)mpc_cosh, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"ccoshf", (funcptr)mpc_cosh, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"ctanh", (funcptr)mpc_tanh, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"ctanhf", (funcptr)mpc_tanh, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
 
     {"casinh", (funcptr)mpc_asinh, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"casinhf", (funcptr)mpc_asinh, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"cacosh", (funcptr)mpc_acosh, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"cacoshf", (funcptr)mpc_acosh, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
     {"catanh", (funcptr)mpc_atanh, args1c, {NULL}, complex_cases_uniform, 0x3f000000, 0x40300000},
     {"catanhf", (funcptr)mpc_atanh, args1fc, {NULL}, complex_cases_uniform_float, 0x38000000, 0x41800000},
 
     {"cexp", (funcptr)mpc_exp, args1c, {NULL}, complex_cases_uniform, 0x3c900000, 0x40862000},
     {"cpow", (funcptr)test_cpow, args2c, {NULL}, complex_pow_cases, 0x3fc00000, 0x40000000},
     {"clog", (funcptr)mpc_log, args1c, {NULL}, complex_log_cases, 0, 0},
     {"csqrt", (funcptr)mpc_sqrt, args1c, {NULL}, complex_log_cases, 0, 0},
 
     {"cexpf", (funcptr)mpc_exp, args1fc, {NULL}, complex_cases_uniform_float, 0x24800000, 0x42b00000},
     {"cpowf", (funcptr)test_cpow, args2fc, {NULL}, complex_pow_cases_float, 0x3e000000, 0x41000000},
     {"clogf", (funcptr)mpc_log, args1fc, {NULL}, complex_log_cases_float, 0, 0},
     {"csqrtf", (funcptr)mpc_sqrt, args1fc, {NULL}, complex_log_cases_float, 0, 0},
 
     {"cdiv", (funcptr)mpc_div, args2c, {NULL}, complex_arithmetic_cases, 0, 0},
     {"cmul", (funcptr)mpc_mul, args2c, {NULL}, complex_arithmetic_cases, 0, 0},
     {"cadd", (funcptr)mpc_add, args2c, {NULL}, complex_arithmetic_cases, 0, 0},
     {"csub", (funcptr)mpc_sub, args2c, {NULL}, complex_arithmetic_cases, 0, 0},
 
     {"cdivf", (funcptr)mpc_div, args2fc, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"cmulf", (funcptr)mpc_mul, args2fc, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"caddf", (funcptr)mpc_add, args2fc, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"csubf", (funcptr)mpc_sub, args2fc, {NULL}, complex_arithmetic_cases_float, 0, 0},
 
     {"cabsf", (funcptr)mpc_abs, args1fcr, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"cabs", (funcptr)mpc_abs, args1cr, {NULL}, complex_arithmetic_cases, 0, 0},
     {"cargf", (funcptr)mpc_arg, args1fcr, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"carg", (funcptr)mpc_arg, args1cr, {NULL}, complex_arithmetic_cases, 0, 0},
     {"cimagf", (funcptr)mpc_imag, args1fcr, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"cimag", (funcptr)mpc_imag, args1cr, {NULL}, complex_arithmetic_cases, 0, 0},
     {"conjf", (funcptr)mpc_conj, args1fc, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"conj", (funcptr)mpc_conj, args1c, {NULL}, complex_arithmetic_cases, 0, 0},
     {"cprojf", (funcptr)mpc_proj, args1fc, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"cproj", (funcptr)mpc_proj, args1c, {NULL}, complex_arithmetic_cases, 0, 0},
     {"crealf", (funcptr)mpc_real, args1fcr, {NULL}, complex_arithmetic_cases_float, 0, 0},
     {"creal", (funcptr)mpc_real, args1cr, {NULL}, complex_arithmetic_cases, 0, 0},
     {"erfcf", (funcptr)mpfr_erfc, args1f, {NULL}, cases_biased_float, 0x1e800000, 0x41000000},
     {"erfc", (funcptr)mpfr_erfc, args1, {NULL}, cases_biased, 0x3bd00000, 0x403c0000},
     {"erff", (funcptr)mpfr_erf, args1f, {NULL}, cases_biased_float, 0x03800000, 0x40700000},
     {"erf", (funcptr)mpfr_erf, args1, {NULL}, cases_biased, 0x00800000, 0x40200000},
     {"exp2f", (funcptr)mpfr_exp2, args1f, {NULL}, cases_uniform_float, 0x33800000, 0x43c00000},
     {"exp2", (funcptr)mpfr_exp2, args1, {NULL}, cases_uniform, 0x3ca00000, 0x40a00000},
     {"expm1f", (funcptr)mpfr_expm1, args1f, {NULL}, cases_uniform_float, 0x33000000, 0x43800000},
     {"expm1", (funcptr)mpfr_expm1, args1, {NULL}, cases_uniform, 0x3c900000, 0x409c0000},
     {"fmaxf", (funcptr)mpfr_max, args2f, {NULL}, minmax_cases_float, 0, 0x7f7fffff},
     {"fmax", (funcptr)mpfr_max, args2, {NULL}, minmax_cases, 0, 0x7fefffff},
     {"fminf", (funcptr)mpfr_min, args2f, {NULL}, minmax_cases_float, 0, 0x7f7fffff},
     {"fmin", (funcptr)mpfr_min, args2, {NULL}, minmax_cases, 0, 0x7fefffff},
     {"lgammaf", (funcptr)test_lgamma, args1f, {NULL}, cases_uniform_float, 0x01800000, 0x7f800000},
     {"lgamma", (funcptr)test_lgamma, args1, {NULL}, cases_uniform, 0x00100000, 0x7ff00000},
     {"log1pf", (funcptr)mpfr_log1p, args1f, {NULL}, log1p_cases_float, 0, 0},
     {"log1p", (funcptr)mpfr_log1p, args1, {NULL}, log1p_cases, 0, 0},
     {"log2f", (funcptr)mpfr_log2, args1f, {NULL}, log_cases_float, 0, 0},
     {"log2", (funcptr)mpfr_log2, args1, {NULL}, log_cases, 0, 0},
     {"tgammaf", (funcptr)mpfr_gamma, args1f, {NULL}, cases_uniform_float, 0x2f800000, 0x43000000},
     {"tgamma", (funcptr)mpfr_gamma, args1, {NULL}, cases_uniform, 0x3c000000, 0x40800000},
 };
 
 const int nfunctions = ( sizeof(functions)/sizeof(*functions) );
 
 #define random_sign ( random_upto(1) ? 0x80000000 : 0 )
 
 static int iszero(uint32 *x) {
     return !((x[0] & 0x7FFFFFFF) || x[1]);
 }
 
 
 static void complex_log_cases(uint32 *out, uint32 param1,
                               uint32 param2) {
     cases_uniform(out,0x00100000,0x7fefffff);
     cases_uniform(out+2,0x00100000,0x7fefffff);
 }
 
 
 static void complex_log_cases_float(uint32 *out, uint32 param1,
                                     uint32 param2) {
     cases_uniform_float(out,0x00800000,0x7f7fffff);
     cases_uniform_float(out+2,0x00800000,0x7f7fffff);
 }
 
 static void complex_cases_biased(uint32 *out, uint32 lowbound,
                                  uint32 highbound) {
     cases_biased(out,lowbound,highbound);
     cases_biased(out+2,lowbound,highbound);
 }
 
 static void complex_cases_biased_float(uint32 *out, uint32 lowbound,
                                        uint32 highbound) {
     cases_biased_float(out,lowbound,highbound);
     cases_biased_float(out+2,lowbound,highbound);
 }
 
 static void complex_cases_uniform(uint32 *out, uint32 lowbound,
                                  uint32 highbound) {
     cases_uniform(out,lowbound,highbound);
     cases_uniform(out+2,lowbound,highbound);
 }
 
 static void complex_cases_uniform_float(uint32 *out, uint32 lowbound,
                                        uint32 highbound) {
     cases_uniform_float(out,lowbound,highbound);
     cases_uniform(out+2,lowbound,highbound);
 }
 
 static void complex_pow_cases(uint32 *out, uint32 lowbound,
                               uint32 highbound) {
     /*
      * Generating non-overflowing cases for complex pow:
      *
      * Our base has both parts within the range [1/2,2], and hence
      * its magnitude is within [1/2,2*sqrt(2)]. The magnitude of its
      * logarithm in base 2 is therefore at most the magnitude of
      * (log2(2*sqrt(2)) + i*pi/log(2)), or in other words
      * hypot(3/2,pi/log(2)) = 4.77. So the magnitude of the exponent
      * input must be at most our output magnitude limit (as a power
      * of two) divided by that.
      *
      * I also set the output magnitude limit a bit low, because we
      * don't guarantee (and neither does glibc) to prevent internal
      * overflow in cases where the output _magnitude_ overflows but
      * scaling it back down by cos and sin of the argument brings it
      * back in range.
      */
     cases_uniform(out,0x3fe00000, 0x40000000);
     cases_uniform(out+2,0x3fe00000, 0x40000000);
     cases_uniform(out+4,0x3f800000, 0x40600000);
     cases_uniform(out+6,0x3f800000, 0x40600000);
 }
 
 static void complex_pow_cases_float(uint32 *out, uint32 lowbound,
                                     uint32 highbound) {
     /*
      * Reasoning as above, though of course the detailed numbers are
      * all different.
      */
     cases_uniform_float(out,0x3f000000, 0x40000000);
     cases_uniform_float(out+2,0x3f000000, 0x40000000);
     cases_uniform_float(out+4,0x3d600000, 0x41900000);
     cases_uniform_float(out+6,0x3d600000, 0x41900000);
 }
 
 static void complex_arithmetic_cases(uint32 *out, uint32 lowbound,
                                      uint32 highbound) {
     cases_uniform(out,0,0x7fefffff);
     cases_uniform(out+2,0,0x7fefffff);
     cases_uniform(out+4,0,0x7fefffff);
     cases_uniform(out+6,0,0x7fefffff);
 }
 
 static void complex_arithmetic_cases_float(uint32 *out, uint32 lowbound,
                                            uint32 highbound) {
     cases_uniform_float(out,0,0x7f7fffff);
     cases_uniform_float(out+2,0,0x7f7fffff);
     cases_uniform_float(out+4,0,0x7f7fffff);
     cases_uniform_float(out+6,0,0x7f7fffff);
 }
 
 /*
  * Included from fplib test suite, in a compact self-contained
  * form.
  */
 
 void float32_case(uint32 *ret) {
     int n, bits;
     uint32 f;
     static int premax, preptr;
     static uint32 *specifics = NULL;
 
     if (!ret) {
         if (specifics)
             free(specifics);
         specifics = NULL;
         premax = preptr = 0;
         return;
     }
 
     if (!specifics) {
         int exps[] = {
             -127, -126, -125, -24, -4, -3, -2, -1, 0, 1, 2, 3, 4,
                 24, 29, 30, 31, 32, 61, 62, 63, 64, 126, 127, 128
         };
         int sign, eptr;
         uint32 se, j;
         /*
          * We want a cross product of:
          *  - each of two sign bits (2)
          *  - each of the above (unbiased) exponents (25)
          *  - the following list of fraction parts:
          *    * zero (1)
          *    * all bits (1)
          *    * one-bit-set (23)
          *    * one-bit-clear (23)
          *    * one-bit-and-above (20: 3 are duplicates)
          *    * one-bit-and-below (20: 3 are duplicates)
          *    (total 88)
          *  (total 4400)
          */
         specifics = malloc(4400 * sizeof(*specifics));
         preptr = 0;
         for (sign = 0; sign <= 1; sign++) {
             for (eptr = 0; eptr < sizeof(exps)/sizeof(*exps); eptr++) {
                 se = (sign ? 0x80000000 : 0) | ((exps[eptr]+127) << 23);
                 /*
                  * Zero.
                  */
                 specifics[preptr++] = se | 0;
                 /*
                  * All bits.
                  */
                 specifics[preptr++] = se | 0x7FFFFF;
                 /*
                  * One-bit-set.
                  */
                 for (j = 1; j && j <= 0x400000; j <<= 1)
                     specifics[preptr++] = se | j;
                 /*
                  * One-bit-clear.
                  */
                 for (j = 1; j && j <= 0x400000; j <<= 1)
                     specifics[preptr++] = se | (0x7FFFFF ^ j);
                 /*
                  * One-bit-and-everything-below.
                  */
                 for (j = 2; j && j <= 0x100000; j <<= 1)
                     specifics[preptr++] = se | (2*j-1);
                 /*
                  * One-bit-and-everything-above.
                  */
                 for (j = 4; j && j <= 0x200000; j <<= 1)
                     specifics[preptr++] = se | (0x7FFFFF ^ (j-1));
                 /*
                  * Done.
                  */
             }
         }
         assert(preptr == 4400);
         premax = preptr;
     }
 
     /*
      * Decide whether to return a pre or a random case.
      */
     n = random32() % (premax+1);
     if (n < preptr) {
         /*
          * Return pre[n].
          */
         uint32 t;
         t = specifics[n];
         specifics[n] = specifics[preptr-1];
         specifics[preptr-1] = t;        /* (not really needed) */
         preptr--;
         *ret = t;
     } else {
         /*
          * Random case.
          * Sign and exponent:
          *  - FIXME
          * Significand:
          *  - with prob 1/5, a totally random bit pattern
          *  - with prob 1/5, all 1s down to some point and then random
          *  - with prob 1/5, all 1s up to some point and then random
          *  - with prob 1/5, all 0s down to some point and then random
          *  - with prob 1/5, all 0s up to some point and then random
          */
         n = random32() % 5;
         f = random32();                /* some random bits */
         bits = random32() % 22 + 1;    /* 1-22 */
         switch (n) {
           case 0:
             break;                     /* leave f alone */
           case 1:
             f |= (1<<bits)-1;
             break;
           case 2:
             f &= ~((1<<bits)-1);
             break;
           case 3:
             f |= ~((1<<bits)-1);
             break;
           case 4:
             f &= (1<<bits)-1;
             break;
         }
         f &= 0x7FFFFF;
         f |= (random32() & 0xFF800000);/* FIXME - do better */
         *ret = f;
     }
 }
 static void float64_case(uint32 *ret) {
     int n, bits;
     uint32 f, g;
     static int premax, preptr;
     static uint32 (*specifics)[2] = NULL;
 
     if (!ret) {
         if (specifics)
             free(specifics);
         specifics = NULL;
         premax = preptr = 0;
         return;
     }
 
     if (!specifics) {
         int exps[] = {
             -1023, -1022, -1021, -129, -128, -127, -126, -53, -4, -3, -2,
             -1, 0, 1, 2, 3, 4, 29, 30, 31, 32, 53, 61, 62, 63, 64, 127,
             128, 129, 1022, 1023, 1024
         };
         int sign, eptr;
         uint32 se, j;
         /*
          * We want a cross product of:
          *  - each of two sign bits (2)
          *  - each of the above (unbiased) exponents (32)
          *  - the following list of fraction parts:
          *    * zero (1)
          *    * all bits (1)
          *    * one-bit-set (52)
          *    * one-bit-clear (52)
          *    * one-bit-and-above (49: 3 are duplicates)
          *    * one-bit-and-below (49: 3 are duplicates)
          *    (total 204)
          *  (total 13056)
          */
         specifics = malloc(13056 * sizeof(*specifics));
         preptr = 0;
         for (sign = 0; sign <= 1; sign++) {
             for (eptr = 0; eptr < sizeof(exps)/sizeof(*exps); eptr++) {
                 se = (sign ? 0x80000000 : 0) | ((exps[eptr]+1023) << 20);
                 /*
                  * Zero.
                  */
                 specifics[preptr][0] = 0;
                 specifics[preptr][1] = 0;
                 specifics[preptr++][0] |= se;
                 /*
                  * All bits.
                  */
                 specifics[preptr][0] = 0xFFFFF;
                 specifics[preptr][1] = ~0;
                 specifics[preptr++][0] |= se;
                 /*
                  * One-bit-set.
                  */
                 for (j = 1; j && j <= 0x80000000; j <<= 1) {
                     specifics[preptr][0] = 0;
                     specifics[preptr][1] = j;
                     specifics[preptr++][0] |= se;
                     if (j & 0xFFFFF) {
                         specifics[preptr][0] = j;
                         specifics[preptr][1] = 0;
                         specifics[preptr++][0] |= se;
                     }
                 }
                 /*
                  * One-bit-clear.
                  */
                 for (j = 1; j && j <= 0x80000000; j <<= 1) {
                     specifics[preptr][0] = 0xFFFFF;
                     specifics[preptr][1] = ~j;
                     specifics[preptr++][0] |= se;
                     if (j & 0xFFFFF) {
                         specifics[preptr][0] = 0xFFFFF ^ j;
                         specifics[preptr][1] = ~0;
                         specifics[preptr++][0] |= se;
                     }
                 }
                 /*
                  * One-bit-and-everything-below.
                  */
                 for (j = 2; j && j <= 0x80000000; j <<= 1) {
                     specifics[preptr][0] = 0;
                     specifics[preptr][1] = 2*j-1;
                     specifics[preptr++][0] |= se;
                 }
                 for (j = 1; j && j <= 0x20000; j <<= 1) {
                     specifics[preptr][0] = 2*j-1;
                     specifics[preptr][1] = ~0;
                     specifics[preptr++][0] |= se;
                 }
                 /*
                  * One-bit-and-everything-above.
                  */
                 for (j = 4; j && j <= 0x80000000; j <<= 1) {
                     specifics[preptr][0] = 0xFFFFF;
                     specifics[preptr][1] = ~(j-1);
                     specifics[preptr++][0] |= se;
                 }
                 for (j = 1; j && j <= 0x40000; j <<= 1) {
                     specifics[preptr][0] = 0xFFFFF ^ (j-1);
                     specifics[preptr][1] = 0;
                     specifics[preptr++][0] |= se;
                 }
                 /*
                  * Done.
                  */
             }
         }
         assert(preptr == 13056);
         premax = preptr;
     }
 
     /*
      * Decide whether to return a pre or a random case.
      */
     n = (uint32) random32() % (uint32) (premax+1);
     if (n < preptr) {
         /*
          * Return pre[n].
          */
         uint32 t;
         t = specifics[n][0];
         specifics[n][0] = specifics[preptr-1][0];
         specifics[preptr-1][0] = t;     /* (not really needed) */
         ret[0] = t;
         t = specifics[n][1];
         specifics[n][1] = specifics[preptr-1][1];
         specifics[preptr-1][1] = t;     /* (not really needed) */
         ret[1] = t;
         preptr--;
     } else {
         /*
          * Random case.
          * Sign and exponent:
          *  - FIXME
          * Significand:
          *  - with prob 1/5, a totally random bit pattern
          *  - with prob 1/5, all 1s down to some point and then random
          *  - with prob 1/5, all 1s up to some point and then random
          *  - with prob 1/5, all 0s down to some point and then random
          *  - with prob 1/5, all 0s up to some point and then random
          */
         n = random32() % 5;
         f = random32();                /* some random bits */
         g = random32();                /* some random bits */
         bits = random32() % 51 + 1;    /* 1-51 */
         switch (n) {
           case 0:
             break;                     /* leave f alone */
           case 1:
             if (bits <= 32)
                 f |= (1<<bits)-1;
             else {
                 bits -= 32;
                 g |= (1<<bits)-1;
                 f = ~0;
             }
             break;
           case 2:
             if (bits <= 32)
                 f &= ~((1<<bits)-1);
             else {
                 bits -= 32;
                 g &= ~((1<<bits)-1);
                 f = 0;
             }
             break;
           case 3:
             if (bits <= 32)
                 g &= (1<<bits)-1;
             else {
                 bits -= 32;
                 f &= (1<<bits)-1;
                 g = 0;
             }
             break;
           case 4:
             if (bits <= 32)
                 g |= ~((1<<bits)-1);
             else {
                 bits -= 32;
                 f |= ~((1<<bits)-1);
                 g = ~0;
             }
             break;
         }
         g &= 0xFFFFF;
         g |= (random32() & 0xFFF00000);/* FIXME - do better */
         ret[0] = g;
         ret[1] = f;
     }
 }
 
 static void cases_biased(uint32 *out, uint32 lowbound,
                           uint32 highbound) {
     do {
         out[0] = highbound - random_upto_biased(highbound-lowbound, 8);
         out[1] = random_upto(0xFFFFFFFF);
         out[0] |= random_sign;
     } while (iszero(out));             /* rule out zero */
 }
 
 static void cases_biased_positive(uint32 *out, uint32 lowbound,
                                   uint32 highbound) {
     do {
         out[0] = highbound - random_upto_biased(highbound-lowbound, 8);
         out[1] = random_upto(0xFFFFFFFF);
     } while (iszero(out));             /* rule out zero */
 }
 
 static void cases_biased_float(uint32 *out, uint32 lowbound,
                                uint32 highbound) {
     do {
         out[0] = highbound - random_upto_biased(highbound-lowbound, 8);
         out[1] = 0;
         out[0] |= random_sign;
     } while (iszero(out));             /* rule out zero */
 }
 
 static void cases_semi1(uint32 *out, uint32 param1,
                         uint32 param2) {
     float64_case(out);
 }
 
 static void cases_semi1_float(uint32 *out, uint32 param1,
                               uint32 param2) {
     float32_case(out);
 }
 
 static void cases_semi2(uint32 *out, uint32 param1,
                         uint32 param2) {
     float64_case(out);
     float64_case(out+2);
 }
 
 static void cases_semi2_float(uint32 *out, uint32 param1,
                         uint32 param2) {
     float32_case(out);
     float32_case(out+2);
 }
 
 static void cases_ldexp(uint32 *out, uint32 param1,
                         uint32 param2) {
     float64_case(out);
     out[2] = random_upto(2048)-1024;
 }
 
 static void cases_ldexp_float(uint32 *out, uint32 param1,
                               uint32 param2) {
     float32_case(out);
     out[2] = random_upto(256)-128;
 }
 
 static void cases_uniform(uint32 *out, uint32 lowbound,
                           uint32 highbound) {
     do {
         out[0] = highbound - random_upto(highbound-lowbound);
         out[1] = random_upto(0xFFFFFFFF);
         out[0] |= random_sign;
     } while (iszero(out));             /* rule out zero */
 }
 static void cases_uniform_float(uint32 *out, uint32 lowbound,
                                 uint32 highbound) {
     do {
         out[0] = highbound - random_upto(highbound-lowbound);
         out[1] = 0;
         out[0] |= random_sign;
     } while (iszero(out));             /* rule out zero */
 }
 
 static void cases_uniform_positive(uint32 *out, uint32 lowbound,
                                    uint32 highbound) {
     do {
         out[0] = highbound - random_upto(highbound-lowbound);
         out[1] = random_upto(0xFFFFFFFF);
     } while (iszero(out));             /* rule out zero */
 }
 static void cases_uniform_float_positive(uint32 *out, uint32 lowbound,
                                          uint32 highbound) {
     do {
         out[0] = highbound - random_upto(highbound-lowbound);
         out[1] = 0;
     } while (iszero(out));             /* rule out zero */
 }
 
 
 static void log_cases(uint32 *out, uint32 param1,
                       uint32 param2) {
     do {
         out[0] = random_upto(0x7FEFFFFF);
         out[1] = random_upto(0xFFFFFFFF);
     } while (iszero(out));             /* rule out zero */
 }
 
 static void log_cases_float(uint32 *out, uint32 param1,
                             uint32 param2) {
     do {
         out[0] = random_upto(0x7F7FFFFF);
         out[1] = 0;
     } while (iszero(out));             /* rule out zero */
 }
 
 static void log1p_cases(uint32 *out, uint32 param1, uint32 param2)
 {
     uint32 sign = random_sign;
     if (sign == 0) {
         cases_uniform_positive(out, 0x3c700000, 0x43400000);
     } else {
         cases_uniform_positive(out, 0x3c000000, 0x3ff00000);
     }
     out[0] |= sign;
 }
 
 static void log1p_cases_float(uint32 *out, uint32 param1, uint32 param2)
 {
     uint32 sign = random_sign;
     if (sign == 0) {
         cases_uniform_float_positive(out, 0x32000000, 0x4c000000);
     } else {
         cases_uniform_float_positive(out, 0x30000000, 0x3f800000);
     }
     out[0] |= sign;
 }
 
 static void minmax_cases(uint32 *out, uint32 param1, uint32 param2)
 {
     do {
         out[0] = random_upto(0x7FEFFFFF);
         out[1] = random_upto(0xFFFFFFFF);
         out[0] |= random_sign;
         out[2] = random_upto(0x7FEFFFFF);
         out[3] = random_upto(0xFFFFFFFF);
         out[2] |= random_sign;
     } while (iszero(out));             /* rule out zero */
 }
 
 static void minmax_cases_float(uint32 *out, uint32 param1, uint32 param2)
 {
     do {
         out[0] = random_upto(0x7F7FFFFF);
         out[1] = 0;
         out[0] |= random_sign;
         out[2] = random_upto(0x7F7FFFFF);
         out[3] = 0;
         out[2] |= random_sign;
     } while (iszero(out));             /* rule out zero */
 }
 
 static void rred_cases(uint32 *out, uint32 param1,
                        uint32 param2) {
     do {
         out[0] = ((0x3fc00000 + random_upto(0x036fffff)) |
                   (random_upto(1) << 31));
         out[1] = random_upto(0xFFFFFFFF);
     } while (iszero(out));             /* rule out zero */
 }
 
 static void rred_cases_float(uint32 *out, uint32 param1,
                              uint32 param2) {
     do {
         out[0] = ((0x3e000000 + random_upto(0x0cffffff)) |
                   (random_upto(1) << 31));
         out[1] = 0;                    /* for iszero */
     } while (iszero(out));             /* rule out zero */
 }
 
 static void atan2_cases(uint32 *out, uint32 param1,
                         uint32 param2) {
     do {
         int expdiff = random_upto(101)-51;
         int swap;
         if (expdiff < 0) {
             expdiff = -expdiff;
             swap = 2;
         } else
             swap = 0;
         out[swap ^ 0] = random_upto(0x7FEFFFFF-((expdiff+1)<<20));
         out[swap ^ 2] = random_upto(((expdiff+1)<<20)-1) + out[swap ^ 0];
         out[1] = random_upto(0xFFFFFFFF);
         out[3] = random_upto(0xFFFFFFFF);
         out[0] |= random_sign;
         out[2] |= random_sign;
     } while (iszero(out) || iszero(out+2));/* rule out zero */
 }
 
 static void atan2_cases_float(uint32 *out, uint32 param1,
                               uint32 param2) {
     do {
         int expdiff = random_upto(44)-22;
         int swap;
         if (expdiff < 0) {
             expdiff = -expdiff;
             swap = 2;
         } else
             swap = 0;
         out[swap ^ 0] = random_upto(0x7F7FFFFF-((expdiff+1)<<23));
         out[swap ^ 2] = random_upto(((expdiff+1)<<23)-1) + out[swap ^ 0];
         out[0] |= random_sign;
         out[2] |= random_sign;
         out[1] = out[3] = 0;           /* for iszero */
     } while (iszero(out) || iszero(out+2));/* rule out zero */
 }
 
 static void pow_cases(uint32 *out, uint32 param1,
                       uint32 param2) {
     /*
      * Pick an exponent e (-0x33 to +0x7FE) for x, and here's the
      * range of numbers we can use as y:
      *
      * For e < 0x3FE, the range is [-0x400/(0x3FE-e),+0x432/(0x3FE-e)]
      * For e > 0x3FF, the range is [-0x432/(e-0x3FF),+0x400/(e-0x3FF)]
      *
      * For e == 0x3FE or e == 0x3FF, the range gets infinite at one
      * end or the other, so we have to be cleverer: pick a number n
      * of useful bits in the mantissa (1 thru 52, so 1 must imply
      * 0x3ff00000.00000001 whereas 52 is anything at least as big
      * as 0x3ff80000.00000000; for e == 0x3fe, 1 necessarily means
      * 0x3fefffff.ffffffff and 52 is anything at most as big as
      * 0x3fe80000.00000000). Then, as it happens, a sensible
      * maximum power is 2^(63-n) for e == 0x3fe, and 2^(62-n) for
      * e == 0x3ff.
      *
      * We inevitably get some overflows in approximating the log
      * curves by these nasty step functions, but that's all right -
      * we do want _some_ overflows to be tested.
      *
      * Having got that, then, it's just a matter of inventing a
      * probability distribution for all of this.
      */
     int e, n;
     uint32 dmin, dmax;
     const uint32 pmin = 0x3e100000;
 
     /*
      * Generate exponents in a slightly biased fashion.
      */
     e = (random_upto(1) ?              /* is exponent small or big? */
          0x3FE - random_upto_biased(0x431,2) :   /* small */
          0x3FF + random_upto_biased(0x3FF,2));   /* big */
 
     /*
      * Now split into cases.
      */
     if (e < 0x3FE || e > 0x3FF) {
         uint32 imin, imax;
         if (e < 0x3FE)
             imin = 0x40000 / (0x3FE - e), imax = 0x43200 / (0x3FE - e);
         else
             imin = 0x43200 / (e - 0x3FF), imax = 0x40000 / (e - 0x3FF);
         /* Power range runs from -imin to imax. Now convert to doubles */
         dmin = doubletop(imin, -8);
         dmax = doubletop(imax, -8);
         /* Compute the number of mantissa bits. */
         n = (e > 0 ? 53 : 52+e);
     } else {
         /* Critical exponents. Generate a top bit index. */
         n = 52 - random_upto_biased(51, 4);
         if (e == 0x3FE)
             dmax = 63 - n;
         else
             dmax = 62 - n;
         dmax = (dmax << 20) + 0x3FF00000;
         dmin = dmax;
     }
     /* Generate a mantissa. */
     if (n <= 32) {
         out[0] = 0;
         out[1] = random_upto((1 << (n-1)) - 1) + (1 << (n-1));
     } else if (n == 33) {
         out[0] = 1;
         out[1] = random_upto(0xFFFFFFFF);
     } else if (n > 33) {
         out[0] = random_upto((1 << (n-33)) - 1) + (1 << (n-33));
         out[1] = random_upto(0xFFFFFFFF);
     }
     /* Negate the mantissa if e == 0x3FE. */
     if (e == 0x3FE) {
         out[1] = -out[1];
         out[0] = -out[0];
         if (out[1]) out[0]--;
     }
     /* Put the exponent on. */
     out[0] &= 0xFFFFF;
     out[0] |= ((e > 0 ? e : 0) << 20);
     /* Generate a power. Powers don't go below 2^-30. */
     if (random_upto(1)) {
         /* Positive power */
         out[2] = dmax - random_upto_biased(dmax-pmin, 10);
     } else {
         /* Negative power */
         out[2] = (dmin - random_upto_biased(dmin-pmin, 10)) | 0x80000000;
     }
     out[3] = random_upto(0xFFFFFFFF);
 }
 static void pow_cases_float(uint32 *out, uint32 param1,
                             uint32 param2) {
     /*
      * Pick an exponent e (-0x16 to +0xFE) for x, and here's the
      * range of numbers we can use as y:
      *
      * For e < 0x7E, the range is [-0x80/(0x7E-e),+0x95/(0x7E-e)]
      * For e > 0x7F, the range is [-0x95/(e-0x7F),+0x80/(e-0x7F)]
      *
      * For e == 0x7E or e == 0x7F, the range gets infinite at one
      * end or the other, so we have to be cleverer: pick a number n
      * of useful bits in the mantissa (1 thru 23, so 1 must imply
      * 0x3f800001 whereas 23 is anything at least as big as
      * 0x3fc00000; for e == 0x7e, 1 necessarily means 0x3f7fffff
      * and 23 is anything at most as big as 0x3f400000). Then, as
      * it happens, a sensible maximum power is 2^(31-n) for e ==
      * 0x7e, and 2^(30-n) for e == 0x7f.
      *
      * We inevitably get some overflows in approximating the log
      * curves by these nasty step functions, but that's all right -
      * we do want _some_ overflows to be tested.
      *
      * Having got that, then, it's just a matter of inventing a
      * probability distribution for all of this.
      */
     int e, n;
     uint32 dmin, dmax;
     const uint32 pmin = 0x38000000;
 
     /*
      * Generate exponents in a slightly biased fashion.
      */
     e = (random_upto(1) ?              /* is exponent small or big? */
          0x7E - random_upto_biased(0x94,2) :   /* small */
          0x7F + random_upto_biased(0x7f,2));   /* big */
 
     /*
      * Now split into cases.
      */
     if (e < 0x7E || e > 0x7F) {
         uint32 imin, imax;
         if (e < 0x7E)
             imin = 0x8000 / (0x7e - e), imax = 0x9500 / (0x7e - e);
         else
             imin = 0x9500 / (e - 0x7f), imax = 0x8000 / (e - 0x7f);
         /* Power range runs from -imin to imax. Now convert to doubles */
         dmin = floatval(imin, -8);
         dmax = floatval(imax, -8);
         /* Compute the number of mantissa bits. */
         n = (e > 0 ? 24 : 23+e);
     } else {
         /* Critical exponents. Generate a top bit index. */
         n = 23 - random_upto_biased(22, 4);
         if (e == 0x7E)
             dmax = 31 - n;
         else
             dmax = 30 - n;
         dmax = (dmax << 23) + 0x3F800000;
         dmin = dmax;
     }
     /* Generate a mantissa. */
     out[0] = random_upto((1 << (n-1)) - 1) + (1 << (n-1));
     out[1] = 0;
     /* Negate the mantissa if e == 0x7E. */
     if (e == 0x7E) {
         out[0] = -out[0];
     }
     /* Put the exponent on. */
     out[0] &= 0x7FFFFF;
     out[0] |= ((e > 0 ? e : 0) << 23);
     /* Generate a power. Powers don't go below 2^-15. */
     if (random_upto(1)) {
         /* Positive power */
         out[2] = dmax - random_upto_biased(dmax-pmin, 10);
     } else {
         /* Negative power */
         out[2] = (dmin - random_upto_biased(dmin-pmin, 10)) | 0x80000000;
     }
     out[3] = 0;
 }
 
 void vet_for_decline(Testable *fn, uint32 *args, uint32 *result, int got_errno_in) {
     int declined = 0;
 
     switch (fn->type) {
       case args1:
       case rred:
       case semi1:
       case t_frexp:
       case t_modf:
       case classify:
       case t_ldexp:
         declined |= lib_fo && is_dhard(args+0);
         break;
       case args1f:
       case rredf:
       case semi1f:
       case t_frexpf:
       case t_modff:
       case classifyf:
         declined |= lib_fo && is_shard(args+0);
         break;
       case args2:
       case semi2:
       case args1c:
       case args1cr:
       case compare:
         declined |= lib_fo && is_dhard(args+0);
         declined |= lib_fo && is_dhard(args+2);
         break;
       case args2f:
       case semi2f:
       case t_ldexpf:
       case comparef:
       case args1fc:
       case args1fcr:
         declined |= lib_fo && is_shard(args+0);
         declined |= lib_fo && is_shard(args+2);
         break;
       case args2c:
         declined |= lib_fo && is_dhard(args+0);
         declined |= lib_fo && is_dhard(args+2);
         declined |= lib_fo && is_dhard(args+4);
         declined |= lib_fo && is_dhard(args+6);
         break;
       case args2fc:
         declined |= lib_fo && is_shard(args+0);
         declined |= lib_fo && is_shard(args+2);
         declined |= lib_fo && is_shard(args+4);
         declined |= lib_fo && is_shard(args+6);
         break;
     }
 
     switch (fn->type) {
       case args1:              /* return an extra-precise result */
       case args2:
       case rred:
       case semi1:              /* return a double result */
       case semi2:
       case t_ldexp:
       case t_frexp:            /* return double * int */
       case args1cr:
         declined |= lib_fo && is_dhard(result);
         break;
       case args1f:
       case args2f:
       case rredf:
       case semi1f:
       case semi2f:
       case t_ldexpf:
       case args1fcr:
         declined |= lib_fo && is_shard(result);
         break;
       case t_modf:             /* return double * double */
         declined |= lib_fo && is_dhard(result+0);
         declined |= lib_fo && is_dhard(result+2);
         break;
       case t_modff:                    /* return float * float */
         declined |= lib_fo && is_shard(result+2);
         /* fall through */
       case t_frexpf:                   /* return float * int */
         declined |= lib_fo && is_shard(result+0);
         break;
       case args1c:
       case args2c:
         declined |= lib_fo && is_dhard(result+0);
         declined |= lib_fo && is_dhard(result+4);
         break;
       case args1fc:
       case args2fc:
         declined |= lib_fo && is_shard(result+0);
         declined |= lib_fo && is_shard(result+4);
         break;
     }
 
     /* Expect basic arithmetic tests to be declined if the command
      * line said that would happen */
     declined |= (lib_no_arith && (fn->func == (funcptr)mpc_add ||
                                   fn->func == (funcptr)mpc_sub ||
                                   fn->func == (funcptr)mpc_mul ||
                                   fn->func == (funcptr)mpc_div));
 
     if (!declined) {
         if (got_errno_in)
             ntests++;
         else
             ntests += 3;
     }
 }
 
 void docase(Testable *fn, uint32 *args) {
     uint32 result[8];  /* real part in first 4, imaginary part in last 4 */
     char *errstr = NULL;
     mpfr_t a, b, r;
     mpc_t ac, bc, rc;
     int rejected, printextra;
     wrapperctx ctx;
 
     mpfr_init2(a, MPFR_PREC);
     mpfr_init2(b, MPFR_PREC);
     mpfr_init2(r, MPFR_PREC);
     mpc_init2(ac, MPFR_PREC);
     mpc_init2(bc, MPFR_PREC);
     mpc_init2(rc, MPFR_PREC);
 
     printf("func=%s", fn->name);
 
     rejected = 0; /* FIXME */
 
     switch (fn->type) {
       case args1:
       case rred:
       case semi1:
       case t_frexp:
       case t_modf:
       case classify:
         printf(" op1=%08x.%08x", args[0], args[1]);
         break;
       case args1f:
       case rredf:
       case semi1f:
       case t_frexpf:
       case t_modff:
       case classifyf:
         printf(" op1=%08x", args[0]);
         break;
       case args2:
       case semi2:
       case compare:
         printf(" op1=%08x.%08x", args[0], args[1]);
         printf(" op2=%08x.%08x", args[2], args[3]);
         break;
       case args2f:
       case semi2f:
       case t_ldexpf:
       case comparef:
         printf(" op1=%08x", args[0]);
         printf(" op2=%08x", args[2]);
         break;
       case t_ldexp:
         printf(" op1=%08x.%08x", args[0], args[1]);
         printf(" op2=%08x", args[2]);
         break;
       case args1c:
       case args1cr:
         printf(" op1r=%08x.%08x", args[0], args[1]);
         printf(" op1i=%08x.%08x", args[2], args[3]);
         break;
       case args2c:
         printf(" op1r=%08x.%08x", args[0], args[1]);
         printf(" op1i=%08x.%08x", args[2], args[3]);
         printf(" op2r=%08x.%08x", args[4], args[5]);
         printf(" op2i=%08x.%08x", args[6], args[7]);
         break;
       case args1fc:
       case args1fcr:
         printf(" op1r=%08x", args[0]);
         printf(" op1i=%08x", args[2]);
         break;
       case args2fc:
         printf(" op1r=%08x", args[0]);
         printf(" op1i=%08x", args[2]);
         printf(" op2r=%08x", args[4]);
         printf(" op2i=%08x", args[6]);
         break;
       default:
         fprintf(stderr, "internal inconsistency?!\n");
         abort();
     }
 
     if (rejected == 2) {
         printf(" - test case rejected\n");
         goto cleanup;
     }
 
     wrapper_init(&ctx);
 
     if (rejected == 0) {
         switch (fn->type) {
           case args1:
             set_mpfr_d(a, args[0], args[1]);
             wrapper_op_real(&ctx, a, 2, args);
             ((testfunc1)(fn->func))(r, a, GMP_RNDN);
             get_mpfr_d(r, &result[0], &result[1], &result[2]);
             wrapper_result_real(&ctx, r, 2, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_d(r, &result[0], &result[1], &result[2]);
             break;
           case args1cr:
             set_mpc_d(ac, args[0], args[1], args[2], args[3]);
             wrapper_op_complex(&ctx, ac, 2, args);
             ((testfunc1cr)(fn->func))(r, ac, GMP_RNDN);
             get_mpfr_d(r, &result[0], &result[1], &result[2]);
             wrapper_result_real(&ctx, r, 2, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_d(r, &result[0], &result[1], &result[2]);
             break;
           case args1f:
             set_mpfr_f(a, args[0]);
             wrapper_op_real(&ctx, a, 1, args);
             ((testfunc1)(fn->func))(r, a, GMP_RNDN);
             get_mpfr_f(r, &result[0], &result[1]);
             wrapper_result_real(&ctx, r, 1, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_f(r, &result[0], &result[1]);
             break;
           case args1fcr:
             set_mpc_f(ac, args[0], args[2]);
             wrapper_op_complex(&ctx, ac, 1, args);
             ((testfunc1cr)(fn->func))(r, ac, GMP_RNDN);
             get_mpfr_f(r, &result[0], &result[1]);
             wrapper_result_real(&ctx, r, 1, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_f(r, &result[0], &result[1]);
             break;
           case args2:
             set_mpfr_d(a, args[0], args[1]);
             wrapper_op_real(&ctx, a, 2, args);
             set_mpfr_d(b, args[2], args[3]);
             wrapper_op_real(&ctx, b, 2, args+2);
             ((testfunc2)(fn->func))(r, a, b, GMP_RNDN);
             get_mpfr_d(r, &result[0], &result[1], &result[2]);
             wrapper_result_real(&ctx, r, 2, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_d(r, &result[0], &result[1], &result[2]);
             break;
           case args2f:
             set_mpfr_f(a, args[0]);
             wrapper_op_real(&ctx, a, 1, args);
             set_mpfr_f(b, args[2]);
             wrapper_op_real(&ctx, b, 1, args+2);
             ((testfunc2)(fn->func))(r, a, b, GMP_RNDN);
             get_mpfr_f(r, &result[0], &result[1]);
             wrapper_result_real(&ctx, r, 1, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_f(r, &result[0], &result[1]);
             break;
           case rred:
             set_mpfr_d(a, args[0], args[1]);
             wrapper_op_real(&ctx, a, 2, args);
             ((testrred)(fn->func))(r, a, (int *)&result[3]);
             get_mpfr_d(r, &result[0], &result[1], &result[2]);
             wrapper_result_real(&ctx, r, 2, result);
             /* We never need to mess about with the integer auxiliary
              * output. */
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_d(r, &result[0], &result[1], &result[2]);
             break;
           case rredf:
             set_mpfr_f(a, args[0]);
             wrapper_op_real(&ctx, a, 1, args);
             ((testrred)(fn->func))(r, a, (int *)&result[3]);
             get_mpfr_f(r, &result[0], &result[1]);
             wrapper_result_real(&ctx, r, 1, result);
             /* We never need to mess about with the integer auxiliary
              * output. */
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpfr_f(r, &result[0], &result[1]);
             break;
           case semi1:
           case semi1f:
             errstr = ((testsemi1)(fn->func))(args, result);
             break;
           case semi2:
           case compare:
             errstr = ((testsemi2)(fn->func))(args, args+2, result);
             break;
           case semi2f:
           case comparef:
           case t_ldexpf:
             errstr = ((testsemi2f)(fn->func))(args, args+2, result);
             break;
           case t_ldexp:
             errstr = ((testldexp)(fn->func))(args, args+2, result);
             break;
           case t_frexp:
             errstr = ((testfrexp)(fn->func))(args, result, result+2);
             break;
           case t_frexpf:
             errstr = ((testfrexp)(fn->func))(args, result, result+2);
             break;
           case t_modf:
             errstr = ((testmodf)(fn->func))(args, result, result+2);
             break;
           case t_modff:
             errstr = ((testmodf)(fn->func))(args, result, result+2);
             break;
           case classify:
             errstr = ((testclassify)(fn->func))(args, &result[0]);
             break;
           case classifyf:
             errstr = ((testclassifyf)(fn->func))(args, &result[0]);
             break;
           case args1c:
             set_mpc_d(ac, args[0], args[1], args[2], args[3]);
             wrapper_op_complex(&ctx, ac, 2, args);
             ((testfunc1c)(fn->func))(rc, ac, MPC_RNDNN);
             get_mpc_d(rc, &result[0], &result[1], &result[2], &result[4], &result[5], &result[6]);
             wrapper_result_complex(&ctx, rc, 2, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpc_d(rc, &result[0], &result[1], &result[2], &result[4], &result[5], &result[6]);
             break;
           case args2c:
             set_mpc_d(ac, args[0], args[1], args[2], args[3]);
             wrapper_op_complex(&ctx, ac, 2, args);
             set_mpc_d(bc, args[4], args[5], args[6], args[7]);
             wrapper_op_complex(&ctx, bc, 2, args+4);
             ((testfunc2c)(fn->func))(rc, ac, bc, MPC_RNDNN);
             get_mpc_d(rc, &result[0], &result[1], &result[2], &result[4], &result[5], &result[6]);
             wrapper_result_complex(&ctx, rc, 2, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpc_d(rc, &result[0], &result[1], &result[2], &result[4], &result[5], &result[6]);
             break;
           case args1fc:
             set_mpc_f(ac, args[0], args[2]);
             wrapper_op_complex(&ctx, ac, 1, args);
             ((testfunc1c)(fn->func))(rc, ac, MPC_RNDNN);
             get_mpc_f(rc, &result[0], &result[1], &result[4], &result[5]);
             wrapper_result_complex(&ctx, rc, 1, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpc_f(rc, &result[0], &result[1], &result[4], &result[5]);
             break;
           case args2fc:
             set_mpc_f(ac, args[0], args[2]);
             wrapper_op_complex(&ctx, ac, 1, args);
             set_mpc_f(bc, args[4], args[6]);
             wrapper_op_complex(&ctx, bc, 1, args+4);
             ((testfunc2c)(fn->func))(rc, ac, bc, MPC_RNDNN);
             get_mpc_f(rc, &result[0], &result[1], &result[4], &result[5]);
             wrapper_result_complex(&ctx, rc, 1, result);
             if (wrapper_run(&ctx, fn->wrappers))
                 get_mpc_f(rc, &result[0], &result[1], &result[4], &result[5]);
             break;
           default:
             fprintf(stderr, "internal inconsistency?!\n");
             abort();
         }
     }
 
     switch (fn->type) {
       case args1:              /* return an extra-precise result */
       case args2:
       case args1cr:
       case rred:
         printextra = 1;
         if (rejected == 0) {
             errstr = NULL;
             if (!mpfr_zero_p(a)) {
                 if ((result[0] & 0x7FFFFFFF) == 0 && result[1] == 0) {
                     /*
                      * If the output is +0 or -0 apart from the extra
                      * precision in result[2], then there's a tricky
                      * judgment call about what we require in the
                      * output. If we output the extra bits and set
                      * errstr="?underflow" then mathtest will tolerate
                      * the function under test rounding down to zero
                      * _or_ up to the minimum denormal; whereas if we
                      * suppress the extra bits and set
                      * errstr="underflow", then mathtest will enforce
                      * that the function really does underflow to zero.
                      *
                      * But where to draw the line? It seems clear to
                      * me that numbers along the lines of
                      * 00000000.00000000.7ff should be treated
                      * similarly to 00000000.00000000.801, but on the
                      * other hand, we must surely be prepared to
                      * enforce a genuine underflow-to-zero in _some_
                      * case where the true mathematical output is
                      * nonzero but absurdly tiny.
                      *
                      * I think a reasonable place to draw the
                      * distinction is at 00000000.00000000.400, i.e.
                      * one quarter of the minimum positive denormal.
                      * If a value less than that rounds up to the
                      * minimum denormal, that must mean the function
                      * under test has managed to make an error of an
                      * entire factor of two, and that's something we
                      * should fix. Above that, you can misround within
                      * the limits of your accuracy bound if you have
                      * to.
                      */
                     if (result[2] < 0x40000000) {
                         /* Total underflow (ERANGE + UFL) is required,
                          * and we suppress the extra bits to make
                          * mathtest enforce that the output is really
                          * zero. */
                         errstr = "underflow";
                         printextra = 0;
                     } else {
                         /* Total underflow is not required, but if the
                          * function rounds down to zero anyway, then
                          * we should be prepared to tolerate it. */
                         errstr = "?underflow";
                     }
                 } else if (!(result[0] & 0x7ff00000)) {
                     /*
                      * If the output is denormal, we usually expect a
                      * UFL exception, warning the user of partial
                      * underflow. The exception is if the denormal
                      * being returned is just one of the input values,
                      * unchanged even in principle. I bodgily handle
                      * this by just special-casing the functions in
                      * question below.
                      */
                     if (!strcmp(fn->name, "fmax") ||
                         !strcmp(fn->name, "fmin") ||
                         !strcmp(fn->name, "creal") ||
                         !strcmp(fn->name, "cimag")) {
                         /* no error expected */
                     } else {
                         errstr = "u";
                     }
                 } else if ((result[0] & 0x7FFFFFFF) > 0x7FEFFFFF) {
                     /*
                      * Infinite results are usually due to overflow,
                      * but one exception is lgamma of a negative
                      * integer.
                      */
                     if (!strcmp(fn->name, "lgamma") &&
                         (args[0] & 0x80000000) != 0 && /* negative */
                         is_dinteger(args)) {
                         errstr = "ERANGE status=z";
                     } else {
                         errstr = "overflow";
                     }
                     printextra = 0;
                 }
             } else {
                 /* lgamma(0) is also a pole. */
                 if (!strcmp(fn->name, "lgamma")) {
                     errstr = "ERANGE status=z";
                     printextra = 0;
                 }
             }
         }
 
         if (!printextra || (rejected && !(rejected==1 && result[2]!=0))) {
             printf(" result=%08x.%08x",
                    result[0], result[1]);
         } else {
             printf(" result=%08x.%08x.%03x",
                    result[0], result[1], (result[2] >> 20) & 0xFFF);
         }
         if (fn->type == rred) {
             printf(" res2=%08x", result[3]);
         }
         break;
       case args1f:
       case args2f:
       case args1fcr:
       case rredf:
         printextra = 1;
         if (rejected == 0) {
             errstr = NULL;
             if (!mpfr_zero_p(a)) {
                 if ((result[0] & 0x7FFFFFFF) == 0) {
                     /*
                      * Decide whether to print the extra bits based on
                      * just how close to zero the number is. See the
                      * big comment in the double-precision case for
                      * discussion.
                      */
                     if (result[1] < 0x40000000) {
                         errstr = "underflow";
                         printextra = 0;
                     } else {
                         errstr = "?underflow";
                     }
                 } else if (!(result[0] & 0x7f800000)) {
                     /*
                      * Functions which do not report partial overflow
                      * are listed here as special cases. (See the
                      * corresponding double case above for a fuller
                      * comment.)
                      */
                     if (!strcmp(fn->name, "fmaxf") ||
                         !strcmp(fn->name, "fminf") ||
                         !strcmp(fn->name, "crealf") ||
                         !strcmp(fn->name, "cimagf")) {
                         /* no error expected */
                     } else {
                         errstr = "u";
                     }
                 } else if ((result[0] & 0x7FFFFFFF) > 0x7F7FFFFF) {
                     /*
                      * Infinite results are usually due to overflow,
                      * but one exception is lgamma of a negative
                      * integer.
                      */
                     if (!strcmp(fn->name, "lgammaf") &&
                         (args[0] & 0x80000000) != 0 && /* negative */
                         is_sinteger(args)) {
                         errstr = "ERANGE status=z";
                     } else {
                         errstr = "overflow";
                     }
                     printextra = 0;
                 }
             } else {
                 /* lgamma(0) is also a pole. */
                 if (!strcmp(fn->name, "lgammaf")) {
                     errstr = "ERANGE status=z";
                     printextra = 0;
                 }
             }
         }
 
         if (!printextra || (rejected && !(rejected==1 && result[1]!=0))) {
             printf(" result=%08x",
                    result[0]);
         } else {
             printf(" result=%08x.%03x",
                    result[0], (result[1] >> 20) & 0xFFF);
         }
         if (fn->type == rredf) {
             printf(" res2=%08x", result[3]);
         }
         break;
       case semi1:              /* return a double result */
       case semi2:
       case t_ldexp:
         printf(" result=%08x.%08x", result[0], result[1]);
         break;
       case semi1f:
       case semi2f:
       case t_ldexpf:
         printf(" result=%08x", result[0]);
         break;
       case t_frexp:            /* return double * int */
         printf(" result=%08x.%08x res2=%08x", result[0], result[1],
                result[2]);
         break;
       case t_modf:             /* return double * double */
         printf(" result=%08x.%08x res2=%08x.%08x",
                result[0], result[1], result[2], result[3]);
         break;
       case t_modff:                    /* return float * float */
         /* fall through */
       case t_frexpf:                   /* return float * int */
         printf(" result=%08x res2=%08x", result[0], result[2]);
         break;
       case classify:
       case classifyf:
       case compare:
       case comparef:
         printf(" result=%x", result[0]);
         break;
       case args1c:
       case args2c:
         if (0/* errstr */) {
             printf(" resultr=%08x.%08x", result[0], result[1]);
             printf(" resulti=%08x.%08x", result[4], result[5]);
         } else {
             printf(" resultr=%08x.%08x.%03x",
                    result[0], result[1], (result[2] >> 20) & 0xFFF);
             printf(" resulti=%08x.%08x.%03x",
                    result[4], result[5], (result[6] >> 20) & 0xFFF);
         }
         /* Underflow behaviour doesn't seem to be specified for complex arithmetic */
         errstr = "?underflow";
         break;
       case args1fc:
       case args2fc:
         if (0/* errstr */) {
             printf(" resultr=%08x", result[0]);
             printf(" resulti=%08x", result[4]);
         } else {
             printf(" resultr=%08x.%03x",
                    result[0], (result[1] >> 20) & 0xFFF);
             printf(" resulti=%08x.%03x",
                    result[4], (result[5] >> 20) & 0xFFF);
         }
         /* Underflow behaviour doesn't seem to be specified for complex arithmetic */
         errstr = "?underflow";
         break;
     }
 
     if (errstr && *(errstr+1) == '\0') {
         printf(" errno=0 status=%c",*errstr);
     } else if (errstr && *errstr == '?') {
         printf(" maybeerror=%s", errstr+1);
     } else if (errstr && errstr[0] == 'E') {
         printf(" errno=%s", errstr);
     } else {
         printf(" error=%s", errstr && *errstr ? errstr : "0");
     }
 
     printf("\n");
 
     vet_for_decline(fn, args, result, 0);
 
   cleanup:
     mpfr_clear(a);
     mpfr_clear(b);
     mpfr_clear(r);
     mpc_clear(ac);
     mpc_clear(bc);
     mpc_clear(rc);
 }
 
 void gencases(Testable *fn, int number) {
     int i;
     uint32 args[8];
 
     float32_case(NULL);
     float64_case(NULL);
 
     printf("random=on\n"); /* signal to runtests.pl that the following tests are randomly generated */
     for (i = 0; i < number; i++) {
         /* generate test point */
         fn->cases(args, fn->caseparam1, fn->caseparam2);
         docase(fn, args);
     }
     printf("random=off\n");
 }
 
 static uint32 doubletop(int x, int scale) {
     int e = 0x412 + scale;
     while (!(x & 0x100000))
         x <<= 1, e--;
     return (e << 20) + x;
 }
 
 static uint32 floatval(int x, int scale) {
     int e = 0x95 + scale;
     while (!(x & 0x800000))
         x <<= 1, e--;
     return (e << 23) + x;
 }
diff --git a/contrib/arm-optimized-routines/math/test/rtest/intern.h b/contrib/arm-optimized-routines/math/test/rtest/intern.h
index 12a9c749e18e..3ebd7ddaf85d 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/intern.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/intern.h
@@ -1,91 +1,91 @@
 /*
  * intern.h
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_intern_h
 #define mathtest_intern_h
 
 #include <mpfr.h>
 #include <mpc.h>
 
 #include "types.h"
 #include "wrappers.h"
 
 /* Generic function pointer. */
 typedef void (*funcptr)(void);
 
 /* Pointers to test function types. */
 typedef int    (*testfunc1)(mpfr_t, mpfr_t, mpfr_rnd_t);
 typedef int    (*testfunc2)(mpfr_t, mpfr_t, mpfr_t, mpfr_rnd_t);
 typedef int    (*testrred)(mpfr_t, mpfr_t, int *);
 typedef char * (*testsemi1)(uint32 *, uint32 *);
 typedef char * (*testsemi2)(uint32 *, uint32 *, uint32 *);
 typedef char * (*testsemi2f)(uint32 *, uint32 *, uint32 *);
 typedef char * (*testldexp)(uint32 *, uint32 *, uint32 *);
 typedef char * (*testfrexp)(uint32 *, uint32 *, uint32 *);
 typedef char * (*testmodf)(uint32 *, uint32 *, uint32 *);
 typedef char * (*testclassify)(uint32 *, uint32 *);
 typedef char * (*testclassifyf)(uint32 *, uint32 *);
 
 typedef int    (*testfunc1c)(mpc_t, mpc_t, mpc_rnd_t);
 typedef int    (*testfunc2c)(mpc_t, mpc_t, mpc_t, mpc_rnd_t);
 
 typedef int    (*testfunc1cr)(mpfr_t, mpc_t, mpfr_rnd_t);
 
 /* Pointer to a function that generates random test cases. */
 typedef void (*casegen)(uint32 *, uint32, uint32);
 
 /*
  * List of testable functions, their types, and their testable range.
  */
 enum {
     args1,                             /* afloat-based, one argument */
     args1f,                            /* same as args1 but in single prec */
     args2,                             /* afloat-based, two arguments */
     args2f,                            /* same as args2 but in single prec */
     rred,                              /* afloat-based, one arg, aux return */
     rredf,                             /* same as rred but in single prec */
     semi1,                             /* seminumerical, one argument */
     semi1f,                            /* seminumerical, 1 arg, float */
     semi2,                             /* seminumerical, two arguments */
     semi2f,                            /* seminumerical, 2 args, floats */
     t_ldexp,                           /* dbl * int -> dbl */
     t_ldexpf,                          /* sgl * int -> sgl */
     t_frexp,                           /* dbl -> dbl * int */
     t_frexpf,                          /* sgl -> sgl * int */
     t_modf,                            /* dbl -> dbl * dbl */
     t_modff,                           /* sgl -> sgl * sgl */
     classify,                          /* classify double: dbl -> int */
     classifyf,                         /* classify float: flt -> int */
     compare,                           /* compare doubles, returns int */
     comparef,                          /* compare floats, returns int */
 
     args1c,                            /* acomplex-base, one argument */
     args2c,
     args1fc,
     args2fc,
     args1cr,                           /* dbl-complex -> complex */
     args1fcr                           /* sgl-complex -> complex */
 };
 
 typedef struct __testable Testable;
 struct __testable {
     char *name;
     funcptr func;
     int type;
     wrapperfunc wrappers[MAXWRAPPERS];
     casegen cases; /* complex functions use the same casegen for both real and complex args */
     uint32 caseparam1, caseparam2;
 };
 
 extern Testable functions[];
 extern const int nfunctions;
 
 extern void init_pi(void);
 
 int nargs_(Testable* f);
 
 #endif
diff --git a/contrib/arm-optimized-routines/math/test/rtest/main.c b/contrib/arm-optimized-routines/math/test/rtest/main.c
index 0d8ead891320..3d533c946f79 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/main.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/main.c
@@ -1,334 +1,334 @@
 /*
  * main.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <time.h>
 
 #include "intern.h"
 
 void gencases(Testable *fn, int number);
 void docase(Testable *fn, uint32 *args);
 void vet_for_decline(Testable *fn, uint32 *args, uint32 *result, int got_errno_in);
 void seed_random(uint32 seed);
 
 int check_declines = 0;
 int lib_fo = 0;
 int lib_no_arith = 0;
 int ntests = 0;
 
 int nargs_(Testable* f) {
     switch((f)->type) {
     case args2:
     case args2f:
     case semi2:
     case semi2f:
     case t_ldexp:
     case t_ldexpf:
     case args1c:
     case args1fc:
     case args1cr:
     case args1fcr:
     case compare:
     case comparef:
         return 2;
     case args2c:
     case args2fc:
         return 4;
     default:
         return 1;
     }
 }
 
 static int isdouble(Testable *f)
 {
     switch (f->type) {
       case args1:
       case rred:
       case semi1:
       case t_frexp:
       case t_modf:
       case classify:
       case t_ldexp:
       case args2:
       case semi2:
       case args1c:
       case args1cr:
       case compare:
       case args2c:
         return 1;
       case args1f:
       case rredf:
       case semi1f:
       case t_frexpf:
       case t_modff:
       case classifyf:
       case args2f:
       case semi2f:
       case t_ldexpf:
       case comparef:
       case args1fc:
       case args1fcr:
       case args2fc:
         return 0;
       default:
         assert(0 && "Bad function type");
     }
 }
 
 Testable *find_function(const char *func)
 {
     int i;
     for (i = 0; i < nfunctions; i++) {
         if (func && !strcmp(func, functions[i].name)) {
             return &functions[i];
         }
     }
     return NULL;
 }
 
 void get_operand(const char *str, Testable *f, uint32 *word0, uint32 *word1)
 {
     struct special {
         unsigned dblword0, dblword1, sglword;
         const char *name;
     } specials[] = {
         {0x00000000,0x00000000,0x00000000,"0"},
         {0x3FF00000,0x00000000,0x3f800000,"1"},
         {0x7FF00000,0x00000000,0x7f800000,"inf"},
         {0x7FF80000,0x00000001,0x7fc00000,"qnan"},
         {0x7FF00000,0x00000001,0x7f800001,"snan"},
         {0x3ff921fb,0x54442d18,0x3fc90fdb,"pi2"},
         {0x400921fb,0x54442d18,0x40490fdb,"pi"},
         {0x3fe921fb,0x54442d18,0x3f490fdb,"pi4"},
         {0x4002d97c,0x7f3321d2,0x4016cbe4,"3pi4"},
     };
     int i;
 
     for (i = 0; i < (int)(sizeof(specials)/sizeof(*specials)); i++) {
         if (!strcmp(str, specials[i].name) ||
             ((str[0] == '-' || str[0] == '+') &&
              !strcmp(str+1, specials[i].name))) {
             assert(f);
             if (isdouble(f)) {
                 *word0 = specials[i].dblword0;
                 *word1 = specials[i].dblword1;
             } else {
                 *word0 = specials[i].sglword;
                 *word1 = 0;
             }
             if (str[0] == '-')
                 *word0 |= 0x80000000U;
             return;
         }
     }
 
     sscanf(str, "%"I32"x.%"I32"x", word0, word1);
 }
 
 void dofile(FILE *fp, int translating) {
     char buf[1024], sparebuf[1024], *p;
 
     /*
      * Command syntax is:
      *
      *  - "seed <integer>" sets a random seed
      *
      *  - "test <function> <ntests>" generates random test lines
      *
      *  - "<function> op1=foo [op2=bar]" generates a specific test
      *  - "func=<function> op1=foo [op2=bar]" does the same
      *  - "func=<function> op1=foo result=bar" will just output the line as-is
      *
      *  - a semicolon or a blank line is ignored
      */
     while (fgets(buf, sizeof(buf), fp)) {
         buf[strcspn(buf, "\r\n")] = '\0';
         strcpy(sparebuf, buf);
         p = buf;
         while (*p && isspace(*p)) p++;
         if (!*p || *p == ';') {
             /* Comment or blank line. Only print if `translating' is set. */
             if (translating)
                 printf("%s\n", buf);
             continue;
         }
         if (!strncmp(buf, "seed ", 5)) {
             seed_random(atoi(buf+5));
         } else if (!strncmp(buf, "random=", 7)) {
             /*
              * Copy 'random=on' / 'random=off' lines unconditionally
              * to the output, so that random test failures can be
              * accumulated into a recent-failures-list file and
              * still identified as random-in-origin when re-run the
              * next day.
              */
             printf("%s\n", buf);
         } else if (!strncmp(buf, "test ", 5)) {
             char *p = buf+5;
             char *q;
             int ntests, i;
             q = p;
             while (*p && !isspace(*p)) p++;
             if (*p) *p++ = '\0';
             while (*p && isspace(*p)) p++;
             if (*p)
                 ntests = atoi(p);
             else
                 ntests = 100;          /* *shrug* */
             for (i = 0; i < nfunctions; i++) {
                 if (!strcmp(q, functions[i].name)) {
                     gencases(&functions[i], ntests);
                     break;
                 }
             }
             if (i == nfunctions) {
                 fprintf(stderr, "unknown test `%s'\n", q);
             }
         } else {
             /*
              * Parse a specific test line.
              */
             uint32 ops[8], result[8];
             int got_op = 0; /* &1 for got_op1, &4 for got_op3 etc. */
             Testable *f = 0;
             char *q, *r;
             int got_result = 0, got_errno_in = 0;
 
             for (q = strtok(p, " \t"); q; q = strtok(NULL, " \t")) {
                 r = strchr(q, '=');
                 if (!r) {
                     f = find_function(q);
                 } else {
                     *r++ = '\0';
 
                     if (!strcmp(q, "func"))
                         f = find_function(r);
                     else if (!strcmp(q, "op1") || !strcmp(q, "op1r")) {
                         get_operand(r, f, &ops[0], &ops[1]);
                         got_op |= 1;
                     } else if (!strcmp(q, "op2") || !strcmp(q, "op1i")) {
                         get_operand(r, f, &ops[2], &ops[3]);
                         got_op |= 2;
                     } else if (!strcmp(q, "op2r")) {
                         get_operand(r, f, &ops[4], &ops[5]);
                         got_op |= 4;
                     } else if (!strcmp(q, "op2i")) {
                         get_operand(r, f, &ops[6], &ops[7]);
                         got_op |= 8;
                     } else if (!strcmp(q, "result") || !strcmp(q, "resultr")) {
                         get_operand(r, f, &result[0], &result[1]);
                         got_result |= 1;
                     } else if (!strcmp(q, "resulti")) {
                         get_operand(r, f, &result[4], &result[5]);
                         got_result |= 2;
                     } else if (!strcmp(q, "res2")) {
                         get_operand(r, f, &result[2], &result[3]);
                         got_result |= 4;
                     } else if (!strcmp(q, "errno_in")) {
                         got_errno_in = 1;
                     }
                 }
             }
 
             /*
              * Test cases already set up by the input are not
              * reprocessed by default, unlike the fplib tests. (This
              * is mostly for historical reasons, because we used to
              * use a very slow and incomplete internal reference
              * implementation; now our ref impl is MPFR/MPC it
              * probably wouldn't be such a bad idea, though we'd still
              * have to make sure all the special cases came out
              * right.) If translating==2 (corresponding to the -T
              * command-line option) then we regenerate everything
              * regardless.
              */
             if (got_result && translating < 2) {
                 if (f)
                     vet_for_decline(f, ops, result, got_errno_in);
                 puts(sparebuf);
                 continue;
             }
 
             if (f && got_op==(1<<nargs_(f))-1) {
                 /*
                  * And do it!
                  */
                 docase(f, ops);
             }
         }
     }
 }
 
 int main(int argc, char **argv) {
     int errs = 0, opts = 1, files = 0, translating = 0;
     unsigned int seed = 1; /* in case no explicit seed provided */
 
     seed_random(seed);
 
     setvbuf(stdout, NULL, _IOLBF, BUFSIZ); /* stops incomplete lines being printed when out of time */
 
     while (--argc) {
         FILE *fp;
         char *p = *++argv;
 
         if (opts && *p == '-') {
             if(*(p+1) == 0) { /* single -, read from stdin */
                 break;
             } else if (!strcmp(p, "-t")) {
                 translating = 1;
             } else if (!strcmp(p, "-T")) {
                 translating = 2;
             } else if (!strcmp(p, "-c")) {
                 check_declines = 1;
             } else if (!strcmp(p, "--")) {
                 opts = 0;
             } else if (!strcmp(p,"--seed") && argc > 1 && 1==sscanf(*(argv+1),"%u",&seed)) {
                 seed_random(seed);
                 argv++; /* next in argv is seed value, so skip */
                 --argc;
             } else if (!strcmp(p, "-fo")) {
                 lib_fo = 1;
             } else if (!strcmp(p, "-noarith")) {
                 lib_no_arith = 1;
             } else {
                 fprintf(stderr,
                         "rtest: ignoring unrecognised option '%s'\n", p);
                 errs = 1;
             }
         } else {
             files = 1;
             if (!errs) {
                 fp = fopen(p, "r");
                 if (fp) {
                     dofile(fp, translating);
                     fclose(fp);
                 } else {
                     perror(p);
                     errs = 1;
                 }
             }
         }
     }
 
     /*
      * If no filename arguments, use stdin.
      */
     if (!files && !errs) {
         dofile(stdin, translating);
     }
 
     if (check_declines) {
         fprintf(stderr, "Tests expected to run: %d\n", ntests);
         fflush(stderr);
     }
 
     return errs;
 }
diff --git a/contrib/arm-optimized-routines/math/test/rtest/random.c b/contrib/arm-optimized-routines/math/test/rtest/random.c
index 56123966b8c4..1de32580b733 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/random.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/random.c
@@ -1,99 +1,99 @@
 /*
  * random.c - random number generator for producing mathlib test cases
  *
  * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
 #include "random.h"
 
 static uint32 seedbuf[55];
 static int seedptr;
 
 void seed_random(uint32 seed) {
     int i;
 
     seedptr = 0;
     for (i = 0; i < 55; i++) {
         seed = seed % 44488 * 48271 - seed / 44488 * 3399;
         seedbuf[i] = seed - 1;
     }
 }
 
 uint32 base_random(void) {
     seedptr %= 55;
     seedbuf[seedptr] += seedbuf[(seedptr+31)%55];
     return seedbuf[seedptr++];
 }
 
 uint32 random32(void) {
     uint32 a, b, b1, b2;
     a = base_random();
     b = base_random();
     for (b1 = 0x80000000, b2 = 1; b1 > b2; b1 >>= 1, b2 <<= 1) {
         uint32 b3 = b1 | b2;
         if ((b & b3) != 0 && (b & b3) != b3)
             b ^= b3;
     }
     return a ^ b;
 }
 
 /*
  * random_upto: generate a uniformly randomised number in the range
  * 0,...,limit-1. (Precondition: limit > 0.)
  *
  * random_upto_biased: generate a number in the same range, but with
  * the probability skewed towards the high end by means of taking the
  * maximum of 8*bias+1 samples from the uniform distribution on the
  * same range. (I don't know why bias is given in that curious way -
  * historical reasons, I expect.)
  *
  * For speed, I separate the implementation of random_upto into the
  * two stages of (a) generate a bitmask which reduces a 32-bit random
  * number to within a factor of two of the right range, (b) repeatedly
  * generate numbers in that range until one is small enough. Splitting
  * it up like that means that random_upto_biased can do (a) only once
  * even when it does (b) lots of times.
  */
 
 static uint32 random_upto_makemask(uint32 limit) {
     uint32 mask = 0xFFFFFFFF;
     int i;
     for (i = 16; i > 0; i >>= 1)
         if ((limit & (mask >> i)) == limit)
             mask >>= i;
     return mask;
 }
 
 static uint32 random_upto_internal(uint32 limit, uint32 mask) {
     uint32 ret;
     do {
         ret = random32() & mask;
     } while (ret > limit);
     return ret;
 }
 
 uint32 random_upto(uint32 limit) {
     uint32 mask = random_upto_makemask(limit);
     return random_upto_internal(limit, mask);
 }
 
 uint32 random_upto_biased(uint32 limit, int bias) {
     uint32 mask = random_upto_makemask(limit);
 
     uint32 ret = random_upto_internal(limit, mask);
     while (bias--) {
         uint32 tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
         tmp = random_upto_internal(limit, mask); if (tmp < ret) ret = tmp;
     }
 
     return ret;
 }
diff --git a/contrib/arm-optimized-routines/math/test/rtest/random.h b/contrib/arm-optimized-routines/math/test/rtest/random.h
index b4b22df82a3d..0b477d72b234 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/random.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/random.h
@@ -1,12 +1,12 @@
 /*
  * random.h - header for random.c
  *
  * Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
 
 uint32 random32(void);
 uint32 random_upto(uint32 limit);
 uint32 random_upto_biased(uint32 limit, int bias);
diff --git a/contrib/arm-optimized-routines/math/test/rtest/semi.c b/contrib/arm-optimized-routines/math/test/rtest/semi.c
index c9f0daf76508..70a7844a48d6 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/semi.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/semi.c
@@ -1,905 +1,905 @@
 /*
  * semi.c: test implementations of mathlib seminumerical functions
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
 #include "semi.h"
 
 static void test_rint(uint32 *in, uint32 *out,
                        int isfloor, int isceil) {
     int sign = in[0] & 0x80000000;
     int roundup = (isfloor && sign) || (isceil && !sign);
     uint32 xh, xl, roundword;
     int ex = (in[0] >> 20) & 0x7FF;    /* exponent */
     int i;
 
     if ((ex > 0x3ff + 52 - 1) ||     /* things this big can't be fractional */
         ((in[0] & 0x7FFFFFFF) == 0 && in[1] == 0)) {   /* zero */
         /* NaN, Inf, a large integer, or zero: just return the input */
         out[0] = in[0];
         out[1] = in[1];
         return;
     }
 
     /*
      * Special case: ex < 0x3ff, ie our number is in (0,1). Return
      * 1 or 0 according to roundup.
      */
     if (ex < 0x3ff) {
         out[0] = sign | (roundup ? 0x3FF00000 : 0);
         out[1] = 0;
         return;
     }
 
     /*
      * We're not short of time here, so we'll do this the hideously
      * inefficient way. Shift bit by bit so that the units place is
      * somewhere predictable, round, and shift back again.
      */
     xh = in[0];
     xl = in[1];
     roundword = 0;
     for (i = ex; i < 0x3ff + 52; i++) {
         if (roundword & 1)
             roundword |= 2;            /* preserve sticky bit */
         roundword = (roundword >> 1) | ((xl & 1) << 31);
         xl = (xl >> 1) | ((xh & 1) << 31);
         xh = xh >> 1;
     }
     if (roundword && roundup) {
         xl++;
         xh += (xl==0);
     }
     for (i = ex; i < 0x3ff + 52; i++) {
         xh = (xh << 1) | ((xl >> 31) & 1);
         xl = (xl & 0x7FFFFFFF) << 1;
     }
     out[0] = xh;
     out[1] = xl;
 }
 
 char *test_ceil(uint32 *in, uint32 *out) {
     test_rint(in, out, 0, 1);
     return NULL;
 }
 
 char *test_floor(uint32 *in, uint32 *out) {
     test_rint(in, out, 1, 0);
     return NULL;
 }
 
 static void test_rintf(uint32 *in, uint32 *out,
                        int isfloor, int isceil) {
     int sign = *in & 0x80000000;
     int roundup = (isfloor && sign) || (isceil && !sign);
     uint32 x, roundword;
     int ex = (*in >> 23) & 0xFF;       /* exponent */
     int i;
 
     if ((ex > 0x7f + 23 - 1) ||      /* things this big can't be fractional */
         (*in & 0x7FFFFFFF) == 0) {     /* zero */
         /* NaN, Inf, a large integer, or zero: just return the input */
         *out = *in;
         return;
     }
 
     /*
      * Special case: ex < 0x7f, ie our number is in (0,1). Return
      * 1 or 0 according to roundup.
      */
     if (ex < 0x7f) {
         *out = sign | (roundup ? 0x3F800000 : 0);
         return;
     }
 
     /*
      * We're not short of time here, so we'll do this the hideously
      * inefficient way. Shift bit by bit so that the units place is
      * somewhere predictable, round, and shift back again.
      */
     x = *in;
     roundword = 0;
     for (i = ex; i < 0x7F + 23; i++) {
         if (roundword & 1)
             roundword |= 2;            /* preserve sticky bit */
         roundword = (roundword >> 1) | ((x & 1) << 31);
         x = x >> 1;
     }
     if (roundword && roundup) {
         x++;
     }
     for (i = ex; i < 0x7F + 23; i++) {
         x = x << 1;
     }
     *out = x;
 }
 
 char *test_ceilf(uint32 *in, uint32 *out) {
     test_rintf(in, out, 0, 1);
     return NULL;
 }
 
 char *test_floorf(uint32 *in, uint32 *out) {
     test_rintf(in, out, 1, 0);
     return NULL;
 }
 
 char *test_fmod(uint32 *a, uint32 *b, uint32 *out) {
     int sign;
     int32 aex, bex;
     uint32 am[2], bm[2];
 
     if (((a[0] & 0x7FFFFFFF) << 1) + !!a[1] > 0xFFE00000 ||
         ((b[0] & 0x7FFFFFFF) << 1) + !!b[1] > 0xFFE00000) {
         /* a or b is NaN: return QNaN, optionally with IVO */
         uint32 an, bn;
         out[0] = 0x7ff80000;
         out[1] = 1;
         an = ((a[0] & 0x7FFFFFFF) << 1) + !!a[1];
         bn = ((b[0] & 0x7FFFFFFF) << 1) + !!b[1];
         if ((an > 0xFFE00000 && an < 0xFFF00000) ||
             (bn > 0xFFE00000 && bn < 0xFFF00000))
             return "i";                /* at least one SNaN: IVO */
         else
             return NULL;               /* no SNaNs, but at least 1 QNaN */
     }
     if ((b[0] & 0x7FFFFFFF) == 0 && b[1] == 0) {   /* b==0: EDOM */
         out[0] = 0x7ff80000;
         out[1] = 1;
         return "EDOM status=i";
     }
     if ((a[0] & 0x7FF00000) == 0x7FF00000) {   /* a==Inf: EDOM */
         out[0] = 0x7ff80000;
         out[1] = 1;
         return "EDOM status=i";
     }
     if ((b[0] & 0x7FF00000) == 0x7FF00000) {   /* b==Inf: return a */
         out[0] = a[0];
         out[1] = a[1];
         return NULL;
     }
     if ((a[0] & 0x7FFFFFFF) == 0 && a[1] == 0) {   /* a==0: return a */
         out[0] = a[0];
         out[1] = a[1];
         return NULL;
     }
 
     /*
      * OK. That's the special cases cleared out of the way. Now we
      * have finite (though not necessarily normal) a and b.
      */
     sign = a[0] & 0x80000000;          /* we discard sign of b */
     test_frexp(a, am, (uint32 *)&aex);
     test_frexp(b, bm, (uint32 *)&bex);
     am[0] &= 0xFFFFF, am[0] |= 0x100000;
     bm[0] &= 0xFFFFF, bm[0] |= 0x100000;
 
     while (aex >= bex) {
         if (am[0] > bm[0] || (am[0] == bm[0] && am[1] >= bm[1])) {
             am[1] -= bm[1];
             am[0] = am[0] - bm[0] - (am[1] > ~bm[1]);
         }
         if (aex > bex) {
             am[0] = (am[0] << 1) | ((am[1] & 0x80000000) >> 31);
             am[1] <<= 1;
             aex--;
         } else
             break;
     }
 
     /*
      * Renormalise final result; this can be cunningly done by
      * passing a denormal to ldexp.
      */
     aex += 0x3fd;
     am[0] |= sign;
     test_ldexp(am, (uint32 *)&aex, out);
 
     return NULL;                       /* FIXME */
 }
 
 char *test_fmodf(uint32 *a, uint32 *b, uint32 *out) {
     int sign;
     int32 aex, bex;
     uint32 am, bm;
 
     if ((*a & 0x7FFFFFFF) > 0x7F800000 ||
         (*b & 0x7FFFFFFF) > 0x7F800000) {
         /* a or b is NaN: return QNaN, optionally with IVO */
         uint32 an, bn;
         *out = 0x7fc00001;
         an = *a & 0x7FFFFFFF;
         bn = *b & 0x7FFFFFFF;
         if ((an > 0x7f800000 && an < 0x7fc00000) ||
             (bn > 0x7f800000 && bn < 0x7fc00000))
             return "i";                /* at least one SNaN: IVO */
         else
             return NULL;               /* no SNaNs, but at least 1 QNaN */
     }
     if ((*b & 0x7FFFFFFF) == 0) {      /* b==0: EDOM */
         *out = 0x7fc00001;
         return "EDOM status=i";
     }
     if ((*a & 0x7F800000) == 0x7F800000) {   /* a==Inf: EDOM */
         *out = 0x7fc00001;
         return "EDOM status=i";
     }
     if ((*b & 0x7F800000) == 0x7F800000) {   /* b==Inf: return a */
         *out = *a;
         return NULL;
     }
     if ((*a & 0x7FFFFFFF) == 0) {      /* a==0: return a */
         *out = *a;
         return NULL;
     }
 
     /*
      * OK. That's the special cases cleared out of the way. Now we
      * have finite (though not necessarily normal) a and b.
      */
     sign = a[0] & 0x80000000;          /* we discard sign of b */
     test_frexpf(a, &am, (uint32 *)&aex);
     test_frexpf(b, &bm, (uint32 *)&bex);
     am &= 0x7FFFFF, am |= 0x800000;
     bm &= 0x7FFFFF, bm |= 0x800000;
 
     while (aex >= bex) {
         if (am >= bm) {
             am -= bm;
         }
         if (aex > bex) {
             am <<= 1;
             aex--;
         } else
             break;
     }
 
     /*
      * Renormalise final result; this can be cunningly done by
      * passing a denormal to ldexp.
      */
     aex += 0x7d;
     am |= sign;
     test_ldexpf(&am, (uint32 *)&aex, out);
 
     return NULL;                       /* FIXME */
 }
 
 char *test_ldexp(uint32 *x, uint32 *np, uint32 *out) {
     int n = *np;
     int32 n2;
     uint32 y[2];
     int ex = (x[0] >> 20) & 0x7FF;     /* exponent */
     int sign = x[0] & 0x80000000;
 
     if (ex == 0x7FF) {                 /* inf/NaN; just return x */
         out[0] = x[0];
         out[1] = x[1];
         return NULL;
     }
     if ((x[0] & 0x7FFFFFFF) == 0 && x[1] == 0) {   /* zero: return x */
         out[0] = x[0];
         out[1] = x[1];
         return NULL;
     }
 
     test_frexp(x, y, (uint32 *)&n2);
     ex = n + n2;
     if (ex > 0x400) {                  /* overflow */
         out[0] = sign | 0x7FF00000;
         out[1] = 0;
         return "overflow";
     }
     /*
      * Underflow. 2^-1074 is 00000000.00000001; so if ex == -1074
      * then we have something [2^-1075,2^-1074). Under round-to-
      * nearest-even, this whole interval rounds up to 2^-1074,
      * except for the bottom endpoint which rounds to even and is
      * an underflow condition.
      *
      * So, ex < -1074 is definite underflow, and ex == -1074 is
      * underflow iff all mantissa bits are zero.
      */
     if (ex < -1074 || (ex == -1074 && (y[0] & 0xFFFFF) == 0 && y[1] == 0)) {
         out[0] = sign;                 /* underflow: correctly signed zero */
         out[1] = 0;
         return "underflow";
     }
 
     /*
      * No overflow or underflow; should be nice and simple, unless
      * we have to denormalise and round the result.
      */
     if (ex < -1021) {                  /* denormalise and round */
         uint32 roundword;
         y[0] &= 0x000FFFFF;
         y[0] |= 0x00100000;            /* set leading bit */
         roundword = 0;
         while (ex < -1021) {
             if (roundword & 1)
                 roundword |= 2;        /* preserve sticky bit */
             roundword = (roundword >> 1) | ((y[1] & 1) << 31);
             y[1] = (y[1] >> 1) | ((y[0] & 1) << 31);
             y[0] = y[0] >> 1;
             ex++;
         }
         if (roundword > 0x80000000 ||  /* round up */
             (roundword == 0x80000000 && (y[1] & 1))) {  /* round up to even */
             y[1]++;
             y[0] += (y[1] == 0);
         }
         out[0] = sign | y[0];
         out[1] = y[1];
         /* Proper ERANGE underflow was handled earlier, but we still
          * expect an IEEE Underflow exception if this partially
          * underflowed result is not exact. */
         if (roundword)
             return "u";
         return NULL;                   /* underflow was handled earlier */
     } else {
         out[0] = y[0] + (ex << 20);
         out[1] = y[1];
         return NULL;
     }
 }
 
 char *test_ldexpf(uint32 *x, uint32 *np, uint32 *out) {
     int n = *np;
     int32 n2;
     uint32 y;
     int ex = (*x >> 23) & 0xFF;     /* exponent */
     int sign = *x & 0x80000000;
 
     if (ex == 0xFF) {                 /* inf/NaN; just return x */
         *out = *x;
         return NULL;
     }
     if ((*x & 0x7FFFFFFF) == 0) {      /* zero: return x */
         *out = *x;
         return NULL;
     }
 
     test_frexpf(x, &y, (uint32 *)&n2);
     ex = n + n2;
     if (ex > 0x80) {                  /* overflow */
         *out = sign | 0x7F800000;
         return "overflow";
     }
     /*
      * Underflow. 2^-149 is 00000001; so if ex == -149 then we have
      * something [2^-150,2^-149). Under round-to- nearest-even,
      * this whole interval rounds up to 2^-149, except for the
      * bottom endpoint which rounds to even and is an underflow
      * condition.
      *
      * So, ex < -149 is definite underflow, and ex == -149 is
      * underflow iff all mantissa bits are zero.
      */
     if (ex < -149 || (ex == -149 && (y & 0x7FFFFF) == 0)) {
         *out = sign;                 /* underflow: correctly signed zero */
         return "underflow";
     }
 
     /*
      * No overflow or underflow; should be nice and simple, unless
      * we have to denormalise and round the result.
      */
     if (ex < -125) {                  /* denormalise and round */
         uint32 roundword;
         y &= 0x007FFFFF;
         y |= 0x00800000;               /* set leading bit */
         roundword = 0;
         while (ex < -125) {
             if (roundword & 1)
                 roundword |= 2;        /* preserve sticky bit */
             roundword = (roundword >> 1) | ((y & 1) << 31);
             y = y >> 1;
             ex++;
         }
         if (roundword > 0x80000000 ||  /* round up */
             (roundword == 0x80000000 && (y & 1))) {  /* round up to even */
             y++;
         }
         *out = sign | y;
         /* Proper ERANGE underflow was handled earlier, but we still
          * expect an IEEE Underflow exception if this partially
          * underflowed result is not exact. */
         if (roundword)
             return "u";
         return NULL;                   /* underflow was handled earlier */
     } else {
         *out = y + (ex << 23);
         return NULL;
     }
 }
 
 char *test_frexp(uint32 *x, uint32 *out, uint32 *nout) {
     int ex = (x[0] >> 20) & 0x7FF;     /* exponent */
     if (ex == 0x7FF) {                 /* inf/NaN; return x/0 */
         out[0] = x[0];
         out[1] = x[1];
         nout[0] = 0;
         return NULL;
     }
     if (ex == 0) {                     /* denormals/zeros */
         int sign;
         uint32 xh, xl;
         if ((x[0] & 0x7FFFFFFF) == 0 && x[1] == 0) {
             /* zero: return x/0 */
             out[0] = x[0];
             out[1] = x[1];
             nout[0] = 0;
             return NULL;
         }
         sign = x[0] & 0x80000000;
         xh = x[0] & 0x7FFFFFFF;
         xl = x[1];
         ex = 1;
         while (!(xh & 0x100000)) {
             ex--;
             xh = (xh << 1) | ((xl >> 31) & 1);
             xl = (xl & 0x7FFFFFFF) << 1;
         }
         out[0] = sign | 0x3FE00000 | (xh & 0xFFFFF);
         out[1] = xl;
         nout[0] = ex - 0x3FE;
         return NULL;
     }
     out[0] = 0x3FE00000 | (x[0] & 0x800FFFFF);
     out[1] = x[1];
     nout[0] = ex - 0x3FE;
     return NULL;                       /* ordinary number; no error */
 }
 
 char *test_frexpf(uint32 *x, uint32 *out, uint32 *nout) {
     int ex = (*x >> 23) & 0xFF;        /* exponent */
     if (ex == 0xFF) {                  /* inf/NaN; return x/0 */
         *out = *x;
         nout[0] = 0;
         return NULL;
     }
     if (ex == 0) {                     /* denormals/zeros */
         int sign;
         uint32 xv;
         if ((*x & 0x7FFFFFFF) == 0) {
             /* zero: return x/0 */
             *out = *x;
             nout[0] = 0;
             return NULL;
         }
         sign = *x & 0x80000000;
         xv = *x & 0x7FFFFFFF;
         ex = 1;
         while (!(xv & 0x800000)) {
             ex--;
             xv = xv << 1;
         }
         *out = sign | 0x3F000000 | (xv & 0x7FFFFF);
         nout[0] = ex - 0x7E;
         return NULL;
     }
     *out = 0x3F000000 | (*x & 0x807FFFFF);
     nout[0] = ex - 0x7E;
     return NULL;                       /* ordinary number; no error */
 }
 
 char *test_modf(uint32 *x, uint32 *fout, uint32 *iout) {
     int ex = (x[0] >> 20) & 0x7FF;     /* exponent */
     int sign = x[0] & 0x80000000;
     uint32 fh, fl;
 
     if (((x[0] & 0x7FFFFFFF) | (!!x[1])) > 0x7FF00000) {
         /*
          * NaN input: return the same in _both_ outputs.
          */
         fout[0] = iout[0] = x[0];
         fout[1] = iout[1] = x[1];
         return NULL;
     }
 
     test_rint(x, iout, 0, 0);
     fh = x[0] - iout[0];
     fl = x[1] - iout[1];
     if (!fh && !fl) {                  /* no fraction part */
         fout[0] = sign;
         fout[1] = 0;
         return NULL;
     }
     if (!(iout[0] & 0x7FFFFFFF) && !iout[1]) {   /* no integer part */
         fout[0] = x[0];
         fout[1] = x[1];
         return NULL;
     }
     while (!(fh & 0x100000)) {
         ex--;
         fh = (fh << 1) | ((fl >> 31) & 1);
         fl = (fl & 0x7FFFFFFF) << 1;
     }
     fout[0] = sign | (ex << 20) | (fh & 0xFFFFF);
     fout[1] = fl;
     return NULL;
 }
 
 char *test_modff(uint32 *x, uint32 *fout, uint32 *iout) {
     int ex = (*x >> 23) & 0xFF;        /* exponent */
     int sign = *x & 0x80000000;
     uint32 f;
 
     if ((*x & 0x7FFFFFFF) > 0x7F800000) {
         /*
          * NaN input: return the same in _both_ outputs.
          */
         *fout = *iout = *x;
         return NULL;
     }
 
     test_rintf(x, iout, 0, 0);
     f = *x - *iout;
     if (!f) {                          /* no fraction part */
         *fout = sign;
         return NULL;
     }
     if (!(*iout & 0x7FFFFFFF)) {       /* no integer part */
         *fout = *x;
         return NULL;
     }
     while (!(f & 0x800000)) {
         ex--;
         f = f << 1;
     }
     *fout = sign | (ex << 23) | (f & 0x7FFFFF);
     return NULL;
 }
 
 char *test_copysign(uint32 *x, uint32 *y, uint32 *out)
 {
     int ysign = y[0] & 0x80000000;
     int xhigh = x[0] & 0x7fffffff;
 
     out[0] = ysign | xhigh;
     out[1] = x[1];
 
     /* There can be no error */
     return NULL;
 }
 
 char *test_copysignf(uint32 *x, uint32 *y, uint32 *out)
 {
     int ysign = y[0] & 0x80000000;
     int xhigh = x[0] & 0x7fffffff;
 
     out[0] = ysign | xhigh;
 
     /* There can be no error */
     return NULL;
 }
 
 char *test_isfinite(uint32 *x, uint32 *out)
 {
     int xhigh = x[0];
     /* Being finite means that the exponent is not 0x7ff */
     if ((xhigh & 0x7ff00000) == 0x7ff00000) out[0] = 0;
     else out[0] = 1;
     return NULL;
 }
 
 char *test_isfinitef(uint32 *x, uint32 *out)
 {
     /* Being finite means that the exponent is not 0xff */
     if ((x[0] & 0x7f800000) == 0x7f800000) out[0] = 0;
     else out[0] = 1;
     return NULL;
 }
 
 char *test_isinff(uint32 *x, uint32 *out)
 {
     /* Being infinite means that our bottom 30 bits equate to 0x7f800000 */
     if ((x[0] & 0x7fffffff) == 0x7f800000) out[0] = 1;
     else out[0] = 0;
     return NULL;
 }
 
 char *test_isinf(uint32 *x, uint32 *out)
 {
     int xhigh = x[0];
     int xlow = x[1];
     /* Being infinite means that our fraction is zero and exponent is 0x7ff */
     if (((xhigh & 0x7fffffff) == 0x7ff00000) && (xlow == 0)) out[0] = 1;
     else out[0] = 0;
     return NULL;
 }
 
 char *test_isnanf(uint32 *x, uint32 *out)
 {
     /* Being NaN means that our exponent is 0xff and non-0 fraction */
     int exponent = x[0] & 0x7f800000;
     int fraction = x[0] & 0x007fffff;
     if ((exponent == 0x7f800000) && (fraction != 0)) out[0] = 1;
     else out[0] = 0;
     return NULL;
 }
 
 char *test_isnan(uint32 *x, uint32 *out)
 {
     /* Being NaN means that our exponent is 0x7ff and non-0 fraction */
     int exponent = x[0] & 0x7ff00000;
     int fractionhigh = x[0] & 0x000fffff;
     if ((exponent == 0x7ff00000) && ((fractionhigh != 0) || x[1] != 0))
         out[0] = 1;
     else out[0] = 0;
     return NULL;
 }
 
 char *test_isnormalf(uint32 *x, uint32 *out)
 {
     /* Being normal means exponent is not 0 and is not 0xff */
     int exponent = x[0] & 0x7f800000;
     if (exponent == 0x7f800000) out[0] = 0;
     else if (exponent == 0) out[0] = 0;
     else out[0] = 1;
     return NULL;
 }
 
 char *test_isnormal(uint32 *x, uint32 *out)
 {
     /* Being normal means exponent is not 0 and is not 0x7ff */
     int exponent = x[0] & 0x7ff00000;
     if (exponent == 0x7ff00000) out[0] = 0;
     else if (exponent == 0) out[0] = 0;
     else out[0] = 1;
     return NULL;
 }
 
 char *test_signbitf(uint32 *x, uint32 *out)
 {
     /* Sign bit is bit 31 */
     out[0] = (x[0] >> 31) & 1;
     return NULL;
 }
 
 char *test_signbit(uint32 *x, uint32 *out)
 {
     /* Sign bit is bit 31 */
     out[0] = (x[0] >> 31) & 1;
     return NULL;
 }
 
 char *test_fpclassify(uint32 *x, uint32 *out)
 {
     int exponent = (x[0] & 0x7ff00000) >> 20;
     int fraction = (x[0] & 0x000fffff) | x[1];
 
     if ((exponent == 0x00) && (fraction == 0)) out[0] = 0;
     else if ((exponent == 0x00) && (fraction != 0)) out[0] = 4;
     else if ((exponent == 0x7ff) && (fraction == 0)) out[0] = 3;
     else if ((exponent == 0x7ff) && (fraction != 0)) out[0] = 7;
     else out[0] = 5;
     return NULL;
 }
 
 char *test_fpclassifyf(uint32 *x, uint32 *out)
 {
     int exponent = (x[0] & 0x7f800000) >> 23;
     int fraction = x[0] & 0x007fffff;
 
     if ((exponent == 0x000) && (fraction == 0)) out[0] = 0;
     else if ((exponent == 0x000) && (fraction != 0)) out[0] = 4;
     else if ((exponent == 0xff) && (fraction == 0)) out[0] = 3;
     else if ((exponent == 0xff) && (fraction != 0)) out[0] = 7;
     else out[0] = 5;
     return NULL;
 }
 
 /*
  * Internal function that compares doubles in x & y and returns -3, -2, -1, 0,
  * 1 if they compare to be signaling, unordered, less than, equal or greater
  * than.
  */
 static int fpcmp4(uint32 *x, uint32 *y)
 {
     int result = 0;
 
     /*
      * Sort out whether results are ordered or not to begin with
      * NaNs have exponent 0x7ff, and non-zero fraction. Signaling NaNs take
      * higher priority than quiet ones.
      */
     if ((x[0] & 0x7fffffff) >= 0x7ff80000) result = -2;
     else if ((x[0] & 0x7fffffff) > 0x7ff00000) result = -3;
     else if (((x[0] & 0x7fffffff) == 0x7ff00000) && (x[1] != 0)) result = -3;
     if ((y[0] & 0x7fffffff) >= 0x7ff80000 && result != -3) result = -2;
     else if ((y[0] & 0x7fffffff) > 0x7ff00000) result = -3;
     else if (((y[0] & 0x7fffffff) == 0x7ff00000) && (y[1] != 0)) result = -3;
     if (result != 0) return result;
 
     /*
      * The two forms of zero are equal
      */
     if (((x[0] & 0x7fffffff) == 0) && x[1] == 0 &&
         ((y[0] & 0x7fffffff) == 0) && y[1] == 0)
         return 0;
 
     /*
      * If x and y have different signs we can tell that they're not equal
      * If x is +ve we have x > y return 1 - otherwise y is +ve return -1
      */
     if ((x[0] >> 31) != (y[0] >> 31))
         return ((x[0] >> 31) == 0) - ((y[0] >> 31) == 0);
 
     /*
      * Now we have both signs the same, let's do an initial compare of the
      * values.
      *
      * Whoever designed IEEE754's floating point formats is very clever and
      * earns my undying admiration.  Once you remove the sign-bit, the
      * floating point numbers can be ordered using the standard <, ==, >
      * operators will treating the fp-numbers as integers with that bit-
      * pattern.
      */
     if ((x[0] & 0x7fffffff) < (y[0] & 0x7fffffff)) result = -1;
     else if ((x[0] & 0x7fffffff) > (y[0] & 0x7fffffff)) result = 1;
     else if (x[1] < y[1]) result = -1;
     else if (x[1] > y[1]) result = 1;
     else result = 0;
 
     /*
      * Now we return the result - is x is positive (and therefore so is y) we
      * return the plain result - otherwise we negate it and return.
      */
     if ((x[0] >> 31) == 0) return result;
     else return -result;
 }
 
 /*
  * Internal function that compares floats in x & y and returns -3, -2, -1, 0,
  * 1 if they compare to be signaling, unordered, less than, equal or greater
  * than.
  */
 static int fpcmp4f(uint32 *x, uint32 *y)
 {
     int result = 0;
 
     /*
      * Sort out whether results are ordered or not to begin with
      * NaNs have exponent 0xff, and non-zero fraction - we have to handle all
      * signaling cases over the quiet ones
      */
     if ((x[0] & 0x7fffffff) >= 0x7fc00000) result = -2;
     else if ((x[0] & 0x7fffffff) > 0x7f800000) result = -3;
     if ((y[0] & 0x7fffffff) >= 0x7fc00000 && result != -3) result = -2;
     else if ((y[0] & 0x7fffffff) > 0x7f800000) result = -3;
     if (result != 0) return result;
 
     /*
      * The two forms of zero are equal
      */
     if (((x[0] & 0x7fffffff) == 0) && ((y[0] & 0x7fffffff) == 0))
         return 0;
 
     /*
      * If x and y have different signs we can tell that they're not equal
      * If x is +ve we have x > y return 1 - otherwise y is +ve return -1
      */
     if ((x[0] >> 31) != (y[0] >> 31))
         return ((x[0] >> 31) == 0) - ((y[0] >> 31) == 0);
 
     /*
      * Now we have both signs the same, let's do an initial compare of the
      * values.
      *
      * Whoever designed IEEE754's floating point formats is very clever and
      * earns my undying admiration.  Once you remove the sign-bit, the
      * floating point numbers can be ordered using the standard <, ==, >
      * operators will treating the fp-numbers as integers with that bit-
      * pattern.
      */
     if ((x[0] & 0x7fffffff) < (y[0] & 0x7fffffff)) result = -1;
     else if ((x[0] & 0x7fffffff) > (y[0] & 0x7fffffff)) result = 1;
     else result = 0;
 
     /*
      * Now we return the result - is x is positive (and therefore so is y) we
      * return the plain result - otherwise we negate it and return.
      */
     if ((x[0] >> 31) == 0) return result;
     else return -result;
 }
 
 char *test_isgreater(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4(x, y);
     *out = (result == 1);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_isgreaterequal(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4(x, y);
     *out = (result >= 0);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_isless(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4(x, y);
     *out = (result == -1);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_islessequal(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4(x, y);
     *out = (result == -1) || (result == 0);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_islessgreater(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4(x, y);
     *out = (result == -1) || (result == 1);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_isunordered(uint32 *x, uint32 *y, uint32 *out)
 {
     int normal = 0;
     int result = fpcmp4(x, y);
 
     test_isnormal(x, out);
     normal |= *out;
     test_isnormal(y, out);
     normal |= *out;
     *out = (result == -2) || (result == -3);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_isgreaterf(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4f(x, y);
     *out = (result == 1);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_isgreaterequalf(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4f(x, y);
     *out = (result >= 0);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_islessf(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4f(x, y);
     *out = (result == -1);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_islessequalf(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4f(x, y);
     *out = (result == -1) || (result == 0);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_islessgreaterf(uint32 *x, uint32 *y, uint32 *out)
 {
     int result = fpcmp4f(x, y);
     *out = (result == -1) || (result == 1);
     return result == -3 ? "i" : NULL;
 }
 
 char *test_isunorderedf(uint32 *x, uint32 *y, uint32 *out)
 {
     int normal = 0;
     int result = fpcmp4f(x, y);
 
     test_isnormalf(x, out);
     normal |= *out;
     test_isnormalf(y, out);
     normal |= *out;
     *out = (result == -2) || (result == -3);
     return result == -3 ? "i" : NULL;
 }
diff --git a/contrib/arm-optimized-routines/math/test/rtest/semi.h b/contrib/arm-optimized-routines/math/test/rtest/semi.h
index 17dc4158fb51..7a1444e55d28 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/semi.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/semi.h
@@ -1,53 +1,53 @@
 /*
  * semi.h: header for semi.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef test_semi_h
 #define test_semi_h
 
 #include "types.h"
 
 char *test_ceil(uint32 *in, uint32 *out);
 char *test_floor(uint32 *in, uint32 *out);
 char *test_fmod(uint32 *a, uint32 *b, uint32 *out);
 char *test_ldexp(uint32 *x, uint32 *n, uint32 *out);
 char *test_frexp(uint32 *x, uint32 *out, uint32 *nout);
 char *test_modf(uint32 *x, uint32 *iout, uint32 *fout);
 char *test_ceilf(uint32 *in, uint32 *out);
 char *test_floorf(uint32 *in, uint32 *out);
 char *test_fmodf(uint32 *a, uint32 *b, uint32 *out);
 char *test_ldexpf(uint32 *x, uint32 *n, uint32 *out);
 char *test_frexpf(uint32 *x, uint32 *out, uint32 *nout);
 char *test_modff(uint32 *x, uint32 *iout, uint32 *fout);
 
 char *test_copysign(uint32 *x, uint32 *y, uint32 *out);
 char *test_copysignf(uint32 *x, uint32 *y, uint32 *out);
 char *test_isfinite(uint32 *x, uint32 *out);
 char *test_isfinitef(uint32 *x, uint32 *out);
 char *test_isinf(uint32 *x, uint32 *out);
 char *test_isinff(uint32 *x, uint32 *out);
 char *test_isnan(uint32 *x, uint32 *out);
 char *test_isnanf(uint32 *x, uint32 *out);
 char *test_isnormal(uint32 *x, uint32 *out);
 char *test_isnormalf(uint32 *x, uint32 *out);
 char *test_signbit(uint32 *x, uint32 *out);
 char *test_signbitf(uint32 *x, uint32 *out);
 char *test_fpclassify(uint32 *x, uint32 *out);
 char *test_fpclassifyf(uint32 *x, uint32 *out);
 
 char *test_isgreater(uint32 *x, uint32 *y, uint32 *out);
 char *test_isgreaterequal(uint32 *x, uint32 *y, uint32 *out);
 char *test_isless(uint32 *x, uint32 *y, uint32 *out);
 char *test_islessequal(uint32 *x, uint32 *y, uint32 *out);
 char *test_islessgreater(uint32 *x, uint32 *y, uint32 *out);
 char *test_isunordered(uint32 *x, uint32 *y, uint32 *out);
 char *test_isgreaterf(uint32 *x, uint32 *y, uint32 *out);
 char *test_isgreaterequalf(uint32 *x, uint32 *y, uint32 *out);
 char *test_islessf(uint32 *x, uint32 *y, uint32 *out);
 char *test_islessequalf(uint32 *x, uint32 *y, uint32 *out);
 char *test_islessgreaterf(uint32 *x, uint32 *y, uint32 *out);
 char *test_isunorderedf(uint32 *x, uint32 *y, uint32 *out);
 #endif
diff --git a/contrib/arm-optimized-routines/math/test/rtest/types.h b/contrib/arm-optimized-routines/math/test/rtest/types.h
index 53cd557fa4cf..e15b4e06a0d4 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/types.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/types.h
@@ -1,25 +1,25 @@
 /*
  * types.h
  *
  * Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_types_h
 #define mathtest_types_h
 
 #include <limits.h>
 
 #if UINT_MAX == 4294967295
 typedef unsigned int uint32;
 typedef int int32;
 #define I32 ""
 #elif ULONG_MAX == 4294967295
 typedef unsigned long uint32;
 typedef long int32;
 #define I32 "l"
 #else
 #error Could not find an unsigned 32-bit integer type
 #endif
 
 #endif
diff --git a/contrib/arm-optimized-routines/math/test/rtest/wrappers.c b/contrib/arm-optimized-routines/math/test/rtest/wrappers.c
index de45ac5768d0..441017192ab4 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/wrappers.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/wrappers.c
@@ -1,261 +1,261 @@
 /*
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #include "intern.h"
 
 void wrapper_init(wrapperctx *ctx)
 {
     int i;
     ctx->nops = ctx->nresults = 0;
     for (i = 0; i < 2; i++) {
         ctx->mpfr_ops[i] = NULL;
         ctx->mpc_ops[i] = NULL;
         ctx->ieee_ops[i] = NULL;
     }
     ctx->mpfr_result = NULL;
     ctx->mpc_result = NULL;
     ctx->ieee_result = NULL;
     ctx->need_regen = 0;
 }
 
 void wrapper_op_real(wrapperctx *ctx, const mpfr_t r,
                      int size, const uint32 *ieee)
 {
     assert(ctx->nops < 2);
     ctx->mpfr_ops[ctx->nops] = r;
     ctx->ieee_ops[ctx->nops] = ieee;
     ctx->size_ops[ctx->nops] = size;
     ctx->nops++;
 }
 
 void wrapper_op_complex(wrapperctx *ctx, const mpc_t c,
                         int size, const uint32 *ieee)
 {
     assert(ctx->nops < 2);
     ctx->mpc_ops[ctx->nops] = c;
     ctx->ieee_ops[ctx->nops] = ieee;
     ctx->size_ops[ctx->nops] = size;
     ctx->nops++;
 }
 
 void wrapper_result_real(wrapperctx *ctx, mpfr_t r,
                          int size, uint32 *ieee)
 {
     assert(ctx->nresults < 1);
     ctx->mpfr_result = r;
     ctx->ieee_result = ieee;
     ctx->size_result = size;
     ctx->nresults++;
 }
 
 void wrapper_result_complex(wrapperctx *ctx, mpc_t c,
                             int size, uint32 *ieee)
 {
     assert(ctx->nresults < 1);
     ctx->mpc_result = c;
     ctx->ieee_result = ieee;
     ctx->size_result = size;
     ctx->nresults++;
 }
 
 int wrapper_run(wrapperctx *ctx, wrapperfunc wrappers[MAXWRAPPERS])
 {
     int i;
     for (i = 0; i < MAXWRAPPERS && wrappers[i]; i++)
         wrappers[i](ctx);
     universal_wrapper(ctx);
     return ctx->need_regen;
 }
 
 mpfr_srcptr wrapper_get_mpfr(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpfr_result);
         return ctx->mpfr_result;
     } else {
         assert(ctx->mpfr_ops[op]);
         return ctx->mpfr_ops[op];
     }
 }
 
 const uint32 *wrapper_get_ieee(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpfr_result);
         return ctx->ieee_result;
     } else {
         assert(ctx->mpfr_ops[op]);
         return ctx->ieee_ops[op];
     }
 }
 
 int wrapper_get_nops(wrapperctx *ctx)
 {
     return ctx->nops;
 }
 
 int wrapper_get_size(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpfr_result || ctx->mpc_result);
         return ctx->size_result;
     } else {
         assert(ctx->mpfr_ops[op] || ctx->mpc_ops[op]);
         return ctx->size_ops[op];
     }
 }
 
 int wrapper_is_complex(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpfr_result || ctx->mpc_result);
         return ctx->mpc_result != NULL;
     } else {
         assert(ctx->mpfr_ops[op] || ctx->mpc_ops[op]);
         return ctx->mpc_ops[op] != NULL;
     }
 }
 
 mpc_srcptr wrapper_get_mpc(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpc_result);
         return ctx->mpc_result;
     } else {
         assert(ctx->mpc_ops[op]);
         return ctx->mpc_ops[op];
     }
 }
 
 mpfr_srcptr wrapper_get_mpfr_r(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpc_result);
         return mpc_realref(ctx->mpc_result);
     } else {
         assert(ctx->mpc_ops[op]);
         return mpc_realref(ctx->mpc_ops[op]);
     }
 }
 
 mpfr_srcptr wrapper_get_mpfr_i(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpc_result);
         return mpc_imagref(ctx->mpc_result);
     } else {
         assert(ctx->mpc_ops[op]);
         return mpc_imagref(ctx->mpc_ops[op]);
     }
 }
 
 const uint32 *wrapper_get_ieee_r(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpc_result);
         return ctx->ieee_result;
     } else {
         assert(ctx->mpc_ops[op]);
         return ctx->ieee_ops[op];
     }
 }
 
 const uint32 *wrapper_get_ieee_i(wrapperctx *ctx, int op)
 {
     if (op < 0) {
         assert(ctx->mpc_result);
         return ctx->ieee_result + 4;
     } else {
         assert(ctx->mpc_ops[op]);
         return ctx->ieee_ops[op] + 2;
     }
 }
 
 void wrapper_set_sign(wrapperctx *ctx, uint32 sign)
 {
     assert(ctx->mpfr_result);
     ctx->ieee_result[0] |= (sign & 0x80000000U);
 }
 
 void wrapper_set_sign_r(wrapperctx *ctx, uint32 sign)
 {
     assert(ctx->mpc_result);
     ctx->ieee_result[0] |= (sign & 0x80000000U);
 }
 
 void wrapper_set_sign_i(wrapperctx *ctx, uint32 sign)
 {
     assert(ctx->mpc_result);
     ctx->ieee_result[4] |= (sign & 0x80000000U);
 }
 
 void wrapper_set_nan(wrapperctx *ctx)
 {
     assert(ctx->mpfr_result);
     mpfr_set_nan(ctx->mpfr_result);
     ctx->need_regen = 1;
 }
 
 void wrapper_set_nan_r(wrapperctx *ctx)
 {
     assert(ctx->mpc_result);
     mpfr_set_nan(mpc_realref(ctx->mpc_result)); /* FIXME: better way? */
     ctx->need_regen = 1;
 }
 
 void wrapper_set_nan_i(wrapperctx *ctx)
 {
     assert(ctx->mpc_result);
     mpfr_set_nan(mpc_imagref(ctx->mpc_result)); /* FIXME: better way? */
     ctx->need_regen = 1;
 }
 
 void wrapper_set_int(wrapperctx *ctx, int val)
 {
     assert(ctx->mpfr_result);
     mpfr_set_si(ctx->mpfr_result, val, GMP_RNDN);
     ctx->need_regen = 1;
 }
 
 void wrapper_set_int_r(wrapperctx *ctx, int val)
 {
     assert(ctx->mpc_result);
     mpfr_set_si(mpc_realref(ctx->mpc_result), val, GMP_RNDN);
     ctx->need_regen = 1;
 }
 
 void wrapper_set_int_i(wrapperctx *ctx, int val)
 {
     assert(ctx->mpc_result);
     mpfr_set_si(mpc_realref(ctx->mpc_result), val, GMP_RNDN);
     ctx->need_regen = 1;
 }
 
 void wrapper_set_mpfr(wrapperctx *ctx, const mpfr_t val)
 {
     assert(ctx->mpfr_result);
     mpfr_set(ctx->mpfr_result, val, GMP_RNDN);
     ctx->need_regen = 1;
 }
 
 void wrapper_set_mpfr_r(wrapperctx *ctx, const mpfr_t val)
 {
     assert(ctx->mpc_result);
     mpfr_set(mpc_realref(ctx->mpc_result), val, GMP_RNDN);
     ctx->need_regen = 1;
 }
 
 void wrapper_set_mpfr_i(wrapperctx *ctx, const mpfr_t val)
 {
     assert(ctx->mpc_result);
     mpfr_set(mpc_realref(ctx->mpc_result), val, GMP_RNDN);
     ctx->need_regen = 1;
 }
diff --git a/contrib/arm-optimized-routines/math/test/rtest/wrappers.h b/contrib/arm-optimized-routines/math/test/rtest/wrappers.h
index 7b09c85a59f1..0a8a58777d8a 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/wrappers.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/wrappers.h
@@ -1,114 +1,114 @@
 /*
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 typedef struct {
     /* Structure type should be considered opaque outside wrappers.c,
      * though we have to define it here so its size is known. */
     int nops;
     int nresults;
     mpfr_srcptr mpfr_ops[2];
     mpfr_ptr mpfr_result;
     mpc_srcptr mpc_ops[2];
     mpc_ptr mpc_result;
     const uint32 *ieee_ops[2];
     uint32 *ieee_result;
     int size_ops[2];
     int size_result;
     int need_regen;
 } wrapperctx;
 
 typedef void (*wrapperfunc)(wrapperctx *ctx);
 #define MAXWRAPPERS 3
 
 /*
  * Functions for the test harness to call.
  *
  * When the test harness executes a test function, it should
  * initialise a wrapperctx with wrapper_init, then provide all the
  * operands and results in both mpfr/mpc and IEEE (+ extrabits)
  * formats via wrapper_op_* and wrapper_result_*. Then it should run
  * the function's wrappers using wrapper_run(), and if that returns
  * true then the primary result has been rewritten in mpfr/mpc format
  * and it should therefore retranslate into IEEE.
  *
  * 'size' in all prototypes below represents an FP type by giving the
  * number of 32-bit words it requires, so 1=float and 2=double. Input
  * operands will be that many words (or that many for both their real
  * and imag parts); outputs will have one extra word for 'extrabits'.
  *
  * This system only applies at all to reference functions using
  * mpfr/mpc. The seminumerical functions we implement in pure IEEE
  * form are expected to handle all their own special cases correctly.
  */
 
 void wrapper_init(wrapperctx *ctx);
 
 /* Real operand. */
 void wrapper_op_real(wrapperctx *ctx, const mpfr_t r,
                      int size, const uint32 *ieee);
 
 /* Complex operand. Real part starts at ieee[0], the imag part at ieee[2]. */
 void wrapper_op_complex(wrapperctx *ctx, const mpc_t c,
                         int size, const uint32 *ieee);
 
 /* Real result. ieee contains size+1 words, as discussed above. */
 void wrapper_result_real(wrapperctx *ctx, mpfr_t r,
                          int size, uint32 *ieee);
 
 /* Complex result. ieee contains size+1 words of real part starting at
  * ieee[0], and another size+1 of imag part starting at ieee[4]. */
 void wrapper_result_complex(wrapperctx *ctx, mpc_t c,
                             int size, uint32 *ieee);
 
 int wrapper_run(wrapperctx *ctx, wrapperfunc wrappers[MAXWRAPPERS]);
 
 /*
  * Functions for wrappers to call. 'op' indicates which operand is
  * being requested: 0,1 means first and second, and -1 means the
  * result.
  */
 
 mpfr_srcptr wrapper_get_mpfr(wrapperctx *ctx, int op);
 const uint32 *wrapper_get_ieee(wrapperctx *ctx, int op);
 
 mpc_srcptr wrapper_get_mpc(wrapperctx *ctx, int op);
 mpfr_srcptr wrapper_get_mpfr_r(wrapperctx *ctx, int op);
 mpfr_srcptr wrapper_get_mpfr_i(wrapperctx *ctx, int op);
 const uint32 *wrapper_get_ieee_r(wrapperctx *ctx, int op);
 const uint32 *wrapper_get_ieee_i(wrapperctx *ctx, int op);
 
 /* Query operand count + types */
 int wrapper_get_nops(wrapperctx *ctx);
 int wrapper_get_size(wrapperctx *ctx, int op);
 int wrapper_is_complex(wrapperctx *ctx, int op);
 
 /* Change just the sign of the result. Only the top bit of 'sign' is used. */
 void wrapper_set_sign(wrapperctx *ctx, uint32 sign);
 void wrapper_set_sign_r(wrapperctx *ctx, uint32 sign);
 void wrapper_set_sign_i(wrapperctx *ctx, uint32 sign);
 
 /* Set a result to NaN. */
 void wrapper_set_nan(wrapperctx *ctx);
 void wrapper_set_nan_r(wrapperctx *ctx);
 void wrapper_set_nan_i(wrapperctx *ctx);
 
 /* Set a result to an integer value (converted to the appropriate
  * float format). */
 void wrapper_set_int(wrapperctx *ctx, int val);
 void wrapper_set_int_r(wrapperctx *ctx, int val);
 void wrapper_set_int_i(wrapperctx *ctx, int val);
 
 /* Set a result to a new MPFR float. */
 void wrapper_set_mpfr(wrapperctx *ctx, const mpfr_t val);
 void wrapper_set_mpfr_r(wrapperctx *ctx, const mpfr_t val);
 void wrapper_set_mpfr_i(wrapperctx *ctx, const mpfr_t val);
 
 /*
  * A universal wrapper called for _all_ functions, that doesn't have
  * to be specified individually everywhere.
  */
 void universal_wrapper(wrapperctx *ctx);
diff --git a/contrib/arm-optimized-routines/math/test/runulp.sh b/contrib/arm-optimized-routines/math/test/runulp.sh
index 0190d9ab27fb..b4000f6ea01b 100755
--- a/contrib/arm-optimized-routines/math/test/runulp.sh
+++ b/contrib/arm-optimized-routines/math/test/runulp.sh
@@ -1,315 +1,324 @@
 #!/bin/bash
 
 # ULP error check script.
 #
-# Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 #set -x
 set -eu
 
 # cd to bin directory.
 cd "${0%/*}"
 
 rmodes='n u d z'
 #rmodes=n
 flags="${ULPFLAGS:--q}"
 emu="$@"
 
 FAIL=0
 PASS=0
 
 t() {
 	[ $r = "n" ] && Lt=$L || Lt=$Ldir
 	$emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
 }
 
 check() {
 	$emu ./ulp -f -q "$@" >/dev/null
 }
 
 Ldir=0.5
 for r in $rmodes
 do
 L=0.01
 t exp  0 0xffff000000000000 10000
 t exp  0x1p-6     0x1p6     40000
 t exp -0x1p-6    -0x1p6     40000
 t exp  633.3      733.3     10000
 t exp -633.3     -777.3     10000
 
 L=0.01
 t exp2  0 0xffff000000000000 10000
 t exp2  0x1p-6     0x1p6     40000
 t exp2 -0x1p-6    -0x1p6     40000
 t exp2  633.3      733.3     10000
 t exp2 -633.3     -777.3     10000
 
 L=0.02
 t log  0 0xffff000000000000 10000
 t log  0x1p-4    0x1p4      40000
 t log  0         inf        40000
 
 L=0.05
 t log2  0 0xffff000000000000 10000
 t log2  0x1p-4    0x1p4      40000
 t log2  0         inf        40000
 
 L=0.05
 t pow  0.5  2.0  x  0  inf 20000
 t pow -0.5 -2.0  x  0  inf 20000
 t pow  0.5  2.0  x -0 -inf 20000
 t pow -0.5 -2.0  x -0 -inf 20000
 t pow  0.5  2.0  x  0x1p-10  0x1p10  40000
 t pow  0.5  2.0  x -0x1p-10 -0x1p10  40000
 t pow  0    inf  x    0.5      2.0   80000
 t pow  0    inf  x   -0.5     -2.0   80000
 t pow  0x1.fp-1   0x1.08p0  x  0x1p8 0x1p17  80000
 t pow  0x1.fp-1   0x1.08p0  x -0x1p8 -0x1p17 80000
 t pow  0         0x1p-1000  x  0 1.0 50000
 t pow  0x1p1000        inf  x  0 1.0 50000
 t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
 t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
 
 L=1.0
 Ldir=0.9
 t erf  0 0xffff000000000000 10000
 t erf  0x1p-1022  0x1p-26   40000
 t erf  -0x1p-1022 -0x1p-26  40000
 t erf  0x1p-26    0x1p3     40000
 t erf  -0x1p-26  -0x1p3     40000
 t erf  0         inf        40000
 Ldir=0.5
 
 L=0.01
 t expf  0    0xffff0000    10000
 t expf  0x1p-14   0x1p8    50000
 t expf -0x1p-14  -0x1p8    50000
 
 L=0.01
 t exp2f  0    0xffff0000   10000
 t exp2f  0x1p-14   0x1p8   50000
 t exp2f -0x1p-14  -0x1p8   50000
 
 L=0.32
 t logf  0    0xffff0000    10000
 t logf  0x1p-4    0x1p4    50000
 t logf  0         inf      50000
 
 L=0.26
 t log2f  0    0xffff0000   10000
 t log2f  0x1p-4    0x1p4   50000
 t log2f  0         inf     50000
 
 L=0.06
 t sinf  0    0xffff0000    10000
 t sinf  0x1p-14  0x1p54    50000
 t sinf -0x1p-14 -0x1p54    50000
 
 L=0.06
 t cosf  0    0xffff0000    10000
 t cosf  0x1p-14  0x1p54    50000
 t cosf -0x1p-14 -0x1p54    50000
 
 L=0.06
 t sincosf_sinf  0    0xffff0000    10000
 t sincosf_sinf  0x1p-14  0x1p54    50000
 t sincosf_sinf -0x1p-14 -0x1p54    50000
 
 L=0.06
 t sincosf_cosf  0    0xffff0000    10000
 t sincosf_cosf  0x1p-14  0x1p54    50000
 t sincosf_cosf -0x1p-14 -0x1p54    50000
 
 L=0.4
 t powf  0x1p-1   0x1p1  x  0x1p-7 0x1p7   50000
 t powf  0x1p-1   0x1p1  x -0x1p-7 -0x1p7  50000
 t powf  0x1p-70 0x1p70  x  0x1p-1 0x1p1   50000
 t powf  0x1p-70 0x1p70  x  -0x1p-1 -0x1p1 50000
 t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
 
 L=0.6
 Ldir=0.9
 t erff  0      0xffff0000 10000
 t erff  0x1p-127  0x1p-26 40000
 t erff -0x1p-127 -0x1p-26 40000
 t erff  0x1p-26   0x1p3   40000
 t erff -0x1p-26  -0x1p3   40000
 t erff  0         inf     40000
 Ldir=0.5
 
 done
 
 # vector functions
 Ldir=0.5
 r='n'
-flags="${ULPFLAGS:--q} -f"
+flags="${ULPFLAGS:--q}"
 runs=
 check __s_exp 1 && runs=1
 runv=
 check __v_exp 1 && runv=1
 runvn=
 check __vn_exp 1 && runvn=1
 
 range_exp='
   0 0xffff000000000000 10000
   0x1p-6     0x1p6     400000
  -0x1p-6    -0x1p6     400000
   633.3      733.3     10000
  -633.3     -777.3     10000
 '
 
 range_log='
   0 0xffff000000000000 10000
   0x1p-4     0x1p4     400000
   0          inf       400000
 '
 
 range_pow='
  0x1p-1   0x1p1  x  0x1p-10 0x1p10   50000
  0x1p-1   0x1p1  x -0x1p-10 -0x1p10  50000
  0x1p-500 0x1p500  x  0x1p-1 0x1p1   50000
  0x1p-500 0x1p500  x  -0x1p-1 -0x1p1 50000
  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p16    50000
  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16   50000
 '
 
 range_sin='
   0 0xffff000000000000 10000
   0x1p-4     0x1p4     400000
  -0x1p-23    0x1p23    400000
 '
 range_cos="$range_sin"
 
 range_expf='
   0    0xffff0000    10000
   0x1p-14   0x1p8    500000
  -0x1p-14  -0x1p8    500000
 '
 
 range_expf_1u="$range_expf"
 range_exp2f="$range_expf"
 range_exp2f_1u="$range_expf"
 
 range_logf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    500000
 '
 
 range_sinf='
  0    0xffff0000    10000
  0x1p-4    0x1p4    300000
 -0x1p-9   -0x1p9    300000
 '
 range_cosf="$range_sinf"
 
 range_powf='
  0x1p-1   0x1p1  x  0x1p-7 0x1p7   50000
  0x1p-1   0x1p1  x -0x1p-7 -0x1p7  50000
  0x1p-70 0x1p70  x  0x1p-1 0x1p1   50000
  0x1p-70 0x1p70  x  -0x1p-1 -0x1p1 50000
  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
 '
 
 # error limits
 L_exp=1.9
 L_log=1.2
 L_pow=0.05
 L_sin=3.0
 L_cos=3.0
 L_expf=1.49
 L_expf_1u=0.4
 L_exp2f=1.49
 L_exp2f_1u=0.4
 L_logf=2.9
 L_sinf=1.4
 L_cosf=1.4
 L_powf=2.1
 
-while read G F R
+while read G F R D
 do
 	[ "$R" = 1 ] || continue
 	case "$G" in \#*) continue ;; esac
 	eval range="\${range_$G}"
 	eval L="\${L_$G}"
 	while read X
 	do
 		[ -n "$X" ] || continue
 		case "$X" in \#*) continue ;; esac
-		t $F $X
+		disable_fenv=""
+		if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
+			# If library was built with SIMD exceptions
+			# disabled, disable fenv checking in ulp
+			# tool. Otherwise, fenv checking may still be
+			# disabled by adding -f to the end of the run
+			# line.
+			disable_fenv="-f"
+		fi
+		t $D $disable_fenv $F $X
 	done << EOF
 $range
 EOF
 done << EOF
 # group symbol run
 exp  __s_exp       $runs
 exp  __v_exp       $runv
 exp  __vn_exp      $runvn
 exp  _ZGVnN2v_exp  $runvn
 
 log  __s_log       $runs
 log  __v_log       $runv
 log  __vn_log      $runvn
 log  _ZGVnN2v_log  $runvn
 
-pow __s_pow       $runs
-pow __v_pow       $runv
-pow __vn_pow      $runvn
-pow _ZGVnN2vv_pow $runvn
+pow __s_pow       $runs         -f
+pow __v_pow       $runv         -f
+pow __vn_pow      $runvn        -f
+pow _ZGVnN2vv_pow $runvn        -f
 
 sin __s_sin       $runs
 sin __v_sin       $runv
 sin __vn_sin      $runvn
 sin _ZGVnN2v_sin  $runvn
 
 cos __s_cos       $runs
 cos __v_cos       $runv
 cos __vn_cos      $runvn
 cos _ZGVnN2v_cos  $runvn
 
 expf __s_expf      $runs
 expf __v_expf      $runv
 expf __vn_expf     $runvn
 expf _ZGVnN4v_expf $runvn
 
-expf_1u __s_expf_1u   $runs
-expf_1u __v_expf_1u   $runv
-expf_1u __vn_expf_1u  $runvn
+expf_1u __s_expf_1u   $runs     -f
+expf_1u __v_expf_1u   $runv     -f
+expf_1u __vn_expf_1u  $runvn    -f
 
 exp2f __s_exp2f      $runs
 exp2f __v_exp2f      $runv
 exp2f __vn_exp2f     $runvn
 exp2f _ZGVnN4v_exp2f $runvn
 
-exp2f_1u __s_exp2f_1u  $runs
-exp2f_1u __v_exp2f_1u  $runv
-exp2f_1u __vn_exp2f_1u $runvn
+exp2f_1u __s_exp2f_1u  $runs    -f
+exp2f_1u __v_exp2f_1u  $runv    -f
+exp2f_1u __vn_exp2f_1u $runvn   -f
 
 logf __s_logf      $runs
 logf __v_logf      $runv
 logf __vn_logf     $runvn
 logf _ZGVnN4v_logf $runvn
 
 sinf __s_sinf      $runs
 sinf __v_sinf      $runv
 sinf __vn_sinf     $runvn
 sinf _ZGVnN4v_sinf $runvn
 
 cosf __s_cosf      $runs
 cosf __v_cosf      $runv
 cosf __vn_cosf     $runvn
 cosf _ZGVnN4v_cosf $runvn
 
-powf __s_powf       $runs
-powf __v_powf       $runv
-powf __vn_powf      $runvn
-powf _ZGVnN4vv_powf $runvn
+powf __s_powf       $runs       -f
+powf __v_powf       $runv       -f
+powf __vn_powf      $runvn      -f
+powf _ZGVnN4vv_powf $runvn      -f
 EOF
 
 [ 0 -eq $FAIL ] || {
 	echo "FAILED $FAIL PASSED $PASS"
 	exit 1
 }
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst
index 79160443f099..7ea0d45795a3 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst
@@ -1,25 +1,25 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
 func=cosf op1=ffc00001 result=7fc00001 errno=0
 func=cosf op1=7f800001 result=7fc00001 errno=0 status=i
 func=cosf op1=ff800001 result=7fc00001 errno=0 status=i
 func=cosf op1=7f800000 result=7fc00001 errno=EDOM status=i
 func=cosf op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=cosf op1=00000000 result=3f800000 errno=0
 func=cosf op1=80000000 result=3f800000 errno=0
 ; SDCOMP-26094: check cosf in the cases for which the range reducer
 ; returns values furthest beyond its nominal upper bound of pi/4.
 func=cosf op1=46427f1b result=3f34dc5c.565 error=0
 func=cosf op1=4647e568 result=3f34dc33.c1f error=0
 func=cosf op1=46428bac result=bf34dbf2.8e3 error=0
 func=cosf op1=4647f1f9 result=bf34dbc9.f9b error=0
 func=cosf op1=4647fe8a result=3f34db60.313 error=0
 func=cosf op1=45d8d7f1 result=bf35006a.7fd error=0
 func=cosf op1=45d371a4 result=3f350056.39b error=0
 func=cosf op1=45ce0b57 result=bf350041.f38 error=0
 func=cosf op1=45d35882 result=bf34ffec.868 error=0
 func=cosf op1=45cdf235 result=3f34ffd8.404 error=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst
index 7fa4d1868c0e..12384cef0dd9 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst
@@ -1,17 +1,17 @@
 ; erf.tst - Directed test cases for erf
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
 func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
 func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0
 func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE
 func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE
 func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux
 func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux
 func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0
 func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
index d05b7b1119c4..28f8fa37f5aa 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
@@ -1,17 +1,17 @@
 ; erff.tst
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
 func=erff op1=ffc00001 result=7fc00001 errno=0
 func=erff op1=7f800001 result=7fc00001 errno=0 status=i
 func=erff op1=ff800001 result=7fc00001 errno=0 status=i
 func=erff op1=7f800000 result=3f800000 errno=0
 func=erff op1=ff800000 result=bf800000 errno=0
 func=erff op1=00000000 result=00000000 errno=ERANGE
 func=erff op1=80000000 result=80000000 errno=ERANGE
 func=erff op1=00000001 result=00000001 errno=0 status=ux
 func=erff op1=80000001 result=80000001 errno=0 status=ux
 func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
 func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst
index 85d556cd1e00..0bb2ef4579cc 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst
@@ -1,31 +1,31 @@
 ; Directed test cases for exp
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=exp op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=exp op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=exp op1=fff00000.00000000 result=00000000.00000000 errno=0
 func=exp op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
 func=exp op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
 func=exp op1=00000000.00000000 result=3ff00000.00000000 errno=0
 func=exp op1=80000000.00000000 result=3ff00000.00000000 errno=0
 func=exp op1=00000000.00000001 result=3ff00000.00000000 errno=0
 func=exp op1=80000000.00000001 result=3ff00000.00000000 errno=0
 func=exp op1=3c900000.00000000 result=3ff00000.00000000.400 errno=0
 func=exp op1=bc900000.00000000 result=3fefffff.ffffffff.800 errno=0
 func=exp op1=3fe00000.00000000 result=3ffa6129.8e1e069b.c97 errno=0
 func=exp op1=bfe00000.00000000 result=3fe368b2.fc6f9609.fe8 errno=0
 func=exp op1=3ff00000.00000000 result=4005bf0a.8b145769.535 errno=0
 func=exp op1=bff00000.00000000 result=3fd78b56.362cef37.c6b errno=0
 func=exp op1=40000000.00000000 result=401d8e64.b8d4ddad.cc3 errno=0
 func=exp op1=c0000000.00000000 result=3fc152aa.a3bf81cb.9fe errno=0
 func=exp op1=3ff12345.6789abcd result=40075955.c34718ed.6e3 errno=0
 func=exp op1=40862e42.fefa39ef result=7fefffff.ffffff2a.1b1 errno=0
 func=exp op1=40862e42.fefa39f0 result=7ff00000.00000000 errno=ERANGE status=ox
 func=exp op1=c0874910.d52d3051 result=00000000.00000001 status=ux
 func=exp op1=c0874910.d52d3052 result=00000000.00000000 errno=ERANGE status=ux
 func=exp op1=c085d589.f2fe5107 result=00f00000.000000f1.46b errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst
index fa56c9f8be4b..7069f9010c8c 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst
@@ -1,30 +1,30 @@
 ; Directed test cases for exp2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=exp2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=exp2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=exp2 op1=fff00000.00000000 result=00000000.00000000 errno=0
 func=exp2 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
 func=exp2 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
 func=exp2 op1=00000000.00000000 result=3ff00000.00000000 errno=0
 func=exp2 op1=80000000.00000000 result=3ff00000.00000000 errno=0
 func=exp2 op1=00000000.00000001 result=3ff00000.00000000 errno=0
 func=exp2 op1=80000000.00000001 result=3ff00000.00000000 errno=0
 func=exp2 op1=3ca00000.00000000 result=3ff00000.00000000.58c errno=0
 func=exp2 op1=bc900000.00000000 result=3fefffff.ffffffff.a74 errno=0
 func=exp2 op1=3fe00000.00000000 result=3ff6a09e.667f3bcc.909 errno=0
 func=exp2 op1=bfe00000.00000000 result=3fe6a09e.667f3bcc.909 errno=0
 func=exp2 op1=3ff00000.00000000 result=40000000.00000000 errno=0
 func=exp2 op1=bff00000.00000000 result=3fe00000.00000000 errno=0
 func=exp2 op1=40000000.00000000 result=40100000.00000000 errno=0
 func=exp2 op1=c0000000.00000000 result=3fd00000.00000000 errno=0
 func=exp2 op1=3ff12345.6789abcd result=4000cef3.c5d12321.663 errno=0
 func=exp2 op1=408fffff.ffffffff result=7fefffff.fffffd3a.37a errno=0
 func=exp2 op1=40900000.00000000 result=7ff00000.00000000 errno=ERANGE status=ox
 func=exp2 op1=c090ca00.00000000 result=00000000.00000000.b50 status=ux
 func=exp2 op1=c090cc00.00000000 result=00000000.00000000 errno=ERANGE status=ux
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst
index 38cfc3f78ac6..6ca2eeab4e12 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst
@@ -1,25 +1,25 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
 func=exp2f op1=ffc00001 result=7fc00001 errno=0
 func=exp2f op1=7f800001 result=7fc00001 errno=0 status=i
 func=exp2f op1=ff800001 result=7fc00001 errno=0 status=i
 func=exp2f op1=7f800000 result=7f800000 errno=0
 func=exp2f op1=7f7fffff result=7f800000 errno=ERANGE status=ox
 func=exp2f op1=ff800000 result=00000000 errno=0
 func=exp2f op1=ff7fffff result=00000000 errno=ERANGE status=ux
 func=exp2f op1=00000000 result=3f800000 errno=0
 func=exp2f op1=80000000 result=3f800000 errno=0
 func=exp2f op1=42fa0001 result=7e00002c.5c8 errno=0
 func=exp2f op1=42ffffff result=7f7fffa7.470 errno=0
 func=exp2f op1=43000000 result=7f800000 errno=ERANGE status=ox
 func=exp2f op1=43000001 result=7f800000 errno=ERANGE status=ox
 func=exp2f op1=c2fa0001 result=00ffffa7.470 errno=0
 func=exp2f op1=c2fc0000 result=00800000 errno=0
 func=exp2f op1=c2fc0001 result=007fffd3.a38 errno=0 status=ux
 func=exp2f op1=c3150000 result=00000001 errno=0
 func=exp2f op1=c3158000 result=00000000.800 errno=ERANGE status=ux
 func=exp2f op1=c3165432 result=00000000.4bd errno=ERANGE status=ux
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst
index ff0f671c2656..89ae8fe78e6c 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst
@@ -1,23 +1,23 @@
 ; expf.tst - Directed test cases for expf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
 func=expf op1=ffc00001 result=7fc00001 errno=0
 func=expf op1=7f800001 result=7fc00001 errno=0 status=i
 func=expf op1=ff800001 result=7fc00001 errno=0 status=i
 func=expf op1=7f800000 result=7f800000 errno=0
 func=expf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
 func=expf op1=ff800000 result=00000000 errno=0
 func=expf op1=ff7fffff result=00000000 errno=ERANGE status=ux
 func=expf op1=00000000 result=3f800000 errno=0
 func=expf op1=80000000 result=3f800000 errno=0
 func=expf op1=42affff8 result=7ef87ed4.e0c errno=0
 func=expf op1=42b00008 result=7ef88698.f67 errno=0
 func=expf op1=42cffff8 result=7f800000 errno=ERANGE status=ox
 func=expf op1=42d00008 result=7f800000 errno=ERANGE status=ox
 func=expf op1=c2affff8 result=0041eecc.041 errno=0 status=ux
 func=expf op1=c2b00008 result=0041ecbc.95e errno=0 status=ux
 func=expf op1=c2cffff8 result=00000000 errno=ERANGE status=ux
 func=expf op1=c2d00008 result=00000000 errno=ERANGE status=ux
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst
index a0aa398cbf73..686ea835645b 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst
@@ -1,21 +1,21 @@
 ; Directed test cases for log
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
 func=log op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=log op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=log op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=log op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
 func=log op1=7fefffff.ffffffff result=40862e42.fefa39ef.354 errno=0
 func=log op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
 func=log op1=3ff00000.00000000 result=00000000.00000000 errno=0
 func=log op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
 func=log op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=log op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=log op1=00000000.00000001 result=c0874385.446d71c3.639 errno=0
 func=log op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
 func=log op1=40000000.00000000 result=3fe62e42.fefa39ef.358 errno=0
 func=log op1=3fe00000.00000000 result=bfe62e42.fefa39ef.358 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
index ff1286cbd53e..361bddec374b 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
@@ -1,21 +1,21 @@
 ; Directed test cases for log2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0
 func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0
 func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0
 func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0
 func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
index 5832c4f08f1e..5fce051cddba 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
@@ -1,27 +1,27 @@
 ; log2f.tst - Directed test cases for log2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
 func=log2f op1=ffc00001 result=7fc00001 errno=0
 func=log2f op1=7f800001 result=7fc00001 errno=0 status=i
 func=log2f op1=ff800001 result=7fc00001 errno=0 status=i
 func=log2f op1=ff810000 result=7fc00001 errno=0 status=i
 func=log2f op1=7f800000 result=7f800000 errno=0
 func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=log2f op1=3f800000 result=00000000 errno=0
 func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z
 func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z
 func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i
 
 func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0
 func=log2f op1=3f604189 result=be4394c8.395 error=0
 func=log2f op1=3f278034 result=bf1caa73.88e error=0
 func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0
 func=log2f op1=3e61259a result=c00bdb95.650 error=0
 func=log2f op1=3f8147ae result=3c6b3267.d6a error=0
 func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0
 func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0
 func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0
 func=log2f op1=40070838 result=3f89e055.a0a error=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst
index 6e68a36e0f6a..a6d1b9d5c51f 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst
@@ -1,69 +1,69 @@
 ; logf.tst - Directed test cases for logf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
 func=logf op1=ffc00001 result=7fc00001 errno=0
 func=logf op1=7f800001 result=7fc00001 errno=0 status=i
 func=logf op1=ff800001 result=7fc00001 errno=0 status=i
 func=logf op1=ff810000 result=7fc00001 errno=0 status=i
 func=logf op1=7f800000 result=7f800000 errno=0
 func=logf op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=logf op1=3f800000 result=00000000 errno=0
 func=logf op1=00000000 result=ff800000 errno=ERANGE status=z
 func=logf op1=80000000 result=ff800000 errno=ERANGE status=z
 func=logf op1=80000001 result=7fc00001 errno=EDOM status=i
 
 ; Directed tests for the special-case handling of log of things
 ; very near 1
 func=logf op1=3f781e49 result=bd0016d9.4ae error=0
 func=logf op1=3f78e602 result=bce675e5.f31 error=0
 func=logf op1=3f844a18 result=3d07030e.ae1 error=0
 func=logf op1=3f79b55b result=bccbd88a.6cb error=0
 func=logf op1=3f7e2f5f result=bbe92452.74a error=0
 func=logf op1=3f7f1c03 result=bb6462c1.c2c error=0
 func=logf op1=3f78b213 result=bced23e2.f56 error=0
 func=logf op1=3f87d5c0 result=3d735847.b7a error=0
 func=logf op1=3f7fa6ad result=bab2c532.12d error=0
 func=logf op1=3f87c06a result=3d70d4b6.b5e error=0
 func=logf op1=3f79cf30 result=bcc88942.6e9 error=0
 func=logf op1=3f794c77 result=bcd94c6f.b1e error=0
 func=logf op1=3f835655 result=3cd2d8a0.0bf error=0
 func=logf op1=3f81b5c0 result=3c596d08.520 error=0
 func=logf op1=3f805e2f result=3b3c18d4.d2b error=0
 func=logf op1=3f7aa609 result=bcad0f90.fdb error=0
 func=logf op1=3f7a9091 result=bcafcd59.f83 error=0
 func=logf op1=3f7a7475 result=bcb36490.a0f error=0
 func=logf op1=3f823417 result=3c8bd287.fa6 error=0
 func=logf op1=3f7fbcc3 result=ba868bac.14c error=0
 func=logf op1=3f805fc9 result=3b3f4a76.169 error=0
 func=logf op1=3f833d43 result=3cccbc4f.cb7 error=0
 func=logf op1=3f7cb1de result=bc54e91e.6b5 error=0
 func=logf op1=3f7f2793 result=bb58c8af.bfc error=0
 func=logf op1=3f7bb8c3 result=bc8a0fc9.93c error=0
 func=logf op1=3f81d349 result=3c67fe09.42e error=0
 func=logf op1=3f7c254d result=bc788cf4.610 error=0
 func=logf op1=3f7f789d result=bb0786d9.6c6 error=0
 func=logf op1=3f7ed1f2 result=bb97605f.963 error=0
 func=logf op1=3f826067 result=3c96b4af.5e1 error=0
 func=logf op1=3f821a68 result=3c8581f9.dac error=0
 func=logf op1=3f864e1a result=3d44f368.e66 error=0
 func=logf op1=3f7fea3d result=b9ae1f66.b58 error=0
 func=logf op1=3f7cf4f5 result=bc43ed76.1c5 error=0
 func=logf op1=3f84c223 result=3d15814e.36d error=0
 func=logf op1=3f7dae6d result=bc1511d5.0aa error=0
 func=logf op1=3f7c0a3c result=bc7f6c0d.758 error=0
 func=logf op1=3f858b22 result=3d2da861.f36 error=0
 func=logf op1=3f85d7c7 result=3d36d490.ee9 error=0
 func=logf op1=3f7f2109 result=bb5f5851.2ed error=0
 func=logf op1=3f83809c result=3cdd23f7.6b1 error=0
 func=logf op1=3f83d96e result=3cf2b9c8.0b1 error=0
 func=logf op1=3f86ca84 result=3d53bee8.53f error=0
 func=logf op1=3f83548e result=3cd269c3.39d error=0
 func=logf op1=3f7c199c result=bc7b84b6.0da error=0
 func=logf op1=3f83133f result=3cc27c0a.9dd error=0
 func=logf op1=3f7c97b4 result=bc5b89dd.399 error=0
 func=logf op1=3f810bc1 result=3c05553c.011 error=0
 func=logf op1=3f7dadb8 result=bc153f7e.fbb error=0
 func=logf op1=3f87be56 result=3d709602.538 error=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst
index 19665817153d..879d12864afe 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst
@@ -1,1418 +1,1418 @@
 ; Directed test cases for pow
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=00100000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=1fffffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=3bdfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=3be00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=3fe00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=3ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=40000000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=40080000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=40120000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=40180000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=407ff800.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=408ff800.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=00000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=80000000.00000001 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=80100000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=9fffffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=bbdfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=bbe00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=bfe00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=bff00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c0000000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c0080000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c0120000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c07f3000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c090ce00.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=00000000.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=00000000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=3bdfffff.ffffffff result=3fefffff.ffffffff.d17 errno=0
 func=pow op1=00000000.00000001 op2=3be00000.00000000 result=3fefffff.ffffffff.d17 errno=0
 func=pow op1=00000000.00000001 op2=3fe00000.00000000 result=1e600000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=3ff00000.00000000 result=00000000.00000001 errno=0 status=u
 func=pow op1=00000000.00000001 op2=40000000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=40080000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=40120000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=40180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=407ff800.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=408ff800.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00000000.00000001 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=00000000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=bbdfffff.ffffffff result=3ff00000.00000000.174 errno=0
 func=pow op1=00000000.00000001 op2=bbe00000.00000000 result=3ff00000.00000000.174 errno=0
 func=pow op1=00000000.00000001 op2=bfe00000.00000000 result=61800000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=bff00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c0000000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c0080000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c0120000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c07f3000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c090ce00.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00000000.00000001 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=00000000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=00000000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=3bdfffff.ffffffff result=3fefffff.ffffffff.d3b errno=0
 func=pow op1=00100000.00000000 op2=3be00000.00000000 result=3fefffff.ffffffff.d3b errno=0
 func=pow op1=00100000.00000000 op2=3fe00000.00000000 result=20000000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=3ff00000.00000000 result=00100000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=40000000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=40080000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=40120000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=40180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=407ff800.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=408ff800.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=00100000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=00100000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=bbdfffff.ffffffff result=3ff00000.00000000.162 errno=0
 func=pow op1=00100000.00000000 op2=bbe00000.00000000 result=3ff00000.00000000.162 errno=0
 func=pow op1=00100000.00000000 op2=bfe00000.00000000 result=5fe00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=bff00000.00000000 result=7fd00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=c0000000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c0080000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c0120000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c07f3000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c090ce00.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=00100000.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=00100000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=00100000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=3bdfffff.ffffffff result=3fefffff.ffffffff.e9d errno=0
 func=pow op1=1fffffff.ffffffff op2=3be00000.00000000 result=3fefffff.ffffffff.e9d errno=0
 func=pow op1=1fffffff.ffffffff op2=3fe00000.00000000 result=2ff6a09e.667f3bcc.360 errno=0
 func=pow op1=1fffffff.ffffffff op2=3ff00000.00000000 result=1fffffff.ffffffff errno=0
 func=pow op1=1fffffff.ffffffff op2=40000000.00000000 result=000fffff.ffffffff errno=0 status=u
 func=pow op1=1fffffff.ffffffff op2=40080000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=40120000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=40180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=407ff800.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=408ff800.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=1fffffff.ffffffff op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=1fffffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=bbdfffff.ffffffff result=3ff00000.00000000.0b1 errno=0
 func=pow op1=1fffffff.ffffffff op2=bbe00000.00000000 result=3ff00000.00000000.0b1 errno=0
 func=pow op1=1fffffff.ffffffff op2=bfe00000.00000000 result=4fe6a09e.667f3bcc.eb0 errno=0
 func=pow op1=1fffffff.ffffffff op2=bff00000.00000000 result=5fe00000.00000000.800 errno=0
 func=pow op1=1fffffff.ffffffff op2=c0000000.00000000 result=7fd00000.00000001 errno=0
 func=pow op1=1fffffff.ffffffff op2=c0080000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=c0120000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=c07f3000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=c090ce00.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=1fffffff.ffffffff op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=1fffffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=1fffffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=3bdfffff.ffffffff result=3fefffff.ffffffff.fff errno=0
 func=pow op1=3fe00000.00000000 op2=3be00000.00000000 result=3fefffff.ffffffff.fff errno=0
 func=pow op1=3fe00000.00000000 op2=3fe00000.00000000 result=3fe6a09e.667f3bcc.908 errno=0
 func=pow op1=3fe00000.00000000 op2=3ff00000.00000000 result=3fe00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=40000000.00000000 result=3fd00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=40080000.00000000 result=3fc00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=40120000.00000000 result=3fa6a09e.667f3bcc.908 errno=0
 func=pow op1=3fe00000.00000000 op2=40180000.00000000 result=3f900000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=407ff800.00000000 result=1ff6a09e.667f3bcc.908 errno=0
 func=pow op1=3fe00000.00000000 op2=408ff800.00000000 result=00080000.00000000 errno=0 status=u
 func=pow op1=3fe00000.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fe00000.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fe00000.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fe00000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3fe00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=bbdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=bbe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=bfe00000.00000000 result=3ff6a09e.667f3bcc.908 errno=0
 func=pow op1=3fe00000.00000000 op2=bff00000.00000000 result=40000000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=c0000000.00000000 result=40100000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=c0080000.00000000 result=40200000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=c0120000.00000000 result=4036a09e.667f3bcc.908 errno=0
 func=pow op1=3fe00000.00000000 op2=c0180000.00000000 result=40500000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=c07f3000.00000000 result=5f200000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=c090ce00.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fe00000.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fe00000.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fe00000.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fe00000.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=3fe00000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3fe00000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=3bdfffff.ffffffff result=3fefffff.ffffffff.fff errno=0
 func=pow op1=3fef9800.00000000 op2=3be00000.00000000 result=3fefffff.ffffffff.fff errno=0
 func=pow op1=3fef9800.00000000 op2=3fe00000.00000000 result=3fefcbd5.7acb4a6e.860 errno=0
 func=pow op1=3fef9800.00000000 op2=3ff00000.00000000 result=3fef9800.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=40000000.00000000 result=3fef3152.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=40080000.00000000 result=3feecbf1.b5800000 errno=0
 func=pow op1=3fef9800.00000000 op2=40120000.00000000 result=3fee3649.b95eb051.74b errno=0
 func=pow op1=3fef9800.00000000 op2=40180000.00000000 result=3feda378.fe2081dd.720 errno=0
 func=pow op1=3fef9800.00000000 op2=407ff800.00000000 result=3f57c7a0.fdc7f7ec.294 errno=0
 func=pow op1=3fef9800.00000000 op2=408ff800.00000000 result=3ec1abd4.ca4dcd2b.5aa errno=0
 func=pow op1=3fef9800.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fef9800.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fef9800.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fef9800.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3fef9800.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=bbdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=bbe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=bfe00000.00000000 result=3ff01a40.0d91bee3.a6e errno=0
 func=pow op1=3fef9800.00000000 op2=bff00000.00000000 result=3ff034ab.2c50040d.2ac errno=0
 func=pow op1=3fef9800.00000000 op2=c0000000.00000000 result=3ff06a03.b86753dd.bb6 errno=0
 func=pow op1=3fef9800.00000000 op2=c0080000.00000000 result=3ff0a00b.defc06f4.558 errno=0
 func=pow op1=3fef9800.00000000 op2=c0120000.00000000 result=3ff0f266.4d09b66a.72f errno=0
 func=pow op1=3fef9800.00000000 op2=c0180000.00000000 result=3ff14658.ab6c8d31.ec8 errno=0
 func=pow op1=3fef9800.00000000 op2=c07f3000.00000000 result=40825a4f.79ba0328.8d7 errno=0
 func=pow op1=3fef9800.00000000 op2=c090ce00.00000000 result=412c5521.b1a8d47f.54d errno=0
 func=pow op1=3fef9800.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fef9800.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fef9800.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fef9800.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=3fef9800.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3fef9800.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3fefffff.ffffffe0 op2=4386128b.68cf9fbc result=003ff70a.f0af9c79.372 errno=0
 func=pow op1=3fefffff.ffffffe0 op2=c386128b.68cf9fbc result=7fa0047b.c8f04d90.332 errno=0
 func=pow op1=3fefffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=3fe00000.00000000 result=3fefffff.ffffffff.800 errno=0
 func=pow op1=3fefffff.ffffffff op2=3ff00000.00000000 result=3fefffff.ffffffff errno=0
 func=pow op1=3fefffff.ffffffff op2=40000000.00000000 result=3fefffff.fffffffe errno=0
 func=pow op1=3fefffff.ffffffff op2=40080000.00000000 result=3fefffff.fffffffd errno=0
 func=pow op1=3fefffff.ffffffff op2=40120000.00000000 result=3fefffff.fffffffb.800 errno=0
 func=pow op1=3fefffff.ffffffff op2=40180000.00000000 result=3fefffff.fffffffa errno=0
 func=pow op1=3fefffff.ffffffff op2=407ff800.00000000 result=3fefffff.fffffe00.800 errno=0
 func=pow op1=3fefffff.ffffffff op2=408ff800.00000000 result=3fefffff.fffffc01 errno=0
 func=pow op1=3fefffff.ffffffff op2=4386128b.68cf9fbc result=3df1d45f.3e91e17c.d0c errno=0
 func=pow op1=3fefffff.ffffffff op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fefffff.ffffffff op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fefffff.ffffffff op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3fefffff.ffffffff op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3fefffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=bbdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=bbe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=bfe00000.00000000 result=3ff00000.00000000.400 errno=0
 func=pow op1=3fefffff.ffffffff op2=bff00000.00000000 result=3ff00000.00000000.800 errno=0
 func=pow op1=3fefffff.ffffffff op2=c0000000.00000000 result=3ff00000.00000001 errno=0
 func=pow op1=3fefffff.ffffffff op2=c0080000.00000000 result=3ff00000.00000001.800 errno=0
 func=pow op1=3fefffff.ffffffff op2=c0120000.00000000 result=3ff00000.00000002.400 errno=0
 func=pow op1=3fefffff.ffffffff op2=c0180000.00000000 result=3ff00000.00000003 errno=0
 func=pow op1=3fefffff.ffffffff op2=c07f3000.00000000 result=3ff00000.000000f9.800 errno=0
 func=pow op1=3fefffff.ffffffff op2=c090ce00.00000000 result=3ff00000.00000219.c00 errno=0
 func=pow op1=3fefffff.ffffffff op2=c386128b.68cf9fbc result=41ecb761.33b97fcc.60b errno=0
 func=pow op1=3fefffff.ffffffff op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fefffff.ffffffff op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fefffff.ffffffff op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3fefffff.ffffffff op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=3fefffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3fefffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=3fe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=3ff00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=40000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=40080000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=40120000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=40180000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=407ff800.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=408ff800.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=43dfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=43e00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=7fefffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=7ff00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3ff00000.00000000 op2=7ff80000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=bbdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=bbe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=bfe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=bff00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c0000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c0080000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c0120000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c0180000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c07f3000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c090ce00.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c3dfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=c3e00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=ffefffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=fff00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3ff00000.00000000 op2=fff80000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=3fe00000.00000000 result=3ff00000.00000000.800 errno=0
 func=pow op1=3ff00000.00000001 op2=3ff00000.00000000 result=3ff00000.00000001 errno=0
 func=pow op1=3ff00000.00000001 op2=40000000.00000000 result=3ff00000.00000002 errno=0
 func=pow op1=3ff00000.00000001 op2=40080000.00000000 result=3ff00000.00000003 errno=0
 func=pow op1=3ff00000.00000001 op2=40120000.00000000 result=3ff00000.00000004.800 errno=0
 func=pow op1=3ff00000.00000001 op2=40180000.00000000 result=3ff00000.00000006 errno=0
 func=pow op1=3ff00000.00000001 op2=407ff800.00000000 result=3ff00000.000001ff.800 errno=0
 func=pow op1=3ff00000.00000001 op2=408ff800.00000000 result=3ff00000.000003ff errno=0
 func=pow op1=3ff00000.00000001 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3ff00000.00000001 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3ff00000.00000001 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3ff00000.00000001 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3ff00000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=bbdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=bbe00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=bfe00000.00000000 result=3fefffff.ffffffff errno=0
 func=pow op1=3ff00000.00000001 op2=bff00000.00000000 result=3fefffff.fffffffe errno=0
 func=pow op1=3ff00000.00000001 op2=c0000000.00000000 result=3fefffff.fffffffc errno=0
 func=pow op1=3ff00000.00000001 op2=c0080000.00000000 result=3fefffff.fffffffa errno=0
 func=pow op1=3ff00000.00000001 op2=c0120000.00000000 result=3fefffff.fffffff7 errno=0
 func=pow op1=3ff00000.00000001 op2=c0180000.00000000 result=3fefffff.fffffff4 errno=0
 func=pow op1=3ff00000.00000001 op2=c07f3000.00000000 result=3fefffff.fffffc1a errno=0
 func=pow op1=3ff00000.00000001 op2=c090ce00.00000000 result=3fefffff.fffff799 errno=0
 func=pow op1=3ff00000.00000001 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3ff00000.00000001 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3ff00000.00000001 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3ff00000.00000001 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=3ff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3ff00000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=3fe00000.00000000 result=3ff019eb.020ee283.520 errno=0
 func=pow op1=3ff03400.00000000 op2=3ff00000.00000000 result=3ff03400.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=40000000.00000000 result=3ff068a9.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=40080000.00000000 result=3ff09dfd.25400000 errno=0
 func=pow op1=3ff03400.00000000 op2=40120000.00000000 result=3ff0ef41.05a27f91.ece errno=0
 func=pow op1=3ff03400.00000000 op2=40180000.00000000 result=3ff14212.5220325e.b90 errno=0
 func=pow op1=3ff03400.00000000 op2=407ff800.00000000 result=4083d3b3.8a3213c3.297 errno=0
 func=pow op1=3ff03400.00000000 op2=408ff800.00000000 result=411891bb.7f728082.d88 errno=0
 func=pow op1=3ff03400.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3ff03400.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3ff03400.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=3ff03400.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3ff03400.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=bbdfffff.ffffffff result=3fefffff.ffffffff.fff errno=0
 func=pow op1=3ff03400.00000000 op2=bbe00000.00000000 result=3fefffff.ffffffff.fff errno=0
 func=pow op1=3ff03400.00000000 op2=bfe00000.00000000 result=3fefcc7d.6c7d2e30.865 errno=0
 func=pow op1=3ff03400.00000000 op2=bff00000.00000000 result=3fef994d.c3455e8c.b6a errno=0
 func=pow op1=3ff03400.00000000 op2=c0000000.00000000 result=3fef33e5.1aaea6ee.309 errno=0
 func=pow op1=3ff03400.00000000 op2=c0080000.00000000 result=3feecfc1.e487ed2b.638 errno=0
 func=pow op1=3ff03400.00000000 op2=c0120000.00000000 result=3fee3be6.60bd4449.151 errno=0
 func=pow op1=3ff03400.00000000 op2=c0180000.00000000 result=3fedaad0.65924e45.6c1 errno=0
 func=pow op1=3ff03400.00000000 op2=c07f3000.00000000 result=3f5e3bf6.471a7841.69b errno=0
 func=pow op1=3ff03400.00000000 op2=c090ce00.00000000 result=3eb57de0.09c1a44f.f1a errno=0
 func=pow op1=3ff03400.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3ff03400.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3ff03400.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=3ff03400.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=3ff03400.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=3ff03400.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=3fe00000.00000000 result=3ff6a09e.667f3bcc.908 errno=0
 func=pow op1=40000000.00000000 op2=3ff00000.00000000 result=40000000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=40000000.00000000 result=40100000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=40080000.00000000 result=40200000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=40120000.00000000 result=4036a09e.667f3bcc.908 errno=0
 func=pow op1=40000000.00000000 op2=40180000.00000000 result=40500000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=407ff800.00000000 result=5fe6a09e.667f3bcc.908 errno=0
 func=pow op1=40000000.00000000 op2=408ff800.00000000 result=7fe00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40000000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40000000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40000000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=bbdfffff.ffffffff result=3fefffff.ffffffff.fff errno=0
 func=pow op1=40000000.00000000 op2=bbe00000.00000000 result=3fefffff.ffffffff.fff errno=0
 func=pow op1=40000000.00000000 op2=bfe00000.00000000 result=3fe6a09e.667f3bcc.908 errno=0
 func=pow op1=40000000.00000000 op2=bff00000.00000000 result=3fe00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=c0000000.00000000 result=3fd00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=c0080000.00000000 result=3fc00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=c0120000.00000000 result=3fa6a09e.667f3bcc.908 errno=0
 func=pow op1=40000000.00000000 op2=c0180000.00000000 result=3f900000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=c07f3000.00000000 result=20c00000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=c08f3a00.00000000 result=017ae89f.995ad3ad.5e8 errno=0
 func=pow op1=40000000.00000000 op2=c090ce00.00000000 result=00000000.00000000.5a8 errno=ERANGE status=u
 func=pow op1=40000000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40000000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40000000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40000000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=40000000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40000000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=3fe00000.00000000 result=3ffbb67a.e8584caa.73b errno=0
 func=pow op1=40080000.00000000 op2=3ff00000.00000000 result=40080000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=40000000.00000000 result=40220000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=40080000.00000000 result=403b0000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=40120000.00000000 result=40618979.c707e083.dd3 errno=0
 func=pow op1=40080000.00000000 op2=40180000.00000000 result=4086c800.00000000 errno=0
 func=pow op1=40080000.00000000 op2=407ff800.00000000 result=729a2473.a65e6847.3ca errno=0
 func=pow op1=40080000.00000000 op2=408ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40080000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40080000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40080000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40080000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40080000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=bbdfffff.ffffffff result=3fefffff.ffffffff.ffe errno=0
 func=pow op1=40080000.00000000 op2=bbe00000.00000000 result=3fefffff.ffffffff.ffe errno=0
 func=pow op1=40080000.00000000 op2=bfe00000.00000000 result=3fe279a7.4590331c.4d2 errno=0
 func=pow op1=40080000.00000000 op2=bff00000.00000000 result=3fd55555.55555555.555 errno=0
 func=pow op1=40080000.00000000 op2=c0000000.00000000 result=3fbc71c7.1c71c71c.71c errno=0
 func=pow op1=40080000.00000000 op2=c0080000.00000000 result=3fa2f684.bda12f68.4bd errno=0
 func=pow op1=40080000.00000000 op2=c0120000.00000000 result=3f7d3205.2b8e89a7.fb7 errno=0
 func=pow op1=40080000.00000000 op2=c0180000.00000000 result=3f567980.e0bf08c7.765 errno=0
 func=pow op1=40080000.00000000 op2=c07f3000.00000000 result=0e81314b.59b2f0d0.9a8 errno=0
 func=pow op1=40080000.00000000 op2=c090ce00.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40080000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40080000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40080000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40080000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=40080000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40080000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=3fe00000.00000000 result=4000f876.ccdf6cd9.6c6 errno=0
 func=pow op1=40120000.00000000 op2=3ff00000.00000000 result=40120000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=40000000.00000000 result=40344000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=40080000.00000000 result=4056c800.00000000 errno=0
 func=pow op1=40120000.00000000 op2=40120000.00000000 result=408b2efd.cb8aa24b.053 errno=0
 func=pow op1=40120000.00000000 op2=40180000.00000000 result=40c037e2.00000000 errno=0
 func=pow op1=40120000.00000000 op2=407ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40120000.00000000 op2=408ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40120000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40120000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40120000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40120000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40120000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=bbdfffff.ffffffff result=3fefffff.ffffffff.ffe errno=0
 func=pow op1=40120000.00000000 op2=bbe00000.00000000 result=3fefffff.ffffffff.ffe errno=0
 func=pow op1=40120000.00000000 op2=bfe00000.00000000 result=3fde2b7d.ddfefa66.160 errno=0
 func=pow op1=40120000.00000000 op2=bff00000.00000000 result=3fcc71c7.1c71c71c.71c errno=0
 func=pow op1=40120000.00000000 op2=c0000000.00000000 result=3fa948b0.fcd6e9e0.652 errno=0
 func=pow op1=40120000.00000000 op2=c0080000.00000000 result=3f867980.e0bf08c7.765 errno=0
 func=pow op1=40120000.00000000 op2=c0120000.00000000 result=3f52d5bc.e225fd84.857 errno=0
 func=pow op1=40120000.00000000 op2=c0180000.00000000 result=3f1f91bd.1b62b9ce.c8a errno=0
 func=pow op1=40120000.00000000 op2=c07f3000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40120000.00000000 op2=c090ce00.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40120000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40120000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40120000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40120000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=40120000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40120000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=3bdfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=3be00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=3fe00000.00000000 result=4003988e.1409212e.7d0 errno=0
 func=pow op1=40180000.00000000 op2=3ff00000.00000000 result=40180000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=40000000.00000000 result=40420000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=40080000.00000000 result=406b0000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=40120000.00000000 result=40a8cd13.d15b8dfe.d63 errno=0
 func=pow op1=40180000.00000000 op2=40180000.00000000 result=40e6c800.00000000 errno=0
 func=pow op1=40180000.00000000 op2=407ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40180000.00000000 op2=408ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40180000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40180000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40180000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=40180000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40180000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=bbdfffff.ffffffff result=3fefffff.ffffffff.ffe errno=0
 func=pow op1=40180000.00000000 op2=bbe00000.00000000 result=3fefffff.ffffffff.ffe errno=0
 func=pow op1=40180000.00000000 op2=bfe00000.00000000 result=3fda20bd.700c2c3d.fc0 errno=0
 func=pow op1=40180000.00000000 op2=bff00000.00000000 result=3fc55555.55555555.555 errno=0
 func=pow op1=40180000.00000000 op2=c0000000.00000000 result=3f9c71c7.1c71c71c.71c errno=0
 func=pow op1=40180000.00000000 op2=c0080000.00000000 result=3f72f684.bda12f68.4bd errno=0
 func=pow op1=40180000.00000000 op2=c0120000.00000000 result=3f34a4ee.2c48d3f1.c3f errno=0
 func=pow op1=40180000.00000000 op2=c0180000.00000000 result=3ef67980.e0bf08c7.765 errno=0
 func=pow op1=40180000.00000000 op2=c07f3000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40180000.00000000 op2=c090ce00.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40180000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40180000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40180000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=40180000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=40180000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=40180000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=3bdfffff.ffffffff result=3ff00000.00000000.053 errno=0
 func=pow op1=4f0fffff.ffffffff op2=3be00000.00000000 result=3ff00000.00000000.053 errno=0
 func=pow op1=4f0fffff.ffffffff op2=3fe00000.00000000 result=477fffff.ffffffff.800 errno=0
 func=pow op1=4f0fffff.ffffffff op2=3ff00000.00000000 result=4f0fffff.ffffffff errno=0
 func=pow op1=4f0fffff.ffffffff op2=40000000.00000000 result=5e2fffff.fffffffe errno=0
 func=pow op1=4f0fffff.ffffffff op2=40080000.00000000 result=6d4fffff.fffffffd errno=0
 func=pow op1=4f0fffff.ffffffff op2=40120000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=40180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=407ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=408ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=4f0fffff.ffffffff op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=4f0fffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=bbdfffff.ffffffff result=3fefffff.ffffffff.f58 errno=0
 func=pow op1=4f0fffff.ffffffff op2=bbe00000.00000000 result=3fefffff.ffffffff.f58 errno=0
 func=pow op1=4f0fffff.ffffffff op2=bfe00000.00000000 result=38600000.00000000.400 errno=0
 func=pow op1=4f0fffff.ffffffff op2=bff00000.00000000 result=30d00000.00000000.800 errno=0
 func=pow op1=4f0fffff.ffffffff op2=c0000000.00000000 result=21b00000.00000001 errno=0
 func=pow op1=4f0fffff.ffffffff op2=c0080000.00000000 result=12900000.00000001.800 errno=0
 func=pow op1=4f0fffff.ffffffff op2=c0120000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=c0180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=c07f3000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=c090ce00.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=4f0fffff.ffffffff op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=4f0fffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=4f0fffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=00000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=00100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=1fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=3bdfffff.ffffffff result=3ff00000.00000000.162 errno=0
 func=pow op1=7fefffff.ffffffff op2=3be00000.00000000 result=3ff00000.00000000.162 errno=0
 func=pow op1=7fefffff.ffffffff op2=3fe00000.00000000 result=5fefffff.ffffffff.800 errno=0
 func=pow op1=7fefffff.ffffffff op2=3ff00000.00000000 result=7fefffff.ffffffff errno=0
 func=pow op1=7fefffff.ffffffff op2=40000000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=40080000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=40120000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=40180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=407ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=408ff800.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=4386128b.68cf9fbc result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=7fefffff.ffffffff op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7fefffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=80000000.00000001 result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=80100000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=9fffffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=bbdfffff.ffffffff result=3fefffff.ffffffff.d3a errno=0
 func=pow op1=7fefffff.ffffffff op2=bbe00000.00000000 result=3fefffff.ffffffff.d3a errno=0
 func=pow op1=7fefffff.ffffffff op2=bfe00000.00000000 result=1ff00000.00000000.400 errno=0
 func=pow op1=7fefffff.ffffffff op2=bff00000.00000000 result=00040000.00000000.200 errno=0 status=u
 func=pow op1=7fefffff.ffffffff op2=c0000000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c0080000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c0120000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c0180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c07f3000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c090ce00.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c386128b.68cf9fbc result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=7fefffff.ffffffff op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7fefffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7fefffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=00000000.00000001 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=00100000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=1fffffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=3bdfffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=3be00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=3fe00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=3ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=40000000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=40080000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=40120000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=40180000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=407ff800.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=408ff800.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=80000000.00000001 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=80100000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=9fffffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=bbdfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=bbe00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=bfe00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=bff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c0000000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c0080000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c0120000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c0180000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c07f3000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c090ce00.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=7ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7ff00000.00000001 op2=00000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=00000000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=00100000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=3be00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=3fe00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=40000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=40080000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=40120000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=40180000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=407ff800.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=408ff800.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=43dfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=43e00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=7fefffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=80000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=80000000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=80100000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=bbe00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=bfe00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c0000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c0080000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c0120000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c0180000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c07f3000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c090ce00.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c3dfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=c3e00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=ffefffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff00000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff80000.00000001 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=00000000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=00100000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=3be00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=3fe00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=40000000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=40080000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=40120000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=40180000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=407ff800.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=408ff800.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=43dfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=43e00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=7fefffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=80000000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=80100000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=bbe00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=bfe00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c0000000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c0080000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c0120000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c0180000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c07f3000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c090ce00.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c3dfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=c3e00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=ffefffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=7ff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=7ff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=00100000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=1fffffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=3bdfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=3be00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=3fe00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=3ff00000.00000000 result=80000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=40000000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=40080000.00000000 result=80000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=40120000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=40180000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=407ff800.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=408ff800.00000000 result=80000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=80000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=80000000.00000001 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=80100000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=9fffffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=bbdfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=bbe00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=bfe00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=bff00000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c0000000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c0080000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c0120000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c07f3000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c090ce00.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=z
 func=pow op1=80000000.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=80000000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=80000000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=80000000.00000001 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=80000000.00000001 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=3ff00000.00000000 result=80000000.00000001 errno=0 status=u
 func=pow op1=80000000.00000001 op2=40000000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=40080000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=40180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=408ff800.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80000000.00000001 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80000000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=80000000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=80000000.00000001 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=80000000.00000001 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=bff00000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=c0000000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=c0080000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=c07f3000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80000000.00000001 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80000000.00000001 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=80000000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=80000000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=3ff00000.00000000 result=80100000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=40000000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=40080000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=40180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=408ff800.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=80100000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=80100000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=bff00000.00000000 result=ffd00000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=c0000000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=c0080000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=c07f3000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=80100000.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=80100000.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=80100000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=80100000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=9fffffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=9fffffff.ffffffff op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=3ff00000.00000000 result=9fffffff.ffffffff errno=0
 func=pow op1=9fffffff.ffffffff op2=40000000.00000000 result=000fffff.ffffffff errno=0 status=u
 func=pow op1=9fffffff.ffffffff op2=40080000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=9fffffff.ffffffff op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=40180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=9fffffff.ffffffff op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=408ff800.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=9fffffff.ffffffff op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=9fffffff.ffffffff op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=9fffffff.ffffffff op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=9fffffff.ffffffff op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=9fffffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=9fffffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=9fffffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=9fffffff.ffffffff op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=bff00000.00000000 result=dfe00000.00000000.800 errno=0
 func=pow op1=9fffffff.ffffffff op2=c0000000.00000000 result=7fd00000.00000001 errno=0
 func=pow op1=9fffffff.ffffffff op2=c0080000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=9fffffff.ffffffff op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=c0180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=9fffffff.ffffffff op2=c07f3000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=9fffffff.ffffffff op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=9fffffff.ffffffff op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=9fffffff.ffffffff op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=9fffffff.ffffffff op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=9fffffff.ffffffff op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=9fffffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=9fffffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=3ff00000.00000000 result=bfe00000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=40000000.00000000 result=3fd00000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=40080000.00000000 result=bfc00000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=40180000.00000000 result=3f900000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=408ff800.00000000 result=80080000.00000000 errno=0 status=u
 func=pow op1=bfe00000.00000000 op2=4386128b.68cf9fbc result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfe00000.00000000 op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfe00000.00000000 op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfe00000.00000000 op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfe00000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bfe00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=bff00000.00000000 result=c0000000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=c0000000.00000000 result=40100000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=c0080000.00000000 result=c0200000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=c0180000.00000000 result=40500000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=c07f3000.00000000 result=df200000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfe00000.00000000 op2=c386128b.68cf9fbc result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfe00000.00000000 op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfe00000.00000000 op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfe00000.00000000 op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfe00000.00000000 op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=bfe00000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bfe00000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bfefffff.ffffffe0 op2=4386128b.68cf9fbc result=003ff70a.f0af9c79.372 errno=0
 func=pow op1=bfefffff.ffffffe0 op2=c386128b.68cf9fbc result=7fa0047b.c8f04d90.332 errno=0
 func=pow op1=bfefffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bfefffff.ffffffff op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=3ff00000.00000000 result=bfefffff.ffffffff errno=0
 func=pow op1=bfefffff.ffffffff op2=40000000.00000000 result=3fefffff.fffffffe errno=0
 func=pow op1=bfefffff.ffffffff op2=40080000.00000000 result=bfefffff.fffffffd errno=0
 func=pow op1=bfefffff.ffffffff op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=40180000.00000000 result=3fefffff.fffffffa errno=0
 func=pow op1=bfefffff.ffffffff op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=408ff800.00000000 result=bfefffff.fffffc01 errno=0
 func=pow op1=bfefffff.ffffffff op2=4386128b.68cf9fbc result=3df1d45f.3e91e17c.d0c errno=0
 func=pow op1=bfefffff.ffffffff op2=43dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfefffff.ffffffff op2=43e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfefffff.ffffffff op2=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bfefffff.ffffffff op2=7ff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=bfefffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bfefffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bfefffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bfefffff.ffffffff op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=bff00000.00000000 result=bff00000.00000000.800 errno=0
 func=pow op1=bfefffff.ffffffff op2=c0000000.00000000 result=3ff00000.00000001 errno=0
 func=pow op1=bfefffff.ffffffff op2=c0080000.00000000 result=bff00000.00000001.800 errno=0
 func=pow op1=bfefffff.ffffffff op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=c0180000.00000000 result=3ff00000.00000003 errno=0
 func=pow op1=bfefffff.ffffffff op2=c07f3000.00000000 result=bff00000.000000f9.800 errno=0
 func=pow op1=bfefffff.ffffffff op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bfefffff.ffffffff op2=c386128b.68cf9fbc result=41ecb761.33b97fcc.60b errno=0
 func=pow op1=bfefffff.ffffffff op2=c3dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfefffff.ffffffff op2=c3e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfefffff.ffffffff op2=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bfefffff.ffffffff op2=fff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=bfefffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bfefffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=3ff00000.00000000 result=bff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=40000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=40080000.00000000 result=bff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=40180000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=408ff800.00000000 result=bff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=43dfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=43e00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=7fefffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=7ff00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=bff00000.00000000 result=bff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=c0000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=c0080000.00000000 result=bff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=c0180000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=c07f3000.00000000 result=bff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000000 op2=c3dfffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=c3e00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=ffefffff.ffffffff result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=fff00000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bff00000.00000001 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000001 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=3ff00000.00000000 result=bff00000.00000001 errno=0
 func=pow op1=bff00000.00000001 op2=40000000.00000000 result=3ff00000.00000002 errno=0
 func=pow op1=bff00000.00000001 op2=40080000.00000000 result=bff00000.00000003 errno=0
 func=pow op1=bff00000.00000001 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=40180000.00000000 result=3ff00000.00000006 errno=0
 func=pow op1=bff00000.00000001 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=408ff800.00000000 result=bff00000.000003ff errno=0
 func=pow op1=bff00000.00000001 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bff00000.00000001 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bff00000.00000001 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=bff00000.00000001 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=bff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bff00000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=bff00000.00000001 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=bff00000.00000001 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=bff00000.00000000 result=bfefffff.fffffffe errno=0
 func=pow op1=bff00000.00000001 op2=c0000000.00000000 result=3fefffff.fffffffc errno=0
 func=pow op1=bff00000.00000001 op2=c0080000.00000000 result=bfefffff.fffffffa errno=0
 func=pow op1=bff00000.00000001 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=c0180000.00000000 result=3fefffff.fffffff4 errno=0
 func=pow op1=bff00000.00000001 op2=c07f3000.00000000 result=bfefffff.fffffc1a errno=0
 func=pow op1=bff00000.00000001 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=bff00000.00000001 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bff00000.00000001 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bff00000.00000001 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=bff00000.00000001 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=bff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=bff00000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=3ff00000.00000000 result=c0000000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=40000000.00000000 result=40100000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=40080000.00000000 result=c0200000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=40180000.00000000 result=40500000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=408ff800.00000000 result=ffe00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0000000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0000000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0000000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=bff00000.00000000 result=bfe00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=c0000000.00000000 result=3fd00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=c0080000.00000000 result=bfc00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=c0180000.00000000 result=3f900000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=c07f3000.00000000 result=a0c00000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0000000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0000000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0000000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0000000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=c0000000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0000000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=3ff00000.00000000 result=c0080000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=40000000.00000000 result=40220000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=40080000.00000000 result=c03b0000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=40180000.00000000 result=4086c800.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=408ff800.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0080000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0080000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0080000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0080000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0080000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=bff00000.00000000 result=bfd55555.55555555.555 errno=0
 func=pow op1=c0080000.00000000 op2=c0000000.00000000 result=3fbc71c7.1c71c71c.71c errno=0
 func=pow op1=c0080000.00000000 op2=c0080000.00000000 result=bfa2f684.bda12f68.4bd errno=0
 func=pow op1=c0080000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=c0180000.00000000 result=3f567980.e0bf08c7.765 errno=0
 func=pow op1=c0080000.00000000 op2=c07f3000.00000000 result=8e81314b.59b2f0d0.9a8 errno=0
 func=pow op1=c0080000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0080000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0080000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0080000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0080000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=c0080000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0080000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=3ff00000.00000000 result=c0120000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=40000000.00000000 result=40344000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=40080000.00000000 result=c056c800.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=40180000.00000000 result=40c037e2.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=408ff800.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0120000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0120000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0120000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0120000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0120000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=bff00000.00000000 result=bfcc71c7.1c71c71c.71c errno=0
 func=pow op1=c0120000.00000000 op2=c0000000.00000000 result=3fa948b0.fcd6e9e0.652 errno=0
 func=pow op1=c0120000.00000000 op2=c0080000.00000000 result=bf867980.e0bf08c7.765 errno=0
 func=pow op1=c0120000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=c0180000.00000000 result=3f1f91bd.1b62b9ce.c8a errno=0
 func=pow op1=c0120000.00000000 op2=c07f3000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=c0120000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0120000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0120000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0120000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0120000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=c0120000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0120000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=3ff00000.00000000 result=c0180000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=40000000.00000000 result=40420000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=40080000.00000000 result=c06b0000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=40180000.00000000 result=40e6c800.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=408ff800.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0180000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0180000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0180000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=c0180000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0180000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=bff00000.00000000 result=bfc55555.55555555.555 errno=0
 func=pow op1=c0180000.00000000 op2=c0000000.00000000 result=3f9c71c7.1c71c71c.71c errno=0
 func=pow op1=c0180000.00000000 op2=c0080000.00000000 result=bf72f684.bda12f68.4bd errno=0
 func=pow op1=c0180000.00000000 op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=c0180000.00000000 result=3ef67980.e0bf08c7.765 errno=0
 func=pow op1=c0180000.00000000 op2=c07f3000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=c0180000.00000000 op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=c0180000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0180000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0180000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=c0180000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=c0180000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=c0180000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=ffefffff.ffffffff op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=ffefffff.ffffffff op2=00000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=00100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=1fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=3be00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=3fe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=3ff00000.00000000 result=ffefffff.ffffffff errno=0
 func=pow op1=ffefffff.ffffffff op2=40000000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=40080000.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=40120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=40180000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=407ff800.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=408ff800.00000000 result=fff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=43dfffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=43e00000.00000000 result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=o
 func=pow op1=ffefffff.ffffffff op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=ffefffff.ffffffff op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=ffefffff.ffffffff op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=ffefffff.ffffffff op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=ffefffff.ffffffff op2=80000000.00000001 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=80100000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=9fffffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=bbe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=bfe00000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=bff00000.00000000 result=80040000.00000000.200 errno=0 status=u
 func=pow op1=ffefffff.ffffffff op2=c0000000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=c0080000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=c0120000.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=c0180000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=c07f3000.00000000 result=80000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=c090ce00.00000000 result=7ff80000.00000000 errno=EDOM status=i
 func=pow op1=ffefffff.ffffffff op2=c3dfffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=c3e00000.00000000 result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=u
 func=pow op1=ffefffff.ffffffff op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=ffefffff.ffffffff op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=ffefffff.ffffffff op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=00000000.00000001 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=00100000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=1fffffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=3bdfffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=3be00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=3fe00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=3ff00000.00000000 result=fff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=40000000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=40080000.00000000 result=fff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=40120000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=40180000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=407ff800.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=408ff800.00000000 result=fff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=43dfffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=43e00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=7fefffff.ffffffff result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=80000000.00000001 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=80100000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=9fffffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=bbdfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=bbe00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=bfe00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=bff00000.00000000 result=80000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c0000000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c0080000.00000000 result=80000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c0120000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c0180000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c07f3000.00000000 result=80000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c090ce00.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c3dfffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=c3e00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=ffefffff.ffffffff result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=fff00000.00000000 result=00000000.00000000 errno=0
 func=pow op1=fff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=fff00000.00000001 op2=00000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=00000000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=00100000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=3be00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=3fe00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=40000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=40080000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=40120000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=40180000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=407ff800.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=408ff800.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=43dfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=43e00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=7fefffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=80000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=80000000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=80100000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=bbe00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=bfe00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c0000000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c0080000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c0120000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c0180000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c07f3000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c090ce00.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c3dfffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=c3e00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=ffefffff.ffffffff result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff00000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff80000.00000001 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=00000000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=00100000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=1fffffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=3bdfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=3be00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=3fe00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=40000000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=40080000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=40120000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=40180000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=407ff800.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=408ff800.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=43dfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=43e00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=7fefffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=80000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=80000000.00000001 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=80100000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=9fffffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=bbdfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=bbe00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=bfe00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c0000000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c0080000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c0120000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c0180000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c07f3000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c090ce00.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c3dfffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=c3e00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=ffefffff.ffffffff result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000000 errno=0
 func=pow op1=fff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000000 errno=0 status=i
 func=pow op1=fff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst
index 3fa8b110f8bc..46d522400871 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst
@@ -1,246 +1,246 @@
 ; powf.tst - Directed test cases for powf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=7fc00001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ffc00001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=7f800000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=40800000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=40400000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=3f000000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=00000000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=80000000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=bf000000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=c0400000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=c0800000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800000 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=7fc00001 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=ffc00001 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=7f800000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=40800000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=40400000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=3f000000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=00000000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=80000000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=bf000000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=c0400000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=c0800000 result=7fc00001 errno=0 status=i
 func=powf op1=ff800001 op2=ff800000 result=7fc00001 errno=0 status=i
 func=powf op1=7fc00001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7fc00001 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=7fc00001 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=7f800000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=40800000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=40400000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=3f000000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=00000000 result=3f800000 errno=0
 func=powf op1=7fc00001 op2=80000000 result=3f800000 errno=0
 func=powf op1=7fc00001 op2=bf000000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=c0400000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=c0800000 result=7fc00001 errno=0
 func=powf op1=7fc00001 op2=ff800000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=ffc00001 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=ffc00001 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=7f800000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=40800000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=40400000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=3f000000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=00000000 result=3f800000 errno=0
 func=powf op1=ffc00001 op2=80000000 result=3f800000 errno=0
 func=powf op1=ffc00001 op2=bf000000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=c0400000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=c0800000 result=7fc00001 errno=0
 func=powf op1=ffc00001 op2=ff800000 result=7fc00001 errno=0
 func=powf op1=7f800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=7f800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=7f800000 op2=7f800000 result=7f800000 errno=0
 func=powf op1=7f800000 op2=40800000 result=7f800000 errno=0
 func=powf op1=7f800000 op2=40400000 result=7f800000 errno=0
 func=powf op1=7f800000 op2=3f000000 result=7f800000 errno=0
 func=powf op1=7f800000 op2=00000001 result=7f800000 errno=0
 func=powf op1=7f800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=7f800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=7f800000 op2=bf000000 result=00000000 errno=0
 func=powf op1=7f800000 op2=c0400000 result=00000000 errno=0
 func=powf op1=7f800000 op2=c0800000 result=00000000 errno=0
 func=powf op1=7f800000 op2=ff800000 result=00000000 errno=0
 func=powf op1=40800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=40800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=40800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=40800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=40800000 op2=7f800000 result=7f800000 errno=0
 func=powf op1=40800000 op2=40800000 result=43800000 errno=0
 func=powf op1=40800000 op2=40400000 result=42800000 errno=0
 func=powf op1=40800000 op2=3f000000 result=40000000 errno=0
 func=powf op1=40800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=40800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=40800000 op2=bf000000 result=3f000000 errno=0
 func=powf op1=40800000 op2=c0400000 result=3c800000 errno=0
 func=powf op1=40800000 op2=c0800000 result=3b800000 errno=0
 func=powf op1=40800000 op2=ff800000 result=00000000 errno=0
 func=powf op1=3f800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=3f800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=3f800000 op2=7fc00001 result=3f800000 errno=0
 func=powf op1=3f800000 op2=ffc00001 result=3f800000 errno=0
 func=powf op1=3f800000 op2=7f800000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=40800000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=40400000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=3f000000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=bf000000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=c0400000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=c0800000 result=3f800000 errno=0
 func=powf op1=3f800000 op2=ff800000 result=3f800000 errno=0
 func=powf op1=3e800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=3e800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=3e800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=3e800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=3e800000 op2=7f800000 result=00000000 errno=0
 func=powf op1=3e800000 op2=40800000 result=3b800000 errno=0
 func=powf op1=3e800000 op2=40400000 result=3c800000 errno=0
 func=powf op1=3e800000 op2=3f000000 result=3f000000 errno=0
 func=powf op1=3e800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=3e800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=3e800000 op2=bf000000 result=40000000 errno=0
 func=powf op1=3e800000 op2=c0400000 result=42800000 errno=0
 func=powf op1=3e800000 op2=c0800000 result=43800000 errno=0
 func=powf op1=3e800000 op2=ff800000 result=7f800000 errno=0
 func=powf op1=00000001 op2=bf800000 result=7f800000 errno=ERANGE status=ox
 func=powf op1=00000000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=00000000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=00000000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=00000000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=00000000 op2=7f800000 result=00000000 errno=0
 func=powf op1=00000000 op2=40800000 result=00000000 errno=0
 func=powf op1=00000000 op2=40400000 result=00000000 errno=0
 func=powf op1=00000000 op2=3f000000 result=00000000 errno=0
 func=powf op1=00000000 op2=00000000 result=3f800000 errno=0
 func=powf op1=00000000 op2=80000000 result=3f800000 errno=0
 func=powf op1=00000000 op2=bf000000 result=7f800000 errno=ERANGE status=z
 func=powf op1=00000000 op2=c0400000 result=7f800000 errno=ERANGE status=z
 func=powf op1=00000000 op2=c0800000 result=7f800000 errno=ERANGE status=z
 func=powf op1=00000000 op2=ff800000 result=7f800000 errno=ERANGE
 func=powf op1=80000000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=80000000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=80000000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=80000000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=80000000 op2=7f800000 result=00000000 errno=0
 func=powf op1=80000000 op2=40800000 result=00000000 errno=0
 func=powf op1=80000000 op2=40400000 result=80000000 errno=0
 func=powf op1=80000000 op2=3f000000 result=00000000 errno=0
 func=powf op1=80000000 op2=00000000 result=3f800000 errno=0
 func=powf op1=80000000 op2=80000000 result=3f800000 errno=0
 func=powf op1=80000000 op2=bf000000 result=7f800000 errno=ERANGE status=z
 func=powf op1=80000000 op2=c0400000 result=ff800000 errno=ERANGE status=z
 func=powf op1=80000000 op2=c0800000 result=7f800000 errno=ERANGE status=z
 func=powf op1=80000000 op2=ff800000 result=7f800000 errno=ERANGE
 func=powf op1=be800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=be800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=be800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=be800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=be800000 op2=7f800000 result=00000000 errno=0
 func=powf op1=be800000 op2=40800000 result=3b800000 errno=0
 func=powf op1=be800000 op2=40400000 result=bc800000 errno=0
 func=powf op1=be800000 op2=3f000000 result=7fc00001 errno=EDOM status=i
 func=powf op1=be800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=be800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=be800000 op2=bf000000 result=7fc00001 errno=EDOM status=i
 func=powf op1=be800000 op2=c0400000 result=c2800000 errno=0
 func=powf op1=be800000 op2=c0800000 result=43800000 errno=0
 func=powf op1=be800000 op2=ff800000 result=7f800000 errno=0
 func=powf op1=bf800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=bf800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=bf800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=bf800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=bf800000 op2=7f800000 result=3f800000 errno=0
 func=powf op1=bf800000 op2=40800000 result=3f800000 errno=0
 func=powf op1=bf800000 op2=40400000 result=bf800000 errno=0
 func=powf op1=bf800000 op2=3f000000 result=7fc00001 errno=EDOM status=i
 func=powf op1=bf800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=bf800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=bf800000 op2=bf000000 result=7fc00001 errno=EDOM status=i
 func=powf op1=bf800000 op2=c0400000 result=bf800000 errno=0
 func=powf op1=bf800000 op2=c0800000 result=3f800000 errno=0
 func=powf op1=bf800000 op2=ff800000 result=3f800000 errno=0
 func=powf op1=c0800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=c0800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=c0800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=c0800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=c0800000 op2=7f800000 result=7f800000 errno=0
 func=powf op1=c0800000 op2=40800000 result=43800000 errno=0
 func=powf op1=c0800000 op2=40400000 result=c2800000 errno=0
 func=powf op1=c0800000 op2=3f000000 result=7fc00001 errno=EDOM status=i
 func=powf op1=c0800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=c0800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=c0800000 op2=bf000000 result=7fc00001 errno=EDOM status=i
 func=powf op1=c0800000 op2=c0400000 result=bc800000 errno=0
 func=powf op1=c0800000 op2=c0800000 result=3b800000 errno=0
 func=powf op1=c0800000 op2=ff800000 result=00000000 errno=0
 func=powf op1=ff800000 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=ff800000 op2=ff800001 result=7fc00001 errno=0 status=i
 func=powf op1=ff800000 op2=7fc00001 result=7fc00001 errno=0
 func=powf op1=ff800000 op2=ffc00001 result=7fc00001 errno=0
 func=powf op1=ff800000 op2=7f800000 result=7f800000 errno=0
 func=powf op1=ff800000 op2=40800000 result=7f800000 errno=0
 func=powf op1=ff800000 op2=40400000 result=ff800000 errno=0
 func=powf op1=ff800000 op2=3f000000 result=7f800000 errno=0
 func=powf op1=ff800000 op2=00000000 result=3f800000 errno=0
 func=powf op1=ff800000 op2=80000000 result=3f800000 errno=0
 func=powf op1=ff800000 op2=bf000000 result=00000000 errno=0
 func=powf op1=ff800000 op2=c0400000 result=80000000 errno=0
 func=powf op1=ff800000 op2=c0800000 result=00000000 errno=0
 func=powf op1=ff800000 op2=ff800000 result=00000000 errno=0
 
 
 func=powf op1=36c27f9d op2=4109fa51 result=00000000 errno=ERANGE status=ux
 func=powf op1=351738cd op2=c0c55691 result=7f800000 errno=ERANGE status=ox
 func=powf op1=42836035 op2=41a99f40 result=7f800000 errno=ERANGE status=ox
 func=powf op1=32bd53f3 op2=40bcba58 result=00000000 errno=ERANGE status=ux
 func=powf op1=32dc5bff op2=40be62ea result=00000000 errno=ERANGE status=ux
 func=powf op1=3a8a3f66 op2=4172bd43 result=00000000 errno=ERANGE status=ux
 func=powf op1=28f0e770 op2=c035b4ca result=7f800000 errno=ERANGE status=ox
 func=powf op1=40886699 op2=c28f703a result=00000000 errno=ERANGE status=ux
 func=powf op1=414bd593 op2=c22370cf result=00000000 errno=ERANGE status=ux
 func=powf op1=3a2f1163 op2=c1422d45 result=7f800000 errno=ERANGE status=ox
 func=powf op1=434f5cf3 op2=41851272 result=7f800000 errno=ERANGE status=ox
 func=powf op1=2e0e27a4 op2=c06b13f5 result=7f800000 errno=ERANGE status=ox
 func=powf op1=39aef7a6 op2=414fd60a result=00000000 errno=ERANGE status=ux
 func=powf op1=21c80729 op2=c00a04ab result=7f800000 errno=ERANGE status=ox
 func=powf op1=42455a4b op2=c1d55905 result=00000000 errno=ERANGE status=ux
 func=powf op1=2d173e0b op2=c05ee797 result=7f800000 errno=ERANGE status=ox
 func=powf op1=452edf9a op2=4132dd7f result=7f800000 errno=ERANGE status=ox
 func=powf op1=406bf67b op2=c29f5f12 result=00000000 errno=ERANGE status=ux
 func=powf op1=2d82a6fc op2=4085779e result=00000000 errno=ERANGE status=ux
 func=powf op1=4551f827 op2=41304516 result=7f800000 errno=ERANGE status=ox
 func=powf op1=3a917c51 op2=41726c0a result=00000001.37f errno=0 status=ux
 ; iso c allows both errno=ERANGE and errno=0
 ;func=powf op1=3b19bbaa op2=4188e6fb result=00000000.b5f errno=0 status=ux
 ;func=powf op1=4088bd18 op2=c28ef056 result=00000000.986 errno=0 status=ux
 func=powf op1=3f7ffd76 op2=4a09221e result=00aa9d24.3ad error=0
 
 func=powf op1=007fffff op2=bf000001 result=5f00002c.2b2 error=0
 func=powf op1=000007ff op2=bf000001 result=62000830.96f error=0
 func=powf op1=007fffff op2=80800001 result=3f800000.000 error=0
 func=powf op1=00000000 op2=800007ff result=7f800000 errno=ERANGE status=z
 func=powf op1=00000000 op2=000007ff result=00000000 error=0
 func=powf op1=bf800000 op2=ff7fffff result=3f800000 error=0
 func=powf op1=2e4e4f30 op2=406b0dc2 result=007e9c59.eb4 errno=0 status=u
 
 ; SDCOMP-25549: ensure the biggest overflow case possible is not
 ; mishandled. Also check the analogous underflow, and also ensure that
 ; our massive-overflow checks do not affect numbers _just within_ the
 ; range.
 func=powf op1=7f7fffff op2=7f7fffff result=7f800000 error=overflow
 func=powf op1=7f7fffff op2=ff7fffff result=00000000 error=underflow
 func=powf op1=54cb3000 op2=403fffff result=7f7fffb2.a95 error=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst
index 4b33d2291c66..cddb346558ea 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst
@@ -1,51 +1,51 @@
 ; Directed test cases for SP sincos
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
 func=sincosf_sinf op1=ffc00001 result=7fc00001 errno=0
 func=sincosf_sinf op1=7f800001 result=7fc00001 errno=0 status=i
 func=sincosf_sinf op1=ff800001 result=7fc00001 errno=0 status=i
 func=sincosf_sinf op1=7f800000 result=7fc00001 errno=EDOM status=i
 func=sincosf_sinf op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=sincosf_sinf op1=00000000 result=00000000 errno=0
 func=sincosf_sinf op1=80000000 result=80000000 errno=0
 func=sincosf_sinf op1=c70d39a1 result=be37fad5.7ed errno=0
 func=sincosf_sinf op1=46427f1b result=3f352d80.f9b error=0
 func=sincosf_sinf op1=4647e568 result=3f352da9.7be error=0
 func=sincosf_sinf op1=46428bac result=bf352dea.924 error=0
 func=sincosf_sinf op1=4647f1f9 result=bf352e13.146 error=0
 func=sincosf_sinf op1=4647fe8a result=3f352e7c.ac9 error=0
 func=sincosf_sinf op1=45d8d7f1 result=3f35097b.cb0 error=0
 func=sincosf_sinf op1=45d371a4 result=bf350990.102 error=0
 func=sincosf_sinf op1=45ce0b57 result=3f3509a4.554 error=0
 func=sincosf_sinf op1=45d35882 result=3f3509f9.bdb error=0
 func=sincosf_sinf op1=45cdf235 result=bf350a0e.02c error=0
 
 func=sincosf_cosf op1=7fc00001 result=7fc00001 errno=0
 func=sincosf_cosf op1=ffc00001 result=7fc00001 errno=0
 func=sincosf_cosf op1=7f800001 result=7fc00001 errno=0 status=i
 func=sincosf_cosf op1=ff800001 result=7fc00001 errno=0 status=i
 func=sincosf_cosf op1=7f800000 result=7fc00001 errno=EDOM status=i
 func=sincosf_cosf op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=sincosf_cosf op1=00000000 result=3f800000 errno=0
 func=sincosf_cosf op1=80000000 result=3f800000 errno=0
 func=sincosf_cosf op1=46427f1b result=3f34dc5c.565 error=0
 func=sincosf_cosf op1=4647e568 result=3f34dc33.c1f error=0
 func=sincosf_cosf op1=46428bac result=bf34dbf2.8e3 error=0
 func=sincosf_cosf op1=4647f1f9 result=bf34dbc9.f9b error=0
 func=sincosf_cosf op1=4647fe8a result=3f34db60.313 error=0
 func=sincosf_cosf op1=45d8d7f1 result=bf35006a.7fd error=0
 func=sincosf_cosf op1=45d371a4 result=3f350056.39b error=0
 func=sincosf_cosf op1=45ce0b57 result=bf350041.f38 error=0
 func=sincosf_cosf op1=45d35882 result=bf34ffec.868 error=0
 func=sincosf_cosf op1=45cdf235 result=3f34ffd8.404 error=0
 
 ; no underflow
 func=sincosf_sinf op1=17800000 result=17800000.000
 func=sincosf_cosf op1=17800000 result=3f800000.000
 ; underflow
 func=sincosf_sinf op1=00400000 result=00400000.000 status=ux
 func=sincosf_cosf op1=00400000 result=3f800000.000 status=ux
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst
index ded80b1598c6..041b13d5d6cb 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst
@@ -1,28 +1,28 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sinf op1=7fc00001 result=7fc00001 errno=0
 func=sinf op1=ffc00001 result=7fc00001 errno=0
 func=sinf op1=7f800001 result=7fc00001 errno=0 status=i
 func=sinf op1=ff800001 result=7fc00001 errno=0 status=i
 func=sinf op1=7f800000 result=7fc00001 errno=EDOM status=i
 func=sinf op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=sinf op1=00000000 result=00000000 errno=0
 func=sinf op1=80000000 result=80000000 errno=0
 ; Directed test for a failure I found while developing mathbench
 func=sinf op1=c70d39a1 result=be37fad5.7ed errno=0
 ; SDCOMP-26094: check sinf in the cases for which the range reducer
 ; returns values furthest beyond its nominal upper bound of pi/4.
 func=sinf op1=46427f1b result=3f352d80.f9b error=0
 func=sinf op1=4647e568 result=3f352da9.7be error=0
 func=sinf op1=46428bac result=bf352dea.924 error=0
 func=sinf op1=4647f1f9 result=bf352e13.146 error=0
 func=sinf op1=4647fe8a result=3f352e7c.ac9 error=0
 func=sinf op1=45d8d7f1 result=3f35097b.cb0 error=0
 func=sinf op1=45d371a4 result=bf350990.102 error=0
 func=sinf op1=45ce0b57 result=3f3509a4.554 error=0
 func=sinf op1=45d35882 result=3f3509f9.bdb error=0
 func=sinf op1=45cdf235 result=bf350a0e.02c error=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/random/double.tst b/contrib/arm-optimized-routines/math/test/testcases/random/double.tst
index c24ff80d5d95..8e885d61722a 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/random/double.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/random/double.tst
@@ -1,10 +1,10 @@
 !! double.tst - Random test case specification for DP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test exp 10000
 test exp2 10000
 test log 10000
 test log2 10000
 test pow 40000
diff --git a/contrib/arm-optimized-routines/math/test/testcases/random/float.tst b/contrib/arm-optimized-routines/math/test/testcases/random/float.tst
index d02a22750abe..ea4a5a015214 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/random/float.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/random/float.tst
@@ -1,15 +1,15 @@
 !! single.tst - Random test case specification for SP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test sinf 10000
 test cosf 10000
 test sincosf_sinf 5000
 test sincosf_cosf 5000
 test tanf 10000
 test expf 10000
 test exp2f 10000
 test logf 10000
 test log2f 10000
 test powf 10000
diff --git a/contrib/arm-optimized-routines/math/test/ulp.c b/contrib/arm-optimized-routines/math/test/ulp.c
index 51479b87a0fd..bb8c3ad69900 100644
--- a/contrib/arm-optimized-routines/math/test/ulp.c
+++ b/contrib/arm-optimized-routines/math/test/ulp.c
@@ -1,853 +1,855 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
 #include <fenv.h>
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mathlib.h"
 
 /* Don't depend on mpfr by default.  */
 #ifndef USE_MPFR
 # define USE_MPFR 0
 #endif
 #if USE_MPFR
 # include <mpfr.h>
 #endif
 
 #ifndef WANT_VMATH
 /* Enable the build of vector math code.  */
 # define WANT_VMATH 1
 #endif
 
 static inline uint64_t
 asuint64 (double f)
 {
   union
   {
     double f;
     uint64_t i;
   } u = {f};
   return u.i;
 }
 
 static inline double
 asdouble (uint64_t i)
 {
   union
   {
     uint64_t i;
     double f;
   } u = {i};
   return u.f;
 }
 
 static inline uint32_t
 asuint (float f)
 {
   union
   {
     float f;
     uint32_t i;
   } u = {f};
   return u.i;
 }
 
 static inline float
 asfloat (uint32_t i)
 {
   union
   {
     uint32_t i;
     float f;
   } u = {i};
   return u.f;
 }
 
 static uint64_t seed = 0x0123456789abcdef;
 static uint64_t
 rand64 (void)
 {
   seed = 6364136223846793005ull * seed + 1;
   return seed ^ (seed >> 32);
 }
 
 /* Uniform random in [0,n].  */
 static uint64_t
 randn (uint64_t n)
 {
   uint64_t r, m;
 
   if (n == 0)
     return 0;
   n++;
   if (n == 0)
     return rand64 ();
   for (;;)
     {
       r = rand64 ();
       m = r % n;
       if (r - m <= -n)
 	return m;
     }
 }
 
 struct gen
 {
   uint64_t start;
   uint64_t len;
   uint64_t start2;
   uint64_t len2;
   uint64_t off;
   uint64_t step;
   uint64_t cnt;
 };
 
 struct args_f1
 {
   float x;
 };
 
 struct args_f2
 {
   float x;
   float x2;
 };
 
 struct args_d1
 {
   double x;
 };
 
 struct args_d2
 {
   double x;
   double x2;
 };
 
 /* result = y + tail*2^ulpexp.  */
 struct ret_f
 {
   float y;
   double tail;
   int ulpexp;
   int ex;
   int ex_may;
 };
 
 struct ret_d
 {
   double y;
   double tail;
   int ulpexp;
   int ex;
   int ex_may;
 };
 
 static inline uint64_t
 next1 (struct gen *g)
 {
   /* For single argument use randomized incremental steps,
      that produce dense sampling without collisions and allow
      testing all inputs in a range.  */
   uint64_t r = g->start + g->off;
   g->off += g->step + randn (g->step / 2);
   if (g->off > g->len)
     g->off -= g->len; /* hack.  */
   return r;
 }
 
 static inline uint64_t
 next2 (uint64_t *x2, struct gen *g)
 {
   /* For two arguments use uniform random sampling.  */
   uint64_t r = g->start + randn (g->len);
   *x2 = g->start2 + randn (g->len2);
   return r;
 }
 
 static struct args_f1
 next_f1 (void *g)
 {
   return (struct args_f1){asfloat (next1 (g))};
 }
 
 static struct args_f2
 next_f2 (void *g)
 {
   uint64_t x2;
   uint64_t x = next2 (&x2, g);
   return (struct args_f2){asfloat (x), asfloat (x2)};
 }
 
 static struct args_d1
 next_d1 (void *g)
 {
   return (struct args_d1){asdouble (next1 (g))};
 }
 
 static struct args_d2
 next_d2 (void *g)
 {
   uint64_t x2;
   uint64_t x = next2 (&x2, g);
   return (struct args_d2){asdouble (x), asdouble (x2)};
 }
 
 struct conf
 {
   int r;
   int rc;
   int quiet;
   int mpfr;
   int fenv;
   unsigned long long n;
   double softlim;
   double errlim;
 };
 
-/* Wrappers for sincos.  */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
 /* A bit of a hack: call vector functions twice with the same
    input in lane 0 but a different value in other lanes: once
    with an in-range value and then with a special case value.  */
 static int secondcall;
 
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
 typedef __f32x4_t v_float;
 typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
+/* First element of fv and dv may be changed by -c argument.  */
+static float fv[2] = {1.0f, -INFINITY};
+static double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef __SVFloat32_t sv_float;
+typedef __SVFloat64_t sv_double;
+
+static inline sv_float svargf(float x)  {
+	int n = svcntw();
+	float base[n];
+	for (int i=0; i<n; i++)
+		base[i] = (float)x;
+	base[n-1] = (float) fv[secondcall];
+	return svld1(svptrue_b32(), base);
+}
+static inline sv_double svargd(double x) {
+	int n = svcntd();
+	double base[n];
+	for (int i=0; i<n; i++)
+		base[i] = x;
+	base[n-1] = dv[secondcall];
+	return svld1(svptrue_b64(), base);
+}
+static inline float svretf(sv_float vec)  {
+	int n = svcntw();
+	float res[n];
+	svst1(svptrue_b32(), res, vec);
+	return res[0];
+}
+static inline double svretd(sv_double vec) {
+	int n = svcntd();
+	double res[n];
+	svst1(svptrue_b64(), res, vec);
+	return res[0];
+}
+#endif
+#endif
+
+#if WANT_SVE_MATH
+long double
+dummyl (long double x)
+{
+  return x;
+}
+
+double
+dummy (double x)
+{
+  return x;
+}
+
+static sv_double
+__sv_dummy (sv_double x)
+{
+  return x;
+}
 
-static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
-static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
-static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
-static float v_expf(float x) { return __v_expf(argf(x))[0]; }
-static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
-static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
-static float v_logf(float x) { return __v_logf(argf(x))[0]; }
-static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
-static double v_sin(double x) { return __v_sin(argd(x))[0]; }
-static double v_cos(double x) { return __v_cos(argd(x))[0]; }
-static double v_exp(double x) { return __v_exp(argd(x))[0]; }
-static double v_log(double x) { return __v_log(argd(x))[0]; }
-static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
-#ifdef __vpcs
-static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
-static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
-static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
-static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
-static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
-static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
-static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
-static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
-static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
-static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
-static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
-static double vn_log(double x) { return __vn_log(argd(x))[0]; }
-static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+static sv_float
+__sv_dummyf (sv_float x)
+{
+  return x;
+}
 #endif
+
+#include "test/ulp_wrappers.h"
+
+/* Wrappers for SVE functions.  */
+#if WANT_SVE_MATH
+static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); }
+static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); }
 #endif
 
 struct fun
 {
   const char *name;
   int arity;
   int singleprec;
   int twice;
   union
   {
     float (*f1) (float);
     float (*f2) (float, float);
     double (*d1) (double);
     double (*d2) (double, double);
   } fun;
   union
   {
     double (*f1) (double);
     double (*f2) (double, double);
     long double (*d1) (long double);
     long double (*d2) (long double, long double);
   } fun_long;
 #if USE_MPFR
   union
   {
     int (*f1) (mpfr_t, const mpfr_t, mpfr_rnd_t);
     int (*f2) (mpfr_t, const mpfr_t, const mpfr_t, mpfr_rnd_t);
     int (*d1) (mpfr_t, const mpfr_t, mpfr_rnd_t);
     int (*d2) (mpfr_t, const mpfr_t, const mpfr_t, mpfr_rnd_t);
   } fun_mpfr;
 #endif
 };
 
 static const struct fun fun[] = {
 #if USE_MPFR
 # define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
   {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}},
 #else
 # define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
   {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}},
 #endif
 #define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0)
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#if WANT_VMATH
- F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
- F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
- F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
- F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
- F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
- F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
- F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
- F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
- F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
-#if __aarch64__
- F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#ifdef __vpcs
- F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#endif
-#endif
+/* Neon routines.  */
+#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVNF1(x) VNF1 (x) ZVF1 (x)
+#define ZVNF2(x) VNF2 (x) ZVF2 (x)
+#define ZVND1(x) VND1 (x) ZVD1 (x)
+#define ZVND2(x) VND2 (x) ZVD2 (x)
+#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+/* SVE routines.  */
+#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+
+#include "test/ulp_funcs.h"
+
+#if WANT_SVE_MATH
+ SVD1 (dummy)
+ SVF1 (dummy)
 #endif
+
 #undef F
 #undef F1
 #undef F2
 #undef D1
 #undef D2
+#undef SVF1
+#undef SVF2
+#undef SVD1
+#undef SVD2
  {0}};
 
 /* Boilerplate for generic calls.  */
 
 static inline int
 ulpscale_f (float x)
 {
   int e = asuint (x) >> 23 & 0xff;
   if (!e)
     e++;
   return e - 0x7f - 23;
 }
 static inline int
 ulpscale_d (double x)
 {
   int e = asuint64 (x) >> 52 & 0x7ff;
   if (!e)
     e++;
   return e - 0x3ff - 52;
 }
 static inline float
 call_f1 (const struct fun *f, struct args_f1 a)
 {
   return f->fun.f1 (a.x);
 }
 static inline float
 call_f2 (const struct fun *f, struct args_f2 a)
 {
   return f->fun.f2 (a.x, a.x2);
 }
 
 static inline double
 call_d1 (const struct fun *f, struct args_d1 a)
 {
   return f->fun.d1 (a.x);
 }
 static inline double
 call_d2 (const struct fun *f, struct args_d2 a)
 {
   return f->fun.d2 (a.x, a.x2);
 }
 static inline double
 call_long_f1 (const struct fun *f, struct args_f1 a)
 {
   return f->fun_long.f1 (a.x);
 }
 static inline double
 call_long_f2 (const struct fun *f, struct args_f2 a)
 {
   return f->fun_long.f2 (a.x, a.x2);
 }
 static inline long double
 call_long_d1 (const struct fun *f, struct args_d1 a)
 {
   return f->fun_long.d1 (a.x);
 }
 static inline long double
 call_long_d2 (const struct fun *f, struct args_d2 a)
 {
   return f->fun_long.d2 (a.x, a.x2);
 }
 static inline void
 printcall_f1 (const struct fun *f, struct args_f1 a)
 {
   printf ("%s(%a)", f->name, a.x);
 }
 static inline void
 printcall_f2 (const struct fun *f, struct args_f2 a)
 {
   printf ("%s(%a, %a)", f->name, a.x, a.x2);
 }
 static inline void
 printcall_d1 (const struct fun *f, struct args_d1 a)
 {
   printf ("%s(%a)", f->name, a.x);
 }
 static inline void
 printcall_d2 (const struct fun *f, struct args_d2 a)
 {
   printf ("%s(%a, %a)", f->name, a.x, a.x2);
 }
 static inline void
 printgen_f1 (const struct fun *f, struct gen *gen)
 {
   printf ("%s in [%a;%a]", f->name, asfloat (gen->start),
 	  asfloat (gen->start + gen->len));
 }
 static inline void
 printgen_f2 (const struct fun *f, struct gen *gen)
 {
   printf ("%s in [%a;%a] x [%a;%a]", f->name, asfloat (gen->start),
 	  asfloat (gen->start + gen->len), asfloat (gen->start2),
 	  asfloat (gen->start2 + gen->len2));
 }
 static inline void
 printgen_d1 (const struct fun *f, struct gen *gen)
 {
   printf ("%s in [%a;%a]", f->name, asdouble (gen->start),
 	  asdouble (gen->start + gen->len));
 }
 static inline void
 printgen_d2 (const struct fun *f, struct gen *gen)
 {
   printf ("%s in [%a;%a] x [%a;%a]", f->name, asdouble (gen->start),
 	  asdouble (gen->start + gen->len), asdouble (gen->start2),
 	  asdouble (gen->start2 + gen->len2));
 }
 
 #define reduce_f1(a, f, op) (f (a.x))
 #define reduce_f2(a, f, op) (f (a.x) op f (a.x2))
 #define reduce_d1(a, f, op) (f (a.x))
 #define reduce_d2(a, f, op) (f (a.x) op f (a.x2))
 
 #ifndef IEEE_754_2008_SNAN
 # define IEEE_754_2008_SNAN 1
 #endif
 static inline int
 issignaling_f (float x)
 {
   uint32_t ix = asuint (x);
   if (!IEEE_754_2008_SNAN)
     return (ix & 0x7fc00000) == 0x7fc00000;
   return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
 }
 static inline int
 issignaling_d (double x)
 {
   uint64_t ix = asuint64 (x);
   if (!IEEE_754_2008_SNAN)
     return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
   return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
 }
 
 #if USE_MPFR
 static mpfr_rnd_t
 rmap (int r)
 {
   switch (r)
     {
     case FE_TONEAREST:
       return MPFR_RNDN;
     case FE_TOWARDZERO:
       return MPFR_RNDZ;
     case FE_UPWARD:
       return MPFR_RNDU;
     case FE_DOWNWARD:
       return MPFR_RNDD;
     }
   return -1;
 }
 
 #define prec_mpfr_f 50
 #define prec_mpfr_d 80
 #define prec_f 24
 #define prec_d 53
 #define emin_f -148
 #define emin_d -1073
 #define emax_f 128
 #define emax_d 1024
 static inline int
 call_mpfr_f1 (mpfr_t y, const struct fun *f, struct args_f1 a, mpfr_rnd_t r)
 {
   MPFR_DECL_INIT (x, prec_f);
   mpfr_set_flt (x, a.x, MPFR_RNDN);
   return f->fun_mpfr.f1 (y, x, r);
 }
 static inline int
 call_mpfr_f2 (mpfr_t y, const struct fun *f, struct args_f2 a, mpfr_rnd_t r)
 {
   MPFR_DECL_INIT (x, prec_f);
   MPFR_DECL_INIT (x2, prec_f);
   mpfr_set_flt (x, a.x, MPFR_RNDN);
   mpfr_set_flt (x2, a.x2, MPFR_RNDN);
   return f->fun_mpfr.f2 (y, x, x2, r);
 }
 static inline int
 call_mpfr_d1 (mpfr_t y, const struct fun *f, struct args_d1 a, mpfr_rnd_t r)
 {
   MPFR_DECL_INIT (x, prec_d);
   mpfr_set_d (x, a.x, MPFR_RNDN);
   return f->fun_mpfr.d1 (y, x, r);
 }
 static inline int
 call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
 {
   MPFR_DECL_INIT (x, prec_d);
   MPFR_DECL_INIT (x2, prec_d);
   mpfr_set_d (x, a.x, MPFR_RNDN);
   mpfr_set_d (x2, a.x2, MPFR_RNDN);
   return f->fun_mpfr.d2 (y, x, x2, r);
 }
 #endif
 
 #define float_f float
 #define double_f double
 #define copysign_f copysignf
 #define nextafter_f nextafterf
 #define fabs_f fabsf
 #define asuint_f asuint
 #define asfloat_f asfloat
 #define scalbn_f scalbnf
 #define lscalbn_f scalbn
 #define halfinf_f 0x1p127f
 #define min_normal_f 0x1p-126f
 
 #define float_d double
 #define double_d long double
 #define copysign_d copysign
 #define nextafter_d nextafter
 #define fabs_d fabs
 #define asuint_d asuint64
 #define asfloat_d asdouble
 #define scalbn_d scalbn
 #define lscalbn_d scalbnl
 #define halfinf_d 0x1p1023
 #define min_normal_d 0x1p-1022
 
 #define NEW_RT
 #define RT(x) x##_f
 #define T(x) x##_f1
 #include "ulp.h"
 #undef T
 #define T(x) x##_f2
 #include "ulp.h"
 #undef T
 #undef RT
 
 #define NEW_RT
 #define RT(x) x##_d
 #define T(x) x##_d1
 #include "ulp.h"
 #undef T
 #define T(x) x##_d2
 #include "ulp.h"
 #undef T
 #undef RT
 
 static void
 usage (void)
 {
   puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
 	"lo [hi [x lo2 hi2] [count]]");
   puts ("Compares func against a higher precision implementation in [lo; hi].");
   puts ("-q: quiet.");
   puts ("-m: use mpfr even if faster method is available.");
   puts ("-f: disable fenv testing (rounding modes and exceptions).");
+#if __aarch64__ && WANT_VMATH
+  puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
+	"    This should be different from tested input in other lanes, and non-special \n"
+	"    (i.e. should not trigger fenv exceptions). Default is 1.");
+#endif
   puts ("Supported func:");
   for (const struct fun *f = fun; f->name; f++)
     printf ("\t%s\n", f->name);
   exit (1);
 }
 
 static int
 cmp (const struct fun *f, struct gen *gen, const struct conf *conf)
 {
   int r = 1;
   if (f->arity == 1 && f->singleprec)
     r = cmp_f1 (f, gen, conf);
   else if (f->arity == 2 && f->singleprec)
     r = cmp_f2 (f, gen, conf);
   else if (f->arity == 1 && !f->singleprec)
     r = cmp_d1 (f, gen, conf);
   else if (f->arity == 2 && !f->singleprec)
     r = cmp_d2 (f, gen, conf);
   else
     usage ();
   return r;
 }
 
 static uint64_t
 getnum (const char *s, int singleprec)
 {
   //	int i;
   uint64_t sign = 0;
   //	char buf[12];
 
   if (s[0] == '+')
     s++;
   else if (s[0] == '-')
     {
       sign = singleprec ? 1ULL << 31 : 1ULL << 63;
       s++;
     }
   /* 0xXXXX is treated as bit representation, '-' flips the sign bit.  */
   if (s[0] == '0' && tolower (s[1]) == 'x' && strchr (s, 'p') == 0)
     return sign ^ strtoull (s, 0, 0);
   //	/* SNaN, QNaN, NaN, Inf.  */
   //	for (i=0; s[i] && i < sizeof buf; i++)
   //		buf[i] = tolower(s[i]);
   //	buf[i] = 0;
   //	if (strcmp(buf, "snan") == 0)
   //		return sign | (singleprec ? 0x7fa00000 : 0x7ff4000000000000);
   //	if (strcmp(buf, "qnan") == 0 || strcmp(buf, "nan") == 0)
   //		return sign | (singleprec ? 0x7fc00000 : 0x7ff8000000000000);
   //	if (strcmp(buf, "inf") == 0 || strcmp(buf, "infinity") == 0)
   //		return sign | (singleprec ? 0x7f800000 : 0x7ff0000000000000);
   /* Otherwise assume it's a floating-point literal.  */
   return sign
 	 | (singleprec ? asuint (strtof (s, 0)) : asuint64 (strtod (s, 0)));
 }
 
 static void
 parsegen (struct gen *g, int argc, char *argv[], const struct fun *f)
 {
   int singleprec = f->singleprec;
   int arity = f->arity;
   uint64_t a, b, a2, b2, n;
   if (argc < 1)
     usage ();
   b = a = getnum (argv[0], singleprec);
   n = 0;
   if (argc > 1 && strcmp (argv[1], "x") == 0)
     {
       argc -= 2;
       argv += 2;
     }
   else if (argc > 1)
     {
       b = getnum (argv[1], singleprec);
       if (argc > 2 && strcmp (argv[2], "x") == 0)
 	{
 	  argc -= 3;
 	  argv += 3;
 	}
     }
   b2 = a2 = getnum (argv[0], singleprec);
   if (argc > 1)
     b2 = getnum (argv[1], singleprec);
   if (argc > 2)
     n = strtoull (argv[2], 0, 0);
   if (argc > 3)
     usage ();
   //printf("ab %lx %lx ab2 %lx %lx n %lu\n", a, b, a2, b2, n);
   if (arity == 1)
     {
       g->start = a;
       g->len = b - a;
       if (n - 1 > b - a)
 	n = b - a + 1;
       g->off = 0;
       g->step = n ? (g->len + 1) / n : 1;
       g->start2 = g->len2 = 0;
       g->cnt = n;
     }
   else if (arity == 2)
     {
       g->start = a;
       g->len = b - a;
       g->off = g->step = 0;
       g->start2 = a2;
       g->len2 = b2 - a2;
       g->cnt = n;
     }
   else
     usage ();
 }
 
 int
 main (int argc, char *argv[])
 {
   const struct fun *f;
   struct gen gen;
   struct conf conf;
   conf.rc = 'n';
   conf.quiet = 0;
   conf.mpfr = 0;
   conf.fenv = 1;
   conf.softlim = 0;
   conf.errlim = INFINITY;
   for (;;)
     {
       argc--;
       argv++;
       if (argc < 1)
 	usage ();
       if (argv[0][0] != '-')
 	break;
       switch (argv[0][1])
 	{
 	case 'e':
 	  argc--;
 	  argv++;
 	  if (argc < 1)
 	    usage ();
 	  conf.errlim = strtod (argv[0], 0);
 	  break;
 	case 'f':
 	  conf.fenv = 0;
 	  break;
 	case 'l':
 	  argc--;
 	  argv++;
 	  if (argc < 1)
 	    usage ();
 	  conf.softlim = strtod (argv[0], 0);
 	  break;
 	case 'm':
 	  conf.mpfr = 1;
 	  break;
 	case 'q':
 	  conf.quiet = 1;
 	  break;
 	case 'r':
 	  conf.rc = argv[0][2];
 	  if (!conf.rc)
 	    {
 	      argc--;
 	      argv++;
 	      if (argc < 1)
 		usage ();
 	      conf.rc = argv[0][0];
 	    }
 	  break;
+#if __aarch64__ && WANT_VMATH
+	case 'c':
+	  argc--;
+	  argv++;
+	  fv[0] = strtof(argv[0], 0);
+	  dv[0] = strtod(argv[0], 0);
+	  break;
+#endif
 	default:
 	  usage ();
 	}
     }
   switch (conf.rc)
     {
     case 'n':
       conf.r = FE_TONEAREST;
       break;
     case 'u':
       conf.r = FE_UPWARD;
       break;
     case 'd':
       conf.r = FE_DOWNWARD;
       break;
     case 'z':
       conf.r = FE_TOWARDZERO;
       break;
     default:
       usage ();
     }
   for (f = fun; f->name; f++)
     if (strcmp (argv[0], f->name) == 0)
       break;
   if (!f->name)
     usage ();
   if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
     conf.mpfr = 1; /* Use mpfr if long double has no extra precision.  */
   if (!USE_MPFR && conf.mpfr)
     {
       puts ("mpfr is not available.");
       return 0;
     }
   argc--;
   argv++;
   parsegen (&gen, argc, argv, f);
   conf.n = gen.cnt;
   return cmp (f, &gen, &conf);
 }
diff --git a/contrib/arm-optimized-routines/math/test/ulp.h b/contrib/arm-optimized-routines/math/test/ulp.h
index a0c301664321..327b4bd0fd06 100644
--- a/contrib/arm-optimized-routines/math/test/ulp.h
+++ b/contrib/arm-optimized-routines/math/test/ulp.h
@@ -1,362 +1,362 @@
 /*
  * Generic functions for ULP error estimation.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* For each different math function type,
    T(x) should add a different suffix to x.
    RT(x) should add a return type specific suffix to x. */
 
 #ifdef NEW_RT
 #undef NEW_RT
 
 # if USE_MPFR
 static int RT(ulpscale_mpfr) (mpfr_t x, int t)
 {
   /* TODO: pow of 2 cases.  */
   if (mpfr_regular_p (x))
     {
       mpfr_exp_t e = mpfr_get_exp (x) - RT(prec);
       if (e < RT(emin))
 	e = RT(emin) - 1;
       if (e > RT(emax) - RT(prec))
 	e = RT(emax) - RT(prec);
       return e;
     }
   if (mpfr_zero_p (x))
     return RT(emin) - 1;
   if (mpfr_inf_p (x))
     return RT(emax) - RT(prec);
   /* NaN.  */
   return 0;
 }
 # endif
 
 /* Difference between exact result and closest real number that
    gets rounded to got, i.e. error before rounding, for a correctly
    rounded result the difference is 0.  */
 static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
 {
   RT(float) want = p->y;
   RT(float) d;
   double e;
 
   if (RT(asuint) (got) == RT(asuint) (want))
     return 0.0;
   if (signbit (got) != signbit (want))
     /* May have false positives with NaN.  */
     //return isnan(got) && isnan(want) ? 0 : INFINITY;
     return INFINITY;
   if (!isfinite (want) || !isfinite (got))
     {
       if (isnan (got) != isnan (want))
 	return INFINITY;
       if (isnan (want))
 	return 0;
       if (isinf (got))
 	{
 	  got = RT(copysign) (RT(halfinf), got);
 	  want *= 0.5f;
 	}
       if (isinf (want))
 	{
 	  want = RT(copysign) (RT(halfinf), want);
 	  got *= 0.5f;
 	}
     }
   if (r == FE_TONEAREST)
     {
       // TODO: incorrect when got vs want cross a powof2 boundary
       /* error = got > want
 	      ? got - want - tail ulp - 0.5 ulp
 	      : got - want - tail ulp + 0.5 ulp;  */
       d = got - want;
       e = d > 0 ? -p->tail - 0.5 : -p->tail + 0.5;
     }
   else
     {
       if ((r == FE_DOWNWARD && got < want) || (r == FE_UPWARD && got > want)
 	  || (r == FE_TOWARDZERO && fabs (got) < fabs (want)))
 	got = RT(nextafter) (got, want);
       d = got - want;
       e = -p->tail;
     }
   return RT(scalbn) (d, -p->ulpexp) + e;
 }
 
 static int RT(isok) (RT(float) ygot, int exgot, RT(float) ywant, int exwant,
 		      int exmay)
 {
   return RT(asuint) (ygot) == RT(asuint) (ywant)
 	 && ((exgot ^ exwant) & ~exmay) == 0;
 }
 
 static int RT(isok_nofenv) (RT(float) ygot, RT(float) ywant)
 {
   return RT(asuint) (ygot) == RT(asuint) (ywant);
 }
 #endif
 
 static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
 				  RT(float) * y, int *ex)
 {
   if (r != FE_TONEAREST)
     fesetround (r);
   feclearexcept (FE_ALL_EXCEPT);
   *y = T(call) (f, a);
   *ex = fetestexcept (FE_ALL_EXCEPT);
   if (r != FE_TONEAREST)
     fesetround (FE_TONEAREST);
 }
 
 static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
 				    int r, RT(float) * y, int *ex)
 {
   *y = T(call) (f, a);
   *ex = 0;
 }
 
 static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
 				      int r, struct RT(ret) * p,
 				      RT(float) ygot, int exgot)
 {
   if (r != FE_TONEAREST)
     fesetround (r);
   feclearexcept (FE_ALL_EXCEPT);
   volatile struct T(args) va = a; // TODO: barrier
   a = va;
   RT(double) yl = T(call_long) (f, a);
   p->y = (RT(float)) yl;
   volatile RT(float) vy = p->y; // TODO: barrier
   (void) vy;
   p->ex = fetestexcept (FE_ALL_EXCEPT);
   if (r != FE_TONEAREST)
     fesetround (FE_TONEAREST);
   p->ex_may = FE_INEXACT;
   if (RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may))
     return 1;
   p->ulpexp = RT(ulpscale) (p->y);
   if (isinf (p->y))
     p->tail = RT(lscalbn) (yl - (RT(double)) 2 * RT(halfinf), -p->ulpexp);
   else
     p->tail = RT(lscalbn) (yl - p->y, -p->ulpexp);
   if (RT(fabs) (p->y) < RT(min_normal))
     {
       /* TODO: subnormal result is treated as undeflow even if it's
 	 exact since call_long may not raise inexact correctly.  */
       if (p->y != 0 || (p->ex & FE_INEXACT))
 	p->ex |= FE_UNDERFLOW | FE_INEXACT;
     }
   return 0;
 }
 static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
 					int r, struct RT(ret) * p,
 					RT(float) ygot, int exgot)
 {
   RT(double) yl = T(call_long) (f, a);
   p->y = (RT(float)) yl;
   if (RT(isok_nofenv) (ygot, p->y))
     return 1;
   p->ulpexp = RT(ulpscale) (p->y);
   if (isinf (p->y))
     p->tail = RT(lscalbn) (yl - (RT(double)) 2 * RT(halfinf), -p->ulpexp);
   else
     p->tail = RT(lscalbn) (yl - p->y, -p->ulpexp);
   return 0;
 }
 
 /* There are nan input args and all quiet.  */
 static inline int T(qnanpropagation) (struct T(args) a)
 {
   return T(reduce) (a, isnan, ||) && !T(reduce) (a, RT(issignaling), ||);
 }
 static inline RT(float) T(sum) (struct T(args) a)
 {
   return T(reduce) (a, , +);
 }
 
 /* returns 1 if the got result is ok.  */
 static inline int T(call_mpfr_fix) (const struct fun *f, struct T(args) a,
 				     int r_fenv, struct RT(ret) * p,
 				     RT(float) ygot, int exgot)
 {
 #if USE_MPFR
   int t, t2;
   mpfr_rnd_t r = rmap (r_fenv);
   MPFR_DECL_INIT(my, RT(prec_mpfr));
   MPFR_DECL_INIT(mr, RT(prec));
   MPFR_DECL_INIT(me, RT(prec_mpfr));
   mpfr_clear_flags ();
   t = T(call_mpfr) (my, f, a, r);
   /* Double rounding.  */
   t2 = mpfr_set (mr, my, r);
   if (t2)
     t = t2;
   mpfr_set_emin (RT(emin));
   mpfr_set_emax (RT(emax));
   t = mpfr_check_range (mr, t, r);
   t = mpfr_subnormalize (mr, t, r);
   mpfr_set_emax (MPFR_EMAX_DEFAULT);
   mpfr_set_emin (MPFR_EMIN_DEFAULT);
   p->y = mpfr_get_d (mr, r);
   p->ex = t ? FE_INEXACT : 0;
   p->ex_may = FE_INEXACT;
   if (mpfr_underflow_p () && (p->ex & FE_INEXACT))
     /* TODO: handle before and after rounding uflow cases.  */
     p->ex |= FE_UNDERFLOW;
   if (mpfr_overflow_p ())
     p->ex |= FE_OVERFLOW | FE_INEXACT;
   if (mpfr_divby0_p ())
     p->ex |= FE_DIVBYZERO;
   //if (mpfr_erangeflag_p ())
   //  p->ex |= FE_INVALID;
   if (!mpfr_nanflag_p () && RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may))
     return 1;
   if (mpfr_nanflag_p () && !T(qnanpropagation) (a))
     p->ex |= FE_INVALID;
   p->ulpexp = RT(ulpscale_mpfr) (my, t);
   if (!isfinite (p->y))
     {
       p->tail = 0;
       if (isnan (p->y))
 	{
 	  /* If an input was nan keep its sign.  */
 	  p->y = T(sum) (a);
 	  if (!isnan (p->y))
 	    p->y = (p->y - p->y) / (p->y - p->y);
 	  return RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may);
 	}
       mpfr_set_si_2exp (mr, signbit (p->y) ? -1 : 1, 1024, MPFR_RNDN);
       if (mpfr_cmpabs (my, mr) >= 0)
 	return RT(isok) (ygot, exgot, p->y, p->ex, p->ex_may);
     }
   mpfr_sub (me, my, mr, MPFR_RNDN);
   mpfr_mul_2si (me, me, -p->ulpexp, MPFR_RNDN);
   p->tail = mpfr_get_d (me, MPFR_RNDN);
   return 0;
 #else
   abort ();
 #endif
 }
 
 static int T(cmp) (const struct fun *f, struct gen *gen,
 		     const struct conf *conf)
 {
   double maxerr = 0;
   uint64_t cnt = 0;
   uint64_t cnt1 = 0;
   uint64_t cnt2 = 0;
   uint64_t cntfail = 0;
   int r = conf->r;
   int use_mpfr = conf->mpfr;
   int fenv = conf->fenv;
   for (;;)
     {
       struct RT(ret) want;
       struct T(args) a = T(next) (gen);
       int exgot;
       int exgot2;
       RT(float) ygot;
       RT(float) ygot2;
       int fail = 0;
       if (fenv)
 	T(call_fenv) (f, a, r, &ygot, &exgot);
       else
 	T(call_nofenv) (f, a, r, &ygot, &exgot);
       if (f->twice) {
 	secondcall = 1;
 	if (fenv)
 	  T(call_fenv) (f, a, r, &ygot2, &exgot2);
 	else
 	  T(call_nofenv) (f, a, r, &ygot2, &exgot2);
 	secondcall = 0;
 	if (RT(asuint) (ygot) != RT(asuint) (ygot2))
 	  {
 	    fail = 1;
 	    cntfail++;
 	    T(printcall) (f, a);
 	    printf (" got %a then %a for same input\n", ygot, ygot2);
 	  }
       }
       cnt++;
       int ok = use_mpfr
 		 ? T(call_mpfr_fix) (f, a, r, &want, ygot, exgot)
 		 : (fenv ? T(call_long_fenv) (f, a, r, &want, ygot, exgot)
 			 : T(call_long_nofenv) (f, a, r, &want, ygot, exgot));
       if (!ok)
 	{
 	  int print = 0;
 	  double err = RT(ulperr) (ygot, &want, r);
 	  double abserr = fabs (err);
 	  // TODO: count errors below accuracy limit.
 	  if (abserr > 0)
 	    cnt1++;
 	  if (abserr > 1)
 	    cnt2++;
 	  if (abserr > conf->errlim)
 	    {
 	      print = 1;
 	      if (!fail)
 		{
 		  fail = 1;
 		  cntfail++;
 		}
 	    }
 	  if (abserr > maxerr)
 	    {
 	      maxerr = abserr;
 	      if (!conf->quiet && abserr > conf->softlim)
 		print = 1;
 	    }
 	  if (print)
 	    {
 	      T(printcall) (f, a);
 	      // TODO: inf ulp handling
 	      printf (" got %a want %a %+g ulp err %g\n", ygot, want.y,
 		      want.tail, err);
 	    }
 	  int diff = fenv ? exgot ^ want.ex : 0;
 	  if (fenv && (diff & ~want.ex_may))
 	    {
 	      if (!fail)
 		{
 		  fail = 1;
 		  cntfail++;
 		}
 	      T(printcall) (f, a);
 	      printf (" is %a %+g ulp, got except 0x%0x", want.y, want.tail,
 		      exgot);
 	      if (diff & exgot)
 		printf (" wrongly set: 0x%x", diff & exgot);
 	      if (diff & ~exgot)
 		printf (" wrongly clear: 0x%x", diff & ~exgot);
 	      putchar ('\n');
 	    }
 	}
       if (cnt >= conf->n)
 	break;
       if (!conf->quiet && cnt % 0x100000 == 0)
 	printf ("progress: %6.3f%% cnt %llu cnt1 %llu cnt2 %llu cntfail %llu "
 		"maxerr %g\n",
 		100.0 * cnt / conf->n, (unsigned long long) cnt,
 		(unsigned long long) cnt1, (unsigned long long) cnt2,
 		(unsigned long long) cntfail, maxerr);
     }
   double cc = cnt;
   if (cntfail)
     printf ("FAIL ");
   else
     printf ("PASS ");
   T(printgen) (f, gen);
   printf (" round %c errlim %g maxerr %g %s cnt %llu cnt1 %llu %g%% cnt2 %llu "
 	  "%g%% cntfail %llu %g%%\n",
 	  conf->rc, conf->errlim,
 	  maxerr, conf->r == FE_TONEAREST ? "+0.5" : "+1.0",
 	  (unsigned long long) cnt,
 	  (unsigned long long) cnt1, 100.0 * cnt1 / cc,
 	  (unsigned long long) cnt2, 100.0 * cnt2 / cc,
 	  (unsigned long long) cntfail, 100.0 * cntfail / cc);
   return !!cntfail;
 }
diff --git a/contrib/arm-optimized-routines/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/math/test/ulp_funcs.h
new file mode 100644
index 000000000000..f5cea4d6d14c
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/ulp_funcs.h
@@ -0,0 +1,78 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+ F1 (sin)
+ F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ F1 (erf)
+ D1 (exp)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ D1 (erf)
+#if WANT_VMATH
+ F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
+ F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
+ F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
+ F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+ F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+ F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
+#if __aarch64__
+ F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#ifdef __vpcs
+ F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#endif
+#endif
+#endif
diff --git a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h
new file mode 100644
index 000000000000..fd9e00c0310f
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h
@@ -0,0 +1,59 @@
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Wrappers for sincos.  */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
+/* Wrappers for vector functions.  */
+#if __aarch64__ && WANT_VMATH
+static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
+static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
+static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
+static float v_logf(float x) { return __v_logf(argf(x))[0]; }
+static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
+static double v_sin(double x) { return __v_sin(argd(x))[0]; }
+static double v_cos(double x) { return __v_cos(argd(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
+#ifdef __vpcs
+static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
+static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
+static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
+static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
+static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
+static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
+static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
+static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
+static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
+static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
+static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#endif
+#endif
diff --git a/contrib/arm-optimized-routines/math/tools/cos.sollya b/contrib/arm-optimized-routines/math/tools/cos.sollya
index bd72d6b74820..6690adfcbb9b 100644
--- a/contrib/arm-optimized-routines/math/tools/cos.sollya
+++ b/contrib/arm-optimized-routines/math/tools/cos.sollya
@@ -1,31 +1,31 @@
 // polynomial for approximating cos(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 8;   // polynomial degree
 a = -pi/4; // interval
 b = pi/4;
 
 // find even polynomial with minimal abs error compared to cos(x)
 
 f = cos(x);
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg/2 do {
   p = roundcoefficients(approx(poly,2*i), [|D ...|]);
   poly = poly + x^(2*i)*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/exp.sollya b/contrib/arm-optimized-routines/math/tools/exp.sollya
index b7a462cda5a4..0668bdb5b3d3 100644
--- a/contrib/arm-optimized-routines/math/tools/exp.sollya
+++ b/contrib/arm-optimized-routines/math/tools/exp.sollya
@@ -1,35 +1,35 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 5; // poly degree
 N = 128; // table entries
 b = log(2)/(2*N);  // interval
 b = b + b*0x1p-16; // increase interval for non-nearest rounding (TOINT_NARROW)
 a = -b;
 
 // find polynomial with minimal abs error
 
 // return p that minimizes |exp(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(exp(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first 2 coeffs are fixed, iteratively find optimal double prec coeffs
 poly = 1 + x;
 for i from 2 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/exp(x), [a;b], 30));
 print("abs error:", accurateinfnorm(exp(x)-poly(x), [a;b], 30));
 print("in [",a,b,"]");
 // double interval error for non-nearest rounding
 print("rel2 error:", accurateinfnorm(1-poly(x)/exp(x), [2*a;2*b], 30));
 print("abs2 error:", accurateinfnorm(exp(x)-poly(x), [2*a;2*b], 30));
 print("in [",2*a,2*b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/exp2.sollya b/contrib/arm-optimized-routines/math/tools/exp2.sollya
index e760769601d4..bd0a42d6bbcb 100644
--- a/contrib/arm-optimized-routines/math/tools/exp2.sollya
+++ b/contrib/arm-optimized-routines/math/tools/exp2.sollya
@@ -1,48 +1,48 @@
 // polynomial for approximating 2^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // exp2f parameters
 deg = 3; // poly degree
 N = 32;  // table entries
 b = 1/(2*N); // interval
 a = -b;
 
 //// exp2 parameters
 //deg = 5; // poly degree
 //N = 128; // table entries
 //b = 1/(2*N); // interval
 //a = -b;
 
 // find polynomial with minimal relative error
 
 f = 2^x;
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|
 approx_abs = proc(poly,d) {
   return remez(f(x) - poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
 //  p = roundcoefficients(approx_abs(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/2^x, [a;b], 30));
 print("abs error:", accurateinfnorm(2^x-poly(x), [a;b], 30));
 print("in [",a,b,"]");
 // double interval error for non-nearest rounding:
 print("rel2 error:", accurateinfnorm(1-poly(x)/2^x, [2*a;2*b], 30));
 print("abs2 error:", accurateinfnorm(2^x-poly(x), [2*a;2*b], 30));
 print("in [",2*a,2*b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/log.sollya b/contrib/arm-optimized-routines/math/tools/log.sollya
index 6df4db44b6f3..5288f5572925 100644
--- a/contrib/arm-optimized-routines/math/tools/log.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log.sollya
@@ -1,35 +1,35 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12; // poly degree
 // |log(1+x)| > 0x1p-4 outside the interval
 a = -0x1p-4;
 b =  0x1.09p-4;
 
 // find log(1+x)/x polynomial with minimal relative error
 // (minimal relative error polynomial for log(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/log2.sollya b/contrib/arm-optimized-routines/math/tools/log2.sollya
index 4a364c0f111f..85811be5d90c 100644
--- a/contrib/arm-optimized-routines/math/tools/log2.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log2.sollya
@@ -1,42 +1,42 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 11; // poly degree
 // |log2(1+x)| > 0x1p-4 outside the interval
 a = -0x1.5b51p-5;
 b =  0x1.6ab2p-5;
 
 ln2 = evaluate(log(2),0);
 invln2hi = double(1/ln2 + 0x1p21) - 0x1p21; // round away last 21 bits
 invln2lo = double(1/ln2 - invln2hi);
 
 // find log2(1+x)/x polynomial with minimal relative error
 // (minimal relative error polynomial for log2(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
 f = f/ln2;
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = invln2hi + invln2lo;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("invln2hi:", invln2hi);
 print("invln2lo:", invln2lo);
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/log2_abs.sollya b/contrib/arm-optimized-routines/math/tools/log2_abs.sollya
index 82c4dac26fa1..d018ba0145d2 100644
--- a/contrib/arm-optimized-routines/math/tools/log2_abs.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log2_abs.sollya
@@ -1,41 +1,41 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
 a= -0x1.f45p-8;
 b=  0x1.f45p-8;
 
 ln2 = evaluate(log(2),0);
 invln2hi = double(1/ln2 + 0x1p21) - 0x1p21; // round away last 21 bits
 invln2lo = double(1/ln2 - invln2hi);
 
 // find log2(1+x) polynomial with minimal absolute error
 f = log(1+x)/ln2;
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(f(x) - poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = x*(invln2lo + invln2hi);
 for i from 2 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("invln2hi:", invln2hi);
 print("invln2lo:", invln2lo);
 print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30));
 //// relative error computation fails if f(0)==0
 //// g = f(x)/x = log2(1+x)/x; using taylor series
 //g = 0;
 //for i from 0 to 60 do { g = g + (-x)^i/(i+1)/ln2; };
 //print("rel error:", accurateinfnorm(1-(poly(x)/x)/g(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/log_abs.sollya b/contrib/arm-optimized-routines/math/tools/log_abs.sollya
index a2ac190fc497..5f9bfe41a683 100644
--- a/contrib/arm-optimized-routines/math/tools/log_abs.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log_abs.sollya
@@ -1,35 +1,35 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
 a = -0x1.fp-9;
 b =  0x1.fp-9;
 
 // find log(1+x) polynomial with minimal absolute error
 f = log(1+x);
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(f(x) - poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = x;
 for i from 2 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30));
 // relative error computation fails if f(0)==0
 // g = f(x)/x = log(1+x)/x; using taylor series
 g = 0;
 for i from 0 to 60 do { g = g + (-x)^i/(i+1); };
 print("rel error:", accurateinfnorm(1-poly(x)/x/g(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/plot.py b/contrib/arm-optimized-routines/math/tools/plot.py
index 6c8b89ff284b..a0fa02322560 100755
--- a/contrib/arm-optimized-routines/math/tools/plot.py
+++ b/contrib/arm-optimized-routines/math/tools/plot.py
@@ -1,61 +1,61 @@
 #!/usr/bin/python
 
 # ULP error plot tool.
 #
 # Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import numpy as np
 import matplotlib.pyplot as plt
 import sys
 import re
 
 # example usage:
 # build/bin/ulp -e .0001 log 0.5 2.0 2345678 | math/tools/plot.py
 
 def fhex(s):
 	return float.fromhex(s)
 
 def parse(f):
 	xs = []
 	gs = []
 	ys = []
 	es = []
 	# Has to match the format used in ulp.c
 	r = re.compile(r'[^ (]+\(([^ )]*)\) got ([^ ]+) want ([^ ]+) [^ ]+ ulp err ([^ ]+)')
 	for line in f:
 		m = r.match(line)
 		if m:
 			x = fhex(m.group(1))
 			g = fhex(m.group(2))
 			y = fhex(m.group(3))
 			e = float(m.group(4))
 			xs.append(x)
 			gs.append(g)
 			ys.append(y)
 			es.append(e)
 		elif line.startswith('PASS') or line.startswith('FAIL'):
 			# Print the summary line
 			print(line)
 	return xs, gs, ys, es
 
 def plot(xs, gs, ys, es):
 	if len(xs) < 2:
 		print('not enough samples')
 		return
 	a = min(xs)
 	b = max(xs)
 	fig, (ax0,ax1) = plt.subplots(nrows=2)
 	es = np.abs(es) # ignore the sign
 	emax = max(es)
 	ax0.text(a+(b-a)*0.7, emax*0.8, '%s\n%g'%(emax.hex(),emax))
 	ax0.plot(xs,es,'r.')
 	ax0.grid()
 	ax1.plot(xs,ys,'r.',label='want')
 	ax1.plot(xs,gs,'b.',label='got')
 	ax1.grid()
 	ax1.legend()
 	plt.show()
 
 xs, gs, ys, es = parse(sys.stdin)
 plot(xs, gs, ys, es)
diff --git a/contrib/arm-optimized-routines/math/tools/remez.jl b/contrib/arm-optimized-routines/math/tools/remez.jl
index 2ff436f5287f..1deab67d0660 100755
--- a/contrib/arm-optimized-routines/math/tools/remez.jl
+++ b/contrib/arm-optimized-routines/math/tools/remez.jl
@@ -1,1334 +1,1334 @@
 #!/usr/bin/env julia
 # -*- julia -*-
 
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
 # Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import Base.\
 
 # ----------------------------------------------------------------------
 # Helper functions to cope with different Julia versions.
 if VERSION >= v"0.7.0"
     array1d(T, d) = Array{T, 1}(undef, d)
     array2d(T, d1, d2) = Array{T, 2}(undef, d1, d2)
 else
     array1d(T, d) = Array(T, d)
     array2d(T, d1, d2) = Array(T, d1, d2)
 end
 if VERSION < v"0.5.0"
     String = ASCIIString
 end
 if VERSION >= v"0.6.0"
     # Use Base.invokelatest to run functions made using eval(), to
     # avoid "world age" error
     run(f, x...) = Base.invokelatest(f, x...)
 else
     # Prior to 0.6.0, invokelatest doesn't exist (but fortunately the
     # world age problem also doesn't seem to exist)
     run(f, x...) = f(x...)
 end
 
 # ----------------------------------------------------------------------
 # Global variables configured by command-line options.
 floatsuffix = "" # adjusted by --floatsuffix
 xvarname = "x" # adjusted by --variable
 epsbits = 256 # adjusted by --bits
 debug_facilities = Set() # adjusted by --debug
 full_output = false # adjusted by --full
 array_format = false # adjusted by --array
 preliminary_commands = array1d(String, 0) # adjusted by --pre
 
 # ----------------------------------------------------------------------
 # Diagnostic and utility functions.
 
 # Enable debugging printouts from a particular subpart of this
 # program.
 #
 # Arguments:
 #    facility   Name of the facility to debug. For a list of facility names,
 #               look through the code for calls to debug().
 #
 # Return value is a BigFloat.
 function enable_debug(facility)
     push!(debug_facilities, facility)
 end
 
 # Print a diagnostic.
 #
 # Arguments:
 #    facility   Name of the facility for which this is a debug message.
 #    printargs  Arguments to println() if debugging of that facility is
 #               enabled.
 macro debug(facility, printargs...)
     printit = quote
         print("[", $facility, "] ")
     end
     for arg in printargs
         printit = quote
             $printit
             print($(esc(arg)))
         end
     end
     return quote
         if $facility in debug_facilities
             $printit
             println()
         end
     end
 end
 
 # Evaluate a polynomial.
 
 # Arguments:
 #    coeffs   Array of BigFloats giving the coefficients of the polynomial.
 #             Starts with the constant term, i.e. coeffs[i] is the
 #             coefficient of x^(i-1) (because Julia arrays are 1-based).
 #    x        Point at which to evaluate the polynomial.
 #
 # Return value is a BigFloat.
 function poly_eval(coeffs::Array{BigFloat}, x::BigFloat)
     n = length(coeffs)
     if n == 0
         return BigFloat(0)
     elseif n == 1
         return coeffs[1]
     else
         return coeffs[1] + x * poly_eval(coeffs[2:n], x)
     end
 end
 
 # Evaluate a rational function.
 
 # Arguments:
 #    ncoeffs  Array of BigFloats giving the coefficients of the numerator.
 #             Starts with the constant term, and 1-based, as above.
 #    dcoeffs  Array of BigFloats giving the coefficients of the denominator.
 #             Starts with the constant term, and 1-based, as above.
 #    x        Point at which to evaluate the function.
 #
 # Return value is a BigFloat.
 function ratfn_eval(ncoeffs::Array{BigFloat}, dcoeffs::Array{BigFloat},
                     x::BigFloat)
     return poly_eval(ncoeffs, x) / poly_eval(dcoeffs, x)
 end
 
 # Format a BigFloat into an appropriate output format.
 # Arguments:
 #    x        BigFloat to format.
 #
 # Return value is a string.
 function float_to_str(x)
     return string(x) * floatsuffix
 end
 
 # Format a polynomial into an arithmetic expression, for pasting into
 # other tools such as gnuplot.
 
 # Arguments:
 #    coeffs   Array of BigFloats giving the coefficients of the polynomial.
 #             Starts with the constant term, and 1-based, as above.
 #
 # Return value is a string.
 function poly_to_string(coeffs::Array{BigFloat})
     n = length(coeffs)
     if n == 0
         return "0"
     elseif n == 1
         return float_to_str(coeffs[1])
     else
         return string(float_to_str(coeffs[1]), "+", xvarname, "*(",
                       poly_to_string(coeffs[2:n]), ")")
     end
 end
 
 # Format a rational function into a string.
 
 # Arguments:
 #    ncoeffs  Array of BigFloats giving the coefficients of the numerator.
 #             Starts with the constant term, and 1-based, as above.
 #    dcoeffs  Array of BigFloats giving the coefficients of the denominator.
 #             Starts with the constant term, and 1-based, as above.
 #
 # Return value is a string.
 function ratfn_to_string(ncoeffs::Array{BigFloat}, dcoeffs::Array{BigFloat})
     if length(dcoeffs) == 1 && dcoeffs[1] == 1
         # Special case: if the denominator is just 1, leave it out.
         return poly_to_string(ncoeffs)
     else
         return string("(", poly_to_string(ncoeffs), ")/(",
                       poly_to_string(dcoeffs), ")")
     end
 end
 
 # Format a list of x,y pairs into a string.
 
 # Arguments:
 #    xys      Array of (x,y) pairs of BigFloats.
 #
 # Return value is a string.
 function format_xylist(xys::Array{Tuple{BigFloat,BigFloat}})
     return ("[\n" *
             join(["  "*string(x)*" -> "*string(y) for (x,y) in xys], "\n") *
             "\n]")
 end
 
 # ----------------------------------------------------------------------
 # Matrix-equation solver for matrices of BigFloat.
 #
 # I had hoped that Julia's type-genericity would allow me to solve the
 # matrix equation Mx=V by just writing 'M \ V'. Unfortunately, that
 # works by translating the inputs into double precision and handing
 # off to an optimised library, which misses the point when I have a
 # matrix and vector of BigFloat and want my result in _better_ than
 # double precision. So I have to implement my own specialisation of
 # the \ operator for that case.
 #
 # Fortunately, the point of using BigFloats is that we have precision
 # to burn, so I can do completely naïve Gaussian elimination without
 # worrying about instability.
 
 # Arguments:
 #    matrix_in    2-dimensional array of BigFloats, representing a matrix M
 #                 in row-first order, i.e. matrix_in[r,c] represents the
 #                 entry in row r col c.
 #    vector_in    1-dimensional array of BigFloats, representing a vector V.
 #
 # Return value: a 1-dimensional array X of BigFloats, satisfying M X = V.
 #
 # Expects the input to be an invertible square matrix and a vector of
 # the corresponding size, on pain of failing an assertion.
 function \(matrix_in :: Array{BigFloat,2},
            vector_in :: Array{BigFloat,1})
     # Copy the inputs, because we'll be mutating them as we go.
     M = copy(matrix_in)
     V = copy(vector_in)
 
     # Input consistency criteria: matrix is square, and vector has
     # length to match.
     n = length(V)
     @assert(n > 0)
     @assert(size(M) == (n,n))
 
     @debug("gausselim", "starting, n=", n)
 
     for i = 1:1:n
         # Straightforward Gaussian elimination: find the largest
         # non-zero entry in column i (and in a row we haven't sorted
         # out already), swap it into row i, scale that row to
         # normalise it to 1, then zero out the rest of the column by
         # subtracting a multiple of that row from each other row.
 
         @debug("gausselim", "matrix=", repr(M))
         @debug("gausselim", "vector=", repr(V))
 
         # Find the best pivot.
         bestrow = 0
         bestval = 0
         for j = i:1:n
             if abs(M[j,i]) > bestval
                 bestrow = j
                 bestval = M[j,i]
             end
         end
         @assert(bestrow > 0) # make sure we did actually find one
 
         @debug("gausselim", "bestrow=", bestrow)
 
         # Swap it into row i.
         if bestrow != i
             for k = 1:1:n
                 M[bestrow,k],M[i,k] = M[i,k],M[bestrow,k]
             end
             V[bestrow],V[i] = V[i],V[bestrow]
         end
 
         # Scale that row so that M[i,i] becomes 1.
         divisor = M[i,i]
         for k = 1:1:n
             M[i,k] = M[i,k] / divisor
         end
         V[i] = V[i] / divisor
         @assert(M[i,i] == 1)
 
         # Zero out all other entries in column i, by subtracting
         # multiples of this row.
         for j = 1:1:n
             if j != i
                 factor = M[j,i]
                 for k = 1:1:n
                     M[j,k] = M[j,k] - M[i,k] * factor
                 end
                 V[j] = V[j] - V[i] * factor
                 @assert(M[j,i] == 0)
             end
         end
     end
 
     @debug("gausselim", "matrix=", repr(M))
     @debug("gausselim", "vector=", repr(V))
     @debug("gausselim", "done!")
 
     # Now we're done: M is the identity matrix, so the equation Mx=V
     # becomes just x=V, i.e. V is already exactly the vector we want
     # to return.
     return V
 end
 
 # ----------------------------------------------------------------------
 # Least-squares fitting of a rational function to a set of (x,y)
 # points.
 #
 # We use this to get an initial starting point for the Remez
 # iteration. Therefore, it doesn't really need to be particularly
 # accurate; it only needs to be good enough to wiggle back and forth
 # across the target function the right number of times (so as to give
 # enough error extrema to start optimising from) and not have any
 # poles in the target interval.
 #
 # Least-squares fitting of a _polynomial_ is actually a sensible thing
 # to do, and minimises the rms error. Doing the following trick with a
 # rational function P/Q is less sensible, because it cannot be made to
 # minimise the error function (P/Q-f)^2 that you actually wanted;
 # instead it minimises (P-fQ)^2. But that should be good enough to
 # have the properties described above.
 #
 # Some theory: suppose you're trying to choose a set of parameters a_i
 # so as to minimise the sum of squares of some error function E_i.
 # Basic calculus says, if you do this in one variable, just
 # differentiate and solve for zero. In this case, that works fine even
 # with multiple variables, because you _partially_ differentiate with
 # respect to each a_i, giving a system of equations, and that system
 # turns out to be linear so we just solve it as a matrix.
 #
 # In this case, our parameters are the coefficients of P and Q; to
 # avoid underdetermining the system we'll fix Q's constant term at 1,
 # so that our error function (as described above) is
 #
 # E = \sum (p_0 + p_1 x + ... + p_n x^n - y - y q_1 x - ... - y q_d x^d)^2
 #
 # where the sum is over all (x,y) coordinate pairs. Setting dE/dp_j=0
 # (for each j) gives an equation of the form
 #
 # 0 = \sum 2(p_0 + p_1 x + ... + p_n x^n - y - y q_1 x - ... - y q_d x^d) x^j
 #
 # and setting dE/dq_j=0 gives one of the form
 #
 # 0 = \sum 2(p_0 + p_1 x + ... + p_n x^n - y - y q_1 x - ... - y q_d x^d) y x^j
 #
 # And both of those row types, treated as multivariate linear
 # equations in the p,q values, have each coefficient being a value of
 # the form \sum x^i, \sum y x^i or \sum y^2 x^i, for various i. (Times
 # a factor of 2, but we can throw that away.) So we can go through the
 # list of input coordinates summing all of those things, and then we
 # have enough information to construct our matrix and solve it
 # straight off for the rational function coefficients.
 
 # Arguments:
 #    f        The function to be approximated. Maps BigFloat -> BigFloat.
 #    xvals    Array of BigFloats, giving the list of x-coordinates at which
 #             to evaluate f.
 #    n        Degree of the numerator polynomial of the desired rational
 #             function.
 #    d        Degree of the denominator polynomial of the desired rational
 #             function.
 #    w        Error-weighting function. Takes two BigFloat arguments x,y
 #             and returns a scaling factor for the error at that location.
 #             A larger value indicates that the error should be given
 #             greater weight in the square sum we try to minimise.
 #             If unspecified, defaults to giving everything the same weight.
 #
 # Return values: a pair of arrays of BigFloats (N,D) giving the
 # coefficients of the returned rational function. N has size n+1; D
 # has size d+1. Both start with the constant term, i.e. N[i] is the
 # coefficient of x^(i-1) (because Julia arrays are 1-based). D[1] will
 # be 1.
 function ratfn_leastsquares(f::Function, xvals::Array{BigFloat}, n, d,
                             w = (x,y)->BigFloat(1))
     # Accumulate sums of x^i y^j, for j={0,1,2} and a range of x.
     # Again because Julia arrays are 1-based, we'll have sums[i,j]
     # being the sum of x^(i-1) y^(j-1).
     maxpow = max(n,d) * 2 + 1
     sums = zeros(BigFloat, maxpow, 3)
     for x = xvals
         y = f(x)
         weight = w(x,y)
         for i = 1:1:maxpow
             for j = 1:1:3
                 sums[i,j] += x^(i-1) * y^(j-1) * weight
             end
         end
     end
 
     @debug("leastsquares", "sums=", repr(sums))
 
     # Build the matrix. We're solving n+d+1 equations in n+d+1
     # unknowns. (We actually have to return n+d+2 coefficients, but
     # one of them is hardwired to 1.)
     matrix = array2d(BigFloat, n+d+1, n+d+1)
     vector = array1d(BigFloat, n+d+1)
     for i = 0:1:n
         # Equation obtained by differentiating with respect to p_i,
         # i.e. the numerator coefficient of x^i.
         row = 1+i
         for j = 0:1:n
             matrix[row, 1+j] = sums[1+i+j, 1]
         end
         for j = 1:1:d
             matrix[row, 1+n+j] = -sums[1+i+j, 2]
         end
         vector[row] = sums[1+i, 2]
     end
     for i = 1:1:d
         # Equation obtained by differentiating with respect to q_i,
         # i.e. the denominator coefficient of x^i.
         row = 1+n+i
         for j = 0:1:n
             matrix[row, 1+j] = sums[1+i+j, 2]
         end
         for j = 1:1:d
             matrix[row, 1+n+j] = -sums[1+i+j, 3]
         end
         vector[row] = sums[1+i, 3]
     end
 
     @debug("leastsquares", "matrix=", repr(matrix))
     @debug("leastsquares", "vector=", repr(vector))
 
     # Solve the matrix equation.
     all_coeffs = matrix \ vector
 
     @debug("leastsquares", "all_coeffs=", repr(all_coeffs))
 
     # And marshal the results into two separate polynomial vectors to
     # return.
     ncoeffs = all_coeffs[1:n+1]
     dcoeffs = vcat([1], all_coeffs[n+2:n+d+1])
     return (ncoeffs, dcoeffs)
 end
 
 # ----------------------------------------------------------------------
 # Golden-section search to find a maximum of a function.
 
 # Arguments:
 #    f        Function to be maximised/minimised. Maps BigFloat -> BigFloat.
 #    a,b,c    BigFloats bracketing a maximum of the function.
 #
 # Expects:
 #    a,b,c are in order (either a<=b<=c or c<=b<=a)
 #    a != c             (but b can equal one or the other if it wants to)
 #    f(a) <= f(b) >= f(c)
 #
 # Return value is an (x,y) pair of BigFloats giving the extremal input
 # and output. (That is, y=f(x).)
 function goldensection(f::Function, a::BigFloat, b::BigFloat, c::BigFloat)
     # Decide on a 'good enough' threshold.
     threshold = abs(c-a) * 2^(-epsbits/2)
 
     # We'll need the golden ratio phi, of course. Or rather, in this
     # case, we need 1/phi = 0.618...
     one_over_phi = 2 / (1 + sqrt(BigFloat(5)))
 
     # Flip round the interval endpoints so that the interval [a,b] is
     # at least as large as [b,c]. (Then we can always pick our new
     # point in [a,b] without having to handle lots of special cases.)
     if abs(b-a) < abs(c-a)
         a,  c  = c,  a
     end
 
     # Evaluate the function at the initial points.
     fa = f(a)
     fb = f(b)
     fc = f(c)
 
     @debug("goldensection", "starting")
 
     while abs(c-a) > threshold
         @debug("goldensection", "a: ", a, " -> ", fa)
         @debug("goldensection", "b: ", b, " -> ", fb)
         @debug("goldensection", "c: ", c, " -> ", fc)
 
         # Check invariants.
         @assert(a <= b <= c || c <= b <= a)
         @assert(fa <= fb >= fc)
 
         # Subdivide the larger of the intervals [a,b] and [b,c]. We've
         # arranged that this is always [a,b], for simplicity.
         d = a + (b-a) * one_over_phi
 
         # Now we have an interval looking like this (possibly
         # reversed):
         #
         #    a            d       b            c
         #
         # and we know f(b) is bigger than either f(a) or f(c). We have
         # two cases: either f(d) > f(b), or vice versa. In either
         # case, we can narrow to an interval of 1/phi the size, and
         # still satisfy all our invariants (three ordered points,
         # [a,b] at least the width of [b,c], f(a)<=f(b)>=f(c)).
         fd = f(d)
         @debug("goldensection", "d: ", d, " -> ", fd)
         if fd > fb
             a,  b,  c  = a,  d,  b
             fa, fb, fc = fa, fd, fb
             @debug("goldensection", "adb case")
         else
             a,  b,  c  = c,  b,  d
             fa, fb, fc = fc, fb, fd
             @debug("goldensection", "cbd case")
         end
     end
 
     @debug("goldensection", "done: ", b, " -> ", fb)
     return (b, fb)
 end
 
 # ----------------------------------------------------------------------
 # Find the extrema of a function within a given interval.
 
 # Arguments:
 #    f         The function to be approximated. Maps BigFloat -> BigFloat.
 #    grid      A set of points at which to evaluate f. Must be high enough
 #              resolution to make extrema obvious.
 #
 # Returns an array of (x,y) pairs of BigFloats, with each x,y giving
 # the extremum location and its value (i.e. y=f(x)).
 function find_extrema(f::Function, grid::Array{BigFloat})
     len = length(grid)
     extrema = array1d(Tuple{BigFloat, BigFloat}, 0)
     for i = 1:1:len
         # We have to provide goldensection() with three points
         # bracketing the extremum. If the extremum is at one end of
         # the interval, then the only way we can do that is to set two
         # of the points equal (which goldensection() will cope with).
         prev = max(1, i-1)
         next = min(i+1, len)
 
         # Find our three pairs of (x,y) coordinates.
         xp, xi, xn = grid[prev], grid[i], grid[next]
         yp, yi, yn = f(xp), f(xi), f(xn)
 
         # See if they look like an extremum, and if so, ask
         # goldensection() to give a more exact location for it.
         if yp <= yi >= yn
             push!(extrema, goldensection(f, xp, xi, xn))
         elseif yp >= yi <= yn
             x, y = goldensection(x->-f(x), xp, xi, xn)
             push!(extrema, (x, -y))
         end
     end
     return extrema
 end
 
 # ----------------------------------------------------------------------
 # Winnow a list of a function's extrema to give a subsequence of a
 # specified length, with the extrema in the subsequence alternating
 # signs, and with the smallest absolute value of an extremum in the
 # subsequence as large as possible.
 #
 # We do this using a dynamic-programming approach. We work along the
 # provided array of extrema, and at all times, we track the best set
 # of extrema we have so far seen for each possible (length, sign of
 # last extremum) pair. Each new extremum is evaluated to see whether
 # it can be added to any previously seen best subsequence to make a
 # new subsequence that beats the previous record holder in its slot.
 
 # Arguments:
 #    extrema   An array of (x,y) pairs of BigFloats giving the input extrema.
 #    n         Number of extrema required as output.
 #
 # Returns a new array of (x,y) pairs which is a subsequence of the
 # original sequence. (So, in particular, if the input was sorted by x
 # then so will the output be.)
 function winnow_extrema(extrema::Array{Tuple{BigFloat,BigFloat}}, n)
     # best[i,j] gives the best sequence so far of length i and with
     # sign j (where signs are coded as 1=positive, 2=negative), in the
     # form of a tuple (cost, actual array of x,y pairs).
     best = fill((BigFloat(0), array1d(Tuple{BigFloat,BigFloat}, 0)), n, 2)
 
     for (x,y) = extrema
         if y > 0
             sign = 1
         elseif y < 0
             sign = 2
         else
             # A zero-valued extremum cannot possibly contribute to any
             # optimal sequence, so we simply ignore it!
             continue
         end
 
         for i = 1:1:n
             # See if we can create a new entry for best[i,sign] by
             # appending our current (x,y) to some previous thing.
             if i == 1
                 # Special case: we don't store a best zero-length
                 # sequence :-)
                 candidate = (abs(y), [(x,y)])
             else
                 othersign = 3-sign # map 1->2 and 2->1
                 oldscore, oldlist = best[i-1, othersign]
                 newscore = min(abs(y), oldscore)
                 newlist = vcat(oldlist, [(x,y)])
                 candidate = (newscore, newlist)
             end
             # If our new candidate improves on the previous value of
             # best[i,sign], then replace it.
             if candidate[1] > best[i,sign][1]
                 best[i,sign] = candidate
             end
         end
     end
 
     # Our ultimate return value has to be either best[n,1] or
     # best[n,2], but it could be either. See which one has the higher
     # score.
     if best[n,1][1] > best[n,2][1]
         ret = best[n,1][2]
     else
         ret = best[n,2][2]
     end
     # Make sure we did actually _find_ a good answer.
     @assert(length(ret) == n)
     return ret
 end
 
 # ----------------------------------------------------------------------
 # Construct a rational-function approximation with equal and
 # alternating weighted deviation at a specific set of x-coordinates.
 
 # Arguments:
 #    f         The function to be approximated. Maps BigFloat -> BigFloat.
 #    coords    An array of BigFloats giving the x-coordinates. There should
 #              be n+d+2 of them.
 #    n, d      The degrees of the numerator and denominator of the desired
 #              approximation.
 #    prev_err  A plausible value for the alternating weighted deviation.
 #              (Required to kickstart a binary search in the nonlinear case;
 #              see comments below.)
 #    w         Error-weighting function. Takes two BigFloat arguments x,y
 #              and returns a scaling factor for the error at that location.
 #              The returned approximation R should have the minimum possible
 #              maximum value of abs((f(x)-R(x)) * w(x,f(x))). Optional
 #              parameter, defaulting to the always-return-1 function.
 #
 # Return values: a pair of arrays of BigFloats (N,D) giving the
 # coefficients of the returned rational function. N has size n+1; D
 # has size d+1. Both start with the constant term, i.e. N[i] is the
 # coefficient of x^(i-1) (because Julia arrays are 1-based). D[1] will
 # be 1.
 function ratfn_equal_deviation(f::Function, coords::Array{BigFloat},
                                n, d, prev_err::BigFloat,
                                w = (x,y)->BigFloat(1))
     @debug("equaldev", "n=", n, " d=", d, " coords=", repr(coords))
     @assert(length(coords) == n+d+2)
 
     if d == 0
         # Special case: we're after a polynomial. In this case, we
         # have the particularly easy job of just constructing and
         # solving a system of n+2 linear equations, to find the n+1
         # coefficients of the polynomial and also the amount of
         # deviation at the specified coordinates. Each equation is of
         # the form
         #
         #   p_0 x^0 + p_1 x^1 + ... + p_n x^n ± e/w(x) = f(x)
         #
         # in which the p_i and e are the variables, and the powers of
         # x and calls to w and f are the coefficients.
 
         matrix = array2d(BigFloat, n+2, n+2)
         vector = array1d(BigFloat, n+2)
         currsign = +1
         for i = 1:1:n+2
             x = coords[i]
             for j = 0:1:n
                 matrix[i,1+j] = x^j
             end
             y = f(x)
             vector[i] = y
             matrix[i, n+2] = currsign / w(x,y)
             currsign = -currsign
         end
 
         @debug("equaldev", "matrix=", repr(matrix))
         @debug("equaldev", "vector=", repr(vector))
 
         outvector = matrix \ vector
 
         @debug("equaldev", "outvector=", repr(outvector))
 
         ncoeffs = outvector[1:n+1]
         dcoeffs = [BigFloat(1)]
         return ncoeffs, dcoeffs
     else
         # For a nontrivial rational function, the system of equations
         # we need to solve becomes nonlinear, because each equation
         # now takes the form
         #
         #   p_0 x^0 + p_1 x^1 + ... + p_n x^n
         #   --------------------------------- ± e/w(x) = f(x)
         #     x^0 + q_1 x^1 + ... + q_d x^d
         #
         # and multiplying up by the denominator gives you a lot of
         # terms containing e × q_i. So we can't do this the really
         # easy way using a matrix equation as above.
         #
         # Fortunately, this is a fairly easy kind of nonlinear system.
         # The equations all become linear if you switch to treating e
         # as a constant, so a reasonably sensible approach is to pick
         # a candidate value of e, solve all but one of the equations
         # for the remaining unknowns, and then see what the error
         # turns out to be in the final equation. The Chebyshev
         # alternation theorem guarantees that that error in the last
         # equation will be anti-monotonic in the input e, so we can
         # just binary-search until we get the two as close to equal as
         # we need them.
 
         function try_e(e)
             # Try a given value of e, derive the coefficients of the
             # resulting rational function by setting up equations
             # based on the first n+d+1 of the n+d+2 coordinates, and
             # see what the error turns out to be at the final
             # coordinate.
             matrix = array2d(BigFloat, n+d+1, n+d+1)
             vector = array1d(BigFloat, n+d+1)
             currsign = +1
             for i = 1:1:n+d+1
                 x = coords[i]
                 y = f(x)
                 y_adj = y - currsign * e / w(x,y)
                 for j = 0:1:n
                     matrix[i,1+j] = x^j
                 end
                 for j = 1:1:d
                     matrix[i,1+n+j] = -x^j * y_adj
                 end
                 vector[i] = y_adj
                 currsign = -currsign
             end
 
             @debug("equaldev", "trying e=", e)
             @debug("equaldev", "matrix=", repr(matrix))
             @debug("equaldev", "vector=", repr(vector))
 
             outvector = matrix \ vector
 
             @debug("equaldev", "outvector=", repr(outvector))
 
             ncoeffs = outvector[1:n+1]
             dcoeffs = vcat([BigFloat(1)], outvector[n+2:n+d+1])
 
             x = coords[n+d+2]
             y = f(x)
             last_e = (ratfn_eval(ncoeffs, dcoeffs, x) - y) * w(x,y) * -currsign
 
             @debug("equaldev", "last e=", last_e)
 
             return ncoeffs, dcoeffs, last_e
         end
 
         threshold = 2^(-epsbits/2) # convergence threshold
 
         # Start by trying our previous iteration's error value. This
         # value (e0) will be one end of our binary-search interval,
         # and whatever it caused the last point's error to be, that
         # (e1) will be the other end.
         e0 = prev_err
         @debug("equaldev", "e0 = ", e0)
         nc, dc, e1 = try_e(e0)
         @debug("equaldev", "e1 = ", e1)
         if abs(e1-e0) <= threshold
             # If we're _really_ lucky, we hit the error right on the
             # nose just by doing that!
             return nc, dc
         end
         s = sign(e1-e0)
         @debug("equaldev", "s = ", s)
 
         # Verify by assertion that trying our other interval endpoint
         # e1 gives a value that's wrong in the other direction.
         # (Otherwise our binary search won't get a sensible answer at
         # all.)
         nc, dc, e2 = try_e(e1)
         @debug("equaldev", "e2 = ", e2)
         @assert(sign(e2-e1) == -s)
 
         # Now binary-search until our two endpoints narrow enough.
         local emid
         while abs(e1-e0) > threshold
             emid = (e1+e0)/2
             nc, dc, enew = try_e(emid)
             if sign(enew-emid) == s
                 e0 = emid
             else
                 e1 = emid
             end
         end
 
         @debug("equaldev", "final e=", emid)
         return nc, dc
     end
 end
 
 # ----------------------------------------------------------------------
 # Top-level function to find a minimax rational-function approximation.
 
 # Arguments:
 #    f         The function to be approximated. Maps BigFloat -> BigFloat.
 #    interval  A pair of BigFloats giving the endpoints of the interval
 #              (in either order) on which to approximate f.
 #    n, d      The degrees of the numerator and denominator of the desired
 #              approximation.
 #    w         Error-weighting function. Takes two BigFloat arguments x,y
 #              and returns a scaling factor for the error at that location.
 #              The returned approximation R should have the minimum possible
 #              maximum value of abs((f(x)-R(x)) * w(x,f(x))). Optional
 #              parameter, defaulting to the always-return-1 function.
 #
 # Return values: a tuple (N,D,E,X), where
 
 #    N,D       A pair of arrays of BigFloats giving the coefficients
 #              of the returned rational function. N has size n+1; D
 #              has size d+1. Both start with the constant term, i.e.
 #              N[i] is the coefficient of x^(i-1) (because Julia
 #              arrays are 1-based). D[1] will be 1.
 #    E         The maximum weighted error (BigFloat).
 #    X         An array of pairs of BigFloats giving the locations of n+2
 #              points and the weighted error at each of those points. The
 #              weighted error values will have alternating signs, which
 #              means that the Chebyshev alternation theorem guarantees
 #              that any other function of the same degree must exceed
 #              the error of this one at at least one of those points.
 function ratfn_minimax(f::Function, interval::Tuple{BigFloat,BigFloat}, n, d,
                        w = (x,y)->BigFloat(1))
     # We start off by finding a least-squares approximation. This
     # doesn't need to be perfect, but if we can get it reasonably good
     # then it'll save iterations in the refining stage.
     #
     # Least-squares approximations tend to look nicer in a minimax
     # sense if you evaluate the function at a big pile of Chebyshev
     # nodes rather than uniformly spaced points. These values will
     # also make a good grid to use for the initial search for error
     # extrema, so we'll keep them around for that reason too.
 
     # Construct the grid.
     lo, hi = minimum(interval), maximum(interval)
     local grid
     let
         mid = (hi+lo)/2
         halfwid = (hi-lo)/2
         nnodes = 16 * (n+d+1)
         pi = 2*asin(BigFloat(1))
         grid = [ mid - halfwid * cos(pi*i/nnodes) for i=0:1:nnodes ]
     end
 
     # Find the initial least-squares approximation.
     (nc, dc) = ratfn_leastsquares(f, grid, n, d, w)
     @debug("minimax", "initial leastsquares approx = ",
            ratfn_to_string(nc, dc))
 
     # Threshold of convergence. We stop when the relative difference
     # between the min and max (winnowed) error extrema is less than
     # this.
     #
     # This is set to the cube root of machine epsilon on a more or
     # less empirical basis, because the rational-function case will
     # not converge reliably if you set it to only the square root.
     # (Repeatable by using the --test mode.) On the assumption that
     # input and output error in each iteration can be expected to be
     # related by a simple power law (because it'll just be down to how
     # many leading terms of a Taylor series are zero), the cube root
     # was the next thing to try.
     threshold = 2^(-epsbits/3)
 
     # Main loop.
     while true
         # Find all the error extrema we can.
         function compute_error(x)
             real_y = f(x)
             approx_y = ratfn_eval(nc, dc, x)
             return (approx_y - real_y) * w(x, real_y)
         end
         extrema = find_extrema(compute_error, grid)
         @debug("minimax", "all extrema = ", format_xylist(extrema))
 
         # Winnow the extrema down to the right number, and ensure they
         # have alternating sign.
         extrema = winnow_extrema(extrema, n+d+2)
         @debug("minimax", "winnowed extrema = ", format_xylist(extrema))
 
         # See if we've finished.
         min_err = minimum([abs(y) for (x,y) = extrema])
         max_err = maximum([abs(y) for (x,y) = extrema])
         variation = (max_err - min_err) / max_err
         @debug("minimax", "extremum variation = ", variation)
         if variation < threshold
             @debug("minimax", "done!")
             return nc, dc, max_err, extrema
         end
 
         # If not, refine our function by equalising the error at the
         # extrema points, and go round again.
         (nc, dc) = ratfn_equal_deviation(f, map(x->x[1], extrema),
                                          n, d, max_err, w)
         @debug("minimax", "refined approx = ", ratfn_to_string(nc, dc))
     end
 end
 
 # ----------------------------------------------------------------------
 # Check if a polynomial is well-conditioned for accurate evaluation in
 # a given interval by Horner's rule.
 #
 # This is true if at every step where Horner's rule computes
 # (coefficient + x*value_so_far), the constant coefficient you're
 # adding on is of larger magnitude than the x*value_so_far operand.
 # And this has to be true for every x in the interval.
 #
 # Arguments:
 #    coeffs    The coefficients of the polynomial under test. Starts with
 #              the constant term, i.e. coeffs[i] is the coefficient of
 #              x^(i-1) (because Julia arrays are 1-based).
 #    lo, hi    The bounds of the interval.
 #
 # Return value: the largest ratio (x*value_so_far / coefficient), at
 # any step of evaluation, for any x in the interval. If this is less
 # than 1, the polynomial is at least somewhat well-conditioned;
 # ideally you want it to be more like 1/8 or 1/16 or so, so that the
 # relative rounding error accumulated at each step are reduced by
 # several factors of 2 when the next coefficient is added on.
 
 function wellcond(coeffs, lo, hi)
     x = max(abs(lo), abs(hi))
     worst = 0
     so_far = 0
     for i = length(coeffs):-1:1
         coeff = abs(coeffs[i])
         so_far *= x
         if coeff != 0
             thisval = so_far / coeff
             worst = max(worst, thisval)
             so_far += coeff
         end
     end
     return worst
 end
 
 # ----------------------------------------------------------------------
 # Small set of unit tests.
 
 function test()
     passes = 0
     fails = 0
 
     function approx_eq(x, y, limit=1e-6)
         return abs(x - y) < limit
     end
 
     function test(condition)
         if condition
             passes += 1
         else
             println("fail")
             fails += 1
         end
     end
 
     # Test Gaussian elimination.
     println("Gaussian test 1:")
     m = BigFloat[1 1 2; 3 5 8; 13 34 21]
     v = BigFloat[1, -1, 2]
     ret = m \ v
     println("  ",repr(ret))
     test(approx_eq(ret[1], 109/26))
     test(approx_eq(ret[2], -105/130))
     test(approx_eq(ret[3], -31/26))
 
     # Test leastsquares rational functions.
     println("Leastsquares test 1:")
     n = 10000
     a = array1d(BigFloat, n+1)
     for i = 0:1:n
         a[1+i] = i/BigFloat(n)
     end
     (nc, dc) = ratfn_leastsquares(x->exp(x), a, 2, 2)
     println("  ",ratfn_to_string(nc, dc))
     for x = a
         test(approx_eq(exp(x), ratfn_eval(nc, dc, x), 1e-4))
     end
 
     # Test golden section search.
     println("Golden section test 1:")
     x, y = goldensection(x->sin(x),
                               BigFloat(0), BigFloat(1)/10, BigFloat(4))
     println("  ", x, " -> ", y)
     test(approx_eq(x, asin(BigFloat(1))))
     test(approx_eq(y, 1))
 
     # Test extrema-winnowing algorithm.
     println("Winnow test 1:")
     extrema = [(x, sin(20*x)*sin(197*x))
                for x in BigFloat(0):BigFloat(1)/1000:BigFloat(1)]
     winnowed = winnow_extrema(extrema, 12)
     println("  ret = ", format_xylist(winnowed))
     prevx, prevy = -1, 0
     for (x,y) = winnowed
         test(x > prevx)
         test(y != 0)
         test(prevy * y <= 0) # tolerates initial prevx having no sign
         test(abs(y) > 0.9)
         prevx, prevy = x, y
     end
 
     # Test actual minimax approximation.
     println("Minimax test 1 (polynomial):")
     (nc, dc, e, x) = ratfn_minimax(x->exp(x), (BigFloat(0), BigFloat(1)), 4, 0)
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
     for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x)) <= e * 1.0000001)
     end
 
     println("Minimax test 2 (rational):")
     (nc, dc, e, x) = ratfn_minimax(x->exp(x), (BigFloat(0), BigFloat(1)), 2, 2)
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
     for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x)) <= e * 1.0000001)
     end
 
     println("Minimax test 3 (polynomial, weighted):")
     (nc, dc, e, x) = ratfn_minimax(x->exp(x), (BigFloat(0), BigFloat(1)), 4, 0,
                                    (x,y)->1/y)
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
     for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
     end
 
     println("Minimax test 4 (rational, weighted):")
     (nc, dc, e, x) = ratfn_minimax(x->exp(x), (BigFloat(0), BigFloat(1)), 2, 2,
                                    (x,y)->1/y)
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
     for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
     end
 
     println("Minimax test 5 (rational, weighted, odd degree):")
     (nc, dc, e, x) = ratfn_minimax(x->exp(x), (BigFloat(0), BigFloat(1)), 2, 1,
                                    (x,y)->1/y)
     println("  ",e)
     println("  ",ratfn_to_string(nc, dc))
     test(0 < e < 1e-3)
     for x = 0:BigFloat(1)/1000:1
         test(abs(ratfn_eval(nc, dc, x) - exp(x))/exp(x) <= e * 1.0000001)
     end
 
     total = passes + fails
     println(passes, " passes ", fails, " fails ", total, " total")
 end
 
 # ----------------------------------------------------------------------
 # Online help.
 function help()
     print("""
 Usage:
 
     remez.jl [options] <lo> <hi> <n> <d> <expr> [<weight>]
 
 Arguments:
 
     <lo>, <hi>
 
         Bounds of the interval on which to approximate the target
         function. These are parsed and evaluated as Julia expressions,
         so you can write things like '1/BigFloat(6)' to get an
         accurate representation of 1/6, or '4*atan(BigFloat(1))' to
         get pi. (Unfortunately, the obvious 'BigFloat(pi)' doesn't
         work in Julia.)
 
     <n>, <d>
 
         The desired degree of polynomial(s) you want for your
         approximation. These should be non-negative integers. If you
         want a rational function as output, set <n> to the degree of
         the numerator, and <d> the denominator. If you just want an
         ordinary polynomial, set <d> to 0, and <n> to the degree of
         the polynomial you want.
 
     <expr>
 
         A Julia expression giving the function to be approximated on
         the interval. The input value is predefined as 'x' when this
         expression is evaluated, so you should write something along
         the lines of 'sin(x)' or 'sqrt(1+tan(x)^2)' etc.
 
     <weight>
 
         If provided, a Julia expression giving the weighting factor
         for the approximation error. The output polynomial will
         minimise the largest absolute value of (P-f) * w at any point
         in the interval, where P is the value of the polynomial, f is
         the value of the target function given by <expr>, and w is the
         weight given by this function.
 
         When this expression is evaluated, the input value to P and f
         is predefined as 'x', and also the true output value f(x) is
         predefined as 'y'. So you can minimise the relative error by
         simply writing '1/y'.
 
         If the <weight> argument is not provided, the default
         weighting function always returns 1, so that the polynomial
         will minimise the maximum absolute error |P-f|.
 
 Computation options:
 
     --pre=<predef_expr>
 
         Evaluate the Julia expression <predef_expr> before starting
         the computation. This permits you to pre-define variables or
         functions which the Julia expressions in your main arguments
         can refer to. All of <lo>, <hi>, <expr> and <weight> can make
         use of things defined by <predef_expr>.
 
         One internal remez.jl function that you might sometimes find
         useful in this expression is 'goldensection', which finds the
         location and value of a maximum of a function. For example,
         one implementation strategy for the gamma function involves
         translating it to put its unique local minimum at the origin,
         in which case you can write something like this
 
             --pre='(m,my) = goldensection(x -> -gamma(x),
                   BigFloat(1), BigFloat(1.5), BigFloat(2))'
 
         to predefine 'm' as the location of gamma's minimum, and 'my'
         as the (negated) value that gamma actually takes at that
         point, i.e. -gamma(m).
 
         (Since 'goldensection' always finds a maximum, we had to
         negate gamma in the input function to make it find a minimum
         instead. Consult the comments in the source for more details
         on the use of this function.)
 
         If you use this option more than once, all the expressions you
         provide will be run in sequence.
 
     --bits=<bits>
 
         Specify the accuracy to which you want the output polynomial,
         in bits. Default 256, which should be more than enough.
 
     --bigfloatbits=<bits>
 
         Turn up the precision used by Julia for its BigFloat
         evaluation. Default is Julia's default (also 256). You might
         want to try setting this higher than the --bits value if the
         algorithm is failing to converge for some reason.
 
 Output options:
 
     --full
 
         Instead of just printing the approximation function itself,
         also print auxiliary information:
          - the locations of the error extrema, and the actual
            (weighted) error at each of those locations
          - the overall maximum error of the function
          - a 'well-conditioning quotient', giving the worst-case ratio
            between any polynomial coefficient and the largest possible
            value of the higher-order terms it will be added to.
 
         The well-conditioning quotient should be less than 1, ideally
         by several factors of two, for accurate evaluation in the
         target precision. If you request a rational function, a
         separate well-conditioning quotient will be printed for the
         numerator and denominator.
 
         Use this option when deciding how wide an interval to
         approximate your function on, and what degree of polynomial
         you need.
 
     --variable=<identifier>
 
         When writing the output polynomial or rational function in its
         usual form as an arithmetic expression, use <identifier> as
         the name of the input variable. Default is 'x'.
 
     --suffix=<suffix>
 
         When writing the output polynomial or rational function in its
         usual form as an arithmetic expression, write <suffix> after
         every floating-point literal. For example, '--suffix=F' will
         generate a C expression in which the coefficients are literals
         of type 'float' rather than 'double'.
 
     --array
 
         Instead of writing the output polynomial as an arithmetic
         expression in Horner's rule form, write out just its
         coefficients, one per line, each with a trailing comma.
         Suitable for pasting into a C array declaration.
 
         This option is not currently supported if the output is a
         rational function, because you'd need two separate arrays for
         the numerator and denominator coefficients and there's no
         obviously right way to provide both of those together.
 
 Debug and test options:
 
     --debug=<facility>
 
         Enable debugging output from various parts of the Remez
         calculation. <facility> should be the name of one of the
         classes of diagnostic output implemented in the program.
         Useful values include 'gausselim', 'leastsquares',
         'goldensection', 'equaldev', 'minimax'. This is probably
         mostly useful to people debugging problems with the script, so
         consult the source code for more information about what the
         diagnostic output for each of those facilities will be.
 
         If you want diagnostics from more than one facility, specify
         this option multiple times with different arguments.
 
     --test
 
         Run remez.jl's internal test suite. No arguments needed.
 
 Miscellaneous options:
 
     --help
 
         Display this text and exit. No arguments needed.
 
 """)
 end
 
 # ----------------------------------------------------------------------
 # Main program.
 
 function main()
     nargs = length(argwords)
     if nargs != 5 && nargs != 6
         error("usage: remez.jl <lo> <hi> <n> <d> <expr> [<weight>]\n" *
               "       run 'remez.jl --help' for more help")
     end
 
     for preliminary_command in preliminary_commands
         eval(Meta.parse(preliminary_command))
     end
 
     lo = BigFloat(eval(Meta.parse(argwords[1])))
     hi = BigFloat(eval(Meta.parse(argwords[2])))
     n = parse(Int,argwords[3])
     d = parse(Int,argwords[4])
     f = eval(Meta.parse("x -> " * argwords[5]))
 
     # Wrap the user-provided function with a function of our own. This
     # arranges to detect silly FP values (inf,nan) early and diagnose
     # them sensibly, and also lets us log all evaluations of the
     # function in case you suspect it's doing the wrong thing at some
     # special-case point.
     function func(x)
         y = run(f,x)
         @debug("f", x, " -> ", y)
         if !isfinite(y)
             error("f(" * string(x) * ") returned non-finite value " * string(y))
         end
         return y
     end
 
     if nargs == 6
         # Wrap the user-provided weight function similarly.
         w = eval(Meta.parse("(x,y) -> " * argwords[6]))
         function wrapped_weight(x,y)
             ww = run(w,x,y)
             if !isfinite(ww)
                 error("w(" * string(x) * "," * string(y) *
                       ") returned non-finite value " * string(ww))
             end
             return ww
         end
         weight = wrapped_weight
     else
         weight = (x,y)->BigFloat(1)
     end
 
     (nc, dc, e, extrema) = ratfn_minimax(func, (lo, hi), n, d, weight)
     if array_format
         if d == 0
             functext = join([string(x)*",\n" for x=nc],"")
         else
             # It's unclear how you should best format an array of
             # coefficients for a rational function, so I'll leave
             # implementing this option until I have a use case.
             error("--array unsupported for rational functions")
         end
     else
         functext = ratfn_to_string(nc, dc) * "\n"
     end
     if full_output
         # Print everything you might want to know about the function
         println("extrema = ", format_xylist(extrema))
         println("maxerror = ", string(e))
         if length(dc) > 1
             println("wellconditioning_numerator = ",
                     string(wellcond(nc, lo, hi)))
             println("wellconditioning_denominator = ",
                     string(wellcond(dc, lo, hi)))
         else
             println("wellconditioning = ", string(wellcond(nc, lo, hi)))
         end
         print("function = ", functext)
     else
         # Just print the text people will want to paste into their code
         print(functext)
     end
 end
 
 # ----------------------------------------------------------------------
 # Top-level code: parse the argument list and decide what to do.
 
 what_to_do = main
 
 doing_opts = true
 argwords = array1d(String, 0)
 for arg = ARGS
     global doing_opts, what_to_do, argwords
     global full_output, array_format, xvarname, floatsuffix, epsbits
     if doing_opts && startswith(arg, "-")
         if arg == "--"
             doing_opts = false
         elseif arg == "--help"
             what_to_do = help
         elseif arg == "--test"
             what_to_do = test
         elseif arg == "--full"
             full_output = true
         elseif arg == "--array"
             array_format = true
         elseif startswith(arg, "--debug=")
             enable_debug(arg[length("--debug=")+1:end])
         elseif startswith(arg, "--variable=")
             xvarname = arg[length("--variable=")+1:end]
         elseif startswith(arg, "--suffix=")
             floatsuffix = arg[length("--suffix=")+1:end]
         elseif startswith(arg, "--bits=")
             epsbits = parse(Int,arg[length("--bits=")+1:end])
         elseif startswith(arg, "--bigfloatbits=")
             set_bigfloat_precision(
                 parse(Int,arg[length("--bigfloatbits=")+1:end]))
         elseif startswith(arg, "--pre=")
             push!(preliminary_commands, arg[length("--pre=")+1:end])
         else
             error("unrecognised option: ", arg)
         end
     else
         push!(argwords, arg)
     end
 end
 
 what_to_do()
diff --git a/contrib/arm-optimized-routines/math/tools/sin.sollya b/contrib/arm-optimized-routines/math/tools/sin.sollya
index a6e851145c11..a19300019867 100644
--- a/contrib/arm-optimized-routines/math/tools/sin.sollya
+++ b/contrib/arm-optimized-routines/math/tools/sin.sollya
@@ -1,37 +1,37 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7;   // polynomial degree
 a = -pi/4; // interval
 b = pi/4;
 
 // find even polynomial with minimal abs error compared to sin(x)/x
 
 // account for /x
 deg = deg-1;
 
 // f = sin(x)/x;
 f = 1;
 c = 1;
 for i from 1 to 60 do { c = 2*i*(2*i + 1)*c; f = f + (-1)^i*x^(2*i)/c; };
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg/2 do {
   p = roundcoefficients(approx(poly,2*i), [|D ...|]);
   poly = poly + x^(2*i)*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("abs error:", accurateinfnorm(sin(x)-x*poly(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/v_exp.sollya b/contrib/arm-optimized-routines/math/tools/v_exp.sollya
index c0abb63fb642..5fa7de7435a9 100644
--- a/contrib/arm-optimized-routines/math/tools/v_exp.sollya
+++ b/contrib/arm-optimized-routines/math/tools/v_exp.sollya
@@ -1,30 +1,30 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 4; // poly degree
 N = 128; // table entries
 b = log(2)/(2*N);  // interval
 a = -b;
 
 // find polynomial with minimal abs error
 
 // return p that minimizes |exp(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(exp(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first 2 coeffs are fixed, iteratively find optimal double prec coeffs
 poly = 1 + x;
 for i from 2 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/exp(x), [a;b], 30));
 print("abs error:", accurateinfnorm(exp(x)-poly(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/v_log.sollya b/contrib/arm-optimized-routines/math/tools/v_log.sollya
index cc3d2c4ae72a..d982524eb920 100644
--- a/contrib/arm-optimized-routines/math/tools/v_log.sollya
+++ b/contrib/arm-optimized-routines/math/tools/v_log.sollya
@@ -1,34 +1,34 @@
 // polynomial used for __v_log(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 a = -0x1.fc1p-9;
 b = 0x1.009p-8;
 
 // find log(1+x)/x polynomial with minimal relative error
 // (minimal relative error polynomial for log(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/tools/v_sin.sollya b/contrib/arm-optimized-routines/math/tools/v_sin.sollya
index 65cc9957c624..63b9d65a1ac3 100644
--- a/contrib/arm-optimized-routines/math/tools/v_sin.sollya
+++ b/contrib/arm-optimized-routines/math/tools/v_sin.sollya
@@ -1,36 +1,36 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 15;  // polynomial degree
 a = -pi/2; // interval
 b = pi/2;
 
 // find even polynomial with minimal abs error compared to sin(x)/x
 
 // account for /x
 deg = deg-1;
 
 // f = sin(x)/x;
 f = 1;
 c = 1;
 for i from 1 to 60 do { c = 2*i*(2*i + 1)*c; f = f + (-1)^i*x^(2*i)/c; };
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|
 approx = proc(poly,d) {
   return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg/2 do {
   p = roundcoefficients(approx(poly,2*i), [|D ...|]);
   poly = poly + x^(2*i)*coeff(p,0);
 };
 
 display = hexadecimal;
 print("abs error:", accurateinfnorm(sin(x)-x*poly(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/math/v_cos.c b/contrib/arm-optimized-routines/math/v_cos.c
index 20ba6bd0d0d9..4c8787e66c41 100644
--- a/contrib/arm-optimized-routines/math/v_cos.c
+++ b/contrib/arm-optimized-routines/math/v_cos.c
@@ -1,87 +1,95 @@
 /*
  * Double-precision vector cos function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const double Poly[] = {
 /* worst-case error is 3.5 ulp.
    abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].  */
 -0x1.9f4a9c8b21dc9p-41,
  0x1.60e88a10163f2p-33,
 -0x1.ae6361b7254e7p-26,
  0x1.71de382e8d62bp-19,
 -0x1.a01a019aeb4ffp-13,
  0x1.111111110b25ep-7,
 -0x1.55555555554c3p-3,
 };
 
 #define C7 v_f64 (Poly[0])
 #define C6 v_f64 (Poly[1])
 #define C5 v_f64 (Poly[2])
 #define C4 v_f64 (Poly[3])
 #define C3 v_f64 (Poly[4])
 #define C2 v_f64 (Poly[5])
 #define C1 v_f64 (Poly[6])
 
 #define InvPi v_f64 (0x1.45f306dc9c883p-2)
 #define HalfPi v_f64 (0x1.921fb54442d18p+0)
 #define Pi1 v_f64 (0x1.921fb54442d18p+1)
 #define Pi2 v_f64 (0x1.1a62633145c06p-53)
 #define Pi3 v_f64 (0x1.c1cd129024e09p-106)
 #define Shift v_f64 (0x1.8p52)
 #define RangeVal v_f64 (0x1p23)
 #define AbsMask v_u64 (0x7fffffffffffffff)
 
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 {
   return v_call_f64 (cos, x, y, cmp);
 }
 
 VPCS_ATTR
 v_f64_t
 V_NAME(cos) (v_f64_t x)
 {
   v_f64_t n, r, r2, y;
   v_u64_t odd, cmp;
 
   r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
   cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
 
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f64 (cmp, v_f64 (1.0), r);
+#endif
+
   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
   n = v_fma_f64 (InvPi, r + HalfPi, Shift);
   odd = v_as_u64_f64 (n) << 63;
   n -= Shift;
   n -= v_f64 (0.5);
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
   r = v_fma_f64 (-Pi1, n, r);
   r = v_fma_f64 (-Pi2, n, r);
   r = v_fma_f64 (-Pi3, n, r);
 
   /* sin(r) poly approx.  */
   r2 = r * r;
   y = v_fma_f64 (C7, r2, C6);
   y = v_fma_f64 (y, r2, C5);
   y = v_fma_f64 (y, r2, C4);
   y = v_fma_f64 (y, r2, C3);
   y = v_fma_f64 (y, r2, C2);
   y = v_fma_f64 (y, r2, C1);
   y = v_fma_f64 (y * r2, r, r);
 
   /* sign.  */
   y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
 
   if (unlikely (v_any_u64 (cmp)))
     return specialcase (x, y, cmp);
   return y;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_cosf.c b/contrib/arm-optimized-routines/math/v_cosf.c
index 150294b8845e..bd677c3ae173 100644
--- a/contrib/arm-optimized-routines/math/v_cosf.c
+++ b/contrib/arm-optimized-routines/math/v_cosf.c
@@ -1,76 +1,84 @@
 /*
  * Single-precision vector cos function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /* 1.886 ulp error */
   0x1.5b2e76p-19f,
   -0x1.9f42eap-13f,
   0x1.110df4p-7f,
   -0x1.555548p-3f,
 };
 #define Pi1 v_f32 (0x1.921fb6p+1f)
 #define Pi2 v_f32 (-0x1.777a5cp-24f)
 #define Pi3 v_f32 (-0x1.ee59dap-49f)
 #define A3 v_f32 (Poly[3])
 #define A5 v_f32 (Poly[2])
 #define A7 v_f32 (Poly[1])
 #define A9 v_f32 (Poly[0])
 #define RangeVal v_f32 (0x1p20f)
 #define InvPi v_f32 (0x1.45f306p-2f)
 #define Shift v_f32 (0x1.8p+23f)
 #define AbsMask v_u32 (0x7fffffff)
 #define HalfPi v_f32 (0x1.921fb6p0f)
 
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
 {
   /* Fall back to scalar code.  */
   return v_call_f32 (cosf, x, y, cmp);
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(cosf) (v_f32_t x)
 {
   v_f32_t n, r, r2, y;
   v_u32_t odd, cmp;
 
   r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
   cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
 
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f32 (cmp, v_f32 (1.0f), r);
+#endif
+
   /* n = rint((|x|+pi/2)/pi) - 0.5 */
   n = v_fma_f32 (InvPi, r + HalfPi, Shift);
   odd = v_as_u32_f32 (n) << 31;
   n -= Shift;
   n -= v_f32 (0.5f);
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
   r = v_fma_f32 (-Pi1, n, r);
   r = v_fma_f32 (-Pi2, n, r);
   r = v_fma_f32 (-Pi3, n, r);
 
   /* y = sin(r) */
   r2 = r * r;
   y = v_fma_f32 (A9, r2, A7);
   y = v_fma_f32 (y, r2, A5);
   y = v_fma_f32 (y, r2, A3);
   y = v_fma_f32 (y * r2, r, r);
 
   /* sign fix */
   y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
 
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (x, y, cmp);
   return y;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_exp.c b/contrib/arm-optimized-routines/math/v_exp.c
index e459d53fddd2..da23fd1c5f46 100644
--- a/contrib/arm-optimized-routines/math/v_exp.c
+++ b/contrib/arm-optimized-routines/math/v_exp.c
@@ -1,94 +1,128 @@
 /*
  * Double-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 #include "v_exp.h"
 
 #if V_EXP_TABLE_BITS == 7
 /* maxerr: 1.88 +0.5 ulp
    rel error: 1.4337*2^-53
    abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
 #define C1 v_f64 (0x1.ffffffffffd43p-2)
 #define C2 v_f64 (0x1.55555c75adbb2p-3)
 #define C3 v_f64 (0x1.55555da646206p-5)
 #define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2.  */
 #define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N.  */
 #define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
 #elif V_EXP_TABLE_BITS == 8
 /* maxerr: 0.54 +0.5 ulp
    rel error: 1.4318*2^-58
    abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ].  */
 #define C1 v_f64 (0x1.fffffffffffd4p-2)
 #define C2 v_f64 (0x1.5555571d6b68cp-3)
 #define C3 v_f64 (0x1.5555576a59599p-5)
 #define InvLn2 v_f64 (0x1.71547652b82fep8)
 #define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
 #define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
 #endif
 
 #define N (1 << V_EXP_TABLE_BITS)
 #define Tab __v_exp_data
 #define IndexMask v_u64 (N - 1)
 #define Shift v_f64 (0x1.8p+52)
+
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)).  */
+#define BigBound 0x408	/* top12 (asuint64 (0x1p9)).  */
+
+VPCS_ATTR static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
 #define Thres v_f64 (704.0)
 
 VPCS_ATTR
 static v_f64_t
 specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
 {
   v_f64_t absn = v_abs_f64 (n);
 
   /* 2^(n/N) may overflow, break it up into s1*s2.  */
   v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
   v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
   v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
   v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
   v_f64_t r1 = s1 * s1;
   v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
   return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
 }
 
+#endif
+
 VPCS_ATTR
 v_f64_t
 V_NAME(exp) (v_f64_t x)
 {
   v_f64_t n, r, r2, s, y, z;
   v_u64_t cmp, u, e, i;
 
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  v_f64_t xm = x;
+  cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound
+		    >= BigBound - TinyBound);
+  if (unlikely (v_any_u64 (cmp)))
+    x = v_sel_f64 (cmp, v_f64 (1), x);
+#else
   cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+#endif
 
   /* n = round(x/(ln2/N)).  */
   z = v_fma_f64 (x, InvLn2, Shift);
   u = v_as_u64_f64 (z);
   n = z - Shift;
 
   /* r = x - n*ln2/N.  */
   r = x;
   r = v_fma_f64 (-Ln2hi, n, r);
   r = v_fma_f64 (-Ln2lo, n, r);
 
   e = u << (52 - V_EXP_TABLE_BITS);
   i = u & IndexMask;
 
   /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
   r2 = r * r;
   y = v_fma_f64 (C2, r, C1);
   y = v_fma_f64 (C3, r2, y);
   y = v_fma_f64 (y, r2, r);
 
   /* s = 2^(n/N).  */
   u = v_lookup_u64 (Tab, i);
   s = v_as_f64_u64 (u + e);
 
   if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f64 (y, s, s), cmp);
+#else
     return specialcase (s, y, n);
+#endif
+
   return v_fma_f64 (y, s, s);
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_exp.h b/contrib/arm-optimized-routines/math/v_exp.h
index 305da19c0a53..1e7f7f3b927d 100644
--- a/contrib/arm-optimized-routines/math/v_exp.h
+++ b/contrib/arm-optimized-routines/math/v_exp.h
@@ -1,14 +1,14 @@
 /*
  * Declarations for double-precision e^x vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_math.h"
 #if WANT_VMATH
 
 #define V_EXP_TABLE_BITS 7
 
 extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_exp2f.c b/contrib/arm-optimized-routines/math/v_exp2f.c
index e3ea5af3414d..7f40dbaa6679 100644
--- a/contrib/arm-optimized-routines/math/v_exp2f.c
+++ b/contrib/arm-optimized-routines/math/v_exp2f.c
@@ -1,78 +1,117 @@
 /*
  * Single-precision vector 2^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /* maxerr: 1.962 ulp.  */
   0x1.59977ap-10f,
   0x1.3ce9e4p-7f,
   0x1.c6bd32p-5f,
   0x1.ebf9bcp-3f,
   0x1.62e422p-1f,
 };
 #define C0 v_f32 (Poly[0])
 #define C1 v_f32 (Poly[1])
 #define C2 v_f32 (Poly[2])
 #define C3 v_f32 (Poly[3])
 #define C4 v_f32 (Poly[4])
 
 #define Shift v_f32 (0x1.8p23f)
 
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x20000000 /* asuint (0x1p-63).  */
+#define BigBound 0x42800000  /* asuint (0x1p6).  */
+
+VPCS_ATTR
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
   v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
   v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
   v_f32_t s2 = v_as_f32_u32 (e - b);
   v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
   v_u32_t r2 = v_as_u32_f32 (s1 * s1);
   v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
   /* Similar to r1 but avoids double rounding in the subnormal range.  */
   v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
+#endif
+
 VPCS_ATTR
 v_f32_t
 V_NAME(exp2f) (v_f32_t x)
 {
-  v_f32_t n, r, r2, scale, p, q, poly, absn;
+  v_f32_t n, r, r2, scale, p, q, poly;
   v_u32_t cmp, e;
 
-  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = n + r, with r in [-1/2, 1/2].  */
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
+		    >= BigBound - TinyBound);
+  v_f32_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_sel_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = n + r, with r in [-1/2, 1/2].  */
 #if 0
   v_f32_t z;
   z = x + Shift;
   n = z - Shift;
   r = x - n;
   e = v_as_u32_f32 (z) << 23;
 #else
   n = v_round_f32 (x);
   r = x - n;
   e = v_as_u32_s32 (v_round_s32 (x)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
+
+#if !WANT_SIMD_EXCEPT
+  v_f32_t absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
+#endif
+
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
+
   if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
+#else
     return specialcase (poly, n, e, absn, cmp, scale);
+#endif
+
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/v_exp2f_1u.c
index 1caa14d9bfff..de1a32d54139 100644
--- a/contrib/arm-optimized-routines/math/v_exp2f_1u.c
+++ b/contrib/arm-optimized-routines/math/v_exp2f_1u.c
@@ -1,75 +1,75 @@
 /*
  * Single-precision vector 2^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /*  maxerr: 0.878 ulp.  */
   0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
 };
 #define C0 v_f32 (Poly[0])
 #define C1 v_f32 (Poly[1])
 #define C2 v_f32 (Poly[2])
 #define C3 v_f32 (Poly[3])
 #define C4 v_f32 (Poly[4])
 #define C5 v_f32 (Poly[5])
 
 #define Shift v_f32 (0x1.8p23f)
 #define InvLn2 v_f32 (0x1.715476p+0f)
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
   v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
   v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
   v_f32_t s2 = v_as_f32_u32 (e - b);
   v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
   v_f32_t r1 = s1 * s1;
   v_f32_t r0 = poly * s1 * s2;
   return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(exp2f_1u) (v_f32_t x)
 {
   v_f32_t n, r, scale, poly, absn;
   v_u32_t cmp, e;
 
   /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
      x = n + r, with r in [-1/2, 1/2].  */
 #if 0
   v_f32_t z;
   z = x + Shift;
   n = z - Shift;
   r = x - n;
   e = v_as_u32_f32 (z) << 23;
 #else
   n = v_round_f32 (x);
   r = x - n;
   e = v_as_u32_s32 (v_round_s32 (x)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
   absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
   poly = v_fma_f32 (C0, r, C1);
   poly = v_fma_f32 (poly, r, C2);
   poly = v_fma_f32 (poly, r, C3);
   poly = v_fma_f32 (poly, r, C4);
   poly = v_fma_f32 (poly, r, C5);
   poly = v_fma_f32 (poly, r, v_f32 (1.0f));
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn);
   return scale * poly;
 }
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_exp_data.c b/contrib/arm-optimized-routines/math/v_exp_data.c
index 365355497e95..30421da81429 100644
--- a/contrib/arm-optimized-routines/math/v_exp_data.c
+++ b/contrib/arm-optimized-routines/math/v_exp_data.c
@@ -1,403 +1,403 @@
 /*
  * Lookup table for double-precision e^x vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_exp.h"
 #if WANT_VMATH
 
 #define N (1 << V_EXP_TABLE_BITS)
 
 /* 2^(j/N), j=0..N.  */
 const u64_t __v_exp_data[] = {
 #if N == 128
 0x3ff0000000000000,
 0x3feff63da9fb3335,
 0x3fefec9a3e778061,
 0x3fefe315e86e7f85,
 0x3fefd9b0d3158574,
 0x3fefd06b29ddf6de,
 0x3fefc74518759bc8,
 0x3fefbe3ecac6f383,
 0x3fefb5586cf9890f,
 0x3fefac922b7247f7,
 0x3fefa3ec32d3d1a2,
 0x3fef9b66affed31b,
 0x3fef9301d0125b51,
 0x3fef8abdc06c31cc,
 0x3fef829aaea92de0,
 0x3fef7a98c8a58e51,
 0x3fef72b83c7d517b,
 0x3fef6af9388c8dea,
 0x3fef635beb6fcb75,
 0x3fef5be084045cd4,
 0x3fef54873168b9aa,
 0x3fef4d5022fcd91d,
 0x3fef463b88628cd6,
 0x3fef3f49917ddc96,
 0x3fef387a6e756238,
 0x3fef31ce4fb2a63f,
 0x3fef2b4565e27cdd,
 0x3fef24dfe1f56381,
 0x3fef1e9df51fdee1,
 0x3fef187fd0dad990,
 0x3fef1285a6e4030b,
 0x3fef0cafa93e2f56,
 0x3fef06fe0a31b715,
 0x3fef0170fc4cd831,
 0x3feefc08b26416ff,
 0x3feef6c55f929ff1,
 0x3feef1a7373aa9cb,
 0x3feeecae6d05d866,
 0x3feee7db34e59ff7,
 0x3feee32dc313a8e5,
 0x3feedea64c123422,
 0x3feeda4504ac801c,
 0x3feed60a21f72e2a,
 0x3feed1f5d950a897,
 0x3feece086061892d,
 0x3feeca41ed1d0057,
 0x3feec6a2b5c13cd0,
 0x3feec32af0d7d3de,
 0x3feebfdad5362a27,
 0x3feebcb299fddd0d,
 0x3feeb9b2769d2ca7,
 0x3feeb6daa2cf6642,
 0x3feeb42b569d4f82,
 0x3feeb1a4ca5d920f,
 0x3feeaf4736b527da,
 0x3feead12d497c7fd,
 0x3feeab07dd485429,
 0x3feea9268a5946b7,
 0x3feea76f15ad2148,
 0x3feea5e1b976dc09,
 0x3feea47eb03a5585,
 0x3feea34634ccc320,
 0x3feea23882552225,
 0x3feea155d44ca973,
 0x3feea09e667f3bcd,
 0x3feea012750bdabf,
 0x3fee9fb23c651a2f,
 0x3fee9f7df9519484,
 0x3fee9f75e8ec5f74,
 0x3fee9f9a48a58174,
 0x3fee9feb564267c9,
 0x3feea0694fde5d3f,
 0x3feea11473eb0187,
 0x3feea1ed0130c132,
 0x3feea2f336cf4e62,
 0x3feea427543e1a12,
 0x3feea589994cce13,
 0x3feea71a4623c7ad,
 0x3feea8d99b4492ed,
 0x3feeaac7d98a6699,
 0x3feeace5422aa0db,
 0x3feeaf3216b5448c,
 0x3feeb1ae99157736,
 0x3feeb45b0b91ffc6,
 0x3feeb737b0cdc5e5,
 0x3feeba44cbc8520f,
 0x3feebd829fde4e50,
 0x3feec0f170ca07ba,
 0x3feec49182a3f090,
 0x3feec86319e32323,
 0x3feecc667b5de565,
 0x3feed09bec4a2d33,
 0x3feed503b23e255d,
 0x3feed99e1330b358,
 0x3feede6b5579fdbf,
 0x3feee36bbfd3f37a,
 0x3feee89f995ad3ad,
 0x3feeee07298db666,
 0x3feef3a2b84f15fb,
 0x3feef9728de5593a,
 0x3feeff76f2fb5e47,
 0x3fef05b030a1064a,
 0x3fef0c1e904bc1d2,
 0x3fef12c25bd71e09,
 0x3fef199bdd85529c,
 0x3fef20ab5fffd07a,
 0x3fef27f12e57d14b,
 0x3fef2f6d9406e7b5,
 0x3fef3720dcef9069,
 0x3fef3f0b555dc3fa,
 0x3fef472d4a07897c,
 0x3fef4f87080d89f2,
 0x3fef5818dcfba487,
 0x3fef60e316c98398,
 0x3fef69e603db3285,
 0x3fef7321f301b460,
 0x3fef7c97337b9b5f,
 0x3fef864614f5a129,
 0x3fef902ee78b3ff6,
 0x3fef9a51fbc74c83,
 0x3fefa4afa2a490da,
 0x3fefaf482d8e67f1,
 0x3fefba1bee615a27,
 0x3fefc52b376bba97,
 0x3fefd0765b6e4540,
 0x3fefdbfdad9cbe14,
 0x3fefe7c1819e90d8,
 0x3feff3c22b8f71f1,
 #elif N == 256
 0x3ff0000000000000,
 0x3feffb1afa5abcbf,
 0x3feff63da9fb3335,
 0x3feff168143b0281,
 0x3fefec9a3e778061,
 0x3fefe7d42e11bbcc,
 0x3fefe315e86e7f85,
 0x3fefde5f72f654b1,
 0x3fefd9b0d3158574,
 0x3fefd50a0e3c1f89,
 0x3fefd06b29ddf6de,
 0x3fefcbd42b72a836,
 0x3fefc74518759bc8,
 0x3fefc2bdf66607e0,
 0x3fefbe3ecac6f383,
 0x3fefb9c79b1f3919,
 0x3fefb5586cf9890f,
 0x3fefb0f145e46c85,
 0x3fefac922b7247f7,
 0x3fefa83b23395dec,
 0x3fefa3ec32d3d1a2,
 0x3fef9fa55fdfa9c5,
 0x3fef9b66affed31b,
 0x3fef973028d7233e,
 0x3fef9301d0125b51,
 0x3fef8edbab5e2ab6,
 0x3fef8abdc06c31cc,
 0x3fef86a814f204ab,
 0x3fef829aaea92de0,
 0x3fef7e95934f312e,
 0x3fef7a98c8a58e51,
 0x3fef76a45471c3c2,
 0x3fef72b83c7d517b,
 0x3fef6ed48695bbc0,
 0x3fef6af9388c8dea,
 0x3fef672658375d2f,
 0x3fef635beb6fcb75,
 0x3fef5f99f8138a1c,
 0x3fef5be084045cd4,
 0x3fef582f95281c6b,
 0x3fef54873168b9aa,
 0x3fef50e75eb44027,
 0x3fef4d5022fcd91d,
 0x3fef49c18438ce4d,
 0x3fef463b88628cd6,
 0x3fef42be3578a819,
 0x3fef3f49917ddc96,
 0x3fef3bdda27912d1,
 0x3fef387a6e756238,
 0x3fef351ffb82140a,
 0x3fef31ce4fb2a63f,
 0x3fef2e85711ece75,
 0x3fef2b4565e27cdd,
 0x3fef280e341ddf29,
 0x3fef24dfe1f56381,
 0x3fef21ba7591bb70,
 0x3fef1e9df51fdee1,
 0x3fef1b8a66d10f13,
 0x3fef187fd0dad990,
 0x3fef157e39771b2f,
 0x3fef1285a6e4030b,
 0x3fef0f961f641589,
 0x3fef0cafa93e2f56,
 0x3fef09d24abd886b,
 0x3fef06fe0a31b715,
 0x3fef0432edeeb2fd,
 0x3fef0170fc4cd831,
 0x3feefeb83ba8ea32,
 0x3feefc08b26416ff,
 0x3feef96266e3fa2d,
 0x3feef6c55f929ff1,
 0x3feef431a2de883b,
 0x3feef1a7373aa9cb,
 0x3feeef26231e754a,
 0x3feeecae6d05d866,
 0x3feeea401b7140ef,
 0x3feee7db34e59ff7,
 0x3feee57fbfec6cf4,
 0x3feee32dc313a8e5,
 0x3feee0e544ede173,
 0x3feedea64c123422,
 0x3feedc70df1c5175,
 0x3feeda4504ac801c,
 0x3feed822c367a024,
 0x3feed60a21f72e2a,
 0x3feed3fb2709468a,
 0x3feed1f5d950a897,
 0x3feecffa3f84b9d4,
 0x3feece086061892d,
 0x3feecc2042a7d232,
 0x3feeca41ed1d0057,
 0x3feec86d668b3237,
 0x3feec6a2b5c13cd0,
 0x3feec4e1e192aed2,
 0x3feec32af0d7d3de,
 0x3feec17dea6db7d7,
 0x3feebfdad5362a27,
 0x3feebe41b817c114,
 0x3feebcb299fddd0d,
 0x3feebb2d81d8abff,
 0x3feeb9b2769d2ca7,
 0x3feeb8417f4531ee,
 0x3feeb6daa2cf6642,
 0x3feeb57de83f4eef,
 0x3feeb42b569d4f82,
 0x3feeb2e2f4f6ad27,
 0x3feeb1a4ca5d920f,
 0x3feeb070dde910d2,
 0x3feeaf4736b527da,
 0x3feeae27dbe2c4cf,
 0x3feead12d497c7fd,
 0x3feeac0827ff07cc,
 0x3feeab07dd485429,
 0x3feeaa11fba87a03,
 0x3feea9268a5946b7,
 0x3feea84590998b93,
 0x3feea76f15ad2148,
 0x3feea6a320dceb71,
 0x3feea5e1b976dc09,
 0x3feea52ae6cdf6f4,
 0x3feea47eb03a5585,
 0x3feea3dd1d1929fd,
 0x3feea34634ccc320,
 0x3feea2b9febc8fb7,
 0x3feea23882552225,
 0x3feea1c1c70833f6,
 0x3feea155d44ca973,
 0x3feea0f4b19e9538,
 0x3feea09e667f3bcd,
 0x3feea052fa75173e,
 0x3feea012750bdabf,
 0x3fee9fdcddd47645,
 0x3fee9fb23c651a2f,
 0x3fee9f9298593ae5,
 0x3fee9f7df9519484,
 0x3fee9f7466f42e87,
 0x3fee9f75e8ec5f74,
 0x3fee9f8286ead08a,
 0x3fee9f9a48a58174,
 0x3fee9fbd35d7cbfd,
 0x3fee9feb564267c9,
 0x3feea024b1ab6e09,
 0x3feea0694fde5d3f,
 0x3feea0b938ac1cf6,
 0x3feea11473eb0187,
 0x3feea17b0976cfdb,
 0x3feea1ed0130c132,
 0x3feea26a62ff86f0,
 0x3feea2f336cf4e62,
 0x3feea3878491c491,
 0x3feea427543e1a12,
 0x3feea4d2add106d9,
 0x3feea589994cce13,
 0x3feea64c1eb941f7,
 0x3feea71a4623c7ad,
 0x3feea7f4179f5b21,
 0x3feea8d99b4492ed,
 0x3feea9cad931a436,
 0x3feeaac7d98a6699,
 0x3feeabd0a478580f,
 0x3feeace5422aa0db,
 0x3feeae05bad61778,
 0x3feeaf3216b5448c,
 0x3feeb06a5e0866d9,
 0x3feeb1ae99157736,
 0x3feeb2fed0282c8a,
 0x3feeb45b0b91ffc6,
 0x3feeb5c353aa2fe2,
 0x3feeb737b0cdc5e5,
 0x3feeb8b82b5f98e5,
 0x3feeba44cbc8520f,
 0x3feebbdd9a7670b3,
 0x3feebd829fde4e50,
 0x3feebf33e47a22a2,
 0x3feec0f170ca07ba,
 0x3feec2bb4d53fe0d,
 0x3feec49182a3f090,
 0x3feec674194bb8d5,
 0x3feec86319e32323,
 0x3feeca5e8d07f29e,
 0x3feecc667b5de565,
 0x3feece7aed8eb8bb,
 0x3feed09bec4a2d33,
 0x3feed2c980460ad8,
 0x3feed503b23e255d,
 0x3feed74a8af46052,
 0x3feed99e1330b358,
 0x3feedbfe53c12e59,
 0x3feede6b5579fdbf,
 0x3feee0e521356eba,
 0x3feee36bbfd3f37a,
 0x3feee5ff3a3c2774,
 0x3feee89f995ad3ad,
 0x3feeeb4ce622f2ff,
 0x3feeee07298db666,
 0x3feef0ce6c9a8952,
 0x3feef3a2b84f15fb,
 0x3feef68415b749b1,
 0x3feef9728de5593a,
 0x3feefc6e29f1c52a,
 0x3feeff76f2fb5e47,
 0x3fef028cf22749e4,
 0x3fef05b030a1064a,
 0x3fef08e0b79a6f1f,
 0x3fef0c1e904bc1d2,
 0x3fef0f69c3f3a207,
 0x3fef12c25bd71e09,
 0x3fef16286141b33d,
 0x3fef199bdd85529c,
 0x3fef1d1cd9fa652c,
 0x3fef20ab5fffd07a,
 0x3fef244778fafb22,
 0x3fef27f12e57d14b,
 0x3fef2ba88988c933,
 0x3fef2f6d9406e7b5,
 0x3fef33405751c4db,
 0x3fef3720dcef9069,
 0x3fef3b0f2e6d1675,
 0x3fef3f0b555dc3fa,
 0x3fef43155b5bab74,
 0x3fef472d4a07897c,
 0x3fef4b532b08c968,
 0x3fef4f87080d89f2,
 0x3fef53c8eacaa1d6,
 0x3fef5818dcfba487,
 0x3fef5c76e862e6d3,
 0x3fef60e316c98398,
 0x3fef655d71ff6075,
 0x3fef69e603db3285,
 0x3fef6e7cd63a8315,
 0x3fef7321f301b460,
 0x3fef77d5641c0658,
 0x3fef7c97337b9b5f,
 0x3fef81676b197d17,
 0x3fef864614f5a129,
 0x3fef8b333b16ee12,
 0x3fef902ee78b3ff6,
 0x3fef953924676d76,
 0x3fef9a51fbc74c83,
 0x3fef9f7977cdb740,
 0x3fefa4afa2a490da,
 0x3fefa9f4867cca6e,
 0x3fefaf482d8e67f1,
 0x3fefb4aaa2188510,
 0x3fefba1bee615a27,
 0x3fefbf9c1cb6412a,
 0x3fefc52b376bba97,
 0x3fefcac948dd7274,
 0x3fefd0765b6e4540,
 0x3fefd632798844f8,
 0x3fefdbfdad9cbe14,
 0x3fefe1d802243c89,
 0x3fefe7c1819e90d8,
 0x3fefedba3692d514,
 0x3feff3c22b8f71f1,
 0x3feff9d96b2a23d9,
 #endif
 };
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_expf.c b/contrib/arm-optimized-routines/math/v_expf.c
index d403e00534f0..ade23b2416aa 100644
--- a/contrib/arm-optimized-routines/math/v_expf.c
+++ b/contrib/arm-optimized-routines/math/v_expf.c
@@ -1,83 +1,122 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /* maxerr: 1.45358 +0.5 ulp.  */
   0x1.0e4020p-7f,
   0x1.573e2ep-5f,
   0x1.555e66p-3f,
   0x1.fffdb6p-2f,
   0x1.ffffecp-1f,
 };
 #define C0 v_f32 (Poly[0])
 #define C1 v_f32 (Poly[1])
 #define C2 v_f32 (Poly[2])
 #define C3 v_f32 (Poly[3])
 #define C4 v_f32 (Poly[4])
 
 #define Shift v_f32 (0x1.8p23f)
 #define InvLn2 v_f32 (0x1.715476p+0f)
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x20000000 /* asuint (0x1p-63).  */
+#define BigBound 0x42800000  /* asuint (0x1p6).  */
+
+VPCS_ATTR
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
   v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
   v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
   v_f32_t s2 = v_as_f32_u32 (e - b);
   v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
   v_u32_t r2 = v_as_u32_f32 (s1 * s1);
   v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
   /* Similar to r1 but avoids double rounding in the subnormal range.  */
   v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
+#endif
+
 VPCS_ATTR
 v_f32_t
 V_NAME(expf) (v_f32_t x)
 {
-  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_f32_t n, r, r2, scale, p, q, poly, z;
   v_u32_t cmp, e;
 
-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
+		    >= BigBound - TinyBound);
+  v_f32_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_sel_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
   z = v_fma_f32 (x, InvLn2, Shift);
   n = z - Shift;
   r = v_fma_f32 (n, -Ln2hi, x);
   r = v_fma_f32 (n, -Ln2lo, r);
   e = v_as_u32_f32 (z) << 23;
 #else
   z = x * InvLn2;
   n = v_round_f32 (z);
   r = v_fma_f32 (n, -Ln2hi, x);
   r = v_fma_f32 (n, -Ln2lo, r);
   e = v_as_u32_s32 (v_round_s32 (z)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
+
+#if !WANT_SIMD_EXCEPT
+  v_f32_t absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
+#endif
+
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
+
   if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
+#else
     return specialcase (poly, n, e, absn, cmp, scale);
+#endif
+
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_expf_1u.c b/contrib/arm-optimized-routines/math/v_expf_1u.c
index 023bd248c9ac..8f0ae91c582a 100644
--- a/contrib/arm-optimized-routines/math/v_expf_1u.c
+++ b/contrib/arm-optimized-routines/math/v_expf_1u.c
@@ -1,80 +1,80 @@
 /*
  * Single-precision vector e^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /*  maxerr: 0.36565 +0.5 ulp.  */
   0x1.6a6000p-10f,
   0x1.12718ep-7f,
   0x1.555af0p-5f,
   0x1.555430p-3f,
   0x1.fffff4p-2f,
 };
 #define C0 v_f32 (Poly[0])
 #define C1 v_f32 (Poly[1])
 #define C2 v_f32 (Poly[2])
 #define C3 v_f32 (Poly[3])
 #define C4 v_f32 (Poly[4])
 
 #define Shift v_f32 (0x1.8p23f)
 #define InvLn2 v_f32 (0x1.715476p+0f)
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
   v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
   v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
   v_f32_t s2 = v_as_f32_u32 (e - b);
   v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
   v_f32_t r1 = s1 * s1;
   v_f32_t r0 = poly * s1 * s2;
   return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(expf_1u) (v_f32_t x)
 {
   v_f32_t n, r, scale, poly, absn, z;
   v_u32_t cmp, e;
 
   /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
   z = v_fma_f32 (x, InvLn2, Shift);
   n = z - Shift;
   r = v_fma_f32 (n, -Ln2hi, x);
   r = v_fma_f32 (n, -Ln2lo, r);
   e = v_as_u32_f32 (z) << 23;
 #else
   z = x * InvLn2;
   n = v_round_f32 (z);
   r = v_fma_f32 (n, -Ln2hi, x);
   r = v_fma_f32 (n, -Ln2lo, r);
   e = v_as_u32_s32 (v_round_s32 (z)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
   absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
   poly = v_fma_f32 (C0, r, C1);
   poly = v_fma_f32 (poly, r, C2);
   poly = v_fma_f32 (poly, r, C3);
   poly = v_fma_f32 (poly, r, C4);
   poly = v_fma_f32 (poly, r, v_f32 (1.0f));
   poly = v_fma_f32 (poly, r, v_f32 (1.0f));
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn);
   return scale * poly;
 }
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_log.c b/contrib/arm-optimized-routines/math/v_log.c
index d84c740d2b6b..47a829119b3c 100644
--- a/contrib/arm-optimized-routines/math/v_log.c
+++ b/contrib/arm-optimized-routines/math/v_log.c
@@ -1,104 +1,104 @@
 /*
  * Double-precision vector log(x) function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #include "v_log.h"
 #if V_SUPPORTED
 
 /* Worst-case error: 1.17 + 0.5 ulp.  */
 
 static const f64_t Poly[] = {
   /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
   -0x1.ffffffffffff7p-2,
    0x1.55555555170d4p-2,
   -0x1.0000000399c27p-2,
    0x1.999b2e90e94cap-3,
   -0x1.554e550bd501ep-3,
 };
 
 #define A0 v_f64 (Poly[0])
 #define A1 v_f64 (Poly[1])
 #define A2 v_f64 (Poly[2])
 #define A3 v_f64 (Poly[3])
 #define A4 v_f64 (Poly[4])
 #define Ln2 v_f64 (0x1.62e42fefa39efp-1)
 #define N (1 << V_LOG_TABLE_BITS)
 #define OFF v_u64 (0x3fe6900900000000)
 
 struct entry
 {
   v_f64_t invc;
   v_f64_t logc;
 };
 
 static inline struct entry
 lookup (v_u64_t i)
 {
   struct entry e;
 #ifdef SCALAR
   e.invc = __v_log_data[i].invc;
   e.logc = __v_log_data[i].logc;
 #else
   e.invc[0] = __v_log_data[i[0]].invc;
   e.logc[0] = __v_log_data[i[0]].logc;
   e.invc[1] = __v_log_data[i[1]].invc;
   e.logc[1] = __v_log_data[i[1]].logc;
 #endif
   return e;
 }
 
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 {
   return v_call_f64 (log, x, y, cmp);
 }
 
 VPCS_ATTR
 v_f64_t
 V_NAME(log) (v_f64_t x)
 {
   v_f64_t z, r, r2, p, y, kd, hi;
   v_u64_t ix, iz, tmp, top, i, cmp;
   v_s64_t k;
   struct entry e;
 
   ix = v_as_u64_f64 (x);
   top = ix >> 48;
   cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
 
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
   k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
   iz = ix - (tmp & v_u64 (0xfffULL << 52));
   z = v_as_f64_u64 (iz);
   e = lookup (i);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
   r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
   kd = v_to_f64_s64 (k);
 
   /* hi = r + log(c) + k*Ln2.  */
   hi = v_fma_f64 (kd, Ln2, e.logc + r);
   /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
   r2 = r * r;
   y = v_fma_f64 (A3, r, A2);
   p = v_fma_f64 (A1, r, A0);
   y = v_fma_f64 (A4, r2, y);
   y = v_fma_f64 (y, r2, p);
   y = v_fma_f64 (y, r2, hi);
 
   if (unlikely (v_any_u64 (cmp)))
     return specialcase (x, y, cmp);
   return y;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_log.h b/contrib/arm-optimized-routines/math/v_log.h
index bcc2fa6fa930..a37bbc2bd6b6 100644
--- a/contrib/arm-optimized-routines/math/v_log.h
+++ b/contrib/arm-optimized-routines/math/v_log.h
@@ -1,18 +1,18 @@
 /*
  * Declarations for double-precision log(x) vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_math.h"
 #if WANT_VMATH
 
 #define V_LOG_TABLE_BITS 7
 
 extern const struct v_log_data
 {
   f64_t invc;
   f64_t logc;
 } __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_log_data.c b/contrib/arm-optimized-routines/math/v_log_data.c
index 97ee5b09c6a9..ec1c8e5e16b2 100644
--- a/contrib/arm-optimized-routines/math/v_log_data.c
+++ b/contrib/arm-optimized-routines/math/v_log_data.c
@@ -1,158 +1,158 @@
 /*
  * Lookup table for double-precision log(x) vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_log.h"
 #if WANT_VMATH
 
 #define N (1 << V_LOG_TABLE_BITS)
 
 /* Algorithm:
 
 	x = 2^k z
 	log(x) = k ln2 + log(c) + poly(z/c - 1)
 
 where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
 and log(c) and 1/c for the ith subinterval comes from a lookup table:
 
 	tab[i].invc = 1/c
 	tab[i].logc = (double)log(c)
 
 where c is near the center of the subinterval and is chosen by trying several
 floating point invc candidates around 1/center and selecting one for which
 the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
 that contains 1 and the previous one got tweaked to avoid cancellation.  */
 const struct v_log_data __v_log_data[N] = {
 {0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
 {0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
 {0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
 {0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
 {0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
 {0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
 {0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
 {0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
 {0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
 {0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
 {0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
 {0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
 {0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
 {0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
 {0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
 {0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
 {0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
 {0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
 {0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
 {0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
 {0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
 {0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
 {0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
 {0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
 {0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
 {0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
 {0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
 {0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
 {0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
 {0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
 {0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
 {0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
 {0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
 {0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
 {0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
 {0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
 {0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
 {0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
 {0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
 {0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
 {0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
 {0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
 {0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
 {0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
 {0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
 {0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
 {0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
 {0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
 {0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
 {0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
 {0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
 {0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
 {0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
 {0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
 {0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
 {0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
 {0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
 {0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
 {0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
 {0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
 {0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
 {0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
 {0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
 {0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
 {0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
 {0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
 {0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
 {0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
 {0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
 {0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
 {0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
 {0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
 {0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
 {0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
 {0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
 {1.0, 0.0},
 {0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
 {0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
 {0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
 {0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
 {0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
 {0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
 {0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
 {0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
 {0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
 {0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
 {0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
 {0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
 {0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
 {0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
 {0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
 {0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
 {0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
 {0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
 {0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
 {0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
 {0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
 {0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
 {0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
 {0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
 {0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
 {0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
 {0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
 {0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
 {0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
 {0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
 {0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
 {0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
 {0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
 {0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
 {0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
 {0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
 {0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
 {0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
 {0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
 {0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
 {0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
 {0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
 {0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
 {0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
 {0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
 {0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
 {0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
 {0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
 {0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
 {0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
 {0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
 {0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
 };
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_logf.c b/contrib/arm-optimized-routines/math/v_logf.c
index 7373192f03fa..93a53758bff7 100644
--- a/contrib/arm-optimized-routines/math/v_logf.c
+++ b/contrib/arm-optimized-routines/math/v_logf.c
@@ -1,73 +1,73 @@
 /*
  * Single-precision vector log function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /* 3.34 ulp error */
   -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
   -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
 };
 #define P7 v_f32 (Poly[0])
 #define P6 v_f32 (Poly[1])
 #define P5 v_f32 (Poly[2])
 #define P4 v_f32 (Poly[3])
 #define P3 v_f32 (Poly[4])
 #define P2 v_f32 (Poly[5])
 #define P1 v_f32 (Poly[6])
 
 #define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
 #define Min v_u32 (0x00800000)
 #define Max v_u32 (0x7f800000)
 #define Mask v_u32 (0x007fffff)
 #define Off v_u32 (0x3f2aaaab) /* 0.666667 */
 
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
 {
   /* Fall back to scalar code.  */
   return v_call_f32 (logf, x, y, cmp);
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(logf) (v_f32_t x)
 {
   v_f32_t n, p, q, r, r2, y;
   v_u32_t u, cmp;
 
   u = v_as_u32_f32 (x);
   cmp = v_cond_u32 (u - Min >= Max - Min);
 
   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
   u -= Off;
   n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
   u &= Mask;
   u += Off;
   r = v_as_f32_u32 (u) - v_f32 (1.0f);
 
   /* y = log(1+r) + n*ln2.  */
   r2 = r * r;
   /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
   p = v_fma_f32 (P6, r, P5);
   q = v_fma_f32 (P4, r, P3);
   y = v_fma_f32 (P2, r, P1);
   p = v_fma_f32 (P7, r2, p);
   q = v_fma_f32 (p, r2, q);
   y = v_fma_f32 (q, r2, y);
   p = v_fma_f32 (Ln2, n, r);
   y = v_fma_f32 (y, r2, p);
 
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (x, y, cmp);
   return y;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_math.h b/contrib/arm-optimized-routines/math/v_math.h
index f2cc4670bb9b..3289916187d2 100644
--- a/contrib/arm-optimized-routines/math/v_math.h
+++ b/contrib/arm-optimized-routines/math/v_math.h
@@ -1,641 +1,661 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _V_MATH_H
 #define _V_MATH_H
 
 #ifndef WANT_VMATH
 /* Enable the build of vector math code.  */
 # define WANT_VMATH 1
 #endif
 #if WANT_VMATH
 
 /* The goal of this header is to allow vector and scalar
    build of the same algorithm, the provided intrinsic
    wrappers are also vector length agnostic so they can
    be implemented for SVE too (or other simd architectures)
    and then the code should work on those targets too.  */
 
 #if SCALAR
 #define V_NAME(x) __s_##x
 #elif VPCS && __aarch64__
 #define V_NAME(x) __vn_##x
 #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
 #else
 #define V_NAME(x) __v_##x
 #endif
 
 #ifndef VPCS_ATTR
 #define VPCS_ATTR
 #endif
 #ifndef VPCS_ALIAS
 #define VPCS_ALIAS
 #endif
 
 #include <stdint.h>
 #include "math_config.h"
 
 typedef float f32_t;
 typedef uint32_t u32_t;
 typedef int32_t s32_t;
 typedef double f64_t;
 typedef uint64_t u64_t;
 typedef int64_t s64_t;
 
 /* reinterpret as type1 from type2.  */
 static inline u32_t
 as_u32_f32 (f32_t x)
 {
   union { f32_t f; u32_t u; } r = {x};
   return r.u;
 }
 static inline f32_t
 as_f32_u32 (u32_t x)
 {
   union { u32_t u; f32_t f; } r = {x};
   return r.f;
 }
 static inline s32_t
 as_s32_u32 (u32_t x)
 {
   union { u32_t u; s32_t i; } r = {x};
   return r.i;
 }
 static inline u32_t
 as_u32_s32 (s32_t x)
 {
   union { s32_t i; u32_t u; } r = {x};
   return r.u;
 }
 static inline u64_t
 as_u64_f64 (f64_t x)
 {
   union { f64_t f; u64_t u; } r = {x};
   return r.u;
 }
 static inline f64_t
 as_f64_u64 (u64_t x)
 {
   union { u64_t u; f64_t f; } r = {x};
   return r.f;
 }
 static inline s64_t
 as_s64_u64 (u64_t x)
 {
   union { u64_t u; s64_t i; } r = {x};
   return r.i;
 }
 static inline u64_t
 as_u64_s64 (s64_t x)
 {
   union { s64_t i; u64_t u; } r = {x};
   return r.u;
 }
 
 #if SCALAR
 #define V_SUPPORTED 1
 typedef f32_t v_f32_t;
 typedef u32_t v_u32_t;
 typedef s32_t v_s32_t;
 typedef f64_t v_f64_t;
 typedef u64_t v_u64_t;
 typedef s64_t v_s64_t;
 
 static inline int
 v_lanes32 (void)
 {
   return 1;
 }
 
 static inline v_f32_t
 v_f32 (f32_t x)
 {
   return x;
 }
 static inline v_u32_t
 v_u32 (u32_t x)
 {
   return x;
 }
 static inline v_s32_t
 v_s32 (s32_t x)
 {
   return x;
 }
 
 static inline f32_t
 v_get_f32 (v_f32_t x, int i)
 {
   return x;
 }
 static inline u32_t
 v_get_u32 (v_u32_t x, int i)
 {
   return x;
 }
 static inline s32_t
 v_get_s32 (v_s32_t x, int i)
 {
   return x;
 }
 
 static inline void
 v_set_f32 (v_f32_t *x, int i, f32_t v)
 {
   *x = v;
 }
 static inline void
 v_set_u32 (v_u32_t *x, int i, u32_t v)
 {
   *x = v;
 }
 static inline void
 v_set_s32 (v_s32_t *x, int i, s32_t v)
 {
   *x = v;
 }
 
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u32 (v_u32_t x)
 {
   return x != 0;
 }
 /* to wrap the result of relational operators.  */
 static inline v_u32_t
 v_cond_u32 (v_u32_t x)
 {
   return x ? -1 : 0;
 }
 static inline v_f32_t
 v_abs_f32 (v_f32_t x)
 {
   return __builtin_fabsf (x);
 }
 static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return __builtin_fmaf (x, y, z);
 }
 static inline v_f32_t
 v_round_f32 (v_f32_t x)
 {
   return __builtin_roundf (x);
 }
 static inline v_s32_t
 v_round_s32 (v_f32_t x)
 {
   return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
 {
   return x;
 }
 static inline v_f32_t
 v_to_f32_u32 (v_u32_t x)
 {
   return x;
 }
 /* reinterpret as type1 from type2.  */
 static inline v_u32_t
 v_as_u32_f32 (v_f32_t x)
 {
   union { v_f32_t f; v_u32_t u; } r = {x};
   return r.u;
 }
 static inline v_f32_t
 v_as_f32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_f32_t f; } r = {x};
   return r.f;
 }
 static inline v_s32_t
 v_as_s32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_s32_t i; } r = {x};
   return r.i;
 }
 static inline v_u32_t
 v_as_u32_s32 (v_s32_t x)
 {
   union { v_s32_t i; v_u32_t u; } r = {x};
   return r.u;
 }
 static inline v_f32_t
 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
 {
   return tab[idx];
 }
 static inline v_u32_t
 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
 {
   return tab[idx];
 }
 static inline v_f32_t
 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
 {
   return f (x);
 }
 static inline v_f32_t
 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
 	     v_u32_t p)
 {
   return f (x1, x2);
 }
 
 static inline int
 v_lanes64 (void)
 {
   return 1;
 }
 static inline v_f64_t
 v_f64 (f64_t x)
 {
   return x;
 }
 static inline v_u64_t
 v_u64 (u64_t x)
 {
   return x;
 }
 static inline v_s64_t
 v_s64 (s64_t x)
 {
   return x;
 }
 static inline f64_t
 v_get_f64 (v_f64_t x, int i)
 {
   return x;
 }
 static inline void
 v_set_f64 (v_f64_t *x, int i, f64_t v)
 {
   *x = v;
 }
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
 {
   return x != 0;
 }
 /* to wrap the result of relational operators.  */
 static inline v_u64_t
 v_cond_u64 (v_u64_t x)
 {
   return x ? -1 : 0;
 }
 static inline v_f64_t
 v_abs_f64 (v_f64_t x)
 {
   return __builtin_fabs (x);
 }
 static inline v_f64_t
 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
 {
   return __builtin_fma (x, y, z);
 }
 static inline v_f64_t
 v_round_f64 (v_f64_t x)
 {
   return __builtin_round (x);
 }
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
   return __builtin_lround (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
 {
   return x;
 }
 static inline v_f64_t
 v_to_f64_u64 (v_u64_t x)
 {
   return x;
 }
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
 {
   union { v_f64_t f; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_as_f64_u64 (v_u64_t x)
 {
   union { v_u64_t u; v_f64_t f; } r = {x};
   return r.f;
 }
 static inline v_s64_t
 v_as_s64_u64 (v_u64_t x)
 {
   union { v_u64_t u; v_s64_t i; } r = {x};
   return r.i;
 }
 static inline v_u64_t
 v_as_u64_s64 (v_s64_t x)
 {
   union { v_s64_t i; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
 {
   return tab[idx];
 }
 static inline v_u64_t
 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
 {
   return tab[idx];
 }
 static inline v_f64_t
 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
 {
   return f (x);
 }
 
 #elif __aarch64__
 #define V_SUPPORTED 1
 #include <arm_neon.h>
 typedef float32x4_t v_f32_t;
 typedef uint32x4_t v_u32_t;
 typedef int32x4_t v_s32_t;
 typedef float64x2_t v_f64_t;
 typedef uint64x2_t v_u64_t;
 typedef int64x2_t v_s64_t;
 
 static inline int
 v_lanes32 (void)
 {
   return 4;
 }
 
 static inline v_f32_t
 v_f32 (f32_t x)
 {
   return (v_f32_t){x, x, x, x};
 }
 static inline v_u32_t
 v_u32 (u32_t x)
 {
   return (v_u32_t){x, x, x, x};
 }
 static inline v_s32_t
 v_s32 (s32_t x)
 {
   return (v_s32_t){x, x, x, x};
 }
 
 static inline f32_t
 v_get_f32 (v_f32_t x, int i)
 {
   return x[i];
 }
 static inline u32_t
 v_get_u32 (v_u32_t x, int i)
 {
   return x[i];
 }
 static inline s32_t
 v_get_s32 (v_s32_t x, int i)
 {
   return x[i];
 }
 
 static inline void
 v_set_f32 (v_f32_t *x, int i, f32_t v)
 {
   (*x)[i] = v;
 }
 static inline void
 v_set_u32 (v_u32_t *x, int i, u32_t v)
 {
   (*x)[i] = v;
 }
 static inline void
 v_set_s32 (v_s32_t *x, int i, s32_t v)
 {
   (*x)[i] = v;
 }
 
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u32 (v_u32_t x)
 {
   /* assume elements in x are either 0 or -1u.  */
   return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
 }
 /* to wrap the result of relational operators.  */
 static inline v_u32_t
 v_cond_u32 (v_u32_t x)
 {
   return x;
 }
 static inline v_f32_t
 v_abs_f32 (v_f32_t x)
 {
   return vabsq_f32 (x);
 }
 static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return vfmaq_f32 (z, x, y);
 }
 static inline v_f32_t
 v_round_f32 (v_f32_t x)
 {
   return vrndaq_f32 (x);
 }
 static inline v_s32_t
 v_round_s32 (v_f32_t x)
 {
   return vcvtaq_s32_f32 (x);
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
 {
   return (v_f32_t){x[0], x[1], x[2], x[3]};
 }
 static inline v_f32_t
 v_to_f32_u32 (v_u32_t x)
 {
   return (v_f32_t){x[0], x[1], x[2], x[3]};
 }
 /* reinterpret as type1 from type2.  */
 static inline v_u32_t
 v_as_u32_f32 (v_f32_t x)
 {
   union { v_f32_t f; v_u32_t u; } r = {x};
   return r.u;
 }
 static inline v_f32_t
 v_as_f32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_f32_t f; } r = {x};
   return r.f;
 }
 static inline v_s32_t
 v_as_s32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_s32_t i; } r = {x};
   return r.i;
 }
 static inline v_u32_t
 v_as_u32_s32 (v_s32_t x)
 {
   union { v_s32_t i; v_u32_t u; } r = {x};
   return r.u;
 }
 static inline v_f32_t
 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
 {
   return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
 }
 static inline v_u32_t
 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
 {
   return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
 }
 static inline v_f32_t
 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
 {
   return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
 		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
 }
 static inline v_f32_t
 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
 	     v_u32_t p)
 {
   return (
     v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
 	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
 }
 
 static inline int
 v_lanes64 (void)
 {
   return 2;
 }
 static inline v_f64_t
 v_f64 (f64_t x)
 {
   return (v_f64_t){x, x};
 }
 static inline v_u64_t
 v_u64 (u64_t x)
 {
   return (v_u64_t){x, x};
 }
 static inline v_s64_t
 v_s64 (s64_t x)
 {
   return (v_s64_t){x, x};
 }
 static inline f64_t
 v_get_f64 (v_f64_t x, int i)
 {
   return x[i];
 }
 static inline void
 v_set_f64 (v_f64_t *x, int i, f64_t v)
 {
   (*x)[i] = v;
 }
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
 {
   /* assume elements in x are either 0 or -1u.  */
   return vpaddd_u64 (x) != 0;
 }
 /* to wrap the result of relational operators.  */
 static inline v_u64_t
 v_cond_u64 (v_u64_t x)
 {
   return x;
 }
 static inline v_f64_t
 v_abs_f64 (v_f64_t x)
 {
   return vabsq_f64 (x);
 }
 static inline v_f64_t
 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
 {
   return vfmaq_f64 (z, x, y);
 }
 static inline v_f64_t
 v_round_f64 (v_f64_t x)
 {
   return vrndaq_f64 (x);
 }
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
   return vcvtaq_s64_f64 (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
 {
   return (v_f64_t){x[0], x[1]};
 }
 static inline v_f64_t
 v_to_f64_u64 (v_u64_t x)
 {
   return (v_f64_t){x[0], x[1]};
 }
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
 {
   union { v_f64_t f; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_as_f64_u64 (v_u64_t x)
 {
   union { v_u64_t u; v_f64_t f; } r = {x};
   return r.f;
 }
 static inline v_s64_t
 v_as_s64_u64 (v_u64_t x)
 {
   union {  v_u64_t u; v_s64_t i; } r = {x};
   return r.i;
 }
 static inline v_u64_t
 v_as_u64_s64 (v_s64_t x)
 {
   union { v_s64_t i; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
 {
   return (v_f64_t){tab[idx[0]], tab[idx[1]]};
 }
 static inline v_u64_t
 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
 {
   return (v_u64_t){tab[idx[0]], tab[idx[1]]};
 }
 static inline v_f64_t
 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
 {
   return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
 }
 #endif
 
 #endif
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_pow.c b/contrib/arm-optimized-routines/math/v_pow.c
index a209d57f41ce..05a83aaa8c0a 100644
--- a/contrib/arm-optimized-routines/math/v_pow.c
+++ b/contrib/arm-optimized-routines/math/v_pow.c
@@ -1,27 +1,27 @@
 /*
  * Double-precision vector pow function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 VPCS_ATTR
 v_f64_t
 V_NAME(pow) (v_f64_t x, v_f64_t y)
 {
   v_f64_t z;
   for (int lane = 0; lane < v_lanes64 (); lane++)
     {
       f64_t sx = v_get_f64 (x, lane);
       f64_t sy = v_get_f64 (y, lane);
       f64_t sz = pow (sx, sy);
       v_set_f64 (&z, lane, sz);
     }
   return z;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_powf.c b/contrib/arm-optimized-routines/math/v_powf.c
index fb80fa6f1846..ad8ab8d4f00d 100644
--- a/contrib/arm-optimized-routines/math/v_powf.c
+++ b/contrib/arm-optimized-routines/math/v_powf.c
@@ -1,235 +1,235 @@
 /*
  * Single-precision vector powf function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 #define Min v_u32 (0x00800000)
 #define Max v_u32 (0x7f800000)
 #define SBITS 5
 #define Tlog v__powf_log2_data.tab
 #define Texp v__exp2f_data.tab
 #define A v__powf_log2_data.poly
 #define C v__exp2f_data.poly
 #define LOGDEG 4
 
 #if LOGDEG == 5
 /* 1.01 ulp */
 #define OFF v_u32 (0x3f330000)
 #define TBITS 4
 #elif LOGDEG == 4
 /* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
 #define OFF v_u32 (0x3f35d000)
 #define TBITS 5
 #endif
 
 #define V_EXP2F_TABLE_BITS SBITS
 #define V_EXP2F_POLY_ORDER 3
 struct v_exp2f_data
 {
   uint64_t tab[1 << V_EXP2F_TABLE_BITS];
   double poly[V_EXP2F_POLY_ORDER];
 };
 
 #define V_POWF_LOG2_TABLE_BITS TBITS
 #define V_POWF_LOG2_POLY_ORDER LOGDEG
 #define SCALE ((double) (1 << SBITS))
 struct v_powf_log2_data
 {
   struct
   {
     double invc, logc;
   } tab[1 << V_POWF_LOG2_TABLE_BITS];
   double poly[V_POWF_LOG2_POLY_ORDER];
 };
 
 static const struct v_powf_log2_data v__powf_log2_data = {
 #if LOGDEG == 5
   .tab = {
 { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
 { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
 { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
 { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
 { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
 { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
 { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
 { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
 { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
 { 0x1p+0, 0x0p+0 * SCALE },
 { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
 { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
 { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
 { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
 { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
 { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
   },
 /* rel err: 1.46 * 2^-32 */
   .poly = {
 0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
 0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
 0x1.71547652ab82bp0 * SCALE,
   }
 #elif LOGDEG == 4
   .tab = {
 {0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
 {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
 {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
 {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
 {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
 {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
 {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
 {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
 {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
 {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
 {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
 {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
 {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
 {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
 {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
 {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
 {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
 {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
 {0x1p+0, 0x0p+0 * SCALE},
 {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
 {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
 {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
 {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
 {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
 {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
 {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
 {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
 {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
 {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
 {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
 {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
 {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
   },
 /* rel err: 1.5 * 2^-30 */
   .poly = {
  -0x1.6ff5daa3b3d7cp-2 * SCALE,
  0x1.ec81d03c01aebp-2 * SCALE,
  -0x1.71547bb43f101p-1 * SCALE,
  0x1.7154764a815cbp0 * SCALE,
   }
 #endif
 };
 
 static const struct v_exp2f_data v__exp2f_data = {
   .tab = {
 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
 0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
 0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
   },
 /* rel err: 1.69 * 2^-34 */
   .poly = {
 0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
   },
 };
 
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
 {
   return v_call2_f32 (powf, x, y, ret, cmp);
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(powf) (v_f32_t x, v_f32_t y)
 {
   v_u32_t u, tmp, cmp, i, top, iz;
   v_s32_t k;
   v_f32_t ret;
 
   u = v_as_u32_f32 (x);
   cmp = v_cond_u32 (u - Min >= Max - Min);
   tmp = u - OFF;
   i = (tmp >> (23 - TBITS)) % (1 << TBITS);
   top = tmp & 0xff800000;
   iz = u - top;
   k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
 
   for (int lane = 0; lane < v_lanes32 (); lane++)
     {
       uint32_t si, siz;
       int32_t sk;
       float sy;
 
       /* Use double precision for each lane.  */
       double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
       uint64_t ki, t;
 
       si = v_get_u32 (i, lane);
       siz = v_get_u32 (iz, lane);
       sk = v_get_s32 (k, lane);
       sy = v_get_f32 (y, lane);
 
       invc = Tlog[si].invc;
       logc = Tlog[si].logc;
       z = (double) as_f32_u32 (siz);
 
       /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
       r = __builtin_fma (z, invc, -1.0);
       y0 = logc + (double) sk;
 
       /* Polynomial to approximate log1p(r)/ln2.  */
 #if LOGDEG == 5
       logx = A[0];
       logx = r * logx + A[1];
       logx = r * logx + A[2];
       logx = r * logx + A[3];
       logx = r * logx + A[4];
       logx = r * logx + y0;
 #elif LOGDEG == 4
       logx = A[0];
       logx = r * logx + A[1];
       logx = r * logx + A[2];
       logx = r * logx + A[3];
       logx = r * logx + y0;
 #endif
       ylogx = sy * logx;
       v_set_u32 (&cmp, lane,
 		 (as_u64_f64 (ylogx) >> 47 & 0xffff)
 		     >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
 		   ? 1
 		   : v_get_u32 (cmp, lane));
 
       /* N*x = k + r with r in [-1/2, 1/2] */
 #if TOINT_INTRINSICS
       kd = roundtoint (ylogx); /* k */
       ki = converttoint (ylogx);
 #else
 # define SHIFT 0x1.8p52
       kd = eval_as_double (ylogx + SHIFT);
       ki = asuint64 (kd);
       kd -= SHIFT;
 #endif
       r = ylogx - kd;
 
       /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
       t = Texp[ki % (1 << SBITS)];
       t += ki << (52 - SBITS);
       s = as_f64_u64 (t);
       p = C[0];
       p = __builtin_fma (p, r, C[1]);
       p = __builtin_fma (p, r, C[2]);
       p = __builtin_fma (p, s * r, s);
 
       v_set_f32 (&ret, lane, p);
     }
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (x, y, ret, cmp);
   return ret;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_sin.c b/contrib/arm-optimized-routines/math/v_sin.c
index 2b9ed059189c..9dbb9dec04de 100644
--- a/contrib/arm-optimized-routines/math/v_sin.c
+++ b/contrib/arm-optimized-routines/math/v_sin.c
@@ -1,86 +1,103 @@
 /*
  * Double-precision vector sin function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const double Poly[] = {
 /* worst-case error is 3.5 ulp.
    abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].  */
 -0x1.9f4a9c8b21dc9p-41,
  0x1.60e88a10163f2p-33,
 -0x1.ae6361b7254e7p-26,
  0x1.71de382e8d62bp-19,
 -0x1.a01a019aeb4ffp-13,
  0x1.111111110b25ep-7,
 -0x1.55555555554c3p-3,
 };
 
 #define C7 v_f64 (Poly[0])
 #define C6 v_f64 (Poly[1])
 #define C5 v_f64 (Poly[2])
 #define C4 v_f64 (Poly[3])
 #define C3 v_f64 (Poly[4])
 #define C2 v_f64 (Poly[5])
 #define C1 v_f64 (Poly[6])
 
 #define InvPi v_f64 (0x1.45f306dc9c883p-2)
 #define Pi1 v_f64 (0x1.921fb54442d18p+1)
 #define Pi2 v_f64 (0x1.1a62633145c06p-53)
 #define Pi3 v_f64 (0x1.c1cd129024e09p-106)
 #define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
 #define AbsMask v_u64 (0x7fffffffffffffff)
 
+#if WANT_SIMD_EXCEPT
+#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)).  */
+#define Thresh 0x214	/* top12 (asuint64 (RangeVal)) - TinyBound.  */
+#else
+#define RangeVal v_f64 (0x1p23)
+#endif
+
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
 {
   return v_call_f64 (sin, x, y, cmp);
 }
 
 VPCS_ATTR
 v_f64_t
 V_NAME(sin) (v_f64_t x)
 {
   v_f64_t n, r, r2, y;
-  v_u64_t sign, odd, cmp;
+  v_u64_t sign, odd, cmp, ir;
 
-  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+  ir = v_as_u64_f64 (x) & AbsMask;
+  r = v_as_f64_u64 (ir);
   sign = v_as_u64_f64 (x) & ~AbsMask;
-  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+#if WANT_SIMD_EXCEPT
+  /* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be
+     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+     fenv). These lanes will be fixed by specialcase later.  */
+  cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh);
+  if (unlikely (v_any_u64 (cmp)))
+    r = v_sel_f64 (cmp, v_f64 (1), r);
+#else
+  cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal));
+#endif
 
   /* n = rint(|x|/pi).  */
   n = v_fma_f64 (InvPi, r, Shift);
   odd = v_as_u64_f64 (n) << 63;
   n -= Shift;
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
   r = v_fma_f64 (-Pi1, n, r);
   r = v_fma_f64 (-Pi2, n, r);
   r = v_fma_f64 (-Pi3, n, r);
 
   /* sin(r) poly approx.  */
   r2 = r * r;
   y = v_fma_f64 (C7, r2, C6);
   y = v_fma_f64 (y, r2, C5);
   y = v_fma_f64 (y, r2, C4);
   y = v_fma_f64 (y, r2, C3);
   y = v_fma_f64 (y, r2, C2);
   y = v_fma_f64 (y, r2, C1);
   y = v_fma_f64 (y * r2, r, r);
 
   /* sign.  */
   y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
 
   if (unlikely (v_any_u64 (cmp)))
     return specialcase (x, y, cmp);
   return y;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/v_sinf.c b/contrib/arm-optimized-routines/math/v_sinf.c
index e66bfce6d8aa..ce35dacc65cf 100644
--- a/contrib/arm-optimized-routines/math/v_sinf.c
+++ b/contrib/arm-optimized-routines/math/v_sinf.c
@@ -1,75 +1,88 @@
 /*
  * Single-precision vector sin function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /* 1.886 ulp error */
   0x1.5b2e76p-19f,
   -0x1.9f42eap-13f,
   0x1.110df4p-7f,
   -0x1.555548p-3f,
 };
 #define Pi1 v_f32 (0x1.921fb6p+1f)
 #define Pi2 v_f32 (-0x1.777a5cp-24f)
 #define Pi3 v_f32 (-0x1.ee59dap-49f)
 #define A3 v_f32 (Poly[3])
 #define A5 v_f32 (Poly[2])
 #define A7 v_f32 (Poly[1])
 #define A9 v_f32 (Poly[0])
 #define RangeVal v_f32 (0x1p20f)
+#define TinyBound v_f32 (0x1p-61f)
 #define InvPi v_f32 (0x1.45f306p-2f)
 #define Shift v_f32 (0x1.8p+23f)
 #define AbsMask v_u32 (0x7fffffff)
 
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
 {
   /* Fall back to scalar code.  */
   return v_call_f32 (sinf, x, y, cmp);
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(sinf) (v_f32_t x)
 {
   v_f32_t n, r, r2, y;
-  v_u32_t sign, odd, cmp;
+  v_u32_t sign, odd, cmp, ir;
 
-  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  ir = v_as_u32_f32 (x) & AbsMask;
+  r = v_as_f32_u32 (ir);
   sign = v_as_u32_f32 (x) & ~AbsMask;
-  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound)
+		     >= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound)));
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f32 (cmp, v_f32 (1), r);
+#else
+  cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal));
+#endif
 
   /* n = rint(|x|/pi) */
   n = v_fma_f32 (InvPi, r, Shift);
   odd = v_as_u32_f32 (n) << 31;
   n -= Shift;
 
   /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
   r = v_fma_f32 (-Pi1, n, r);
   r = v_fma_f32 (-Pi2, n, r);
   r = v_fma_f32 (-Pi3, n, r);
 
   /* y = sin(r) */
   r2 = r * r;
   y = v_fma_f32 (A9, r2, A7);
   y = v_fma_f32 (y, r2, A5);
   y = v_fma_f32 (y, r2, A3);
   y = v_fma_f32 (y * r2, r, r);
 
   /* sign fix */
   y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
 
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (x, y, cmp);
   return y;
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_cos.c b/contrib/arm-optimized-routines/math/vn_cos.c
index b57a549eba68..4b5b23718a8b 100644
--- a/contrib/arm-optimized-routines/math/vn_cos.c
+++ b/contrib/arm-optimized-routines/math/vn_cos.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_cos.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
 #include "v_cos.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_cosf.c b/contrib/arm-optimized-routines/math/vn_cosf.c
index 6321d4620fa7..86dd26ecb3e7 100644
--- a/contrib/arm-optimized-routines/math/vn_cosf.c
+++ b/contrib/arm-optimized-routines/math/vn_cosf.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_cosf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
 #include "v_cosf.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_exp.c b/contrib/arm-optimized-routines/math/vn_exp.c
index 06e269d41766..0d85b17de05a 100644
--- a/contrib/arm-optimized-routines/math/vn_exp.c
+++ b/contrib/arm-optimized-routines/math/vn_exp.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_exp.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
 #include "v_exp.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_exp2f.c b/contrib/arm-optimized-routines/math/vn_exp2f.c
index db9707e86f16..da3bb40ae93f 100644
--- a/contrib/arm-optimized-routines/math/vn_exp2f.c
+++ b/contrib/arm-optimized-routines/math/vn_exp2f.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_exp2f.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
 #include "v_exp2f.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c b/contrib/arm-optimized-routines/math/vn_exp2f_1u.c
index 17bd0abd7a60..3e3a24705614 100644
--- a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c
+++ b/contrib/arm-optimized-routines/math/vn_exp2f_1u.c
@@ -1,11 +1,11 @@
 /*
  * AdvSIMD vector PCS variant of __v_exp2f_1u.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #include "v_exp2f_1u.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_expf.c b/contrib/arm-optimized-routines/math/vn_expf.c
index 0652907225d9..6e91a940bbf4 100644
--- a/contrib/arm-optimized-routines/math/vn_expf.c
+++ b/contrib/arm-optimized-routines/math/vn_expf.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_expf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
 #include "v_expf.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_expf_1u.c b/contrib/arm-optimized-routines/math/vn_expf_1u.c
index 3be776814822..57ae6a315b9b 100644
--- a/contrib/arm-optimized-routines/math/vn_expf_1u.c
+++ b/contrib/arm-optimized-routines/math/vn_expf_1u.c
@@ -1,11 +1,11 @@
 /*
  * AdvSIMD vector PCS variant of __v_expf_1u.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #include "v_expf_1u.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_log.c b/contrib/arm-optimized-routines/math/vn_log.c
index b58fe8ff820a..902bff1fcd4e 100644
--- a/contrib/arm-optimized-routines/math/vn_log.c
+++ b/contrib/arm-optimized-routines/math/vn_log.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_log.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
 #include "v_log.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_logf.c b/contrib/arm-optimized-routines/math/vn_logf.c
index cc5b8ae3ed55..07e493685b4d 100644
--- a/contrib/arm-optimized-routines/math/vn_logf.c
+++ b/contrib/arm-optimized-routines/math/vn_logf.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_logf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
 #include "v_logf.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_pow.c b/contrib/arm-optimized-routines/math/vn_pow.c
index 260950113b04..1a980ff6bf2f 100644
--- a/contrib/arm-optimized-routines/math/vn_pow.c
+++ b/contrib/arm-optimized-routines/math/vn_pow.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_pow.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
 #include "v_pow.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_powf.c b/contrib/arm-optimized-routines/math/vn_powf.c
index 095d07e337ad..a42ade371adc 100644
--- a/contrib/arm-optimized-routines/math/vn_powf.c
+++ b/contrib/arm-optimized-routines/math/vn_powf.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_powf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
 #include "v_powf.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_sin.c b/contrib/arm-optimized-routines/math/vn_sin.c
index 905c79623350..64b05c8ca0eb 100644
--- a/contrib/arm-optimized-routines/math/vn_sin.c
+++ b/contrib/arm-optimized-routines/math/vn_sin.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_sin.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
 #include "v_sin.c"
 #endif
diff --git a/contrib/arm-optimized-routines/math/vn_sinf.c b/contrib/arm-optimized-routines/math/vn_sinf.c
index 1214e1a55638..6e880c60dc39 100644
--- a/contrib/arm-optimized-routines/math/vn_sinf.c
+++ b/contrib/arm-optimized-routines/math/vn_sinf.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_sinf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
 #include "v_sinf.c"
 #endif
diff --git a/contrib/arm-optimized-routines/networking/Dir.mk b/contrib/arm-optimized-routines/networking/Dir.mk
index b49610341171..2589e0a1f91c 100644
--- a/contrib/arm-optimized-routines/networking/Dir.mk
+++ b/contrib/arm-optimized-routines/networking/Dir.mk
@@ -1,76 +1,76 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/networking
 B := build/networking
 
 ifeq ($(ARCH),)
 all-networking check-networking install-networking clean-networking:
 	@echo "*** Please set ARCH in config.mk. ***"
 	@exit 1
 else
 
 networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS])
 networking-test-srcs := $(wildcard $(S)/test/*.c)
 
 networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
 
 networking-libs := \
 	build/lib/libnetworking.so \
 	build/lib/libnetworking.a \
 
 networking-tools := \
 	build/bin/test/chksum
 
 networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs)))
 networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs)))
 
 networking-objs := \
 	$(networking-lib-objs) \
 	$(networking-lib-objs:%.o=%.os) \
 	$(networking-test-objs) \
 
 networking-files := \
 	$(networking-objs) \
 	$(networking-libs) \
 	$(networking-tools) \
 	$(networking-includes) \
 
 all-networking: $(networking-libs) $(networking-tools) $(networking-includes)
 
 $(networking-objs): $(networking-includes)
 $(networking-objs): CFLAGS_ALL += $(networking-cflags)
 
 build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os)
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
 
 build/lib/libnetworkinglib.a: $(networking-lib-objs)
 	rm -f $@
 	$(AR) rc $@ $^
 	$(RANLIB) $@
 
 build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
 check-networking: $(networking-tools)
 	$(EMULATOR) build/bin/test/chksum -i simple
 	$(EMULATOR) build/bin/test/chksum -i scalar
 	$(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available
 
 install-networking: \
  $(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
  $(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%)
 
 clean-networking:
 	rm -f $(networking-files)
 endif
 
 .PHONY: all-networking check-networking install-networking clean-networking
diff --git a/contrib/arm-optimized-routines/networking/aarch64/chksum_simd.c b/contrib/arm-optimized-routines/networking/aarch64/chksum_simd.c
index 6d5be58b1f32..90c00eb7cabe 100644
--- a/contrib/arm-optimized-routines/networking/aarch64/chksum_simd.c
+++ b/contrib/arm-optimized-routines/networking/aarch64/chksum_simd.c
@@ -1,146 +1,146 @@
 /*
  * AArch64-specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
 #include "../chksum_common.h"
 
 #ifndef __ARM_NEON
 #pragma GCC target("+simd")
 #endif
 
 #include <arm_neon.h>
 
 always_inline
 static inline uint64_t
 slurp_head64(const void **pptr, uint32_t *nbytes)
 {
     Assert(*nbytes >= 8);
     uint64_t sum = 0;
     uint32_t off = (uintptr_t) *pptr % 8;
     if (likely(off != 0))
     {
 	/* Get rid of bytes 0..off-1 */
 	const unsigned char *ptr64 = align_ptr(*pptr, 8);
 	uint64_t mask = ALL_ONES << (CHAR_BIT * off);
 	uint64_t val = load64(ptr64) & mask;
 	/* Fold 64-bit sum to 33 bits */
 	sum = val >> 32;
 	sum += (uint32_t) val;
 	*pptr = ptr64 + 8;
 	*nbytes -= 8 - off;
     }
     return sum;
 }
 
 always_inline
 static inline uint64_t
 slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes)
 {
     Assert(nbytes < 8);
     if (likely(nbytes != 0))
     {
 	/* Get rid of bytes 7..nbytes */
 	uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes));
 	Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes);
 	uint64_t val = load64(ptr) & mask;
 	sum += val >> 32;
 	sum += (uint32_t) val;
 	nbytes = 0;
     }
     Assert(nbytes == 0);
     return sum;
 }
 
 unsigned short
 __chksum_aarch64_simd(const void *ptr, unsigned int nbytes)
 {
     bool swap = (uintptr_t) ptr & 1;
     uint64_t sum;
 
     if (unlikely(nbytes < 50))
     {
 	sum = slurp_small(ptr, nbytes);
 	swap = false;
 	goto fold;
     }
 
     /* 8-byte align pointer */
     Assert(nbytes >= 8);
     sum = slurp_head64(&ptr, &nbytes);
     Assert(((uintptr_t) ptr & 7) == 0);
 
     const uint32_t *may_alias ptr32 = ptr;
 
     uint64x2_t vsum0 = { 0, 0 };
     uint64x2_t vsum1 = { 0, 0 };
     uint64x2_t vsum2 = { 0, 0 };
     uint64x2_t vsum3 = { 0, 0 };
 
     /* Sum groups of 64 bytes */
     for (uint32_t i = 0; i < nbytes / 64; i++)
     {
 	uint32x4_t vtmp0 = vld1q_u32(ptr32);
 	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
 	uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
 	uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
 	vsum0 = vpadalq_u32(vsum0, vtmp0);
 	vsum1 = vpadalq_u32(vsum1, vtmp1);
 	vsum2 = vpadalq_u32(vsum2, vtmp2);
 	vsum3 = vpadalq_u32(vsum3, vtmp3);
 	ptr32 += 16;
     }
     nbytes %= 64;
 
     /* Fold vsum2 and vsum3 into vsum0 and vsum1 */
     vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
     vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
 
     /* Add any trailing group of 32 bytes */
     if (nbytes & 32)
     {
 	uint32x4_t vtmp0 = vld1q_u32(ptr32);
 	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
 	vsum0 = vpadalq_u32(vsum0, vtmp0);
 	vsum1 = vpadalq_u32(vsum1, vtmp1);
 	ptr32 += 8;
 	nbytes -= 32;
     }
     Assert(nbytes < 32);
 
     /* Fold vsum1 into vsum0 */
     vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
 
     /* Add any trailing group of 16 bytes */
     if (nbytes & 16)
     {
 	uint32x4_t vtmp = vld1q_u32(ptr32);
 	vsum0 = vpadalq_u32(vsum0, vtmp);
 	ptr32 += 4;
 	nbytes -= 16;
     }
     Assert(nbytes < 16);
 
     /* Add any trailing group of 8 bytes */
     if (nbytes & 8)
     {
 	uint32x2_t vtmp = vld1_u32(ptr32);
 	vsum0 = vaddw_u32(vsum0, vtmp);
 	ptr32 += 2;
 	nbytes -= 8;
     }
     Assert(nbytes < 8);
 
     uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0));
     sum += val >> 32;
     sum += (uint32_t) val;
 
     /* Handle any trailing 0..7 bytes */
     sum = slurp_tail64(sum, ptr32, nbytes);
 
 fold:
     return fold_and_swap(sum, swap);
 }
diff --git a/contrib/arm-optimized-routines/networking/arm/chksum_simd.c b/contrib/arm-optimized-routines/networking/arm/chksum_simd.c
index 7f69adfc963c..ae08fe5dd056 100644
--- a/contrib/arm-optimized-routines/networking/arm/chksum_simd.c
+++ b/contrib/arm-optimized-routines/networking/arm/chksum_simd.c
@@ -1,149 +1,149 @@
 /*
  * Armv7-A specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
 #include "../chksum_common.h"
 
 #ifndef __ARM_NEON
 #pragma GCC target("+simd")
 #endif
 
 #include <arm_neon.h>
 
 unsigned short
 __chksum_arm_simd(const void *ptr, unsigned int nbytes)
 {
     bool swap = (uintptr_t) ptr & 1;
     uint64x1_t vsum = { 0 };
 
     if (unlikely(nbytes < 40))
     {
 	uint64_t sum = slurp_small(ptr, nbytes);
 	return fold_and_swap(sum, false);
     }
 
     /* 8-byte align pointer */
     /* Inline slurp_head-like code since we use NEON here */
     Assert(nbytes >= 8);
     uint32_t off = (uintptr_t) ptr & 7;
     if (likely(off != 0))
     {
 	const uint64_t *may_alias ptr64 = align_ptr(ptr, 8);
 	uint64x1_t vword64 = vld1_u64(ptr64);
 	/* Get rid of bytes 0..off-1 */
 	uint64x1_t vmask = vdup_n_u64(ALL_ONES);
 	int64x1_t vshiftl = vdup_n_s64(CHAR_BIT * off);
 	vmask = vshl_u64(vmask, vshiftl);
 	vword64 = vand_u64(vword64, vmask);
 	uint32x2_t vtmp = vreinterpret_u32_u64(vword64);
 	/* Set accumulator */
 	vsum = vpaddl_u32(vtmp);
 	/* Update pointer and remaining size */
 	ptr = (char *) ptr64 + 8;
 	nbytes -= 8 - off;
     }
     Assert(((uintptr_t) ptr & 7) == 0);
 
     /* Sum groups of 64 bytes */
     uint64x2_t vsum0 = { 0, 0 };
     uint64x2_t vsum1 = { 0, 0 };
     uint64x2_t vsum2 = { 0, 0 };
     uint64x2_t vsum3 = { 0, 0 };
     const uint32_t *may_alias ptr32 = ptr;
     for (uint32_t i = 0; i < nbytes / 64; i++)
     {
 	uint32x4_t vtmp0 = vld1q_u32(ptr32);
 	uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
 	uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
 	uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
 	vsum0 = vpadalq_u32(vsum0, vtmp0);
 	vsum1 = vpadalq_u32(vsum1, vtmp1);
 	vsum2 = vpadalq_u32(vsum2, vtmp2);
 	vsum3 = vpadalq_u32(vsum3, vtmp3);
 	ptr32 += 16;
     }
     nbytes %= 64;
 
     /* Fold vsum1/vsum2/vsum3 into vsum0 */
     vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
     vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
     vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
 
     /* Add any trailing 16-byte groups */
     while (likely(nbytes >= 16))
     {
 	uint32x4_t vtmp0 = vld1q_u32(ptr32);
 	vsum0 = vpadalq_u32(vsum0, vtmp0);
 	ptr32 += 4;
 	nbytes -= 16;
     }
     Assert(nbytes < 16);
 
     /* Fold vsum0 into vsum */
     {
 	/* 4xu32 (4x32b) -> 2xu64 (2x33b) */
 	vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
 	/* 4xu32 (2x(1b+32b)) -> 2xu64 (2x(0b+32b)) */
 	vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
 	/* 4xu32 (4x32b) -> 2xu64 (2x33b) */
 	Assert((vgetq_lane_u64(vsum0, 0) >> 32) == 0);
 	Assert((vgetq_lane_u64(vsum0, 1) >> 32) == 0);
 	uint32x2_t vtmp = vmovn_u64(vsum0);
 	/* Add to accumulator */
 	vsum = vpadal_u32(vsum, vtmp);
     }
 
     /* Add any trailing group of 8 bytes */
     if (nbytes & 8)
     {
 	uint32x2_t vtmp = vld1_u32(ptr32);
 	/* Add to accumulator */
 	vsum = vpadal_u32(vsum, vtmp);
 	ptr32 += 2;
 	nbytes -= 8;
     }
     Assert(nbytes < 8);
 
     /* Handle any trailing 1..7 bytes */
     if (likely(nbytes != 0))
     {
 	Assert(((uintptr_t) ptr32 & 7) == 0);
 	Assert(nbytes < 8);
 	uint64x1_t vword64 = vld1_u64((const uint64_t *) ptr32);
 	/* Get rid of bytes 7..nbytes */
 	uint64x1_t vmask = vdup_n_u64(ALL_ONES);
 	int64x1_t vshiftr = vdup_n_s64(-CHAR_BIT * (8 - nbytes));
 	vmask = vshl_u64(vmask, vshiftr);/* Shift right */
 	vword64 = vand_u64(vword64, vmask);
 	/* Fold 64-bit sum to 33 bits */
 	vword64 = vpaddl_u32(vreinterpret_u32_u64(vword64));
 	/* Add to accumulator */
 	vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64));
     }
 
     /* Fold 64-bit vsum to 32 bits */
     vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
     vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
     Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0);
 
     /* Fold 32-bit vsum to 16 bits */
     uint32x2_t vsum32 = vreinterpret_u32_u64(vsum);
     vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
     vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
     Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 1) == 0);
     Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 2) == 0);
     Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 3) == 0);
 
     /* Convert to 16-bit scalar */
     uint16_t sum = vget_lane_u16(vreinterpret_u16_u32(vsum32), 0);
 
     if (unlikely(swap))/* Odd base pointer is unexpected */
     {
 	sum = bswap16(sum);
     }
     return sum;
 }
diff --git a/contrib/arm-optimized-routines/networking/chksum.c b/contrib/arm-optimized-routines/networking/chksum.c
index 95ce5baa94e4..329482ffdcee 100644
--- a/contrib/arm-optimized-routines/networking/chksum.c
+++ b/contrib/arm-optimized-routines/networking/chksum.c
@@ -1,81 +1,81 @@
 /*
  * Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
  * This sum is often used as a simple checksum in networking.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
 #include "chksum_common.h"
 
 always_inline
 static inline uint32_t
 slurp_head32(const void **pptr, uint32_t *nbytes)
 {
     uint32_t sum = 0;
     Assert(*nbytes >= 4);
     uint32_t off = (uintptr_t) *pptr % 4;
     if (likely(off != 0))
     {
 	/* Get rid of bytes 0..off-1 */
 	const unsigned char *ptr32 = align_ptr(*pptr, 4);
 	uint32_t mask = ~0U << (CHAR_BIT * off);
 	sum = load32(ptr32) & mask;
 	*pptr = ptr32 + 4;
 	*nbytes -= 4 - off;
     }
     return sum;
 }
 
 /* Additional loop unrolling would help when not auto-vectorizing */
 unsigned short
 __chksum(const void *ptr, unsigned int nbytes)
 {
     bool swap = false;
     uint64_t sum = 0;
 
     if (nbytes > 300)
     {
 	/* 4-byte align pointer */
 	swap = (uintptr_t) ptr & 1;
 	sum = slurp_head32(&ptr, &nbytes);
     }
     /* Else benefit of aligning not worth the overhead */
 
     /* Sum all 16-byte chunks */
     const char *cptr = ptr;
     for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
     {
 	uint64_t h0 = load32(cptr + 0);
 	uint64_t h1 = load32(cptr + 4);
 	uint64_t h2 = load32(cptr + 8);
 	uint64_t h3 = load32(cptr + 12);
 	sum += h0 + h1 + h2 + h3;
 	cptr += 16;
     }
     nbytes %= 16;
     Assert(nbytes < 16);
 
     /* Handle any trailing 4-byte chunks */
     while (nbytes >= 4)
     {
 	sum += load32(cptr);
 	cptr += 4;
 	nbytes -= 4;
     }
     Assert(nbytes < 4);
 
     if (nbytes & 2)
     {
 	sum += load16(cptr);
 	cptr += 2;
     }
 
     if (nbytes & 1)
     {
 	sum += *(uint8_t *)cptr;
     }
 
     return fold_and_swap(sum, swap);
 }
diff --git a/contrib/arm-optimized-routines/networking/chksum_common.h b/contrib/arm-optimized-routines/networking/chksum_common.h
index 958c8cc0742e..16f0f6c11df7 100644
--- a/contrib/arm-optimized-routines/networking/chksum_common.h
+++ b/contrib/arm-optimized-routines/networking/chksum_common.h
@@ -1,132 +1,132 @@
 /*
  * Common code for checksum implementations
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef CHKSUM_COMMON_H
 #define CHKSUM_COMMON_H
 
 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
 #error Only little endian supported
 #endif
 
 #include <limits.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 
 /* Assertions must be explicitly enabled */
 #if WANT_ASSERT
 #undef NDEBUG
 #include <assert.h>
 #define Assert(exp) assert(exp)
 #else
 #define Assert(exp) (void) (exp)
 #endif
 
 #ifdef __GNUC__
 #define likely(x)     __builtin_expect(!!(x), 1)
 #define unlikely(x)   __builtin_expect(!!(x), 0)
 #define may_alias     __attribute__((__may_alias__))
 #define always_inline __attribute__((always_inline))
 #ifdef __clang__
 #define no_unroll_loops
 #else
 #define no_unroll_loops  __attribute__((optimize("no-unroll-loops")))
 #endif
 #define bswap16(x)    __builtin_bswap16((x))
 #else
 #define likely(x)     (x)
 #define unlikely(x)   (x)
 #define may_alias
 #define always_inline
 #define no_unroll_loops
 #define bswap16(x)    ((uint8_t)((x) >> 8) | ((uint8_t)(x) << 8))
 #endif
 
 #define ALL_ONES ~UINT64_C(0)
 
 static inline
 uint64_t load64(const void *ptr)
 {
     /* GCC will optimise this to a normal load instruction */
     uint64_t v;
     memcpy(&v, ptr, sizeof v);
     return v;
 }
 
 static inline
 uint32_t load32(const void *ptr)
 {
     /* GCC will optimise this to a normal load instruction */
     uint32_t v;
     memcpy(&v, ptr, sizeof v);
     return v;
 }
 
 static inline
 uint16_t load16(const void *ptr)
 {
     /* GCC will optimise this to a normal load instruction */
     uint16_t v;
     memcpy(&v, ptr, sizeof v);
     return v;
 }
 
 /* slurp_small() is for small buffers, don't waste cycles on alignment */
 no_unroll_loops
 always_inline
 static inline uint64_t
 slurp_small(const void *ptr, uint32_t nbytes)
 {
     const unsigned char *cptr = ptr;
     uint64_t sum = 0;
     while (nbytes >= 4)
     {
 	sum += load32(cptr);
 	cptr += 4;
 	nbytes -= 4;
     }
     if (nbytes & 2)
     {
 	sum += load16(cptr);
 	cptr += 2;
     }
     if (nbytes & 1)
     {
 	sum += (uint8_t) *cptr;
     }
     return sum;
 }
 
 static inline const void *
 align_ptr(const void *ptr, size_t bytes)
 {
     return (void *) ((uintptr_t) ptr & -(uintptr_t) bytes);
 }
 
 always_inline
 static inline uint16_t
 fold_and_swap(uint64_t sum, bool swap)
 {
     /* Fold 64-bit sum to 32 bits */
     sum = (sum & 0xffffffff) + (sum >> 32);
     sum = (sum & 0xffffffff) + (sum >> 32);
     Assert(sum == (uint32_t) sum);
 
     /* Fold 32-bit sum to 16 bits */
     sum = (sum & 0xffff) + (sum >> 16);
     sum = (sum & 0xffff) + (sum >> 16);
     Assert(sum == (uint16_t) sum);
 
     if (unlikely(swap)) /* Odd base pointer is unexpected */
     {
 	sum = bswap16(sum);
     }
 
     return (uint16_t) sum;
 }
 
 #endif
diff --git a/contrib/arm-optimized-routines/networking/include/networking.h b/contrib/arm-optimized-routines/networking/include/networking.h
index a88feff88339..297dd4bfab02 100644
--- a/contrib/arm-optimized-routines/networking/include/networking.h
+++ b/contrib/arm-optimized-routines/networking/include/networking.h
@@ -1,14 +1,14 @@
 /*
  * Public API.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 unsigned short __chksum (const void *, unsigned int);
 #if __aarch64__ && __ARM_NEON
 unsigned short __chksum_aarch64_simd (const void *, unsigned int);
 #endif
 #if __arm__ && __ARM_NEON
 unsigned short __chksum_arm_simd (const void *, unsigned int);
 #endif
diff --git a/contrib/arm-optimized-routines/networking/test/chksum.c b/contrib/arm-optimized-routines/networking/test/chksum.c
index 41b98120f275..239b5b88777b 100644
--- a/contrib/arm-optimized-routines/networking/test/chksum.c
+++ b/contrib/arm-optimized-routines/networking/test/chksum.c
@@ -1,381 +1,381 @@
 /*
  * Ones' complement checksum test & benchmark
  *
  * Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <time.h>
 #include <unistd.h>
 #include "../include/networking.h"
 
 #if WANT_ASSERT
 #undef NDEBUG
 #include <assert.h>
 #define Assert(exp) assert(exp)
 #else
 #define Assert(exp) (void) (exp)
 #endif
 
 #ifdef __GNUC__
 #define may_alias __attribute__((__may_alias__))
 #else
 #define may_alias
 #endif
 
 #define CACHE_LINE 64
 #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
 
 /* Reference implementation - do not modify! */
 static uint16_t
 checksum_simple(const void *ptr, uint32_t nbytes)
 {
     const uint16_t *may_alias hptr = ptr;
     uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
 
     /* Sum all halfwords, assume misaligned accesses are handled in HW */
     for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
     {
 	sum += *hptr++;
     }
 
     /* Add any trailing odd byte */
     if ((nbytes & 0x01) != 0)
     {
 	sum += *(uint8_t *) hptr;
     }
 
     /* Fold 64-bit sum to 32 bits */
     sum = (sum & 0xffffffff) + (sum >> 32);
     sum = (sum & 0xffffffff) + (sum >> 32);
     Assert(sum == (uint32_t) sum);
 
     /* Fold 32-bit sum to 16 bits */
     sum = (sum & 0xffff) + (sum >> 16);
     sum = (sum & 0xffff) + (sum >> 16);
     Assert(sum == (uint16_t) sum);
 
     return (uint16_t) sum;
 }
 
 static struct
 {
     uint16_t (*cksum_fp)(const void *, uint32_t);
     const char *name;
 } implementations[] =
 {
     { checksum_simple, "simple"},
     { __chksum, "scalar"},
 #if __arm__
     { __chksum_arm_simd, "simd" },
 #elif __aarch64__
     { __chksum_aarch64_simd, "simd" },
 #endif
     { NULL, NULL}
 };
 
 static int
 find_impl(const char *name)
 {
     for (int i = 0; implementations[i].name != NULL; i++)
     {
 	if (strcmp(implementations[i].name, name) == 0)
 	{
 	    return i;
 	}
     }
     return -1;
 }
 
 static uint16_t (*CKSUM_FP)(const void *, uint32_t);
 static volatile uint16_t SINK;
 
 static bool
 verify(const void *data, uint32_t offset, uint32_t size)
 {
 
     uint16_t csum_expected = checksum_simple(data, size);
     uint16_t csum_actual = CKSUM_FP(data, size);
     if (csum_actual != csum_expected)
     {
 	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
 		"actual %04x expected %04x (valid)",
 		offset, size, csum_actual, csum_expected);
 	if (size < 65536)
 	{
 	    /* Fatal error */
 	    exit(EXIT_FAILURE);
 	}
 	/* Else some implementations only support sizes up to 2^16 */
 	return false;
     }
     return true;
 }
 
 static uint64_t
 clock_get_ns(void)
 {
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
 }
 
 static void
 benchmark(const uint8_t *base,
 	  size_t poolsize,
 	  uint32_t blksize,
 	  uint32_t numops,
 	  uint64_t cpufreq)
 {
     printf("%11u ", (unsigned int) blksize); fflush(stdout);
 
     uint64_t start = clock_get_ns();
     for (uint32_t i = 0; i < numops; i ++)
     {
 	/* Read a random value from the pool */
 	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
 	/* Generate a random starting address */
 	const void *data = &base[random % (poolsize - blksize)];
 	SINK = CKSUM_FP(data, blksize);
     }
     uint64_t end = clock_get_ns();
 
 #define MEGABYTE 1000000 /* Decimal megabyte (MB) */
     uint64_t elapsed_ns = end - start;
     uint64_t elapsed_ms = elapsed_ns / 1000000;
     uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
     uint64_t accbytes = (uint64_t) numops * blksize;
     printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
     unsigned int cyc_per_blk = cpufreq / blks_per_s;
     printf("%11u ", cyc_per_blk);
     if (blksize != 0)
     {
 	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
 	printf("%7u.%03u ",
 		cyc_per_byte / 1000, cyc_per_byte % 1000);
     }
     printf("\n");
 }
 
 int main(int argc, char *argv[])
 {
     int c;
     bool DUMP = false;
     uint32_t IMPL = 0;/* Simple implementation */
     uint64_t CPUFREQ = 0;
     uint32_t BLKSIZE = 0;
     uint32_t NUMOPS = 1000000;
     uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
 
     setvbuf(stdout, NULL, _IOLBF, 160);
     while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
     {
 	switch (c)
 	{
 	    case 'b' :
 		{
 		    int blksize = atoi(optarg);
 		    if (blksize < 1 || blksize > POOLSIZE / 2)
 		    {
 			fprintf(stderr, "Invalid block size %d\n", blksize);
 			exit(EXIT_FAILURE);
 		    }
 		    BLKSIZE = (unsigned) blksize;
 		    break;
 		}
 	    case 'd' :
 		DUMP = true;
 		break;
 	    case 'f' :
 		{
 		    int64_t cpufreq = atoll(optarg);
 		    if (cpufreq < 1)
 		    {
 			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
 				cpufreq);
 			exit(EXIT_FAILURE);
 		    }
 		    CPUFREQ = cpufreq;
 		    break;
 		}
 	    case 'i' :
 		{
 		    int impl = find_impl(optarg);
 		    if (impl < 0)
 		    {
 			fprintf(stderr, "Invalid implementation %s\n", optarg);
 			goto usage;
 		    }
 		    IMPL = (unsigned) impl;
 		    break;
 		}
 	    case 'n' :
 		{
 		    int numops = atoi(optarg);
 		    if (numops < 1)
 		    {
 			fprintf(stderr, "Invalid number of operations %d\n", numops);
 			exit(EXIT_FAILURE);
 		    }
 		    NUMOPS = (unsigned) numops;
 		    break;
 		}
 	    case 'p' :
 		{
 		    int poolsize = atoi(optarg);
 		    if (poolsize < 4096)
 		    {
 			fprintf(stderr, "Invalid pool size %d\n", poolsize);
 			exit(EXIT_FAILURE);
 		    }
 		    char c = optarg[strlen(optarg) - 1];
 		    if (c == 'M')
 		    {
 			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
 		    }
 		    else if (c == 'K')
 		    {
 			POOLSIZE = (unsigned) poolsize * 1024;
 		    }
 		    else
 		    {
 			POOLSIZE = (unsigned) poolsize;
 		    }
 		    break;
 		}
 	    default :
 usage :
 		fprintf(stderr, "Usage: checksum <options>\n"
 			"-b <blksize>    Block size\n"
 			"-d              Dump first 96 bytes of data\n"
 			"-f <cpufreq>    CPU frequency (Hz)\n"
 			"-i <impl>       Implementation\n"
 			"-n <numops>     Number of operations\n"
 			"-p <poolsize>   Pool size (K or M suffix)\n"
 		       );
 		printf("Implementations:");
 		for (int i = 0; implementations[i].name != NULL; i++)
 		{
 		    printf(" %s", implementations[i].name);
 		}
 		printf("\n");
 		exit(EXIT_FAILURE);
 	}
     }
     if (optind > argc)
     {
 	goto usage;
     }
 
     CKSUM_FP = implementations[IMPL].cksum_fp;
     POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
     uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
 			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
     if (base == MAP_FAILED)
     {
 	perror("aligned_alloc"), exit(EXIT_FAILURE);
     }
     for (size_t i = 0; i < POOLSIZE / 4; i++)
     {
 	((uint32_t *) base)[i] = rand();
     }
 
     printf("Implementation: %s\n", implementations[IMPL].name);
     printf("numops %u, poolsize ", NUMOPS);
     if (POOLSIZE % (1024 * 1024) == 0)
     {
 	printf("%uMiB", POOLSIZE / (1024 * 1024));
     }
     else if (POOLSIZE % 1024 == 0)
     {
 	printf("%uKiB", POOLSIZE / 1024);
     }
     else
     {
 	printf("%uB", POOLSIZE);
     }
     printf(", blocksize %u, CPU frequency %juMHz\n",
 	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
 #if WANT_ASSERT
     printf("Warning: assertions are enabled\n");
 #endif
 
     if (DUMP)
     {
 	/* Print out first 96 bytes of data for human debugging */
 	for (int i = 0; i < 96; i++)
 	{
 	    if (i % 8 == 0)
 		printf("%2u:", i);
 	    printf(" %02x", base[i]);
 	    if (i % 8 == 7)
 		printf("\n");
 	}
     }
 
     /* Verify that chosen algorithm handles all combinations of offsets and sizes */
     printf("Verifying..."); fflush(stdout);
     bool success = true;
     /* Check all (relevant) combinations of size and offset */
     for (int size = 0; size <= 256; size++)
     {
 	for (int offset = 0; offset < 255; offset++)
 	{
 	    /* Check at start of mapped memory */
 	    success &= verify(&base[offset], offset, size);
 	    /* Check at end of mapped memory */
 	    uint8_t *p = base + POOLSIZE - (size + offset);
 	    success &= verify(p, (uintptr_t) p % 64, size);
 	}
     }
     /* Check increasingly larger sizes */
     for (size_t size = 1; size < POOLSIZE; size *= 2)
     {
 	success &= verify(base, 0, size);
     }
     /* Check the full size, this can detect accumulator overflows */
     success &= verify(base, 0, POOLSIZE);
     printf("%s\n", success ? "OK" : "failure");
 
     /* Print throughput in decimal megabyte (1000000B) per second */
     if (CPUFREQ != 0)
     {
 	printf("%11s %11s %11s %11s\n",
 	       "block size", "MB/s", "cycles/blk", "cycles/byte");
     }
     else
     {
 	printf("%11s %11s %11s %11s\n",
 	       "block size", "MB/s", "ns/blk", "ns/byte");
 	CPUFREQ = 1000000000;
     }
     if (BLKSIZE != 0)
     {
 	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
     }
     else
     {
 	static const uint16_t sizes[] =
 	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
 	for (int i = 0; sizes[i] != 0; i++)
 	{
 	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
 	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
 	}
     }
 
     if (munmap(base, POOLSIZE) != 0)
     {
 	perror("munmap"), exit(EXIT_FAILURE);
     }
 
     return success ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/contrib/arm-optimized-routines/pl/Dir.mk b/contrib/arm-optimized-routines/pl/Dir.mk
new file mode 100644
index 000000000000..2d007790d241
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/Dir.mk
@@ -0,0 +1,21 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+# These targets are defined if we prescribe pl in SUBS.
+# It requires PLSUBS to be set.
+
+$(foreach sub,$(PLSUBS),$(eval include $(srcdir)/pl/$(sub)/Dir.mk))
+
+pl-files := $($(PLSUBS:%=pl/%-files))
+
+all-pl: $(PLSUBS:%=all-pl/%)
+
+check-pl: $(PLSUBS:%=check-pl/%)
+
+install-pl: $(PLSUBS:%=install-pl/%)
+
+clean-pl: $(PLSUBS:%=clean-pl/%)
+
+.PHONY: all-pl check-pl install-pl clean-pl
diff --git a/contrib/arm-optimized-routines/pl/README.contributors b/contrib/arm-optimized-routines/pl/README.contributors
new file mode 100644
index 000000000000..3af9b1fc7741
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/README.contributors
@@ -0,0 +1,23 @@
+Code in this sub-directory should follow the GNU Coding Standard, but it is
+not expected to be upstreamed into glibc without modification, so
+glibc-specific conventions need not be followed.
+
+The requirements for portable code apply to non-portable code with the
+following differences:
+
+
+1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
+   are no specific restrictions on acceptable ULP error, but if functions
+   provide significantly less accuracy than portable equivalents then a clear
+   justification for inclusion should be stated in comments at the top of the
+   source file. Error bounds of the approximation should be clearly documented
+   in comments.
+
+2. Functions are assumed to support round-to-nearest mode by default, unless
+   stated; other rounding modes are not required to be provided.
+
+3. Handling of special cases may be relaxed for vector functions. Checking
+   whether each vector lane contains special values such as NaN, Inf or
+   denormal numbers can prove too costly for vector functions. This is often
+   not required since vector functions are typically used along with aggressive
+   compiler optimization flags.
diff --git a/contrib/arm-optimized-routines/pl/math/Dir.mk b/contrib/arm-optimized-routines/pl/math/Dir.mk
new file mode 100644
index 000000000000..be65344572a8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/Dir.mk
@@ -0,0 +1,229 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+PLM := $(srcdir)/pl/math
+AOR := $(srcdir)/math
+B := build/pl/math
+
+math-lib-srcs := $(wildcard $(PLM)/*.[cS])
+math-test-srcs := \
+	$(AOR)/test/mathtest.c \
+	$(AOR)/test/mathbench.c \
+	$(AOR)/test/ulp.c \
+
+math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
+
+math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
+math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
+
+math-libs := \
+	build/pl/lib/libmathlib.so \
+	build/pl/lib/libmathlib.a \
+
+math-tools := \
+	build/pl/bin/mathtest \
+	build/pl/bin/mathbench \
+	build/pl/bin/mathbench_libc \
+	build/pl/bin/runulp.sh \
+	build/pl/bin/ulp \
+
+math-host-tools := \
+	build/pl/bin/rtest \
+
+math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
+math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
+math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
+math-target-objs := $(math-lib-objs) $(math-test-objs)
+math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
+
+pl/math-files := \
+	$(math-objs) \
+	$(math-libs) \
+	$(math-tools) \
+	$(math-host-tools) \
+	$(math-includes) \
+	$(math-test-includes) \
+
+all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+
+$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): CFLAGS_PL += $(math-cflags)
+$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
+$(math-host-objs): CC = $(HOST_CC)
+$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
+
+build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
+
+build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG macros with mathbench func entries
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
+
+build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG macros with ULP wrapper declarations
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
+
+$(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h
+$(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
+
+$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
+$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
+
+build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
+
+build/pl/lib/libmathlib.a: $(math-lib-objs)
+	rm -f $@
+	$(AR) rc $@ $^
+	$(RANLIB) $@
+
+$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
+$(math-tools): LDLIBS += $(math-ldlibs) -lm
+
+# Some targets to build pl/math/test from math/test sources
+build/pl/math/test/%.o: $(srcdir)/math/test/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.o: $(srcdir)/math/test/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.os: $(srcdir)/math/test/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.os: $(srcdir)/math/test/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+# Some targets to build pl/ sources using appropriate flags
+build/pl/%.o: $(srcdir)/pl/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.o: $(srcdir)/pl/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.os: $(srcdir)/pl/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.os: $(srcdir)/pl/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/bin/rtest: $(math-host-objs)
+	$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
+
+build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+# This is not ideal, but allows custom symbols in mathbench to get resolved.
+build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm
+
+build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/pl/include/%.h: $(PLM)/include/%.h
+	cp $< $@
+
+build/pl/include/test/%.h: $(PLM)/test/%.h
+	cp $< $@
+
+build/pl/bin/%.sh: $(PLM)/test/%.sh
+	cp $< $@
+
+pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst)
+pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst)
+
+check-pl/math-test: $(math-tools)
+	cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+
+check-pl/math-rtest: $(math-host-tools) $(math-tools)
+	cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+
+ulp-input-dir=$(B)/test/inputs
+
+math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
+math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
+math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
+math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs)))
+
+ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs)
+
+$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
+
+$(ulp-input-dir)/%.ulp: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
+
+$(ulp-input-dir)/%.alias: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
+
+$(ulp-input-dir)/%.fenv: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
+
+$(ulp-input-dir)/%.itv: $(PLM)/%.c
+	mkdir -p $(dir $@)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@
+
+ulp-lims := $(ulp-input-dir)/limits
+$(ulp-lims): $(math-lib-lims)
+	cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
+
+ulp-aliases := $(ulp-input-dir)/aliases
+$(ulp-aliases): $(math-lib-aliases)
+	cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
+
+fenv-exps := $(ulp-input-dir)/fenv
+$(fenv-exps): $(math-lib-fenvs)
+	cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
+
+ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias
+$(ulp-itvs-noalias): $(math-lib-itvs)
+	cat $^ > $@
+
+rename-aliases := $(ulp-input-dir)/rename_alias.sed
+$(rename-aliases): $(ulp-aliases)
+	# Build sed script for replacing aliases from generated alias file
+	cat $< |  awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@
+
+ulp-itvs-alias := $(ulp-input-dir)/itvs_alias
+$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases)
+	cat $< | sed  -f $(rename-aliases) > $@
+
+ulp-itvs := $(ulp-input-dir)/intervals
+$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
+	cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
+
+check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
+	WANT_SVE_MATH=$(WANT_SVE_MATH) \
+	ULPFLAGS="$(math-ulpflags)" \
+	LIMITS=../../../$(ulp-lims) \
+	ALIASES=../../../$(ulp-aliases) \
+	INTERVALS=../../../$(ulp-itvs) \
+	FENV=../../../$(fenv-exps) \
+	build/pl/bin/runulp.sh $(EMULATOR)
+
+check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
+
+$(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so
+	$(INSTALL) -D $< $@
+
+$(DESTDIR)$(libdir)/pl/%: build/pl/lib/%
+	$(INSTALL) -m 644 -D $< $@
+
+$(DESTDIR)$(includedir)/pl/%: build/pl/include/%
+	$(INSTALL) -m 644 -D $< $@
+
+install-pl/math: \
+ $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
+ $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
+
+clean-pl/math:
+	rm -f $(pl/math-files)
+
+.PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math
diff --git a/contrib/arm-optimized-routines/pl/math/acosh_3u.c b/contrib/arm-optimized-routines/pl/math/acosh_3u.c
new file mode 100644
index 000000000000..4e2cb6737ba8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/acosh_3u.c
@@ -0,0 +1,66 @@
+/*
+ * Double-precision acosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define MinusZero (0x8000000000000000)
+#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511).  */
+#define Two (0x4000000000000000)       /* asuint64(2.0).  */
+
+double
+optr_aor_log_f64 (double);
+
+double
+log1p (double);
+
+/* acosh approximation using a variety of approaches on different intervals:
+
+   acosh(x) = ln(x + sqrt(x * x - 1)).
+
+   x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
+   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   ln(2). The greatest observed error in this region is 0.98 ULP:
+   acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
+				want 0x1.28066a11a7c8p+9.
+
+   x > 2: Calculate the result directly using definition of acosh(x). Greatest
+   observed error in this region is 1.33 ULP:
+   acosh(0x1.1e45d14bfcfa2p+1) got 0x1.71a06f50c34b5p+0
+			      want 0x1.71a06f50c34b6p+0.
+
+   0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
+   undefined. For 1 <= x <= 2, the largest observed error is 2.69 ULP:
+   acosh(0x1.073528248093p+0) got 0x1.e4d9bd20684f3p-3
+			     want 0x1.e4d9bd20684f6p-3.  */
+double
+acosh (double x)
+{
+  uint64_t ix = asuint64 (x);
+
+  if (unlikely (ix >= MinusZero))
+    return __math_invalid (x);
+
+  if (unlikely (ix >= SquareLim))
+    return optr_aor_log_f64 (x) + Ln2;
+
+  if (ix >= Two)
+    return optr_aor_log_f64 (x + sqrt (x * x - 1));
+
+  double xm1 = x - 1;
+  return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
+}
+
+PL_SIG (S, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (acosh, 2.19)
+PL_TEST_INTERVAL (acosh, 0, 1, 10000)
+PL_TEST_INTERVAL (acosh, 1, 2, 100000)
+PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
+PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
+PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/pl/math/acoshf_2u8.c b/contrib/arm-optimized-routines/pl/math/acoshf_2u8.c
new file mode 100644
index 000000000000..c9cded7fd2ff
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/acoshf_2u8.c
@@ -0,0 +1,63 @@
+/*
+ * Single-precision acosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2 (0x1.62e4p-1f)
+#define MinusZero 0x80000000
+#define SquareLim 0x5f800000 /* asuint(0x1p64).  */
+#define Two 0x40000000
+
+/* Single-precision log from math/.  */
+float
+optr_aor_log_f32 (float);
+
+/* Single-precision log(1+x) from pl/math.  */
+float
+log1pf (float);
+
+/* acoshf approximation using a variety of approaches on different intervals:
+
+   x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
+   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   ln(2). The greatest error in the region is 0.94 ULP:
+   acoshf(0x1.15f706p+92) got 0x1.022e14p+6 want 0x1.022e16p+6.
+
+   x > 2: Calculate the result directly using definition of asinh(x) = ln(x +
+   sqrt(x*x - 1)). Greatest error in this region is 1.30 ULP:
+   acoshf(0x1.249d8p+1) got 0x1.77e1aep+0 want 0x1.77e1bp+0.
+
+   0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
+   undefined. For 1 <= x <= 2, the greatest error is 2.78 ULP:
+   acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 want 0x1.ef9ea2p-3.  */
+float
+acoshf (float x)
+{
+  uint32_t ix = asuint (x);
+
+  if (unlikely (ix >= MinusZero))
+    return __math_invalidf (x);
+
+  if (unlikely (ix >= SquareLim))
+    return optr_aor_log_f32 (x) + Ln2;
+
+  if (ix > Two)
+    return optr_aor_log_f32 (x + sqrtf (x * x - 1));
+
+  float xm1 = x - 1;
+  return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
+}
+
+PL_SIG (S, F, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (acoshf, 2.30)
+PL_TEST_INTERVAL (acoshf, 0, 1, 100)
+PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
+PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
+PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
+PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/pl/math/asinh_2u5.c b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c
new file mode 100644
index 000000000000..f1679556d5f8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision asinh(x) function
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
+#define One 0x3ff0000000000000	  /* asuint64(1.0).  */
+#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511).  */
+#define Ln2 0x1.62e42fefa39efp-1
+
+double
+optr_aor_log_f64 (double);
+
+/* Scalar double-precision asinh implementation. This routine uses different
+   approaches on different intervals:
+
+   |x| < 2^-26: Return x. Function is exact in this region.
+
+   |x| < 1: Use custom order-17 polynomial. This is least accurate close to 1.
+     The largest observed error in this region is 1.47 ULPs:
+     asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				want 0x1.c1d6bf874019cp-1.
+
+   |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate
+     the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)).
+     The largest observed error in this region is 2.03 ULPs:
+     asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1
+				 want -0x1.c3508eb6a682p-1.
+
+   |x| >= 2^511: We cannot square x without overflow at a low
+     cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
+     even double x without overflow, so calculate this as ln(x) +
+     ln(2). The largest observed error in this region is 0.98 ULPs at many
+     values, for instance:
+     asinh(0x1.5255a4cf10319p+975) got 0x1.52652f4cb26cbp+9
+				  want 0x1.52652f4cb26ccp+9.  */
+double
+asinh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  double ax = asdouble (ia);
+  uint64_t sign = ix & ~AbsMask;
+
+  if (ia < ExpM26)
+    {
+      return x;
+    }
+
+  if (ia < One)
+    {
+      double x2 = x * x;
+      double z2 = x2 * x2;
+      double z4 = z2 * z2;
+      double z8 = z4 * z4;
+#define C(i) __asinh_data.poly[i]
+      double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
+      double y = fma (p, x2 * ax, ax);
+      return asdouble (asuint64 (y) | sign);
+    }
+
+  if (unlikely (ia >= Exp511))
+    {
+      return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign);
+    }
+
+  return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1)))
+		   | sign);
+}
+
+PL_SIG (S, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (asinh, 1.54)
+PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000)
+PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000)
+PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000)
+PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000)
+PL_TEST_INTERVAL (asinh, 100.0, inf, 50000)
+PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000)
diff --git a/contrib/arm-optimized-routines/pl/math/asinh_data.c b/contrib/arm-optimized-routines/pl/math/asinh_data.c
new file mode 100644
index 000000000000..073b19799bda
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/asinh_data.c
@@ -0,0 +1,22 @@
+/*
+ * Double-precision polynomial coefficients for scalar asinh(x)
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* asinh(x) is odd, and the first term of the Taylor expansion is x, so we can
+   approximate the function by x + x^3 * P(x^2), where P(z) has the form:
+   C0 + C1 * z + C2 * z^2 + C3 * z^3 + ...
+   Note P is evaluated on even powers of x only. See tools/asinh.sollya for the
+   algorithm used to generate these coefficients.  */
+const struct asinh_data __asinh_data
+  = {.poly
+     = {-0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+	0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+	-0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+	0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+	-0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+	0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18}};
diff --git a/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c
new file mode 100644
index 000000000000..2b2c55db56dc
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrinf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffff)
+#define SqrtFltMax (0x1.749e96p+10f)
+#define Ln2 (0x1.62e4p-1f)
+#define One (0x3f8)
+#define ExpM12 (0x398)
+
+#define C(i) __asinhf_data.coeffs[i]
+
+float
+optr_aor_log_f32 (float);
+
+/* asinhf approximation using a variety of approaches on different intervals:
+
+   |x| < 2^-12: Return x. Function is exactly rounded in this region.
+
+   |x| < 1.0: Use custom order-8 polynomial. The largest observed
+     error in this region is 1.3ulps:
+     asinhf(0x1.f0f74cp-1) got 0x1.b88de4p-1 want 0x1.b88de2p-1.
+
+   |x| <= SqrtFltMax: Calculate the result directly using the
+     definition of asinh(x) = ln(x + sqrt(x*x + 1)). The largest
+     observed error in this region is 1.99ulps.
+     asinhf(0x1.00e358p+0) got 0x1.c4849ep-1 want 0x1.c484a2p-1.
+
+   |x| > SqrtFltMax: We cannot square x without overflow at a low
+     cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
+     even double x without overflow, so calculate this as ln(x) +
+     ln(2). This largest observed error in this region is 3.39ulps.
+     asinhf(0x1.749e9ep+10) got 0x1.fffff8p+2 want 0x1.fffffep+2.  */
+float
+asinhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 20;
+  float ax = asfloat (ia);
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (ia12 < ExpM12 || ia == 0x7f800000))
+    return x;
+
+  if (unlikely (ia12 >= 0x7f8))
+    return __math_invalidf (x);
+
+  if (ia12 < One)
+    {
+      float x2 = ax * ax;
+      float p = ESTRIN_7 (ax, x2, x2 * x2, C);
+      float y = fmaf (x2, p, ax);
+      return asfloat (asuint (y) | sign);
+    }
+
+  if (unlikely (ax > SqrtFltMax))
+    {
+      return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign);
+    }
+
+  return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign);
+}
+
+PL_SIG (S, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (asinhf, 2.9)
+PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000)
+PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000)
+PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000)
diff --git a/contrib/arm-optimized-routines/pl/math/asinhf_data.c b/contrib/arm-optimized-routines/pl/math/asinhf_data.c
new file mode 100644
index 000000000000..cd1ef16b3b6a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/asinhf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients for single-precision asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya for
+   these coeffs were generated.  */
+const struct asinhf_data __asinhf_data
+  = {.coeffs
+     = {-0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, 0x1.3a81dcp-4f,
+	0x1.65bbaap-10f, -0x1.057f1p-4f, 0x1.6c1d46p-5f, -0x1.4cafe8p-7f}};
diff --git a/contrib/arm-optimized-routines/pl/math/atan2_2u5.c b/contrib/arm-optimized-routines/pl/math/atan2_2u5.c
new file mode 100644
index 000000000000..c909ac99fa22
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atan2_2u5.c
@@ -0,0 +1,159 @@
+/*
+ * Double-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#include "atan_common.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Pi (0x1.921fb54442d18p+1)
+#define PiOver2 (0x1.921fb54442d18p+0)
+#define PiOver4 (0x1.921fb54442d18p-1)
+#define SignMask (0x8000000000000000)
+#define ExpMask (0x7ff0000000000000)
+
+/* We calculate atan2 by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8,
+   which may underflow if n and d have very different magnitude.
+   POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n
+   and d for which P underflows, and is used to special-case such inputs.  */
+#define POW8_EXP_UFLOW_BOUND 62
+
+static inline int64_t
+biased_exponent (double f)
+{
+  uint64_t fi = asuint64 (f);
+  return (fi & ExpMask) >> 52;
+}
+
+/* Fast implementation of scalar atan2. Largest errors are when y and x are
+   close together. The greatest observed error is 2.28 ULP:
+   atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
+double
+atan2 (double y, double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iy = asuint64 (y);
+
+  uint64_t sign_x = ix & SignMask;
+  uint64_t sign_y = iy & SignMask;
+
+  uint64_t iax = ix & ~SignMask;
+  uint64_t iay = iy & ~SignMask;
+
+  bool xisnan = isnan (x);
+  if (unlikely (isnan (y) && !xisnan))
+    return __math_invalid (y);
+  if (unlikely (xisnan))
+    return __math_invalid (x);
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2);
+
+  int64_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* y = 0.  */
+  if (iay == 0)
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or y is much larger than x (difference in exponents >=
+     POW8_EXP_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* Special case for either x is INF or (x, y) is very close to x axis and x is
+     negative.  */
+  if (unlikely (iax == 0x7ff0000000000000
+		|| (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2)))
+    {
+      if (iay == 0x7ff0000000000000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0 * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0 * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7ff0000000000000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint64_t sign_xy = sign_x ^ sign_y;
+
+  double ax = asdouble (iax);
+  double ay = asdouble (iay);
+  uint64_t pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atan.  */
+  double n = pred_aygtax ? -ax : ay;
+  double d = pred_aygtax ? ay : ax;
+  double z = n / d;
+
+  double ret;
+  if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      double shift = sign_x ? -2.0 : 0.0;
+      shift = pred_aygtax ? shift + 1.0 : shift;
+      shift *= PiOver2;
+
+      ret = eval_poly (z, z, shift);
+    }
+
+  /* Account for the sign of x and y.  */
+  return asdouble (asuint64 (ret) ^ sign_xy);
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (S, D, 2, atan2)
+PL_TEST_ULP (atan2, 1.78)
+PL_TEST_INTERVAL (atan2, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (atan2, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (atan2, 1e6, 1e32, 40000)
diff --git a/contrib/arm-optimized-routines/pl/math/atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/atan2f_3u.c
new file mode 100644
index 000000000000..38e1df59c102
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atan2f_3u.c
@@ -0,0 +1,167 @@
+/*
+ * Single-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#include "atanf_common.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Pi (0x1.921fb6p+1f)
+#define PiOver2 (0x1.921fb6p+0f)
+#define PiOver4 (0x1.921fb6p-1f)
+#define SignMask (0x80000000)
+
+/* We calculate atan2f by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. The polynomial may underflow.
+   POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and d
+   for which P underflows, and is used to special-case such inputs.  */
+#define POLY_UFLOW_BOUND 24
+
+static inline int32_t
+biased_exponent (float f)
+{
+  uint32_t fi = asuint (f);
+  int32_t ex = (int32_t) ((fi & 0x7f800000) >> 23);
+  if (unlikely (ex == 0))
+    {
+      /* Subnormal case - we still need to get the exponent right for subnormal
+	 numbers as division may take us back inside the normal range.  */
+      return ex - __builtin_clz (fi << 9);
+    }
+  return ex;
+}
+
+/* Fast implementation of scalar atan2f. Largest observed error is
+   2.88ulps in [99.0, 101.0] x [99.0, 101.0]:
+   atan2f(0x1.9332d8p+6, 0x1.8cb6c4p+6) got 0x1.964646p-1
+				       want 0x1.964640p-1.  */
+float
+atan2f (float y, float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iy = asuint (y);
+
+  uint32_t sign_x = ix & SignMask;
+  uint32_t sign_y = iy & SignMask;
+
+  uint32_t iax = ix & ~SignMask;
+  uint32_t iay = iy & ~SignMask;
+
+  /* x or y is NaN.  */
+  if ((iax > 0x7f800000) || (iay > 0x7f800000))
+    return x + y;
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 31) & 1) | ((ix >> 30) & 2);
+
+  /* The following follows glibc ieee754 implementation, except
+     that we do not use +-tiny shifts (non-nearest rounding mode).  */
+
+  int32_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* Special case for (x, y) either on or very close to the x axis. Either y =
+     0, or y is tiny and x is huge (difference in exponents >=
+     POLY_UFLOW_BOUND). In the second case, we only want to use this special
+     case when x is negative (i.e. quadrants 2 or 3).  */
+  if (unlikely (iay == 0 || (exp_diff >= POLY_UFLOW_BOUND && m >= 2)))
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or x is tiny and y is huge (difference in exponents >=
+     POLY_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POLY_UFLOW_BOUND))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* x is INF.  */
+  if (iax == 0x7f800000)
+    {
+      if (iay == 0x7f800000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0f * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0f * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0f; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0f; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7f800000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint32_t sign_xy = sign_x ^ sign_y;
+
+  float ax = asfloat (iax);
+  float ay = asfloat (iay);
+
+  bool pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atanf.  */
+  float n = pred_aygtax ? -ax : ay;
+  float d = pred_aygtax ? ay : ax;
+  float z = n / d;
+
+  float ret;
+  if (unlikely (m < 2 && exp_diff >= POLY_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      float shift = sign_x ? -2.0f : 0.0f;
+      shift = pred_aygtax ? shift + 1.0f : shift;
+      shift *= PiOver2;
+
+      ret = eval_poly (z, z, shift);
+    }
+
+  /* Account for the sign of x and y.  */
+  return asfloat (asuint (ret) ^ sign_xy);
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (S, F, 2, atan2)
+PL_TEST_ULP (atan2f, 2.4)
+PL_TEST_INTERVAL (atan2f, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (atan2f, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2f, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2f, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (atan2f, 1e6, 1e32, 40000)
diff --git a/contrib/arm-optimized-routines/pl/math/atan_2u5.c b/contrib/arm-optimized-routines/pl/math/atan_2u5.c
new file mode 100644
index 000000000000..ee4770101758
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atan_2u5.c
@@ -0,0 +1,73 @@
+/*
+ * Double-precision atan(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "atan_common.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define PiOver2 0x1.921fb54442d18p+0
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+#define OneTop 0x3ff
+
+/* Fast implementation of double-precision atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+			     want 0x1.9225645bdd7c3p-1.  */
+double
+atan (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 52;
+
+  if (unlikely (ia12 >= BigBound || ia12 < TinyBound))
+    {
+      if (ia12 < TinyBound)
+	/* Avoid underflow by returning x.  */
+	return x;
+      if (ia > 0x7ff0000000000000)
+	/* Propagate NaN.  */
+	return __math_invalid (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asdouble (asuint64 (PiOver2) ^ sign);
+    }
+
+  double z, az, shift;
+  if (ia12 >= OneTop)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0 / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabs (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      shift = 0;
+      az = asdouble (ia);
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  double y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atan, 1.78)
+PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/pl/math/atan_common.h b/contrib/arm-optimized-routines/pl/math/atan_common.h
new file mode 100644
index 000000000000..da0da6436854
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atan_common.h
@@ -0,0 +1,49 @@
+/*
+ * Double-precision polynomial evaluation function for scalar and vector atan(x)
+ * and atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "estrin.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define DBL_T v_f64_t
+#define P(i) v_f64 (__atan_poly_data.poly[i])
+
+#else
+
+#define DBL_T double
+#define P(i) __atan_poly_data.poly[i]
+
+#endif
+
+/* Polynomial used in fast atan(x) and atan2(y,x) implementations
+   The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline DBL_T
+eval_poly (DBL_T z, DBL_T az, DBL_T shift)
+{
+  /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.  */
+  DBL_T z2 = z * z;
+  DBL_T x2 = z2 * z2;
+  DBL_T x4 = x2 * x2;
+  DBL_T x8 = x4 * x4;
+  DBL_T y
+    = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  y = FMA (y, z2 * az, az);
+  y = y + shift;
+
+  return y;
+}
+
+#undef DBL_T
+#undef FMA
+#undef P
diff --git a/contrib/arm-optimized-routines/pl/math/atan_data.c b/contrib/arm-optimized-routines/pl/math/atan_data.c
new file mode 100644
index 000000000000..91d0f61d2eaf
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atan_data.c
@@ -0,0 +1,20 @@
+/*
+ * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct atan_poly_data __atan_poly_data = {
+  .poly = {/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0]. See atan.sollya for details of how these were
+	      generated.  */
+	   -0x1.5555555555555p-2,  0x1.99999999996c1p-3,  -0x1.2492492478f88p-3,
+	   0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+	   -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5,  -0x1.aebfe7b418581p-5,
+	   0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+	   -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6,  -0x1.0051381722a59p-6,
+	   0x1.14e9dc19a4a4ep-7,   -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+	   -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16}};
diff --git a/contrib/arm-optimized-routines/pl/math/atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c
new file mode 100644
index 000000000000..9d17f252b8b9
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atan(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "atanf_common.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define PiOver2 0x1.921fb6p+0f
+#define AbsMask 0x7fffffff
+#define TinyBound 0x30800000 /* asuint(0x1p-30).  */
+#define BigBound 0x4e800000  /* asuint(0x1p30).  */
+#define One 0x3f800000
+
+/* Approximation of single-precision atan(x) based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2.
+   Maximum error is 2.88 ulps:
+   atanf(0x1.0565ccp+0) got 0x1.97771p-1
+		       want 0x1.97770ap-1.  */
+float
+atanf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix & ~AbsMask;
+  uint32_t ia = ix & AbsMask;
+
+  if (unlikely (ia < TinyBound))
+    /* Avoid underflow by returning x.  */
+    return x;
+
+  if (unlikely (ia > BigBound))
+    {
+      if (ia > 0x7f800000)
+	/* Propagate NaN.  */
+	return __math_invalidf (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asfloat (asuint (PiOver2) ^ sign);
+    }
+
+  float z, az, shift;
+  if (ia > One)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0f / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabsf (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      az = asfloat (ia);
+      shift = 0;
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  float y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asfloat (asuint (y) ^ sign);
+}
+
+PL_SIG (S, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atanf, 2.38)
+PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/pl/math/atanf_common.h b/contrib/arm-optimized-routines/pl/math/atanf_common.h
new file mode 100644
index 000000000000..37ca76dee2f7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atanf_common.h
@@ -0,0 +1,51 @@
+/*
+ * Single-precision polynomial evaluation function for scalar and vector
+ * atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_ATANF_COMMON_H
+#define PL_MATH_ATANF_COMMON_H
+
+#include "math_config.h"
+#include "estrinf.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define FLT_T v_f32_t
+#define P(i) v_f32 (__atanf_poly_data.poly[i])
+
+#else
+
+#define FLT_T float
+#define P(i) __atanf_poly_data.poly[i]
+
+#endif
+
+/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations
+   The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline FLT_T
+eval_poly (FLT_T z, FLT_T az, FLT_T shift)
+{
+  /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+     a standard implementation using z8 creates spurious underflow
+     in the very last fma (when z^8 is small enough).
+     Therefore, we split the last fma into a mul and and an fma.
+     Horner and single-level Estrin have higher errors that exceed
+     threshold.  */
+  FLT_T z2 = z * z;
+  FLT_T z4 = z2 * z2;
+
+  /* Then assemble polynomial.  */
+  FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
+
+  /* Finalize:
+     y = shift + z * P(z^2).  */
+  return FMA (y, z2 * az, az) + shift;
+}
+
+#endif // PL_MATH_ATANF_COMMON_H
diff --git a/contrib/arm-optimized-routines/pl/math/atanf_data.c b/contrib/arm-optimized-routines/pl/math/atanf_data.c
new file mode 100644
index 000000000000..c4cba2378cea
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atanf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0].
+ */
+const struct atanf_poly_data __atanf_poly_data = {
+  .poly = {/* See atanf.sollya for details of how these were generated.  */
+	   -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+	   -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f}};
diff --git a/contrib/arm-optimized-routines/pl/math/atanh_3u.c b/contrib/arm-optimized-routines/pl/math/atanh_3u.c
new file mode 100644
index 000000000000..a168cd555ff6
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atanh_3u.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define Ln2Hi 0x1.62e42fefa3800p-1
+#define Ln2Lo 0x1.ef35793c76730p-45
+#define OneMHfRt2Top                                                           \
+  0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)).  */
+#define OneTop12 0x3ff
+#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
+#define BottomMask 0xffffffff
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+log1p_inline (double x)
+{
+  /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced
+     interval. Copied from log1p_2u.c, with no special-case handling. See that
+     file for details of the algorithm.  */
+  double m = x + 1;
+  uint64_t mi = asuint64 (m);
+
+  /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in
+     [sqrt(2)/2, sqrt(2)].  */
+  uint32_t u = (mi >> 32) + OneMHfRt2Top;
+  int32_t k = (int32_t) (u >> 20) - OneTop12;
+  uint32_t utop = (u & 0x000fffff) + HfRt2Top;
+  uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask);
+  double f = asdouble (u_red) - 1;
+
+  /* Correction term for round-off in f.  */
+  double cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(f) with polynomial.  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f);
+
+  /* Recombine log1p(x) = k*log2 + log1p(f) + c/m.  */
+  double kd = k;
+  double y = fma (Ln2Lo, kd, cm);
+  return y + fma (Ln2Hi, kd, p);
+}
+
+/* Approximation for double-precision inverse tanh(x), using a simplified
+   version of log1p. Greatest observed error is 3.00 ULP:
+   atanh(0x1.e58f3c108d714p-4) got 0x1.e7da77672a647p-4
+			      want 0x1.e7da77672a64ap-4.  */
+double
+atanh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+
+  if (unlikely (ia == One))
+    return __math_divzero (sign >> 32);
+
+  if (unlikely (ia > One))
+    return __math_invalid (x);
+
+  double halfsign = asdouble (Half | sign);
+  double ax = asdouble (ia);
+  return halfsign * log1p_inline ((2 * ax) / (1 - ax));
+}
+
+PL_SIG (S, D, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (atanh, 3.00)
+PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000)
+PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000)
+PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000)
+PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000)
+PL_TEST_INTERVAL (atanh, 1, inf, 100)
+PL_TEST_INTERVAL (atanh, -1, -inf, 100)
diff --git a/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c
new file mode 100644
index 000000000000..fb90aa29c7a3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  return fmaf (m4 * p_79, m4, p_06);
+}
+
+static inline float
+log1pf_inline (float x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  float m = x + 1.0f;
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+  float s = asfloat (Four - k);
+  float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f);
+  float p = eval_poly (m_scale);
+  float scale_back = (float) k * 0x1.0p-23f;
+  return fmaf (scale_back, Ln2, p);
+}
+
+/* Approximation for single-precision inverse tanh(x), using a simplified
+   version of log1p. Maximum error is 3.08 ULP:
+   atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5
+		       want 0x1.ffb76ep-5.  */
+float
+atanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax < TinyBound))
+    return x;
+
+  if (iax == One)
+    return __math_divzero (sign);
+
+  if (unlikely (iax > One))
+    return __math_invalidf (x);
+
+  float halfsign = asfloat (Half | sign);
+  float ax = asfloat (iax);
+  return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+}
+
+PL_SIG (S, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (atanhf, 2.59)
+PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500)
+PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000)
+PL_TEST_INTERVAL (atanhf, 1, inf, 1000)
+PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500)
+PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000)
+PL_TEST_INTERVAL (atanhf, -1, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/pl/math/cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c
new file mode 100644
index 000000000000..83715dd18a3e
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c
@@ -0,0 +1,70 @@
+/*
+ * Double-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+PL_SIG (S, D, 1, cbrt, -10.0, 10.0)
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds 0x1.5555555555555p-1
+
+#define C(i) __cbrt_data.poly[i]
+#define T(i) __cbrt_data.table[i]
+
+/* Approximation for double-precision cbrt(x), using low-order polynomial and
+   two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+			     want 0x1.965fe72821e99p+0.  */
+double
+cbrt (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f80000000000000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexp.  */
+  int e;
+  double m = frexp (asdouble (iax), &e);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  double p_01 = fma (C (1), m, C (0));
+  double p_23 = fma (C (3), m, C (2));
+  double p = fma (p_23, m * m, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  double m_by_3 = m / 3;
+  double a = fma (TwoThirds, p, m_by_3 / (p * p));
+  a = fma (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexp.  */
+  return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_TEST_ULP (cbrt, 1.30)
+PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
diff --git a/contrib/arm-optimized-routines/pl/math/cbrt_data.c b/contrib/arm-optimized-routines/pl/math/cbrt_data.c
new file mode 100644
index 000000000000..3d484c2779e2
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/cbrt_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for double-precision cbrt(x).
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrt_data __cbrt_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrt.sollya for details of generation.  */
+	      0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	         0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}};
diff --git a/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c
new file mode 100644
index 000000000000..adc591786a6a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c
@@ -0,0 +1,67 @@
+/*
+ * Single-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrinf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define SignMask 0x80000000
+#define TwoThirds 0x1.555556p-1f
+
+#define C(i) __cbrtf_data.poly[i]
+#define T(i) __cbrtf_data.table[i]
+
+/* Approximation for single-precision cbrt(x), using low-order polynomial and
+   one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This
+   is observed for every value where the mantissa is 0x1.81410e and the exponent
+   is a multiple of 3, for example:
+   cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			want 0x1.255d92p+10.  */
+float
+cbrtf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & SignMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f800000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexpf.  */
+  int e;
+  float m = frexpf (asfloat (iax), &e);
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  float p = ESTRIN_3 (m, m * m, C);
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  float m_by_3 = m / 3;
+  float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexpf.  */
+  return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (cbrtf, 1.03)
+PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000)
diff --git a/contrib/arm-optimized-routines/pl/math/cbrtf_data.c b/contrib/arm-optimized-routines/pl/math/cbrtf_data.c
new file mode 100644
index 000000000000..c6cdb4de0d65
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/cbrtf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for single-precision cbrt(x).
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrtf_data __cbrtf_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrtf.sollya for details of generation.  */
+	        0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	        0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}};
diff --git a/contrib/arm-optimized-routines/pl/math/cosh_2u.c b/contrib/arm-optimized-routines/pl/math/cosh_2u.c
new file mode 100644
index 000000000000..5d1df0717453
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/cosh_2u.c
@@ -0,0 +1,66 @@
+/*
+ * Double-precision cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define SpecialBound                                                           \
+  0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows.  */
+
+double
+__exp_dd (double, double);
+
+static double
+specialcase (double x, uint64_t iax)
+{
+  if (iax == 0x7ff0000000000000)
+    return INFINITY;
+  if (iax > 0x7ff0000000000000)
+    return __math_invalid (x);
+  /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by
+     exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2.  */
+  double t = __exp_dd (asdouble (iax) / 2, 0);
+  return (0.5 * t) * t;
+}
+
+/* Approximation for double-precision cosh(x).
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the special region, 1.93 ULP:
+   cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+			     want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.03 ULP:
+   cosh(0x1.502cd8e56ab3bp+0) got 0x1.fe54962842d0ep+0
+			     want 0x1.fe54962842d0fp+0.  */
+double
+cosh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+
+  /* exp overflows a little bit before cosh, so use special-case handler for the
+     gap, as well as special values.  */
+  if (unlikely (iax >= SpecialBound))
+    return specialcase (x, iax);
+
+  double ax = asdouble (iax);
+  /* Use double-precision exp helper to calculate exp(x), then:
+     cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2).  */
+  double t = __exp_dd (ax, 0);
+  return 0.5 * t + 0.5 / t;
+}
+
+PL_SIG (S, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (cosh, 1.43)
+PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
+PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000)
+PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
+PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000)
+PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100)
+PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100)
diff --git a/contrib/arm-optimized-routines/pl/math/coshf_1u9.c b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c
new file mode 100644
index 000000000000..c125c929aa77
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c
@@ -0,0 +1,71 @@
+/*
+ * Single-precision cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
+#define SpecialBound                                                           \
+  0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use       \
+		special case.  */
+
+float
+optr_aor_exp_f32 (float);
+
+static NOINLINE float
+specialcase (float x, uint32_t iax)
+{
+  if (iax == 0x7f800000)
+    return INFINITY;
+  if (iax > 0x7f800000)
+    return __math_invalidf (x);
+  if (iax <= TinyBound)
+    /* For tiny x, avoid underflow by just returning 1.  */
+    return 1;
+  /* Otherwise SpecialBound <= |x| < Inf. x is too large to calculate exp(x)
+     without overflow, so use exp(|x|/2) instead. For large x cosh(x) is
+     dominated by exp(x), so return:
+     cosh(x) ~= (exp(|x|/2))^2 / 2.  */
+  float t = optr_aor_exp_f32 (asfloat (iax) / 2);
+  return (0.5 * t) * t;
+}
+
+/* Approximation for single-precision cosh(x) using exp.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The maximum error is 1.89 ULP, observed for |x| > SpecialBound:
+   coshf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127.
+   The maximum error observed for TinyBound < |x| < SpecialBound is 1.02 ULP:
+   coshf(0x1.50a3cp+0) got 0x1.ff21dcp+0 want 0x1.ff21dap+0.  */
+float
+coshf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  float ax = asfloat (iax);
+
+  if (unlikely (iax <= TinyBound || iax >= SpecialBound))
+    {
+      /* x is tiny, large or special.  */
+      return specialcase (x, iax);
+    }
+
+  /* Compute cosh using the definition:
+     coshf(x) = exp(x) / 2 + exp(-x) / 2.  */
+  float t = optr_aor_exp_f32 (ax);
+  return 0.5f * t + 0.5f / t;
+}
+
+PL_SIG (S, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (coshf, 1.89)
+PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100)
+PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
+PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100)
+PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000)
diff --git a/contrib/arm-optimized-routines/pl/math/erfc_4u5.c b/contrib/arm-optimized-routines/pl/math/erfc_4u5.c
new file mode 100644
index 000000000000..e9af9d3bcdb4
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/erfc_4u5.c
@@ -0,0 +1,155 @@
+/*
+ * Double-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pairwise_horner.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffffffffffff)
+
+#define xint __erfc_data.interval_bounds
+#define PX __erfc_data.poly
+
+/* Accurate exponential from optimized routines.  */
+double
+__exp_dd (double x, double xtail);
+
+static inline double
+eval_poly_horner (double z, int i)
+{
+  double z2 = z * z;
+#define C(j) PX[i][j]
+  return PAIRWISE_HORNER_12 (z, z2, C);
+}
+
+/* Accurate evaluation of exp(x^2)
+   using compensated product (x^2 ~ x*x + e2)
+   and the __exp_dd(y,d) routine, that is the
+   computation of exp(y+d) with a small correction d<<y.  */
+static inline double
+eval_accurate_gaussian (double a)
+{
+  double e2;
+  double a2 = a * a;
+  double aa1 = -fma (0x1.0000002p27, a, -a);
+  aa1 = fma (0x1.0000002p27, a, aa1);
+  double aa2 = a - aa1;
+  e2 = fma (-aa1, aa1, a2);
+  e2 = fma (-aa1, aa2, e2);
+  e2 = fma (-aa2, aa1, e2);
+  e2 = fma (-aa2, aa2, e2);
+  return __exp_dd (-a2, e2);
+}
+
+/* Approximation of erfc for |x| > 6.0.  */
+static inline double
+approx_erfc_hi (double x, int i)
+{
+  double a = fabs (x);
+  double z = a - xint[i];
+  double p = eval_poly_horner (z, i);
+  double e_mx2 = eval_accurate_gaussian (a);
+  return p * e_mx2;
+}
+
+static inline int
+get_itv_idx (double x)
+{
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  double a = asdouble (asuint64 (x) & AbsMask);
+  double z = a + 1.0;
+  z = z * z;
+  z = z * z;
+  return (asuint64 (z) >> 52) - 1023;
+}
+
+/* Approximation of erfc for |x| < 6.0.  */
+static inline double
+approx_erfc_lo (double x, uint32_t sign, int i)
+{
+  double a = fabs (x);
+  double z = a - xint[i];
+  double p = eval_poly_horner (z, i);
+  double e_mx2 = eval_accurate_gaussian (a);
+  if (sign)
+    return fma (-p, e_mx2, 2.0);
+  else
+    return p * e_mx2;
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+abstop12 (double x)
+{
+  return (asuint64 (x) >> 52) & 0x7ff;
+}
+
+/* Top 32 bits of a double.  */
+static inline uint32_t
+top32 (double x)
+{
+  return asuint64 (x) >> 32;
+}
+
+/* Fast erfc implementation.
+   The approximation uses polynomial approximation of
+   exp(x^2) * erfc(x) with fixed orders on 20 intervals.
+   Maximum measured error is 4.05 ULPs:.
+   erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2
+			     want 0x1.ff84036f8f0b7p-2.  */
+double
+erfc (double x)
+{
+  /* Get top words.  */
+  uint32_t ix = top32 (x); /* We need to compare at most 32 bits.  */
+  uint32_t ia = ix & 0x7fffffff;
+  uint32_t sign = ix >> 31;
+
+  /* Handle special cases and small values with a single comparison:
+     abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
+     Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2
+     Errno EDOM does not have to be set in case of erfc(nan).
+     Only ERANGE may be set in case of underflow.
+     Small values (|x|<small)
+       |x|<0x1.0p-56 => accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7)
+       |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd).  */
+  if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd))
+    {
+      if (abstop12 (x) >= 0x7ff)
+	return (double) (sign << 1) + 1.0 / x; /* special cases.  */
+      else
+	return 1.0 - x; /* small case.  */
+    }
+  else if (ia < 0x40180000)
+    { /* |x| < 6.0.  */
+      return approx_erfc_lo (x, sign, get_itv_idx (x));
+    }
+  else if (sign)
+    { /* x <= -6.0.  */
+      return 2.0;
+    }
+  else if (ia < 0x403c0000)
+    { /* 6.0 <= x < 28.  */
+      return approx_erfc_hi (x, get_itv_idx (x));
+    }
+  else
+    { /* x > 28.  */
+      return __math_uflow (0);
+    }
+}
+
+PL_SIG (S, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (erfc, 3.56)
+PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erfc, 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/pl/math/erfc_data.c b/contrib/arm-optimized-routines/pl/math/erfc_data.c
new file mode 100644
index 000000000000..fa7184fcc871
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/erfc_data.c
@@ -0,0 +1,145 @@
+/*
+ * Data used in double-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
+   precision. Generated using the Remez algorithm on each interval separately
+   (see erfc.sollya for more detail).  */
+const struct erfc_data __erfc_data = {
+
+/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a
+   logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
+   exception of the first interval.  */
+.interval_bounds = {
+  0x1.0p-50,		/* Tiny boundary.  */
+  0x1.837f05c490126p-3, /* 0.189.  */
+  0x1.a827997709f7ap-2, /* 0.414.  */
+  0x1.5d13f326fe9c8p-1, /* 0.682.  */
+  0x1.0p0,		/* 1.000.  */
+  0x1.60dfc14636e2ap0,	/* 1.378.  */
+  0x1.d413cccfe779ap0,	/* 1.828.  */
+  0x1.2e89f995ad3adp1,	/* 2.364.  */
+  0x1.8p1,		/* 3.000.  */
+  0x1.e0dfc14636e2ap1,	/* 3.757.  */
+  0x1.2a09e667f3bcdp2,	/* 4.657.  */
+  0x1.6e89f995ad3adp2,	/* 5.727.  */
+  0x1.cp2,		/* 7.000.  */
+  0x1.106fe0a31b715p3,	/* 8.514.  */
+  0x1.4a09e667f3bcdp3,	/* 10.31.  */
+  0x1.8e89f995ad3adp3,	/* 12.45.  */
+  0x1.ep3,		/* 15.00.  */
+  0x1.206fe0a31b715p4,	/* 18.03.  */
+  0x1.5a09e667f3bcdp4,	/* 21.63.  */
+  0x1.9e89f995ad3adp4,	/* 25.91.  */
+  0x1.fp4		/* 31.00.  */
+},
+
+/* Coefficients for each order 12 polynomial on each of the 20 intervals.  */
+.poly = {
+  {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1,
+   -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2,
+   0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5,
+   -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9,
+   0x1.c9bfafa73899cp-11},
+  {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1,
+   -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3,
+   0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6,
+   -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10,
+   0x1.526a8a14e9bfcp-12},
+  {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2,
+   -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4,
+   0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7,
+   -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12,
+   0x1.b451af7dd52fep-14},
+  {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2,
+   -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5,
+   0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9,
+   -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13,
+   0x1.e654e67532b44p-16},
+  {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3,
+   -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6,
+   0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10,
+   -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15,
+   0x1.d213a128a75c9p-18},
+  {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4,
+   -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8,
+   0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12,
+   -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17,
+   0x1.7f90154bde15dp-20},
+  {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5,
+   -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9,
+   0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14,
+   -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20,
+   0x1.1020f4741f79ep-22},
+  {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6,
+   -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11,
+   0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16,
+   -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22,
+   0x1.501716d098f14p-25},
+  {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6,
+   -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12,
+   0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18,
+   -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25,
+   0x1.6eb74e2e99662p-28},
+  {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7,
+   -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14,
+   0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20,
+   -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28,
+   0x1.68510d1c32842p-31},
+  {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8,
+   -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15,
+   0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23,
+   -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30,
+   0x1.45aabbe505f6ap-34},
+  {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9,
+   -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17,
+   0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25,
+   -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33,
+   0x1.14989aac741c2p-37},
+  {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10,
+   -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18,
+   0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27,
+   -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37,
+   0x1.c21ba1b404f5ap-41},
+  {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11,
+   -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20,
+   0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30,
+   -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40,
+   0x1.6487c50052867p-44},
+  {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11,
+   -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22,
+   0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32,
+   -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43,
+   0x1.165732f1ae138p-47},
+  {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12,
+   -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23,
+   0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34,
+   -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46,
+   0x1.b0241c6d5b761p-51},
+  {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13,
+   -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25,
+   0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37,
+   -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49,
+   0x1.4f8abb4398a0dp-54},
+  {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14,
+   -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26,
+   0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39,
+   -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52,
+   0x1.058cd4ea9bf04p-57},
+  {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15,
+   -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28,
+   0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41,
+   -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55,
+   0x1.9a2af47d77e44p-61},
+  {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15,
+   -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30,
+   0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44,
+   -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58,
+   0x1.43d3358c64dafp-64}
+}
+};
diff --git a/contrib/arm-optimized-routines/pl/math/erfcf.h b/contrib/arm-optimized-routines/pl/math/erfcf.h
new file mode 100644
index 000000000000..8f1e5f4226e3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/erfcf.h
@@ -0,0 +1,38 @@
+/*
+ * Shared functions for scalar and vector single-precision erfc(x) functions.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_ERFCF_H
+#define PL_MATH_ERFCF_H
+
+#include "math_config.h"
+
+#define FMA fma
+#include "estrin_wrap.h"
+
+/* Accurate exponential from optimized-routines.  */
+double
+__exp_dd (double x, double xtail);
+
+static inline double
+eval_poly (double z, const double *coeff)
+{
+  double z2 = z * z;
+  double z4 = z2 * z2;
+  double z8 = z4 * z4;
+#define C(i) coeff[i]
+  return ESTRIN_15 (z, z2, z4, z8, C);
+#undef C
+}
+
+static inline double
+eval_exp_mx2 (double x)
+{
+  return __exp_dd (-(x * x), 0.0);
+}
+
+#undef FMA
+#endif // PL_MATH_ERFCF_H
diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_2u.c b/contrib/arm-optimized-routines/pl/math/erfcf_2u.c
new file mode 100644
index 000000000000..5a3f9b00aa5c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/erfcf_2u.c
@@ -0,0 +1,133 @@
+/*
+ * Single-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "erfcf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define P(i) __erfcf_poly_data.poly[i]
+
+/* Approximation of erfcf for |x| > 4.0.  */
+static inline float
+approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
+{
+  if (sign)
+    {
+      return 2.0f;
+    }
+
+  /* Polynomial contribution.  */
+  double z = (double) fabs (x);
+  float p = (float) eval_poly (z, coeff);
+  /* Gaussian contribution.  */
+  float e_mx2 = (float) eval_exp_mx2 (z);
+
+  return p * e_mx2;
+}
+
+/* Approximation of erfcf for |x| < 4.0.  */
+static inline float
+approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
+{
+  /* Polynomial contribution.  */
+  double z = (double) fabs (x);
+  float p = (float) eval_poly (z, coeff);
+  /* Gaussian contribution.  */
+  float e_mx2 = (float) eval_exp_mx2 (z);
+
+  if (sign)
+    return fmaf (-p, e_mx2, 2.0f);
+  else
+    return p * e_mx2;
+}
+
+/* Top 12 bits of a float (sign and exponent bits).  */
+static inline uint32_t
+abstop12 (float x)
+{
+  return (asuint (x) >> 20) & 0x7ff;
+}
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Fast erfcf approximation using polynomial approximation
+   multiplied by gaussian.
+   Most of the computation is carried out in double precision,
+   and is very sensitive to accuracy of polynomial and exp
+   evaluation.
+   Worst-case error is 1.968ulps, obtained for x = 2.0412941.
+   erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp
+   err 1.46788.  */
+float
+erfcf (float x)
+{
+  /* Get top words and sign.  */
+  uint32_t ix = asuint (x); /* We need to compare at most 32 bits.  */
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Handle special cases and small values with a single comparison:
+       abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
+
+     Special cases
+       erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2
+
+     Errno
+       EDOM does not have to be set in case of erfcf(nan).
+       Only ERANGE may be set in case of underflow.
+
+     Small values (|x|<small)
+       |x|<0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328).  */
+  if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328))
+    {
+      if (abstop12 (x) >= 0x7f8)
+	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
+      else
+	return 1.0f - x; /* Small case.  */
+    }
+
+  /* Normalized numbers divided in 4 intervals
+     with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for
+     the interesting region as it is the smallest value, representable as a
+     12-bit integer, for which returning 0 gives <1.5 ULP.  */
+  if (ia12 < 0x400)
+    {
+      return approx_erfcf_lo (x, sign, P (0));
+    }
+  if (ia12 < 0x408)
+    {
+      return approx_erfcf_lo (x, sign, P (1));
+    }
+  if (ia12 < 0x410)
+    {
+      return approx_erfcf_hi (x, sign, P (2));
+    }
+  if (ia12 < 0x412)
+    {
+      return approx_erfcf_hi (x, sign, P (3));
+    }
+  if (sign)
+    {
+      return 2.0f;
+    }
+  return __math_uflowf (0);
+}
+
+PL_SIG (S, F, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (erfcf, 1.5)
+PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erfcf, 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_data.c b/contrib/arm-optimized-routines/pl/math/erfcf_data.c
new file mode 100644
index 000000000000..2e018c8c6710
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/erfcf_data.c
@@ -0,0 +1,57 @@
+/*
+ * Data used in single-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
+   precision. Generated using the Remez algorithm on each interval separately
+   (see erfcf.sollya for more detail).  */
+const struct erfcf_poly_data __erfcf_poly_data
+  = {.poly
+     = {{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1,
+	  -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2,
+	  0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5,
+	  -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9,
+	  0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17,
+	  -0x1.32712a6275c4dp-21
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1,
+	  -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3,
+	  0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8,
+	  -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15,
+	  0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27,
+	  -0x1.8ec1581647f9fp-33
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1,
+	  -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6,
+	  0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13,
+	  -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23,
+	  0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38,
+	  -0x1.027034672f11cp-44
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2,
+	  -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8,
+	  0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17,
+	  -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30,
+	  0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46,
+	  -0x1.45abac612344bp-53
+#endif
+	}}};
diff --git a/contrib/arm-optimized-routines/math/erff.c b/contrib/arm-optimized-routines/pl/math/erff_1u5.c
similarity index 59%
copy from contrib/arm-optimized-routines/math/erff.c
copy to contrib/arm-optimized-routines/pl/math/erff_1u5.c
index a58e82565dc3..1a69872c43e5 100644
--- a/contrib/arm-optimized-routines/math/erff.c
+++ b/contrib/arm-optimized-routines/pl/math/erff_1u5.c
@@ -1,104 +1,108 @@
 /*
  * Single-precision erf(x) function.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
-
-#include <stdint.h>
-#include <math.h>
+#include "estrinf.h"
+#include "hornerf.h"
 #include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
 
 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
 #define A __erff_data.erff_poly_A
 #define B __erff_data.erff_poly_B
 
 /* Top 12 bits of a float.  */
 static inline uint32_t
 top12 (float x)
 {
   return asuint (x) >> 20;
 }
 
-/* Efficient implementation of erff
-   using either a pure polynomial approximation or
-   the exponential of a polynomial.
-   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
+/* Efficient implementation of erff using either a pure polynomial approximation
+   or the exponential of a polynomial. Worst-case error is 1.09ulps at
+   0x1.c111acp-1.  */
 float
 erff (float x)
 {
-  float r, x2, u;
+  float r, x2;
 
   /* Get top word.  */
   uint32_t ix = asuint (x);
   uint32_t sign = ix >> 31;
   uint32_t ia12 = top12 (x) & 0x7ff;
 
   /* Limit of both intervals is 0.875 for performance reasons but coefficients
      computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
      from 0.94 to 1.1ulps.  */
   if (ia12 < 0x3f6)
     { /* a = |x| < 0.875.  */
 
       /* Tiny and subnormal cases.  */
       if (unlikely (ia12 < 0x318))
 	{ /* |x| < 2^(-28).  */
 	  if (unlikely (ia12 < 0x040))
 	    { /* |x| < 2^(-119).  */
 	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
 	      return check_uflowf (y);
 	    }
 	  return x + TwoOverSqrtPiMinusOne * x;
 	}
 
       x2 = x * x;
 
-      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
-      r = A[5];
-      r = fmaf (r, x2, A[4]);
-      r = fmaf (r, x2, A[3]);
-      r = fmaf (r, x2, A[2]);
-      r = fmaf (r, x2, A[1]);
-      r = fmaf (r, x2, A[0]);
-      r = fmaf (r, x, x);
+      /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
+       */
+#define C(i) A[i]
+      r = fmaf (HORNER_5 (x2, C), x, x);
+#undef C
     }
   else if (ia12 < 0x408)
     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
 
       float a = fabsf (x);
-      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
-      r = fmaf (B[6], a, B[5]);
-      u = fmaf (B[4], a, B[3]);
-      x2 = x * x;
-      r = fmaf (r, x2, u);
+      /* Use Estrin scheme on high order (small magnitude) coefficients.  */
+#define C(i) B[i]
+      r = ESTRIN_3_ (a, x * x, C, 3);
+#undef C
       /* Then switch to pure Horner scheme.  */
       r = fmaf (r, a, B[2]);
       r = fmaf (r, a, B[1]);
       r = fmaf (r, a, B[0]);
       r = fmaf (r, a, a);
-      /* Single precision exponential with ~0.5ulps,
-	 ensures erff has max. rel. error
-	 < 1ulp on [0.921875, 4.0],
-	 < 1.1ulps on [0.875, 4.0].  */
+      /* Single precision exponential with ~0.5ulps ensures erff has maximum
+	 relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on
+	 [0.875, 4.0].  */
       r = expf (-r);
       /* Explicit copysign (calling copysignf increases latency).  */
       if (sign)
 	r = -1.0f + r;
       else
 	r = 1.0f - r;
     }
   else
     { /* |x| >= 4.0.  */
 
       /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
       if (unlikely (ia12 >= 0x7f8))
 	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
 
       /* Explicit copysign (calling copysignf increases latency).  */
       if (sign)
 	r = -1.0f;
       else
 	r = 1.0f;
     }
   return r;
 }
+
+PL_SIG (S, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (erff, 0.6)
+PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erff, 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/pl/math/erff_data.c b/contrib/arm-optimized-routines/pl/math/erff_data.c
new file mode 100644
index 000000000000..2352baefd35f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/erff_data.c
@@ -0,0 +1,16 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff.  */
+const struct erff_data __erff_data
+  = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+		     -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f},
+     .erff_poly_B
+     = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f,
+	0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}};
diff --git a/contrib/arm-optimized-routines/pl/math/estrin.h b/contrib/arm-optimized-routines/pl/math/estrin.h
new file mode 100644
index 000000000000..f967fb0475b0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/estrin.h
@@ -0,0 +1,16 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "estrin_wrap.h"
diff --git a/contrib/arm-optimized-routines/pl/math/estrin_wrap.h b/contrib/arm-optimized-routines/pl/math/estrin_wrap.h
new file mode 100644
index 000000000000..2ae07001f2cf
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/estrin_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  ESTRIN_1_(x,                  c, i) FMA(x,   c(1 + i),                        c(i))
+#define  ESTRIN_2_(x, x2,              c, i) FMA(x2,  c(2 + i),                        ESTRIN_1_(x,              c, i))
+#define  ESTRIN_3_(x, x2,              c, i) FMA(x2,  ESTRIN_1_(x,         c,  2 + i), ESTRIN_1_(x,              c, i))
+#define  ESTRIN_4_(x, x2, x4,          c, i) FMA(x4,  c(4 + i),                        ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_5_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_1_(x,         c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_6_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_2_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_7_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_3_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_8_(x, x2, x4, x8,      c, i) FMA(x8,  c(8 + i),                        ESTRIN_7_(x, x2, x4,      c, i))
+#define  ESTRIN_9_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_1_(x,         c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_10_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_2_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_11_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_3_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_12_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_4_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_13_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_5_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_14_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_6_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_15_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_7_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i),                       ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x,         c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+
+#define  ESTRIN_1(x,                  c)  ESTRIN_1_(x,                  c, 0)
+#define  ESTRIN_2(x, x2,              c)  ESTRIN_2_(x, x2,              c, 0)
+#define  ESTRIN_3(x, x2,              c)  ESTRIN_3_(x, x2,              c, 0)
+#define  ESTRIN_4(x, x2, x4,          c)  ESTRIN_4_(x, x2, x4,          c, 0)
+#define  ESTRIN_5(x, x2, x4,          c)  ESTRIN_5_(x, x2, x4,          c, 0)
+#define  ESTRIN_6(x, x2, x4,          c)  ESTRIN_6_(x, x2, x4,          c, 0)
+#define  ESTRIN_7(x, x2, x4,          c)  ESTRIN_7_(x, x2, x4,          c, 0)
+#define  ESTRIN_8(x, x2, x4, x8,      c)  ESTRIN_8_(x, x2, x4, x8,      c, 0)
+#define  ESTRIN_9(x, x2, x4, x8,      c)  ESTRIN_9_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_10(x, x2, x4, x8,      c) ESTRIN_10_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_11(x, x2, x4, x8,      c) ESTRIN_11_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_12(x, x2, x4, x8,      c) ESTRIN_12_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_13(x, x2, x4, x8,      c) ESTRIN_13_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_14(x, x2, x4, x8,      c) ESTRIN_14_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_15(x, x2, x4, x8,      c) ESTRIN_15_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0)
+// clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/estrinf.h b/contrib/arm-optimized-routines/pl/math/estrinf.h
new file mode 100644
index 000000000000..175233c6c799
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/estrinf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "estrin_wrap.h"
diff --git a/contrib/arm-optimized-routines/math/exp.c b/contrib/arm-optimized-routines/pl/math/exp.c
similarity index 94%
copy from contrib/arm-optimized-routines/math/exp.c
copy to contrib/arm-optimized-routines/pl/math/exp.c
index 7f5024cd8792..90253b68875d 100644
--- a/contrib/arm-optimized-routines/math/exp.c
+++ b/contrib/arm-optimized-routines/pl/math/exp.c
@@ -1,176 +1,163 @@
 /*
  * Double-precision e^x function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 #define N (1 << EXP_TABLE_BITS)
 #define InvLn2N __exp_data.invln2N
 #define NegLn2hiN __exp_data.negln2hiN
 #define NegLn2loN __exp_data.negln2loN
 #define Shift __exp_data.shift
 #define T __exp_data.tab
 #define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
 #define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
 #define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
 #define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
 #define C6 __exp_data.poly[9 - EXP_POLY_ORDER]
 
 /* Handle cases that may overflow or underflow when computing the result that
    is scale*(1+TMP) without intermediate rounding.  The bit representation of
    scale is in SBITS, however it has a computed exponent that may have
    overflown into the sign bit so that needs to be adjusted before using it as
    a double.  (int32_t)KI is the k used in the argument reduction and exponent
    adjustment of scale, positive k here means the result may overflow and
    negative k means the result may underflow.  */
 static inline double
 specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
 {
   double_t scale, y;
 
   if ((ki & 0x80000000) == 0)
     {
       /* k > 0, the exponent of scale might have overflowed by <= 460.  */
       sbits -= 1009ull << 52;
       scale = asdouble (sbits);
       y = 0x1p1009 * (scale + scale * tmp);
       return check_oflow (eval_as_double (y));
     }
   /* k < 0, need special care in the subnormal range.  */
   sbits += 1022ull << 52;
   scale = asdouble (sbits);
   y = scale + scale * tmp;
   if (y < 1.0)
     {
       /* Round y to the right precision before scaling it into the subnormal
 	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
 	 E is the worst-case ulp error outside the subnormal range.  So this
 	 is only useful if the goal is better than 1 ulp worst-case error.  */
       double_t hi, lo;
       lo = scale - y + scale * tmp;
       hi = 1.0 + y;
       lo = 1.0 - hi + y + lo;
       y = eval_as_double (hi + lo) - 1.0;
       /* Avoid -0.0 with downward rounding.  */
       if (WANT_ROUNDING && y == 0.0)
 	y = 0.0;
       /* The underflow exception needs to be signaled explicitly.  */
       force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
     }
   y = 0x1p-1022 * y;
   return check_uflow (eval_as_double (y));
 }
 
 /* Top 12 bits of a double (sign and exponent bits).  */
 static inline uint32_t
 top12 (double x)
 {
   return asuint64 (x) >> 52;
 }
 
 /* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
    If hastail is 0 then xtail is assumed to be 0 too.  */
 static inline double
 exp_inline (double x, double xtail, int hastail)
 {
   uint32_t abstop;
   uint64_t ki, idx, top, sbits;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, z, r, r2, scale, tail, tmp;
 
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54)))
     {
       if (abstop - top12 (0x1p-54) >= 0x80000000)
 	/* Avoid spurious underflow for tiny x.  */
 	/* Note: 0 is common input.  */
 	return WANT_ROUNDING ? 1.0 + x : 1.0;
       if (abstop >= top12 (1024.0))
 	{
 	  if (asuint64 (x) == asuint64 (-INFINITY))
 	    return 0.0;
 	  if (abstop >= top12 (INFINITY))
 	    return 1.0 + x;
 	  if (asuint64 (x) >> 63)
 	    return __math_uflow (0);
 	  else
 	    return __math_oflow (0);
 	}
       /* Large x is special cased below.  */
       abstop = 0;
     }
 
   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
   /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
   z = InvLn2N * x;
 #if TOINT_INTRINSICS
   kd = roundtoint (z);
   ki = converttoint (z);
 #elif EXP_USE_TOINT_NARROW
   /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
   kd = eval_as_double (z + Shift);
   ki = asuint64 (kd) >> 16;
   kd = (double_t) (int32_t) ki;
 #else
   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
   kd = eval_as_double (z + Shift);
   ki = asuint64 (kd);
   kd -= Shift;
 #endif
   r = x + kd * NegLn2hiN + kd * NegLn2loN;
   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
   if (hastail)
     r += xtail;
   /* 2^(k/N) ~= scale * (1 + tail).  */
   idx = 2 * (ki % N);
   top = ki << (52 - EXP_TABLE_BITS);
   tail = asdouble (T[idx]);
   /* This is only a valid scale when -1023*N < k < 1024*N.  */
   sbits = T[idx + 1] + top;
   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
   r2 = r * r;
   /* Without fma the worst case error is 0.25/N ulp larger.  */
   /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
 #if EXP_POLY_ORDER == 4
   tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4);
 #elif EXP_POLY_ORDER == 5
   tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
 #elif EXP_POLY_ORDER == 6
   tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
 #endif
   if (unlikely (abstop == 0))
     return specialcase (tmp, sbits, ki);
   scale = asdouble (sbits);
   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
      is no spurious underflow here even without fma.  */
   return eval_as_double (scale + scale * tmp);
 }
 
-double
-exp (double x)
-{
-  return exp_inline (x, 0, 0);
-}
-
 /* May be useful for implementing pow where more than double
    precision input is needed.  */
 double
 __exp_dd (double x, double xtail)
 {
   return exp_inline (x, xtail, 1);
 }
-#if USE_GLIBC_ABI
-strong_alias (exp, __exp_finite)
-hidden_alias (exp, __ieee754_exp)
-hidden_alias (__exp_dd, __exp1)
-# if LDBL_MANT_DIG == 53
-long double expl (long double x) { return exp (x); }
-# endif
-#endif
+
diff --git a/contrib/arm-optimized-routines/math/exp_data.c b/contrib/arm-optimized-routines/pl/math/exp_data.c
similarity index 99%
copy from contrib/arm-optimized-routines/math/exp_data.c
copy to contrib/arm-optimized-routines/pl/math/exp_data.c
index cba76832566f..2354be76cfab 100644
--- a/contrib/arm-optimized-routines/math/exp_data.c
+++ b/contrib/arm-optimized-routines/pl/math/exp_data.c
@@ -1,1120 +1,1120 @@
 /*
  * Shared data between exp, exp2 and pow.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << EXP_TABLE_BITS)
 
 const struct exp_data __exp_data = {
 // N/ln2
 .invln2N = 0x1.71547652b82fep0 * N,
 // -ln2/N
 #if N == 64
 .negln2hiN = -0x1.62e42fefa0000p-7,
 .negln2loN = -0x1.cf79abc9e3b3ap-46,
 #elif N == 128
 .negln2hiN = -0x1.62e42fefa0000p-8,
 .negln2loN = -0x1.cf79abc9e3b3ap-47,
 #elif N == 256
 .negln2hiN = -0x1.62e42fefc0000p-9,
 .negln2loN = 0x1.c610ca86c3899p-45,
 #elif N == 512
 .negln2hiN = -0x1.62e42fef80000p-10,
 .negln2loN = -0x1.1cf79abc9e3b4p-45,
 #endif
 // Used for rounding when !TOINT_INTRINSICS
 #if EXP_USE_TOINT_NARROW
 .shift = 0x1800000000.8p0,
 #else
 .shift = 0x1.8p52,
 #endif
 // exp polynomial coefficients.
 .poly = {
 #if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
 // abs error: 1.5543*2^-60
 // ulp error: 0.529 (0.533 without fma)
 // if |x| < ln2/128+eps
 // abs error if |x| < ln2/64: 1.7157*2^-50
 0x1.fffffffffdbcdp-2,
 0x1.555555555444cp-3,
 0x1.555573c6a9f7dp-5,
 0x1.1111266d28935p-7,
 #elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
 // abs error: 1.6735*2^-64
 // ulp error: 0.518 (0.522 without fma)
 // if |x| < ln2/64
 0x1.5555555548f9ap-3,
 0x1.555555554bf5dp-5,
 0x1.11115b75f0f4dp-7,
 0x1.6c171a6b6303ep-10,
 #elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
 // abs error: 1.555*2^-66
 // ulp error: 0.509 (0.511 without fma)
 // if |x| < ln2/256+eps
 // abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
 // abs error if |x| < ln2/128: 1.7145*2^-56
 0x1.ffffffffffdbdp-2,
 0x1.555555555543cp-3,
 0x1.55555cf172b91p-5,
 0x1.1111167a4d017p-7,
 #elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
 // abs error: 1.5542*2^-60
 // ulp error: 0.521 (0.523 without fma)
 // if |x| < ln2/128
 0x1.fffffffffdbcep-2,
 0x1.55555555543c2p-3,
 0x1.555573c64f2e3p-5,
 0x1.111126b4eff73p-7,
 #elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
 // abs error: 1.6861*2^-71
 // ulp error: 0.509 (0.511 without fma)
 // if |x| < ln2/128
 0x1.55555555548fdp-3,
 0x1.555555555658fp-5,
 0x1.111123a859bb6p-7,
 0x1.6c16ba6920cabp-10,
 #elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
 // abs error: 1.43*2^-58
 // ulp error: 0.549 (0.550 without fma)
 // if |x| < ln2/512
 0x1p0, // unused
 0x1.fffffffffffd4p-2,
 0x1.5555571d6ef9p-3,
 0x1.5555576a5adcep-5,
 #elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
 // abs error: 1.5547*2^-66
 // ulp error: 0.505 (0.506 without fma)
 // if |x| < ln2/256
 0x1.ffffffffffdbdp-2,
 0x1.555555555543cp-3,
 0x1.55555cf16e1edp-5,
 0x1.1111167a4b553p-7,
 #elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
 // abs error: 1.4300*2^-63
 // ulp error: 0.504
 // if |x| < ln2/1024
 // abs error if |x| < ln2/512: 1.0689*2^-55
 0x1p0, // unused
 0x1.ffffffffffffdp-2,
 0x1.555555c75bb6p-3,
 0x1.555555dec04a8p-5,
 #endif
 },
 .exp2_shift = 0x1.8p52 / N,
 // exp2 polynomial coefficients.
 .exp2_poly = {
 #if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE
 // abs error: 1.3054*2^-63
 // ulp error: 0.515
 // if |x| < 1/64
 0x1.62e42fefa39efp-1,
 0x1.ebfbdff82c58fp-3,
 0x1.c6b08d7045cf1p-5,
 0x1.3b2ab6fb8fd0ep-7,
 0x1.5d884afec48d7p-10,
 0x1.43097dc684ae1p-13,
 #elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE
 // abs error: 1.2195*2^-65
 // ulp error: 0.507 (0.511 without fma)
 // if |x| < 1/256
 // abs error if |x| < 1/128: 1.9941*2^-56
 0x1.62e42fefa39efp-1,
 0x1.ebfbdff82c424p-3,
 0x1.c6b08d70cf4b5p-5,
 0x1.3b2abd24650ccp-7,
 0x1.5d7e09b4e3a84p-10,
 #elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE
 // abs error: 1.2195*2^-65
 // ulp error: 0.504 (0.508 without fma)
 // if |x| < 1/256
 0x1.62e42fefa39efp-1,
 0x1.ebfbdff82c424p-3,
 0x1.c6b08d70cf4b5p-5,
 0x1.3b2abd24650ccp-7,
 0x1.5d7e09b4e3a84p-10,
 #elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE
 // abs error: 1.4411*2^-64
 // ulp error: 0.5024 (0.5063 without fma)
 // if |x| < 1/1024
 // abs error if |x| < 1/512: 1.9430*2^-56
 0x1.62e42fefa39ecp-1,
 0x1.ebfbdff82c58bp-3,
 0x1.c6b08e46de41fp-5,
 0x1.3b2ab786ee1dap-7,
 #endif
 },
 // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
 // tab[2*k] = asuint64(T[k])
 // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
 .tab = {
 #if N == 64
 0x0, 0x3ff0000000000000,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0xbc93cedd78565858, 0x3feea23882552225,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 #elif N == 128
 0x0, 0x3ff0000000000000,
 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0xbc6a033489906e0b, 0x3fef9b66affed31b,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc9312607a28698a, 0x3feeda4504ac801c,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0xbc845378892be9ae, 0x3feea34634ccc320,
 0xbc93cedd78565858, 0x3feea23882552225,
 0x3c5710aa807e1964, 0x3feea155d44ca973,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0xbc6a12ad8734b982, 0x3feea012750bdabf,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0x3c6dd235e10a73bb, 0x3feec86319e32323,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c90cc319cee31d2, 0x3feed99e1330b358,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc90a40e3da6f640, 0x3feef9728de5593a,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0xbc91eee26b588a35, 0x3fef05b030a1064a,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0xbc900dae3875a949, 0x3fef4f87080d89f2,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0xbc82919e2040220f, 0x3fef60e316c98398,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0x3c843a59ac016b4b, 0x3fef7321f301b460,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0xbc892ab93b470dc9, 0x3fef864614f5a129,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
 #elif N == 256
 0x0, 0x3ff0000000000000,
 0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
 0xbc82985dd8521d32, 0x3feff168143b0281,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
 0x3c845fad437fa426, 0x3fefde5f72f654b1,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0xbc954529642b232f, 0x3fefd50a0e3c1f89,
 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
 0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
 0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0x3c9407fb30d06420, 0x3fefb0f145e46c85,
 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
 0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
 0xbc6a033489906e0b, 0x3fef9b66affed31b,
 0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
 0xbc65704e90c9f860, 0x3fef86a814f204ab,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0xbc897cea57e46280, 0x3fef7e95934f312e,
 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
 0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
 0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
 0x3c8cdc1873af2155, 0x3fef582f95281c6b,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0xbc9493684653a131, 0x3fef50e75eb44027,
 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
 0xbc98e2899077520a, 0x3fef49c18438ce4d,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0x3c9120fcd4f59273, 0x3fef42be3578a819,
 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
 0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0x3c877afbca90ef84, 0x3fef351ffb82140a,
 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
 0x3c91512f082876ee, 0x3fef2e85711ece75,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
 0xbc803297e78260bf, 0x3fef21ba7591bb70,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
 0xbc91e75c40b4251e, 0x3fef157e39771b2f,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c98a911f1f7785a, 0x3fef0f961f641589,
 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
 0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
 0xbc979517a03e2847, 0x3feefeb83ba8ea32,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
 0xbc87430803972b34, 0x3feef431a2de883b,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc954de30ae02d94, 0x3feeef26231e754a,
 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
 0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
 0x3c79c3bba5562a2f, 0x3feee0e544ede173,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc85a71612e21658, 0x3feedc70df1c5175,
 0xbc9312607a28698a, 0x3feeda4504ac801c,
 0x3c86421f6f1d24d6, 0x3feed822c367a024,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0xbc9348a6815fce65, 0x3feed3fb2709468a,
 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
 0x3c835c43984d9871, 0x3feecffa3f84b9d4,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0xbc632afc8d9473a0, 0x3feecc2042a7d232,
 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
 0xbc95fc5e44de020e, 0x3feec86d668b3237,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
 0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0x3c892ca3bf144e63, 0x3feebe41b817c114,
 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
 0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0x3c73e34f67e67118, 0x3feeb8417f4531ee,
 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
 0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
 0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
 0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0xbc943a3540d1898a, 0x3feeaa11fba87a03,
 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
 0xbc951f58ddaa8090, 0x3feea84590998b93,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0xbc82e1648e50a17c, 0x3feea6a320dceb71,
 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
 0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
 0xbc845378892be9ae, 0x3feea34634ccc320,
 0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
 0xbc93cedd78565858, 0x3feea23882552225,
 0xbc85c33fdf910406, 0x3feea1c1c70833f6,
 0x3c5710aa807e1964, 0x3feea155d44ca973,
 0x3c81079ab5789604, 0x3feea0f4b19e9538,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0x3c727df161cd7778, 0x3feea052fa75173e,
 0xbc6a12ad8734b982, 0x3feea012750bdabf,
 0x3c93f9924a05b767, 0x3fee9fdcddd47645,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
 0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc88e67a9006c909, 0x3fee9f8286ead08a,
 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
 0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
 0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
 0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0xbc760a3629969871, 0x3feea3878491c491,
 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
 0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
 0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c7c88549b958471, 0x3feea9cad931a436,
 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
 0x3c931143962f7877, 0x3feeabd0a478580f,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0x3c93e9e96f112479, 0x3feeae05bad61778,
 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
 0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
 0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
 0xbc51669428996971, 0x3feebbdd9a7670b3,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
 0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
 0x3c6dd235e10a73bb, 0x3feec86319e32323,
 0xbc79740b58a20091, 0x3feeca5e8d07f29e,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
 0xbc903d5cbe27874b, 0x3feed2c980460ad8,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c5986178980fce0, 0x3feed74a8af46052,
 0x3c90cc319cee31d2, 0x3feed99e1330b358,
 0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
 0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
 0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc7274aedac8ff80, 0x3feef68415b749b1,
 0xbc90a40e3da6f640, 0x3feef9728de5593a,
 0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
 0xbc91eee26b588a35, 0x3fef05b030a1064a,
 0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0xbc302899507554e5, 0x3fef0f69c3f3a207,
 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
 0xbc80dda2d4c0010c, 0x3fef16286141b33d,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
 0x3c836909391181d3, 0x3fef244778fafb22,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
 0xbc7ac28b7bef6621, 0x3fef33405751c4db,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
 0xbc8cc734592af7fc, 0x3fef43155b5bab74,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0x3c87752a44f587e8, 0x3fef4b532b08c968,
 0xbc900dae3875a949, 0x3fef4f87080d89f2,
 0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
 0xbc82919e2040220f, 0x3fef60e316c98398,
 0x3c8c254d16117a68, 0x3fef655d71ff6075,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
 0x3c843a59ac016b4b, 0x3fef7321f301b460,
 0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
 0xbc892ab93b470dc9, 0x3fef864614f5a129,
 0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0xbc776caa4c2ff1cf, 0x3fef953924676d76,
 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
 0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
 0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
 0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0x3c901f3a75ee0efe, 0x3fefd632798844f8,
 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
 0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 0xbc699c7db2effc76, 0x3fefedba3692d514,
 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
 0x3c64b458677f9840, 0x3feff9d96b2a23d9,
 #elif N == 512
 0x0, 0x3ff0000000000000,
 0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a,
 0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
 0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11,
 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
 0x3c75c18e5ae0563a, 0x3feff3d1e77170b4,
 0xbc82985dd8521d32, 0x3feff168143b0281,
 0xbc705b1125cf49a5, 0x3fefef003103b10e,
 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
 0x3c9f879abbff3f87, 0x3fefea363d42b027,
 0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
 0x3c9b14003824712a, 0x3fefe57411915a8a,
 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
 0x3c61cbf0f38af658, 0x3fefe0b9b35659d8,
 0x3c845fad437fa426, 0x3fefde5f72f654b1,
 0xbc9a3316383dcbc5, 0x3fefdc0727fc1762,
 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
 0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2,
 0xbc954529642b232f, 0x3fefd50a0e3c1f89,
 0xbc89b3236d111646, 0x3fefd2b99fa6407c,
 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
 0xbc8cb191be99b1b0, 0x3fefce1ead925493,
 0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
 0xbc9acb71e83765b7, 0x3fefc98ba42e7d30,
 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
 0x3c5cd3e58b03697e, 0x3fefc50088f8093f,
 0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
 0xbc8bfb07d4755452, 0x3fefc07d61701716,
 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
 0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715,
 0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
 0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52,
 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
 0xbc85b9eb0402507b, 0x3fefb323d833d93f,
 0x3c9407fb30d06420, 0x3fefb0f145e46c85,
 0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53,
 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
 0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba,
 0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
 0x3c9c8be44bf4cde8, 0x3fefa612a7b26300,
 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
 0x3c820c5444c93c44, 0x3fefa1c7c55189c6,
 0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
 0xbc84c6baeb580d7a, 0x3fef9d8503328e6d,
 0xbc6a033489906e0b, 0x3fef9b66affed31b,
 0x3c8657aa1b0d9f83, 0x3fef994a66f951ce,
 0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
 0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1,
 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
 0xbc6b0b2789925e90, 0x3fef90edb6db2dc1,
 0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
 0xbc93aad17d197fae, 0x3fef8ccbae51a5c8,
 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
 0xbc989c464a07ad70, 0x3fef88b1e264a0e9,
 0xbc65704e90c9f860, 0x3fef86a814f204ab,
 0xbc72c338fce197f4, 0x3fef84a058cbae1e,
 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
 0xbc6dca724cea0eb6, 0x3fef809717425438,
 0xbc897cea57e46280, 0x3fef7e95934f312e,
 0x3c464770b955d34d, 0x3fef7c962388149e,
 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
 0xbc962811c114424f, 0x3fef789d83606e12,
 0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
 0x3c8ec58e74904dd4, 0x3fef74ad3c92df73,
 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
 0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89,
 0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
 0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9,
 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
 0x3c8bda920de0f6e2, 0x3fef690eba4df41f,
 0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
 0xbc9a597f9a5ff71c, 0x3fef654013041dc2,
 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
 0x3c50835b125aa573, 0x3fef6179e2363cf8,
 0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
 0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0,
 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
 0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f,
 0x3c8cdc1873af2155, 0x3fef582f95281c6b,
 0xbc6817fd6a313e3e, 0x3fef565a51860746,
 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
 0xbc96236af85fd26a, 0x3fef52b6358e15e8,
 0xbc9493684653a131, 0x3fef50e75eb44027,
 0x3c7795eb4523abe7, 0x3fef4f1aad999e82,
 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
 0x3c8fe58b91b40095, 0x3fef4b87bf9cda38,
 0xbc98e2899077520a, 0x3fef49c18438ce4d,
 0x3c91ecaa860c614a, 0x3fef47fd7190241e,
 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
 0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18,
 0x3c9120fcd4f59273, 0x3fef42be3578a819,
 0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9,
 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
 0x3c87f1c7350e256d, 0x3fef3d9282fc1f27,
 0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
 0x3c420dac6c124f4f, 0x3fef3a2af0b63bff,
 0x3c968efde3a8a894, 0x3fef387a6e756238,
 0xbc99501d09bc09fd, 0x3fef36cc1c78903a,
 0x3c877afbca90ef84, 0x3fef351ffb82140a,
 0x3c73baf864dc8675, 0x3fef33760c547f15,
 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
 0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff,
 0x3c91512f082876ee, 0x3fef2e85711ece75,
 0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82,
 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
 0xbc7548165d85ed32, 0x3fef29a8b16f0a30,
 0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
 0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98,
 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
 0xbc93a255f697ecfe, 0x3fef234c0ea83f36,
 0xbc803297e78260bf, 0x3fef21ba7591bb70,
 0x3c8d2d19edc1e550, 0x3fef202b17779965,
 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
 0xbc76b2173113dd8c, 0x3fef1d130f50d65c,
 0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
 0x3c811aa5f853590b, 0x3fef1a03fc675d1f,
 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
 0x3c61d61a34c8aa02, 0x3fef16fde4f2e280,
 0xbc91e75c40b4251e, 0x3fef157e39771b2f,
 0xbc91f892bf6b286d, 0x3fef1400cf2f6c18,
 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
 0x3c7590c65c20e680, 0x3fef110cc15d5346,
 0x3c98a911f1f7785a, 0x3fef0f961f641589,
 0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833,
 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
 0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2,
 0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
 0x3c7b3bf786a54a87, 0x3fef08670653dfe4,
 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
 0x3c74bb6c41732885, 0x3fef05975721b004,
 0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
 0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac,
 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
 0xbc5451d60c6ac9eb, 0x3fef001375752b40,
 0xbc979517a03e2847, 0x3feefeb83ba8ea32,
 0x3c8787a210ceafd9, 0x3feefd5f4fb45e20,
 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
 0xbc888d1e4629943d, 0x3feefab46484ebb4,
 0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
 0xbc93369c544088b6, 0x3feef812ba4ea77d,
 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
 0x3c85373ce4eb6dfb, 0x3feef57a577dd72b,
 0xbc87430803972b34, 0x3feef431a2de883b,
 0x3c83adec8265a67f, 0x3feef2eb428335b4,
 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
 0xbc835388bcac6bc5, 0x3feef06581d3f669,
 0xbc954de30ae02d94, 0x3feeef26231e754a,
 0x3c727cdb4e4b6640, 0x3feeede91be9c811,
 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
 0x3c86c2696a26af35, 0x3feeeb761742d808,
 0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
 0x3c888f6ff06b979a, 0x3feee90c7a61d55b,
 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
 0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea,
 0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
 0xbc76b8867f91c9d6, 0x3feee4559212ef89,
 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
 0x3c94c9c0b5157fe6, 0x3feee20853c10f28,
 0x3c79c3bba5562a2f, 0x3feee0e544ede173,
 0xbc62455345b51c8e, 0x3feedfc4976d27fa,
 0x3c859f48a72a4c6d, 0x3feedea64c123422,
 0xbc93331de45477d0, 0x3feedd8a63b0a09b,
 0xbc85a71612e21658, 0x3feedc70df1c5175,
 0xbc95f84d39b39b16, 0x3feedb59bf29743f,
 0xbc9312607a28698a, 0x3feeda4504ac801c,
 0xbc72ba4dc7c4d562, 0x3feed932b07a35df,
 0x3c86421f6f1d24d6, 0x3feed822c367a024,
 0xbc844f25dc02691f, 0x3feed7153e4a136a,
 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
 0xbc888d328eb9b501, 0x3feed5016f44d8f5,
 0xbc9348a6815fce65, 0x3feed3fb2709468a,
 0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1,
 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
 0xbc615f0a2b9cd452, 0x3feed0f6d5817663,
 0x3c835c43984d9871, 0x3feecffa3f84b9d4,
 0xbc8c2e465a919e1d, 0x3feecf0018321a1a,
 0x3c4363ed60c2ac11, 0x3feece086061892d,
 0xbc865dfd02bd08f1, 0x3feecd1318eb43ec,
 0xbc632afc8d9473a0, 0x3feecc2042a7d232,
 0xbc8e68cec89b1762, 0x3feecb2fde7006f4,
 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
 0xbc48ae858eb682ca, 0x3feec9566f8827d0,
 0xbc95fc5e44de020e, 0x3feec86d668b3237,
 0x3c5dd71277c0915f, 0x3feec786d3001fe5,
 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
 0x3c92001325ecd7fb, 0x3feec5c10fa920a1,
 0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
 0x3c65ace6e2870332, 0x3feec4052c5916c4,
 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
 0xbc9595c55690ffaf, 0x3feec2532feaada6,
 0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
 0xbc8b401ba9fb5199, 0x3feec0ab213d5283,
 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
 0x3c6df82bf324cc57, 0x3feebf0d073537ca,
 0x3c892ca3bf144e63, 0x3feebe41b817c114,
 0x3c97cae38641c7bb, 0x3feebd78e8bb586b,
 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
 0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a,
 0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
 0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0,
 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
 0xbc80b582d74a55d9, 0x3feeb8f8b804f127,
 0x3c73e34f67e67118, 0x3feeb8417f4531ee,
 0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d,
 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
 0xbc592dca38593e20, 0x3feeb62b00da3b14,
 0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
 0xbc85daca9994833e, 0x3feeb4d359dfd53d,
 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
 0xbc980b4321bc6dae, 0x3feeb385df598d78,
 0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
 0xbc8390afec5241c5, 0x3feeb24298571b06,
 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
 0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf,
 0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
 0xbc910aa91ae9b67f, 0x3feeafdac1351819,
 0x3c93350518fdd78e, 0x3feeaf4736b527da,
 0x3c957e1b67462375, 0x3feeaeb63f4d854c,
 0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
 0x3c8124d5051552a7, 0x3feead9c0d59ca07,
 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
 0xbc3ca103952ecf1f, 0x3feeac8c32824135,
 0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
 0x3c773345c02a4fd6, 0x3feeab86b5f43d92,
 0x3c9063e1e21c5409, 0x3feeab07dd485429,
 0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e,
 0xbc943a3540d1898a, 0x3feeaa11fba87a03,
 0xbc924f2cb4f81746, 0x3feea99af482fc8f,
 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
 0xbc943592a0a9846b, 0x3feea8b4be135acc,
 0xbc951f58ddaa8090, 0x3feea84590998b93,
 0xbc956bc85d444f4f, 0x3feea7d902d47c65,
 0x3c9432e62b64c035, 0x3feea76f15ad2148,
 0x3c914d1e4218319f, 0x3feea707ca0cbf0f,
 0xbc82e1648e50a17c, 0x3feea6a320dceb71,
 0x3c971c93709313f4, 0x3feea6411b078d26,
 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
 0x3c7f88303b60d222, 0x3feea584fd15612a,
 0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
 0x3c70125ca18d4b5b, 0x3feea4d3778bc944,
 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
 0x3c9592ea73798b11, 0x3feea42c91c56acd,
 0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
 0xbc9371d6d7d75739, 0x3feea390532205d8,
 0xbc845378892be9ae, 0x3feea34634ccc320,
 0xbc8ac05fd996f807, 0x3feea2fec30678b7,
 0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
 0xbc91f5067d03653a, 0x3feea277e8dcc390,
 0xbc93cedd78565858, 0x3feea23882552225,
 0x3c917339c86ce3ad, 0x3feea1fbcc140be7,
 0xbc85c33fdf910406, 0x3feea1c1c70833f6,
 0xbc77e66065ba2500, 0x3feea18a7420a036,
 0x3c5710aa807e1964, 0x3feea155d44ca973,
 0x3c964c827ee6b49a, 0x3feea123e87bfb7a,
 0x3c81079ab5789604, 0x3feea0f4b19e9538,
 0xbc928311a3c73480, 0x3feea0c830a4c8d4,
 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
 0x3c882c79e185e981, 0x3feea077541ee718,
 0x3c727df161cd7778, 0x3feea052fa75173e,
 0xbc8b48cea80b043b, 0x3feea0315a736c75,
 0xbc6a12ad8734b982, 0x3feea012750bdabf,
 0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09,
 0x3c93f9924a05b767, 0x3fee9fdcddd47645,
 0x3c954835dd4b7548, 0x3fee9fc62dea2f8a,
 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
 0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8,
 0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
 0xbc8f652fde52775c, 0x3fee9f86e7ba9fef,
 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
 0xbc7b0300defbcf98, 0x3fee9f77ce1303f6,
 0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
 0xbc89dab646035dc0, 0x3fee9f73c4eaa988,
 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
 0xbc91f0c230588dde, 0x3fee9f7ad3ef9011,
 0xbc88e67a9006c909, 0x3fee9f8286ead08a,
 0x3c9106450507a28c, 0x3fee9f8d02d50b8f,
 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
 0xbc9129729a10f3a0, 0x3fee9faa5953c849,
 0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
 0x3c781a70a5124f67, 0x3fee9fd2df29ce7c,
 0xbc8619321e55e68a, 0x3fee9feb564267c9,
 0x3c941626ea62646d, 0x3feea0069c1a861d,
 0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
 0xbc940b9f54365b7c, 0x3feea04597eeba8f,
 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
 0x3c873455e0e826c1, 0x3feea08fda749e5d,
 0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
 0x3c94f006ad874e3e, 0x3feea0e56b7fcf03,
 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
 0xbc8f6d693d0973bb, 0x3feea14652e958aa,
 0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
 0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec,
 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
 0xbc88b25e045d207b, 0x3feea22a4456e7a3,
 0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
 0xbc69cb3314060ca7, 0x3feea2ad5e2850ac,
 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
 0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9,
 0xbc760a3629969871, 0x3feea3878491c491,
 0x3c94aa7212bfa73c, 0x3feea3d5fbab091f,
 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
 0xbc81e688272a8a12, 0x3feea47b8f4abaa9,
 0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
 0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a,
 0xbc9369b6f13b3734, 0x3feea589994cce13,
 0x3c8a1e274eed4476, 0x3feea5e968443d9a,
 0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
 0x3c94a533a59324da, 0x3feea6b1bdadb46d,
 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
 0x3c7a56d2760d087d, 0x3feea785b91e07f1,
 0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
 0x3c91682c1c6e8b05, 0x3feea86562ab00ec,
 0xbc94d450d872576e, 0x3feea8d99b4492ed,
 0x3c89ea99cf7a9591, 0x3feea950c27004c2,
 0x3c7c88549b958471, 0x3feea9cad931a436,
 0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957,
 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
 0x3c909b176e05a9cd, 0x3feeab4ac52be8f7,
 0x3c931143962f7877, 0x3feeabd0a478580f,
 0x3c711607f1952c95, 0x3feeac597875c644,
 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
 0x3c869608f0f86431, 0x3feead74029db01e,
 0x3c93e9e96f112479, 0x3feeae05bad61778,
 0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598,
 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
 0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6,
 0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
 0x3c81c1701c359530, 0x3feeb10afc931857,
 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
 0xbc8edb1bf6809287, 0x3feeb2553499284b,
 0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
 0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c,
 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
 0xbc93fc025e1db9ce, 0x3feeb50dad829e70,
 0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
 0xbc8d737c7d71382e, 0x3feeb67bff148396,
 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
 0x3c6ae88c43905293, 0x3feeb7f669e2802b,
 0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
 0xbc93d1f7661fe51b, 0x3feeb97cf65253d1,
 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
 0x3c651b68797ffc1c, 0x3feebb0faccf9243,
 0xbc51669428996971, 0x3feebbdd9a7670b3,
 0x3c54579c5ceed70b, 0x3feebcae95cba768,
 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
 0x3c87298413381667, 0x3feebe59b9bddb5b,
 0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
 0xbc905000be64e965, 0x3feec01121235681,
 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
 0xbc89fb12e3454b73, 0x3feec1d4d47f2598,
 0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
 0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3,
 0x3c71affc2b91ce27, 0x3feec49182a3f090,
 0x3c90622b15810eea, 0x3feec581414380f2,
 0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
 0x3be9a5ecc875d327, 0x3feec76a0bcfc15e,
 0x3c6dd235e10a73bb, 0x3feec86319e32323,
 0x3c88ea486a3350ef, 0x3feec95f4499c647,
 0xbc79740b58a20091, 0x3feeca5e8d07f29e,
 0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb,
 0xbc87c50422622263, 0x3feecc667b5de565,
 0x3c89c31f7e38028b, 0x3feecd6f23701b15,
 0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
 0xbc5fac13f4e005a3, 0x3feecf89dacfe68c,
 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
 0x3c7d8aced7162e89, 0x3feed1b1231475f7,
 0xbc903d5cbe27874b, 0x3feed2c980460ad8,
 0xbc848f50cea7269f, 0x3feed3e504f696b1,
 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
 0x3c821eb9a08a0542, 0x3feed625893523d4,
 0x3c5986178980fce0, 0x3feed74a8af46052,
 0xbc6133a953131cfd, 0x3feed872b8950a73,
 0x3c90cc319cee31d2, 0x3feed99e1330b358,
 0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca,
 0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
 0xbc90260cf07cb311, 0x3feedd333beb0b7e,
 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
 0x3c1bca400a7b939d, 0x3feedfa6a1897fd2,
 0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
 0x3c9140bc34dfc19f, 0x3feee226d59a09ee,
 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
 0xbc8c9b1da461ab87, 0x3feee4b3e100301e,
 0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
 0x3c8c115f23ebea8e, 0x3feee74dcca5a413,
 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
 0xbc6dcab99f23f84e, 0x3feee9f4a17a4735,
 0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
 0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4,
 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
 0x3c915b1397075f04, 0x3feeef692a8fa8cd,
 0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
 0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a,
 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
 0xbc86a510f31e13e6, 0x3feef511c43bbd62,
 0xbc7274aedac8ff80, 0x3feef68415b749b1,
 0xbc92887ea88e7340, 0x3feef7f9ade433c6,
 0xbc90a40e3da6f640, 0x3feef9728de5593a,
 0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87,
 0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
 0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6,
 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
 0xbc8d1bf10460dba0, 0x3fef01004b3a7804,
 0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
 0x3c8e5d80813dddfc, 0x3fef041ce8e77680,
 0xbc91eee26b588a35, 0x3fef05b030a1064a,
 0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7,
 0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
 0x3c7a77557fd62db3, 0x3fef0a7df9285775,
 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
 0xbc651ba6128db749, 0x3fef0dc27e2cb5e5,
 0xbc302899507554e5, 0x3fef0f69c3f3a207,
 0xbc7c0ffefdc5e251, 0x3fef111462c95b60,
 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
 0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30,
 0xbc80dda2d4c0010c, 0x3fef16286141b33d,
 0x3c923759b8aca76d, 0x3fef17e06ff301f4,
 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
 0xbc895498a73dac7d, 0x3fef1b5aab23e61e,
 0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
 0x3c851de924583108, 0x3fef1ee26b34e065,
 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
 0xbc8c5fe4051ba06c, 0x3fef2277b9881650,
 0x3c836909391181d3, 0x3fef244778fafb22,
 0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad,
 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
 0xbc7af5c67c4e8235, 0x3fef29cb269e601f,
 0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
 0xbc8304ef0045d575, 0x3fef2d89584661a1,
 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
 0x3c8725f94f910375, 0x3fef31553dfa8313,
 0xbc7ac28b7bef6621, 0x3fef33405751c4db,
 0x3c7b53e99f9191e8, 0x3fef352ee13da7cb,
 0x3c676b2c6c921968, 0x3fef3720dcef9069,
 0xbc810a79e6d7e2b8, 0x3fef39164b994d23,
 0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
 0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f,
 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
 0x3c549eeef9ec910c, 0x3fef410e9be12cb9,
 0xbc8cc734592af7fc, 0x3fef43155b5bab74,
 0xbc8335827ffb9dce, 0x3fef451f95018d17,
 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
 0x3c645563980ef762, 0x3fef493e7ba2c38c,
 0x3c87752a44f587e8, 0x3fef4b532b08c968,
 0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c,
 0xbc900dae3875a949, 0x3fef4f87080d89f2,
 0xbc8aab80ceab2b4a, 0x3fef51a638197a3c,
 0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
 0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f,
 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
 0x3c83c119f18464c5, 0x3fef5a461eec14be,
 0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
 0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b,
 0xbc82919e2040220f, 0x3fef60e316c98398,
 0xbc72550d76be719a, 0x3fef631e7e2d479d,
 0x3c8c254d16117a68, 0x3fef655d71ff6075,
 0xbc82090274667d12, 0x3fef679ff37adb4a,
 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
 0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd,
 0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
 0x3c890de9296f4cd1, 0x3fef70cd9ab294e4,
 0x3c843a59ac016b4b, 0x3fef7321f301b460,
 0x3c832ff9978b34bc, 0x3fef7579e065807d,
 0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
 0xbc7303b63dda1980, 0x3fef7a347f63c159,
 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
 0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1,
 0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
 0x3c768d9144ae12fc, 0x3fef83d4f11f8220,
 0xbc892ab93b470dc9, 0x3fef864614f5a129,
 0x3c853687f542403b, 0x3fef88bad7dcee90,
 0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
 0xbc736ed2de40b407, 0x3fef8daf3fe592e8,
 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
 0xbc614ef56c770f3b, 0x3fef92b2334ac7ee,
 0xbc776caa4c2ff1cf, 0x3fef953924676d76,
 0x3c8df7d1353d8e88, 0x3fef97c3bc24e350,
 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
 0xbc850bed64091b8a, 0x3fef9ce3e4933c7e,
 0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
 0x3c89d852381c317f, 0x3fefa212b6bc3181,
 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
 0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5,
 0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
 0xbc5a1f25ce94cae7, 0x3fefac9c80faa594,
 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
 0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2,
 0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
 0x3c737e8ae802b851, 0x3fefb7616ca06dd6,
 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
 0x3c875119560e34af, 0x3fefbcda28a52e59,
 0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
 0xbc7431c3840929c6, 0x3fefc261cbdf5be7,
 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
 0xbc8cb472d2e86b99, 0x3fefc7f860a70c22,
 0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
 0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac,
 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
 0x3c8eef18336b62e3, 0x3fefd35288633625,
 0x3c901f3a75ee0efe, 0x3fefd632798844f8,
 0x3c80d23f87b50a2a, 0x3fefd916302bd526,
 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
 0x3c8302dee657c8e6, 0x3fefdee8f32a4b45,
 0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
 0xbc7b0caa080df170, 0x3fefe4cadbdac61d,
 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
 0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54,
 0xbc699c7db2effc76, 0x3fefedba3692d514,
 0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad,
 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
 0x3c8e70b094fa075a, 0x3feff6cbe15f6314,
 0x3c64b458677f9840, 0x3feff9d96b2a23d9,
 0xbc72ec9a3e5d680a, 0x3feffceaca4391b6,
 #endif
 },
 };
diff --git a/contrib/arm-optimized-routines/math/expf.c b/contrib/arm-optimized-routines/pl/math/expf.c
similarity index 70%
copy from contrib/arm-optimized-routines/math/expf.c
copy to contrib/arm-optimized-routines/pl/math/expf.c
index 9b2f0c3d8c56..c325e45d5cc6 100644
--- a/contrib/arm-optimized-routines/math/expf.c
+++ b/contrib/arm-optimized-routines/pl/math/expf.c
@@ -1,91 +1,76 @@
 /*
  * Single-precision e^x function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
-EXP2F_TABLE_BITS = 5
-EXP2F_POLY_ORDER = 3
+EXPF_TABLE_BITS = 5
+EXPF_POLY_ORDER = 3
 
 ULP error: 0.502 (nearest rounding.)
 Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
 Wrong count: 170635 (all nearest rounding wrong results with fma.)
 Non-nearest ULP error: 1 (rounded ULP error)
 */
 
-#define N (1 << EXP2F_TABLE_BITS)
-#define InvLn2N __exp2f_data.invln2_scaled
-#define T __exp2f_data.tab
-#define C __exp2f_data.poly_scaled
+#define N (1 << EXPF_TABLE_BITS)
+#define InvLn2N __expf_data.invln2_scaled
+#define T __expf_data.tab
+#define C __expf_data.poly_scaled
 
 static inline uint32_t
 top12 (float x)
 {
   return asuint (x) >> 20;
 }
 
 float
-expf (float x)
+optr_aor_exp_f32 (float x)
 {
   uint32_t abstop;
   uint64_t ki, t;
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t kd, xd, z, r, r2, y, s;
 
   xd = (double_t) x;
   abstop = top12 (x) & 0x7ff;
   if (unlikely (abstop >= top12 (88.0f)))
     {
       /* |x| >= 88 or x is nan.  */
       if (asuint (x) == asuint (-INFINITY))
 	return 0.0f;
       if (abstop >= top12 (INFINITY))
 	return x + x;
       if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
 	return __math_oflowf (0);
       if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
 	return __math_uflowf (0);
-#if WANT_ERRNO_UFLOW
-      if (x < -0x1.9d1d9ep6f) /* x < log(0x1p-149) ~= -103.28 */
-	return __math_may_uflowf (0);
-#endif
     }
 
   /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
   z = InvLn2N * xd;
 
   /* Round and convert z to int, the result is in [-150*N, 128*N] and
      ideally nearest int is used, otherwise the magnitude of r can be
      bigger which gives larger approximation error.  */
-#if TOINT_INTRINSICS
   kd = roundtoint (z);
   ki = converttoint (z);
-#else
-# define SHIFT __exp2f_data.shift
-  kd = eval_as_double (z + SHIFT);
-  ki = asuint64 (kd);
-  kd -= SHIFT;
-#endif
   r = z - kd;
 
   /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
   t = T[ki % N];
-  t += ki << (52 - EXP2F_TABLE_BITS);
+  t += ki << (52 - EXPF_TABLE_BITS);
   s = asdouble (t);
   z = C[0] * r + C[1];
   r2 = r * r;
   y = C[2] * r + 1;
   y = z * r2 + y;
   y = y * s;
   return eval_as_float (y);
 }
-#if USE_GLIBC_ABI
-strong_alias (expf, __expf_finite)
-hidden_alias (expf, __ieee754_expf)
-#endif
diff --git a/contrib/arm-optimized-routines/pl/math/expf_data.c b/contrib/arm-optimized-routines/pl/math/expf_data.c
new file mode 100644
index 000000000000..474ad57a29a0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/expf_data.c
@@ -0,0 +1,31 @@
+/*
+ * Coeffs and table entries for single-precision exp. Copied from
+ * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXPF_TABLE_BITS)
+
+const struct expf_data __expf_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .invln2_scaled = 0x1.71547652b82fep+0 * N,
+  .poly_scaled = {
+  0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
+  },
+};
diff --git a/contrib/arm-optimized-routines/pl/math/expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c
new file mode 100644
index 000000000000..a3faff70cb62
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define InvLn2 0x1.71547652b82fep0
+#define Ln2hi 0x1.62e42fefa39efp-1
+#define Ln2lo 0x1.abc9e3b39803fp-56
+#define Shift 0x1.8p52
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define BigBound 0x1.63108c75a1937p+9  /* Above which expm1(x) overflows.  */
+#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */
+#define AbsMask 0x7fffffffffffffff
+
+#define C(i) __expm1_poly[i]
+
+/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
+   The maximum error observed error is 2.17 ULP:
+   expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
+			      want 0x1.a9af566038788p-2.  */
+double
+expm1 (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ax = ix & AbsMask;
+
+  /* Tiny, +Infinity.  */
+  if (ax <= TinyBound || ix == 0x7ff0000000000000)
+    return x;
+
+  /* +/-NaN.  */
+  if (ax > 0x7ff0000000000000)
+    return __math_invalid (x);
+
+  /* Result is too large to be represented as a double.  */
+  if (x >= 0x1.63108c75a1937p+9)
+    return __math_oflow (0);
+
+  /* Result rounds to -1 in double precision.  */
+  if (x <= NegBound)
+    return -1;
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  double j = fma (InvLn2, x, Shift) - Shift;
+  int64_t i = j;
+  double f = fma (j, -Ln2hi, x);
+  f = fma (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* Assemble the result, using a slight rearrangement to achieve acceptable
+     accuracy.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^(i - 1).  */
+  double t = ldexp (0.5, i);
+  /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
+  return 2 * fma (p, t, t - 0.5);
+}
+
+PL_SIG (S, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (expm1, 1.68)
+PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000)
+PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000)
+PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100)
+PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100)
diff --git a/contrib/arm-optimized-routines/pl/math/expm1_data.c b/contrib/arm-optimized-routines/pl/math/expm1_data.c
new file mode 100644
index 000000000000..ff7426b90135
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/expm1_data.c
@@ -0,0 +1,21 @@
+/*
+ * Coefficients for double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1.sollya for details.  */
+const double __expm1_poly[] = {0x1p-1,
+			       0x1.5555555555559p-3,
+			       0x1.555555555554bp-5,
+			       0x1.111111110f663p-7,
+			       0x1.6c16c16c1b5f3p-10,
+			       0x1.a01a01affa35dp-13,
+			       0x1.a01a018b4ecbbp-16,
+			       0x1.71ddf82db5bb4p-19,
+			       0x1.27e517fc0d54bp-22,
+			       0x1.af5eedae67435p-26,
+			       0x1.1f143d060a28ap-29};
diff --git a/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c
new file mode 100644
index 000000000000..70b14e48519d
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+#define AbsMask (0x7fffffff)
+#define InfLimit                                                               \
+  (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows.  */
+#define NegLimit                                                               \
+  (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1.  */
+
+#define C(i) __expm1f_poly[i]
+
+/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
+   The maximum error is 1.51 ULP:
+   expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
+			want 0x1.e2fb94p-2.  */
+float
+expm1f (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ax = ix & AbsMask;
+
+  /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x.
+     Inf:  x == +Inf => expm1(x) = x.  */
+  if (ax <= 0x34000000 || (ix == 0x7f800000))
+    return x;
+
+  /* +/-NaN.  */
+  if (ax > 0x7f800000)
+    return __math_invalidf (x);
+
+  if (x >= InfLimit)
+    return __math_oflowf (0);
+
+  if (x <= NegLimit || ix == 0xff800000)
+    return -1;
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float p = fmaf (f * f, HORNER_4 (f, C), f);
+  /* Assemble the result, using a slight rearrangement to achieve acceptable
+     accuracy.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^(i - 1).  */
+  float t = ldexpf (0.5f, i);
+  /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
+  return 2 * fmaf (p, t, t - 0.5f);
+}
+
+PL_SIG (S, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (expm1f, 1.02)
+PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000)
+PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000)
diff --git a/contrib/arm-optimized-routines/pl/math/expm1f_data.c b/contrib/arm-optimized-routines/pl/math/expm1f_data.c
new file mode 100644
index 000000000000..9d02dc448ebb
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/expm1f_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1f.sollya for details.  */
+const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5,
+			       0x1.12287cp-7, 0x1.6b55a2p-10};
diff --git a/contrib/arm-optimized-routines/pl/math/horner.h b/contrib/arm-optimized-routines/pl/math/horner.h
new file mode 100644
index 000000000000..f92ab6752110
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "horner_wrap.h"
diff --git a/contrib/arm-optimized-routines/pl/math/horner_wrap.h b/contrib/arm-optimized-routines/pl/math/horner_wrap.h
new file mode 100644
index 000000000000..6478968db913
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/horner_wrap.h
@@ -0,0 +1,34 @@
+/*
+ * Helper macros for Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i))
+#define  HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
+#define  HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
+#define  HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
+#define  HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i))
+#define  HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i))
+#define  HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i))
+#define  HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i))
+#define  HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i))
+#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i))
+#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i))
+#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i))
+
+#define  HORNER_1(x, c) HORNER_1_ (x, c, 0)
+#define  HORNER_2(x, c) HORNER_2_ (x, c, 0)
+#define  HORNER_3(x, c) HORNER_3_ (x, c, 0)
+#define  HORNER_4(x, c) HORNER_4_ (x, c, 0)
+#define  HORNER_5(x, c) HORNER_5_ (x, c, 0)
+#define  HORNER_6(x, c) HORNER_6_ (x, c, 0)
+#define  HORNER_7(x, c) HORNER_7_ (x, c, 0)
+#define  HORNER_8(x, c) HORNER_8_ (x, c, 0)
+#define  HORNER_9(x, c) HORNER_9_ (x, c, 0)
+#define HORNER_10(x, c) HORNER_10_(x, c, 0)
+#define HORNER_11(x, c) HORNER_11_(x, c, 0)
+#define HORNER_12(x, c) HORNER_12_(x, c, 0)
+// clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/hornerf.h b/contrib/arm-optimized-routines/pl/math/hornerf.h
new file mode 100644
index 000000000000..0703817b0fbb
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "horner_wrap.h"
diff --git a/contrib/arm-optimized-routines/pl/math/include/mathlib.h b/contrib/arm-optimized-routines/pl/math/include/mathlib.h
new file mode 100644
index 000000000000..af5f9f9c6afb
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/include/mathlib.h
@@ -0,0 +1,244 @@
+// clang-format off
+/*
+ * Public API.
+ *
+ * Copyright (c) 2015-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _MATHLIB_H
+#define _MATHLIB_H
+
+float acoshf (float);
+float asinhf (float);
+float atan2f (float, float);
+float atanf (float);
+float atanhf (float);
+float cbrtf (float);
+float coshf (float);
+float erfcf (float);
+float erff (float);
+float expm1f (float);
+float log10f (float);
+float log1pf (float);
+float sinhf (float);
+float tanf (float);
+float tanhf (float);
+
+double acosh (double);
+double asinh (double);
+double atan (double);
+double atan2 (double, double);
+double atanh (double);
+double cbrt (double);
+double cosh (double);
+double erfc (double);
+double expm1 (double);
+double log10 (double);
+double log1p (double);
+double sinh (double);
+double tanh (double);
+
+float __s_acoshf (float);
+float __s_asinhf (float);
+float __s_atanf (float);
+float __s_atan2f (float, float);
+float __s_atanhf (float);
+float __s_cbrtf (float);
+float __s_coshf (float);
+float __s_erfcf (float);
+float __s_erff (float);
+float __s_expm1f (float);
+float __s_log10f (float);
+float __s_log1pf (float);
+float __s_log2f (float);
+float __s_sinhf (float);
+float __s_tanf (float);
+float __s_tanhf (float);
+
+double __s_acosh (double);
+double __s_asinh (double);
+double __s_atan (double);
+double __s_atan2 (double, double);
+double __s_atanh (double);
+double __s_cbrt (double);
+double __s_cosh (double);
+double __s_erf (double);
+double __s_erfc (double);
+double __s_expm1 (double);
+double __s_log10 (double);
+double __s_log1p (double);
+double __s_log2 (double);
+double __s_sinh (double);
+double __s_tan (double);
+double __s_tanh (double);
+
+#if __aarch64__
+#if __GNUC__ >= 5
+typedef __Float32x4_t __f32x4_t;
+typedef __Float64x2_t __f64x2_t;
+#elif __clang_major__*100+__clang_minor__ >= 305
+typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
+typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
+#else
+#error Unsupported compiler
+#endif
+
+/* Vector functions following the base PCS.  */
+__f32x4_t __v_acoshf (__f32x4_t);
+__f64x2_t __v_acosh (__f64x2_t);
+__f32x4_t __v_asinhf (__f32x4_t);
+__f64x2_t __v_asinh (__f64x2_t);
+__f32x4_t __v_atanf (__f32x4_t);
+__f64x2_t __v_atan (__f64x2_t);
+__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
+__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
+__f32x4_t __v_atanhf (__f32x4_t);
+__f64x2_t __v_atanh (__f64x2_t);
+__f32x4_t __v_cbrtf (__f32x4_t);
+__f64x2_t __v_cbrt (__f64x2_t);
+__f32x4_t __v_coshf (__f32x4_t);
+__f64x2_t __v_cosh (__f64x2_t);
+__f32x4_t __v_erff (__f32x4_t);
+__f64x2_t __v_erf (__f64x2_t);
+__f32x4_t __v_erfcf (__f32x4_t);
+__f64x2_t __v_erfc (__f64x2_t);
+__f32x4_t __v_expm1f (__f32x4_t);
+__f64x2_t __v_expm1 (__f64x2_t);
+__f32x4_t __v_log10f (__f32x4_t);
+__f64x2_t __v_log10 (__f64x2_t);
+__f32x4_t __v_log1pf (__f32x4_t);
+__f64x2_t __v_log1p (__f64x2_t);
+__f32x4_t __v_log2f (__f32x4_t);
+__f64x2_t __v_log2 (__f64x2_t);
+__f32x4_t __v_sinhf (__f32x4_t);
+__f64x2_t __v_sinh (__f64x2_t);
+__f32x4_t __v_tanf (__f32x4_t);
+__f64x2_t __v_tan (__f64x2_t);
+__f32x4_t __v_tanhf (__f32x4_t);
+__f64x2_t __v_tanh (__f64x2_t);
+
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_acoshf (__f32x4_t);
+__vpcs __f64x2_t __vn_acosh (__f64x2_t);
+__vpcs __f32x4_t __vn_asinhf (__f32x4_t);
+__vpcs __f64x2_t __vn_asinh (__f64x2_t);
+__vpcs __f32x4_t __vn_atanf (__f32x4_t);
+__vpcs __f64x2_t __vn_atan (__f64x2_t);
+__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
+__vpcs __f64x2_t __vn_atanh (__f64x2_t);
+__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
+__vpcs __f64x2_t __vn_cbrt (__f64x2_t);
+__vpcs __f32x4_t __vn_coshf (__f32x4_t);
+__vpcs __f64x2_t __vn_cosh (__f64x2_t);
+__vpcs __f32x4_t __vn_erff (__f32x4_t);
+__vpcs __f64x2_t __vn_erf (__f64x2_t);
+__vpcs __f32x4_t __vn_erfcf (__f32x4_t);
+__vpcs __f64x2_t __vn_erfc (__f64x2_t);
+__vpcs __f32x4_t __vn_expm1f (__f32x4_t);
+__vpcs __f64x2_t __vn_expm1 (__f64x2_t);
+__vpcs __f32x4_t __vn_log10f (__f32x4_t);
+__vpcs __f64x2_t __vn_log10 (__f64x2_t);
+__vpcs __f32x4_t __vn_log1pf (__f32x4_t);
+__vpcs __f64x2_t __vn_log1p (__f64x2_t);
+__vpcs __f32x4_t __vn_log2f (__f32x4_t);
+__vpcs __f64x2_t __vn_log2 (__f64x2_t);
+__vpcs __f32x4_t __vn_sinhf (__f32x4_t);
+__vpcs __f64x2_t __vn_sinh (__f64x2_t);
+__vpcs __f32x4_t __vn_tanf (__f32x4_t);
+__vpcs __f64x2_t __vn_tan (__f64x2_t);
+__vpcs __f32x4_t __vn_tanhf (__f32x4_t);
+__vpcs __f64x2_t __vn_tanh (__f64x2_t);
+
+/* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
+
+#endif
+
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
+svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_erff_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_erf_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
+svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log2_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
+svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t);
+svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t);
+/* SVE ABI names.  */
+svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
+svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
+#endif
+
+#endif
+
+#endif
+// clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/include/pl_test.h b/contrib/arm-optimized-routines/pl/math/include/pl_test.h
new file mode 100644
index 000000000000..6a81360ba287
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/include/pl_test.h
@@ -0,0 +1,26 @@
+/*
+ * PL macros to aid testing. This version of this file is used for building the
+ * routine, not the tests. Separate definitions are found in test/pl_test.h
+ * which emit test parameters.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+/* Emit max ULP threshold - silenced for building the routine.  */
+#define PL_TEST_ULP(f, l)
+
+/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of
+   strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is
+   also added to the test suite.  */
+#define PL_ALIAS(a, b) strong_alias (a, b)
+
+/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
+   exceptions. e allows declaration to be emitted conditionally upon certain
+   build flags - defer expansion by one pass to allow those flags to be expanded
+   properly.  */
+#define PL_TEST_EXPECT_FENV(f, e)
+#define PL_TEST_EXPECT_FENV_ALWAYS(f)
+
+#define PL_TEST_INTERVAL(f, lo, hi, n)
+#define PL_TEST_INTERVAL_C(f, lo, hi, n, c)
diff --git a/contrib/arm-optimized-routines/math/log.c b/contrib/arm-optimized-routines/pl/math/log.c
similarity index 80%
copy from contrib/arm-optimized-routines/math/log.c
copy to contrib/arm-optimized-routines/pl/math/log.c
index d3b7bc60747c..40b0441d981d 100644
--- a/contrib/arm-optimized-routines/math/log.c
+++ b/contrib/arm-optimized-routines/pl/math/log.c
@@ -1,162 +1,161 @@
 /*
  * Double-precision log(x) function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 #define T __log_data.tab
 #define T2 __log_data.tab2
 #define B __log_data.poly1
 #define A __log_data.poly
 #define Ln2hi __log_data.ln2hi
 #define Ln2lo __log_data.ln2lo
 #define N (1 << LOG_TABLE_BITS)
 #define OFF 0x3fe6000000000000
 
 /* Top 16 bits of a double.  */
 static inline uint32_t
 top16 (double x)
 {
   return asuint64 (x) >> 48;
 }
 
 double
-log (double x)
+optr_aor_log_f64 (double x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
   uint64_t ix, iz, tmp;
   uint32_t top;
   int k, i;
 
   ix = asuint64 (x);
   top = top16 (x);
 
 #if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11
-# define LO asuint64 (1.0 - 0x1p-5)
-# define HI asuint64 (1.0 + 0x1.1p-5)
+#define LO asuint64 (1.0 - 0x1p-5)
+#define HI asuint64 (1.0 + 0x1.1p-5)
 #elif LOG_POLY1_ORDER == 12
-# define LO asuint64 (1.0 - 0x1p-4)
-# define HI asuint64 (1.0 + 0x1.09p-4)
+#define LO asuint64 (1.0 - 0x1p-4)
+#define HI asuint64 (1.0 + 0x1.09p-4)
 #endif
   if (unlikely (ix - LO < HI - LO))
     {
       /* Handle close to 1.0 inputs separately.  */
       /* Fix sign of zero with downward rounding when x==1.  */
       if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
 	return 0;
       r = x - 1.0;
       r2 = r * r;
       r3 = r * r2;
 #if LOG_POLY1_ORDER == 10
       /* Worst-case error is around 0.516 ULP.  */
-      y = r3 * (B[1] + r * B[2] + r2 * B[3]
-		+ r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
       w = B[0] * r2; /* B[0] == -0.5.  */
       hi = r + w;
       y += r - hi + w;
       y += hi;
 #elif LOG_POLY1_ORDER == 11
       /* Worst-case error is around 0.516 ULP.  */
-      y = r3 * (B[1] + r * B[2]
-		+ r2 * (B[3] + r * B[4] + r2 * B[5]
-			+ r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9])));
+      y = r3
+	  * (B[1] + r * B[2]
+	     + r2
+		 * (B[3] + r * B[4] + r2 * B[5]
+		    + r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9])));
       w = B[0] * r2; /* B[0] == -0.5.  */
       hi = r + w;
       y += r - hi + w;
       y += hi;
 #elif LOG_POLY1_ORDER == 12
-      y = r3 * (B[1] + r * B[2] + r2 * B[3]
-		+ r3 * (B[4] + r * B[5] + r2 * B[6]
-			+ r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
-# if N <= 64
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3
+		 * (B[4] + r * B[5] + r2 * B[6]
+		    + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+#if N <= 64
       /* Worst-case error is around 0.532 ULP.  */
       w = B[0] * r2; /* B[0] == -0.5.  */
       hi = r + w;
       y += r - hi + w;
       y += hi;
-# else
+#else
       /* Worst-case error is around 0.507 ULP.  */
       w = r * 0x1p27;
       double_t rhi = r + w - w;
       double_t rlo = r - rhi;
       w = rhi * rhi * B[0]; /* B[0] == -0.5.  */
       hi = r + w;
       lo = r - hi + w;
       lo += B[0] * rlo * (rhi + r);
       y += lo;
       y += hi;
-# endif
+#endif
 #endif
       return eval_as_double (y);
     }
   if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
     {
       /* x < 0x1p-1022 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzero (1);
       if (ix == asuint64 (INFINITY)) /* log(inf) == inf.  */
 	return x;
       if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
 	return __math_invalid (x);
       /* x is subnormal, normalize it.  */
       ix = asuint64 (x * 0x1p52);
       ix -= 52ULL << 52;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
   k = (int64_t) tmp >> 52; /* arithmetic shift */
   iz = ix - (tmp & 0xfffULL << 52);
   invc = T[i].invc;
   logc = T[i].logc;
   z = asdouble (iz);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
   /* r ~= z/c - 1, |r| < 1/(2*N).  */
 #if HAVE_FAST_FMA
   /* rounding error: 0x1p-55/N.  */
   r = fma (z, invc, -1.0);
 #else
   /* rounding error: 0x1p-55/N + 0x1p-66.  */
   r = (z - T2[i].chi - T2[i].clo) * invc;
 #endif
   kd = (double_t) k;
 
   /* hi + lo = r + log(c) + k*Ln2.  */
   w = kd * Ln2hi + logc;
   hi = w + r;
   lo = w - hi + r + kd * Ln2lo;
 
   /* log(x) = lo + (log1p(r) - r) + hi.  */
   r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
   /* Worst case error if |y| > 0x1p-5:
      0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
      Worst case error if |y| > 0x1p-4:
      0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).  */
 #if LOG_POLY_ORDER == 6
   y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
 #elif LOG_POLY_ORDER == 7
   y = lo
-      + r2 * (A[0] + r * A[1] + r2 * (A[2] + r * A[3])
-	      + r2 * r2 * (A[4] + r * A[5]))
+      + r2
+	  * (A[0] + r * A[1] + r2 * (A[2] + r * A[3])
+	     + r2 * r2 * (A[4] + r * A[5]))
       + hi;
 #endif
   return eval_as_double (y);
 }
-#if USE_GLIBC_ABI
-strong_alias (log, __log_finite)
-hidden_alias (log, __ieee754_log)
-# if LDBL_MANT_DIG == 53
-long double logl (long double x) { return log (x); }
-# endif
-#endif
diff --git a/contrib/arm-optimized-routines/pl/math/log10_2u.c b/contrib/arm-optimized-routines/pl/math/log10_2u.c
new file mode 100644
index 000000000000..74828ea9ef3c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/log10_2u.c
@@ -0,0 +1,150 @@
+/*
+ * Double-precision log10(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Polynomial coefficients and lookup tables.  */
+#define T __log10_data.tab
+#define T2 __log10_data.tab2
+#define B __log10_data.poly1
+#define A __log10_data.poly
+#define Ln2hi __log10_data.ln2hi
+#define Ln2lo __log10_data.ln2lo
+#define InvLn10 __log10_data.invln10
+#define N (1 << LOG10_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+#define LO asuint64 (1.0 - 0x1p-4)
+#define HI asuint64 (1.0 + 0x1.09p-4)
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+/* Fast and low accuracy implementation of log10.
+   The implementation is similar to that of math/log, except that:
+   - Polynomials are computed for log10(1+r) with r on same intervals as log.
+   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
+   Many errors above 1.59 ulp are observed across the whole range of doubles.
+   The greatest observed error is 1.61 ulp, at around 0.965:
+   log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6
+			      want -0x1.fee26884905a8p-6.  */
+double
+log10 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3
+		 * (B[4] + r * B[5] + r2 * B[6]
+		    + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+      /* Worst-case error is around 0.507 ULP.  */
+      w = r * 0x1p27;
+      double_t rhi = r + w - w;
+      double_t rlo = r - rhi;
+      w = rhi * rhi * B[0];
+      hi = r + w;
+      lo = r - hi + w;
+      lo += B[0] * rlo * (rhi + r);
+      y += lo;
+      y += hi;
+      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+      y = y * InvLn10;
+
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* w = log(c) + k*Ln2hi.  */
+  w = kd * Ln2hi + logc;
+  hi = w + r;
+  lo = w - hi + r + kd * Ln2lo;
+
+  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+
+  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+  y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
+  y = y * InvLn10;
+
+  return eval_as_double (y);
+}
+
+// clang-format off
+#if USE_GLIBC_ABI
+strong_alias (log10, __log10_finite)
+hidden_alias (log10, __ieee754_log10)
+#if LDBL_MANT_DIG == 53
+long double
+log10l (long double x)
+{
+  return log10 (x);
+}
+#endif
+#endif
+// clang-format on
+
+PL_SIG (S, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (log10, 1.11)
+PL_TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000)
+PL_TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000)
+PL_TEST_INTERVAL (log10, 0, inf, 40000)
diff --git a/contrib/arm-optimized-routines/math/log_data.c b/contrib/arm-optimized-routines/pl/math/log10_data.c
similarity index 64%
copy from contrib/arm-optimized-routines/math/log_data.c
copy to contrib/arm-optimized-routines/pl/math/log10_data.c
index 96a098d42c16..9976f19cd6df 100644
--- a/contrib/arm-optimized-routines/math/log_data.c
+++ b/contrib/arm-optimized-routines/pl/math/log10_data.c
@@ -1,511 +1,337 @@
 /*
- * Data for log.
+ * Data for log10.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
-#define N (1 << LOG_TABLE_BITS)
+#define N (1 << LOG10_TABLE_BITS)
 
-const struct log_data __log_data = {
+const struct log10_data __log10_data = {
 .ln2hi = 0x1.62e42fefa3800p-1,
 .ln2lo = 0x1.ef35793c76730p-45,
+.invln10 = 0x1.bcb7b1526e50ep-2,
 .poly1 = {
-#if LOG_POLY1_ORDER == 10
-// relative error: 0x1.32eccc6p-62
-// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
--0x1p-1,
-0x1.55555555554e5p-2,
--0x1.0000000000af2p-2,
-0x1.9999999bbe436p-3,
--0x1.55555537f9cdep-3,
-0x1.24922fc8127cfp-3,
--0x1.0000b7d6bb612p-3,
-0x1.c806ee1ddbcafp-4,
--0x1.972335a9c2d6ep-4,
-#elif LOG_POLY1_ORDER == 11
-// relative error: 0x1.52c8b708p-68
-// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
--0x1p-1,
-0x1.5555555555555p-2,
--0x1.ffffffffffea9p-3,
-0x1.999999999c4d4p-3,
--0x1.55555557f5541p-3,
-0x1.249248fbe33e4p-3,
--0x1.ffffc9a3c825bp-4,
-0x1.c71e1f204435dp-4,
--0x1.9a7f26377d06ep-4,
-0x1.71c30cf8f7364p-4,
-#elif LOG_POLY1_ORDER == 12
+#if LOG10_POLY1_ORDER == 12
 // relative error: 0x1.c04d76cp-63
 // in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
 -0x1p-1,
 0x1.5555555555577p-2,
 -0x1.ffffffffffdcbp-3,
 0x1.999999995dd0cp-3,
 -0x1.55555556745a7p-3,
 0x1.24924a344de3p-3,
 -0x1.fffffa4423d65p-4,
 0x1.c7184282ad6cap-4,
 -0x1.999eb43b068ffp-4,
 0x1.78182f7afd085p-4,
 -0x1.5521375d145cdp-4,
 #endif
 },
 .poly = {
-#if N == 64 && LOG_POLY_ORDER == 7
-// relative error: 0x1.906eb8ap-58
-// abs error: 0x1.d2cad5a8p-67
-// in -0x1.fp-8 0x1.fp-8
--0x1.0000000000027p-1,
-0x1.555555555556ap-2,
--0x1.fffffff0440bap-3,
-0x1.99999991906c3p-3,
--0x1.555c8d7e8201ep-3,
-0x1.24978c59151fap-3,
-#elif N == 128 && LOG_POLY_ORDER == 6
+#if N == 128 && LOG10_POLY_ORDER == 6
 // relative error: 0x1.926199e8p-56
 // abs error: 0x1.882ff33p-65
 // in -0x1.fp-9 0x1.fp-9
 -0x1.0000000000001p-1,
 0x1.555555551305bp-2,
 -0x1.fffffffeb459p-3,
 0x1.999b324f10111p-3,
 -0x1.55575e506c89fp-3,
-#elif N == 128 && LOG_POLY_ORDER == 7
-// relative error: 0x1.649fc4bp-64
-// abs error: 0x1.c3b5769p-74
-// in -0x1.fp-9 0x1.fp-9
--0x1.0000000000001p-1,
-0x1.5555555555556p-2,
--0x1.fffffffea1a8p-3,
-0x1.99999998e9139p-3,
--0x1.555776801b968p-3,
-0x1.2493c29331a5cp-3,
 #endif
 },
 /* Algorithm:
 
 	x = 2^k z
 	log(x) = k ln2 + log(c) + log(z/c)
 	log(z/c) = poly(z/c - 1)
 
 where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
 into the ith one, then table entries are computed as
 
 	tab[i].invc = 1/c
 	tab[i].logc = (double)log(c)
 	tab2[i].chi = (double)c
 	tab2[i].clo = (double)(c - (double)c)
 
 where c is near the center of the subinterval and is chosen by trying +-2^29
 floating point invc candidates around 1/center and selecting one for which
 
 	1) the rounding error in 0x1.8p9 + logc is 0,
 	2) the rounding error in z - chi - clo is < 0x1p-66 and
 	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
 
 Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
 2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
 a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
 that logc + poly(z/c - 1) has small error, however near x == 1 when
 |log(x)| < 0x1p-4, this is not enough so that is special cased.  */
 .tab = {
-#if N == 64
-{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
-{0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2},
-{0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2},
-{0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2},
-{0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2},
-{0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2},
-{0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2},
-{0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2},
-{0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2},
-{0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2},
-{0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2},
-{0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2},
-{0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3},
-{0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3},
-{0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3},
-{0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3},
-{0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3},
-{0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3},
-{0x1.33ae463091760p+0, -0x1.7898db878d000p-3},
-{0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3},
-{0x1.2e025c9203c89p+0, -0x1.527e620845000p-3},
-{0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3},
-{0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3},
-{0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3},
-{0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3},
-{0x1.20b4703174157p+0, -0x1.ec738fee40000p-4},
-{0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4},
-{0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4},
-{0x1.194538e960658p+0, -0x1.8197efba9a000p-4},
-{0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4},
-{0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4},
-{0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4},
-{0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5},
-{0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5},
-{0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5},
-{0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5},
-{0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6},
-{0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6},
-{0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7},
-{0x1.01010152cf066p+0, -0x1.0080a711c0000p-8},
-{0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8},
-{0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6},
-{0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5},
-{0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5},
-{0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4},
-{0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4},
-{0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4},
-{0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4},
-{0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4},
-{0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3},
-{0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3},
-{0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3},
-{0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3},
-{0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3},
-{0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3},
-{0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3},
-{0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3},
-{0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3},
-{0x1.8d3018b58699ap-1, 0x1.040259974e000p-2},
-{0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2},
-{0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2},
-{0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2},
-{0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2},
-{0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2},
-#elif N == 128
+#if N == 128
 {0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
 {0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
 {0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
 {0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
 {0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
 {0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
 {0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
 {0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
 {0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
 {0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
 {0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
 {0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
 {0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
 {0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
 {0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
 {0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
 {0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
 {0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
 {0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
 {0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
 {0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
 {0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
 {0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
 {0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
 {0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
 {0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
 {0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
 {0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
 {0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
 {0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
 {0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
 {0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
 {0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
 {0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
 {0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
 {0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
 {0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
 {0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
 {0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
 {0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
 {0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
 {0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
 {0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
 {0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
 {0x1.293726014b530p+0, -0x1.31b996b490000p-3},
 {0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
 {0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
 {0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
 {0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
 {0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
 {0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
 {0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
 {0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
 {0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
 {0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
 {0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
 {0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
 {0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
 {0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
 {0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
 {0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
 {0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
 {0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
 {0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
 {0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
 {0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
 {0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
 {0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
 {0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
 {0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
 {0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
 {0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
 {0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
 {0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
 {0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
 {0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
 {0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
 {0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
 {0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
 {0x1.008040614b195p+0, -0x1.0040979240000p-9},
 {0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
 {0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
 {0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
 {0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
 {0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
 {0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
 {0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
 {0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
 {0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
 {0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
 {0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
 {0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
 {0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
 {0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
 {0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
 {0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
 {0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
 {0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
 {0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
 {0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
 {0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
 {0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
 {0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
 {0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
 {0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
 {0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
 {0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
 {0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
 {0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
 {0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
 {0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
 {0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
 {0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
 {0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
 {0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
 {0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
 {0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
 {0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
 {0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
 {0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
 {0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
 {0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
 {0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
 {0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
 {0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
 {0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
 {0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
 {0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
 #endif
 },
 #if !HAVE_FAST_FMA
 .tab2 = {
-# if N == 64
-{0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56},
-{0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55},
-{0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55},
-{0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56},
-{0x1.720000b37216ep-1, 0x1.802bc8d437043p-55},
-{0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57},
-{0x1.7a0000628daep-1, -0x1.e00434b49313dp-56},
-{0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56},
-{0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57},
-{0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56},
-{0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57},
-{0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55},
-{0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55},
-{0x1.960000135d8eap-1, -0x1.f832268dc3095p-55},
-{0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56},
-{0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58},
-{0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55},
-{0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55},
-{0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57},
-{0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56},
-{0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57},
-{0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57},
-{0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56},
-{0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55},
-{0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58},
-{0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55},
-{0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58},
-{0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59},
-{0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59},
-{0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58},
-{0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55},
-{0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55},
-{0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55},
-{0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60},
-{0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55},
-{0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55},
-{0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56},
-{0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57},
-{0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58},
-{0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58},
-{0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54},
-{0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55},
-{0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54},
-{0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54},
-{0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55},
-{0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57},
-{0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54},
-{0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55},
-{0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54},
-{0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55},
-{0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56},
-{0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54},
-{0x1.320000324c55bp+0, 0x1.f81983997354fp-54},
-{0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54},
-{0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54},
-{0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56},
-{0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54},
-{0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55},
-{0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55},
-{0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56},
-{0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54},
-{0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55},
-{0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55},
-{0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54},
-# elif N == 128
+#if N == 128
 {0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
 {0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
 {0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
 {0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
 {0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
 {0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
 {0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
 {0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
 {0x1.710000e86978p-1, 0x1.bff6671097952p-56},
 {0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
 {0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
 {0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
 {0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
 {0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
 {0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
 {0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
 {0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
 {0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
 {0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
 {0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
 {0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
 {0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
 {0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
 {0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
 {0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
 {0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
 {0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
 {0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
 {0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
 {0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
 {0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
 {0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
 {0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
 {0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
 {0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
 {0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
 {0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
 {0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
 {0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
 {0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
 {0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
 {0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
 {0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
 {0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
 {0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
 {0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
 {0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
 {0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
 {0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
 {0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
 {0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
 {0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
 {0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
 {0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
 {0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
 {0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
 {0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
 {0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
 {0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
 {0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
 {0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
 {0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
 {0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
 {0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
 {0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
 {0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
 {0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
 {0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
 {0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
 {0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
 {0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
 {0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
 {0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
 {0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
 {0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
 {0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
 {0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
 {0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
 {0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
 {0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
 {0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
 {0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
 {0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
 {0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
 {0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
 {0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
 {0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
 {0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
 {0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
 {0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
 {0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
 {0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
 {0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
 {0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
 {0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
 {0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
 {0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
 {0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
 {0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
 {0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
 {0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
 {0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
 {0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
 {0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
 {0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
 {0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
 {0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
 {0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
 {0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
 {0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
 {0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
 {0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
 {0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
 {0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
 {0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
 {0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
 {0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
 {0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
 {0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
 {0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
 {0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
 {0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
 {0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
 {0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
 {0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
 {0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
 {0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
 {0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
 #endif
 },
 #endif /* !HAVE_FAST_FMA */
 };
diff --git a/contrib/arm-optimized-routines/math/logf.c b/contrib/arm-optimized-routines/pl/math/log10f.c
similarity index 54%
copy from contrib/arm-optimized-routines/math/logf.c
copy to contrib/arm-optimized-routines/pl/math/log10f.c
index cfbaee12df10..5c80008e4e57 100644
--- a/contrib/arm-optimized-routines/math/logf.c
+++ b/contrib/arm-optimized-routines/pl/math/log10f.c
@@ -1,79 +1,97 @@
 /*
- * Single-precision log function.
+ * Single-precision log10 function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
+
 #include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
 
-/*
-LOGF_TABLE_BITS = 4
-LOGF_POLY_ORDER = 4
+/* Data associated to logf:
 
-ULP error: 0.818 (nearest rounding.)
-Relative error: 1.957 * 2^-26 (before rounding.)
-*/
+   LOGF_TABLE_BITS = 4
+   LOGF_POLY_ORDER = 4
+
+   ULP error: 0.818 (nearest rounding.)
+   Relative error: 1.957 * 2^-26 (before rounding.).  */
 
 #define T __logf_data.tab
 #define A __logf_data.poly
 #define Ln2 __logf_data.ln2
+#define InvLn10 __logf_data.invln10
 #define N (1 << LOGF_TABLE_BITS)
 #define OFF 0x3f330000
 
+/* This naive implementation of log10f mimics that of log
+   then simply scales the result by 1/log(10) to switch from base e to
+   base 10. Hence, most computations are carried out in double precision.
+   Scaling before rounding to single precision is both faster and more accurate.
+
+   ULP error: 0.797 ulp (nearest rounding.).  */
 float
-logf (float x)
+log10f (float x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, r2, y, y0, invc, logc;
   uint32_t ix, iz, tmp;
   int k, i;
 
   ix = asuint (x);
 #if WANT_ROUNDING
   /* Fix sign of zero with downward rounding when x==1.  */
   if (unlikely (ix == 0x3f800000))
     return 0;
 #endif
   if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
     {
       /* x < 0x1p-126 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzerof (1);
       if (ix == 0x7f800000) /* log(inf) == inf.  */
 	return x;
       if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
 	return __math_invalidf (x);
       /* x is subnormal, normalize it.  */
       ix = asuint (x * 0x1p23f);
       ix -= 23 << 23;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
-  k = (int32_t) tmp >> 23; /* arithmetic shift */
-  iz = ix - (tmp & 0x1ff << 23);
+  k = (int32_t) tmp >> 23; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xff800000);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
 
-  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
   r = z * invc - 1;
   y0 = logc + (double_t) k * Ln2;
 
   /* Pipelined polynomial evaluation to approximate log1p(r).  */
   r2 = r * r;
   y = A[1] * r + A[2];
   y = A[0] * r2 + y;
   y = y * r2 + (y0 + r);
+
+  /* Multiply by 1/log(10).  */
+  y = y * InvLn10;
+
   return eval_as_float (y);
 }
-#if USE_GLIBC_ABI
-strong_alias (logf, __logf_finite)
-hidden_alias (logf, __ieee754_logf)
-#endif
+
+PL_SIG (S, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (log10f, 0.30)
+PL_TEST_INTERVAL (log10f, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000)
+PL_TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000)
+PL_TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000)
+PL_TEST_INTERVAL (log10f, 0, inf, 50000)
diff --git a/contrib/arm-optimized-routines/pl/math/log1p_2u.c b/contrib/arm-optimized-routines/pl/math/log1p_2u.c
new file mode 100644
index 000000000000..23c8ed4a1914
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/log1p_2u.c
@@ -0,0 +1,136 @@
+/*
+ * Double-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2Hi 0x1.62e42fefa3800p-1
+#define Ln2Lo 0x1.ef35793c76730p-45
+#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
+#define OneMHfRt2Top                                                           \
+  0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)).  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+#define OneMHfRt2 0x3fd2bec333018866
+#define Rt2MOne 0x3fda827999fcef32
+#define AbsMask 0x7fffffffffffffff
+#define ExpM63 0x3c00
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+eval_poly (double f)
+{
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
+
+/* log1p approximation using polynomial on reduced interval. Largest
+   observed errors are near the lower boundary of the region where k
+   is 0.
+   Maximum measured error: 1.75ULP.
+   log1p(-0x1.2e1aea97b3e5cp-2) got -0x1.65fb8659a2f9p-2
+			       want -0x1.65fb8659a2f92p-2.  */
+double
+log1p (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia16 = ia >> 48;
+
+  /* Handle special cases first.  */
+  if (unlikely (ia16 >= 0x7ff0 || ix >= 0xbff0000000000000
+		|| ix == 0x8000000000000000))
+    {
+      if (ix == 0x8000000000000000 || ix == 0x7ff0000000000000)
+	{
+	  /* x ==  -0 => log1p(x) =  -0.
+	     x == Inf => log1p(x) = Inf.  */
+	  return x;
+	}
+      if (ix == 0xbff0000000000000)
+	{
+	  /* x == -1 => log1p(x) = -Inf.  */
+	  return __math_divzero (-1);
+	  ;
+	}
+      if (ia16 >= 0x7ff0)
+	{
+	  /* x == +/-NaN => log1p(x) = NaN.  */
+	  return __math_invalid (asdouble (ia));
+	}
+      /* x  <      -1 => log1p(x) =  NaN.
+	 x ==    -Inf => log1p(x) =  NaN.  */
+      return __math_invalid (x);
+    }
+
+  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+			   is in [sqrt(2)/2, sqrt(2)]):
+     log1p(x) = k*log(2) + log1p(f).
+
+     f may not be representable exactly, so we need a correction term:
+     let m = round(1 + x), c = (1 + x) - m.
+     c << m: at very small x, log1p(x) ~ x, hence:
+     log(1+x) - log(m) ~ c/m.
+
+     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+
+  uint64_t sign = ix & ~AbsMask;
+  if (ia <= OneMHfRt2 || (!sign && ia <= Rt2MOne))
+    {
+      if (unlikely (ia16 <= ExpM63))
+	{
+	  /* If exponent of x <= -63 then shortcut the polynomial and avoid
+	     underflow by just returning x, which is exactly rounded in this
+	     region.  */
+	  return x;
+	}
+      /* If x is in [sqrt(2)/2 - 1, sqrt(2) - 1] then we can shortcut all the
+	 logic below, as k = 0 and f = x and therefore representable exactly.
+	 All we need is to return the polynomial.  */
+      return fma (x, eval_poly (x) * x, x);
+    }
+
+  /* Obtain correctly scaled k by manipulation in the exponent.  */
+  double m = x + 1;
+  uint64_t mi = asuint64 (m);
+  uint32_t u = (mi >> 32) + OneMHfRt2Top;
+  int32_t k = (int32_t) (u >> 20) - OneTop12;
+
+  /* Correction term c/m.  */
+  double cm = (x - (m - 1)) / m;
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  uint32_t utop = (u & 0x000fffff) + HfRt2Top;
+  uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask);
+  double f = asdouble (u_red) - 1;
+
+  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+     log1p(0)=0 we choose an approximation of the form:
+	x + C0*x^2 + C1*x^3 + C2x^4 + ...
+     Hence approximation has the form f + f^2 * P(f)
+	where P(x) = C0 + C1*x + C2x^2 + ...  */
+  double p = fma (f, eval_poly (f) * f, f);
+
+  double kd = k;
+  double y = fma (Ln2Lo, kd, cm);
+  return y + fma (Ln2Hi, kd, p);
+}
+
+PL_SIG (S, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (log1p, 1.26)
+PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (log1p, -1.0, inf, 5000)
diff --git a/contrib/arm-optimized-routines/pl/math/log1p_data.c b/contrib/arm-optimized-routines/pl/math/log1p_data.c
new file mode 100644
index 000000000000..6168a0c9a214
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/log1p_data.c
@@ -0,0 +1,19 @@
+/*
+ * Data used in double-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients generated using Remez algorithm, see
+   log1p.sollya for details.  */
+const struct log1p_data __log1p_data = {
+  .coeffs = {-0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+	     0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+	     -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+	     0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+	     -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+	     0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+	     -0x1.cfa7385bdb37ep-6}};
diff --git a/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c
new file mode 100644
index 000000000000..fcfd05a6fcb7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c
@@ -0,0 +1,165 @@
+/*
+ * Single-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2 (0x1.62e43p-1f)
+#define SignMask (0x80000000)
+
+/* Biased exponent of the largest float m for which m^8 underflows.  */
+#define M8UFLOW_BOUND_BEXP 112
+/* Biased exponent of the largest float for which we just return x.  */
+#define TINY_BOUND_BEXP 103
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m, uint32_t e)
+{
+#ifdef LOG1PF_2U5
+
+  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using
+     slightly modified Estrin scheme (no x^0 term, and x term is just x).  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  if (unlikely (e < M8UFLOW_BOUND_BEXP))
+    return p_06;
+
+  float m8 = m4 * m4;
+  return fmaf (m8, p_79, p_06);
+
+#elif defined(LOG1PF_1U3)
+
+  /* 1.3 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Horner
+     scheme. Our polynomial approximation for log1p has the form
+     x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
+     Hence approximation has the form m + m^2 * P(m)
+       where P(x) = C1 + C2 * x + C3 * x^2 + ... .  */
+  return fmaf (m, m * HORNER_8 (m, C), m);
+
+#else
+#error No log1pf approximation exists with the requested precision. Options are 13 or 25.
+#endif
+}
+
+static inline uint32_t
+biased_exponent (uint32_t ix)
+{
+  return (ix & 0x7f800000) >> 23;
+}
+
+/* log1pf approximation using polynomial on reduced interval. Worst-case error
+   when using Estrin is roughly 2.02 ULP:
+   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+float
+log1pf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & ~SignMask;
+  uint32_t ia12 = ia >> 20;
+  uint32_t e = biased_exponent (ix);
+
+  /* Handle special cases first.  */
+  if (unlikely (ia12 >= 0x7f8 || ix >= 0xbf800000 || ix == 0x80000000
+		|| e <= TINY_BOUND_BEXP))
+    {
+      if (ix == 0xff800000)
+	{
+	  /* x == -Inf => log1pf(x) =  NaN.  */
+	  return NAN;
+	}
+      if ((ix == 0x7f800000 || e <= TINY_BOUND_BEXP) && ia12 <= 0x7f8)
+	{
+	  /* |x| < TinyBound => log1p(x)  =  x.
+	      x ==       Inf => log1pf(x) = Inf.  */
+	  return x;
+	}
+      if (ix == 0xbf800000)
+	{
+	  /* x == -1.0 => log1pf(x) = -Inf.  */
+	  return __math_divzerof (-1);
+	}
+      if (ia12 >= 0x7f8)
+	{
+	  /* x == +/-NaN => log1pf(x) = NaN.  */
+	  return __math_invalidf (asfloat (ia));
+	}
+      /* x <    -1.0 => log1pf(x) = NaN.  */
+      return __math_invalidf (x);
+    }
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+
+  if (ix <= 0x3f000000 || ia <= 0x3e800000)
+    {
+      /* If x is in [-0.25, 0.5] then we can shortcut all the logic
+	 below, as k = 0 and m = x.  All we need is to return the
+	 polynomial.  */
+      return eval_poly (x, e);
+    }
+
+  float m = x + 1.0f;
+
+  /* k is used scale the input. 0x3f400000 is chosen as we are trying to
+     reduce x to the range [-0.25, 0.5]. Inside this range, k is 0.
+     Outside this range, if k is reinterpreted as (NOT CONVERTED TO) float:
+	 let k = sign * 2^p      where sign = -1 if x < 0
+					       1 otherwise
+	 and p is a negative integer whose magnitude increases with the
+	 magnitude of x.  */
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+
+  /* By using integer arithmetic, we obtain the necessary scaling by
+     subtracting the unbiased exponent of k from the exponent of x.  */
+  float m_scale = asfloat (asuint (x) - k);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number (s in [2**-126,2**26]), and scale m down accordingly.  */
+  float s = asfloat (asuint (4.0f) - k);
+  m_scale = m_scale + fmaf (0.25f, s, -1.0f);
+
+  float p = eval_poly (m_scale, biased_exponent (asuint (m_scale)));
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  float scale_back = (float) k * 0x1.0p-23f;
+
+  /* Apply the scaling back.  */
+  return fmaf (scale_back, Ln2, p);
+}
+
+PL_SIG (S, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (log1pf, 1.52)
+PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000)
diff --git a/contrib/arm-optimized-routines/pl/math/log1pf_data.c b/contrib/arm-optimized-routines/pl/math/log1pf_data.c
new file mode 100644
index 000000000000..8c92d5738fe8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/log1pf_data.c
@@ -0,0 +1,14 @@
+/*
+ * Data used in single-precision log1p(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+/* Polynomial coefficients generated using floating-point minimax
+   algorithm, see tools/log1pf.sollya for details.  */
+const struct log1pf_data __log1pf_data
+  = {.coeffs = {-0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+		-0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+		-0x1.6f0d5ep-5f}};
diff --git a/contrib/arm-optimized-routines/math/log_data.c b/contrib/arm-optimized-routines/pl/math/log_data.c
similarity index 99%
copy from contrib/arm-optimized-routines/math/log_data.c
copy to contrib/arm-optimized-routines/pl/math/log_data.c
index 96a098d42c16..34715e5036a3 100644
--- a/contrib/arm-optimized-routines/math/log_data.c
+++ b/contrib/arm-optimized-routines/pl/math/log_data.c
@@ -1,511 +1,511 @@
 /*
  * Data for log.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #define N (1 << LOG_TABLE_BITS)
 
 const struct log_data __log_data = {
 .ln2hi = 0x1.62e42fefa3800p-1,
 .ln2lo = 0x1.ef35793c76730p-45,
 .poly1 = {
 #if LOG_POLY1_ORDER == 10
 // relative error: 0x1.32eccc6p-62
 // in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
 -0x1p-1,
 0x1.55555555554e5p-2,
 -0x1.0000000000af2p-2,
 0x1.9999999bbe436p-3,
 -0x1.55555537f9cdep-3,
 0x1.24922fc8127cfp-3,
 -0x1.0000b7d6bb612p-3,
 0x1.c806ee1ddbcafp-4,
 -0x1.972335a9c2d6ep-4,
 #elif LOG_POLY1_ORDER == 11
 // relative error: 0x1.52c8b708p-68
 // in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
 -0x1p-1,
 0x1.5555555555555p-2,
 -0x1.ffffffffffea9p-3,
 0x1.999999999c4d4p-3,
 -0x1.55555557f5541p-3,
 0x1.249248fbe33e4p-3,
 -0x1.ffffc9a3c825bp-4,
 0x1.c71e1f204435dp-4,
 -0x1.9a7f26377d06ep-4,
 0x1.71c30cf8f7364p-4,
 #elif LOG_POLY1_ORDER == 12
 // relative error: 0x1.c04d76cp-63
 // in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
 -0x1p-1,
 0x1.5555555555577p-2,
 -0x1.ffffffffffdcbp-3,
 0x1.999999995dd0cp-3,
 -0x1.55555556745a7p-3,
 0x1.24924a344de3p-3,
 -0x1.fffffa4423d65p-4,
 0x1.c7184282ad6cap-4,
 -0x1.999eb43b068ffp-4,
 0x1.78182f7afd085p-4,
 -0x1.5521375d145cdp-4,
 #endif
 },
 .poly = {
 #if N == 64 && LOG_POLY_ORDER == 7
 // relative error: 0x1.906eb8ap-58
 // abs error: 0x1.d2cad5a8p-67
 // in -0x1.fp-8 0x1.fp-8
 -0x1.0000000000027p-1,
 0x1.555555555556ap-2,
 -0x1.fffffff0440bap-3,
 0x1.99999991906c3p-3,
 -0x1.555c8d7e8201ep-3,
 0x1.24978c59151fap-3,
 #elif N == 128 && LOG_POLY_ORDER == 6
 // relative error: 0x1.926199e8p-56
 // abs error: 0x1.882ff33p-65
 // in -0x1.fp-9 0x1.fp-9
 -0x1.0000000000001p-1,
 0x1.555555551305bp-2,
 -0x1.fffffffeb459p-3,
 0x1.999b324f10111p-3,
 -0x1.55575e506c89fp-3,
 #elif N == 128 && LOG_POLY_ORDER == 7
 // relative error: 0x1.649fc4bp-64
 // abs error: 0x1.c3b5769p-74
 // in -0x1.fp-9 0x1.fp-9
 -0x1.0000000000001p-1,
 0x1.5555555555556p-2,
 -0x1.fffffffea1a8p-3,
 0x1.99999998e9139p-3,
 -0x1.555776801b968p-3,
 0x1.2493c29331a5cp-3,
 #endif
 },
 /* Algorithm:
 
 	x = 2^k z
 	log(x) = k ln2 + log(c) + log(z/c)
 	log(z/c) = poly(z/c - 1)
 
 where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
 into the ith one, then table entries are computed as
 
 	tab[i].invc = 1/c
 	tab[i].logc = (double)log(c)
 	tab2[i].chi = (double)c
 	tab2[i].clo = (double)(c - (double)c)
 
 where c is near the center of the subinterval and is chosen by trying +-2^29
 floating point invc candidates around 1/center and selecting one for which
 
 	1) the rounding error in 0x1.8p9 + logc is 0,
 	2) the rounding error in z - chi - clo is < 0x1p-66 and
 	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
 
 Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
 2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
 a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
 that logc + poly(z/c - 1) has small error, however near x == 1 when
 |log(x)| < 0x1p-4, this is not enough so that is special cased.  */
 .tab = {
 #if N == 64
 {0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
 {0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2},
 {0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2},
 {0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2},
 {0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2},
 {0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2},
 {0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2},
 {0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2},
 {0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2},
 {0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2},
 {0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2},
 {0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2},
 {0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3},
 {0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3},
 {0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3},
 {0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3},
 {0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3},
 {0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3},
 {0x1.33ae463091760p+0, -0x1.7898db878d000p-3},
 {0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3},
 {0x1.2e025c9203c89p+0, -0x1.527e620845000p-3},
 {0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3},
 {0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3},
 {0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3},
 {0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3},
 {0x1.20b4703174157p+0, -0x1.ec738fee40000p-4},
 {0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4},
 {0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4},
 {0x1.194538e960658p+0, -0x1.8197efba9a000p-4},
 {0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4},
 {0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4},
 {0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4},
 {0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5},
 {0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5},
 {0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5},
 {0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5},
 {0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6},
 {0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6},
 {0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7},
 {0x1.01010152cf066p+0, -0x1.0080a711c0000p-8},
 {0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8},
 {0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6},
 {0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5},
 {0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5},
 {0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4},
 {0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4},
 {0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4},
 {0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4},
 {0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4},
 {0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3},
 {0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3},
 {0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3},
 {0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3},
 {0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3},
 {0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3},
 {0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3},
 {0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3},
 {0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3},
 {0x1.8d3018b58699ap-1, 0x1.040259974e000p-2},
 {0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2},
 {0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2},
 {0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2},
 {0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2},
 {0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2},
 #elif N == 128
 {0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
 {0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
 {0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
 {0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
 {0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
 {0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
 {0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
 {0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
 {0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
 {0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
 {0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
 {0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
 {0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
 {0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
 {0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
 {0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
 {0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
 {0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
 {0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
 {0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
 {0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
 {0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
 {0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
 {0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
 {0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
 {0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
 {0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
 {0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
 {0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
 {0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
 {0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
 {0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
 {0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
 {0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
 {0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
 {0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
 {0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
 {0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
 {0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
 {0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
 {0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
 {0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
 {0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
 {0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
 {0x1.293726014b530p+0, -0x1.31b996b490000p-3},
 {0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
 {0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
 {0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
 {0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
 {0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
 {0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
 {0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
 {0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
 {0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
 {0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
 {0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
 {0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
 {0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
 {0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
 {0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
 {0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
 {0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
 {0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
 {0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
 {0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
 {0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
 {0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
 {0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
 {0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
 {0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
 {0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
 {0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
 {0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
 {0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
 {0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
 {0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
 {0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
 {0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
 {0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
 {0x1.008040614b195p+0, -0x1.0040979240000p-9},
 {0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
 {0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
 {0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
 {0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
 {0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
 {0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
 {0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
 {0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
 {0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
 {0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
 {0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
 {0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
 {0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
 {0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
 {0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
 {0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
 {0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
 {0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
 {0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
 {0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
 {0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
 {0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
 {0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
 {0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
 {0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
 {0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
 {0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
 {0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
 {0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
 {0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
 {0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
 {0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
 {0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
 {0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
 {0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
 {0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
 {0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
 {0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
 {0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
 {0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
 {0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
 {0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
 {0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
 {0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
 {0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
 {0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
 {0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
 {0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
 #endif
 },
 #if !HAVE_FAST_FMA
 .tab2 = {
-# if N == 64
+#if N == 64
 {0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56},
 {0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55},
 {0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55},
 {0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56},
 {0x1.720000b37216ep-1, 0x1.802bc8d437043p-55},
 {0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57},
 {0x1.7a0000628daep-1, -0x1.e00434b49313dp-56},
 {0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56},
 {0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57},
 {0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56},
 {0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57},
 {0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55},
 {0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55},
 {0x1.960000135d8eap-1, -0x1.f832268dc3095p-55},
 {0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56},
 {0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58},
 {0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55},
 {0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55},
 {0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57},
 {0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56},
 {0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57},
 {0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57},
 {0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56},
 {0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55},
 {0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58},
 {0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55},
 {0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58},
 {0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59},
 {0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59},
 {0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58},
 {0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55},
 {0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55},
 {0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55},
 {0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60},
 {0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55},
 {0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55},
 {0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56},
 {0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57},
 {0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58},
 {0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58},
 {0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54},
 {0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55},
 {0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54},
 {0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54},
 {0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55},
 {0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57},
 {0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54},
 {0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55},
 {0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54},
 {0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55},
 {0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56},
 {0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54},
 {0x1.320000324c55bp+0, 0x1.f81983997354fp-54},
 {0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54},
 {0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54},
 {0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56},
 {0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54},
 {0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55},
 {0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55},
 {0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56},
 {0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54},
 {0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55},
 {0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55},
 {0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54},
-# elif N == 128
+#elif N == 128
 {0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
 {0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
 {0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
 {0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
 {0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
 {0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
 {0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
 {0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
 {0x1.710000e86978p-1, 0x1.bff6671097952p-56},
 {0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
 {0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
 {0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
 {0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
 {0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
 {0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
 {0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
 {0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
 {0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
 {0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
 {0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
 {0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
 {0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
 {0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
 {0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
 {0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
 {0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
 {0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
 {0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
 {0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
 {0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
 {0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
 {0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
 {0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
 {0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
 {0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
 {0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
 {0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
 {0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
 {0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
 {0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
 {0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
 {0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
 {0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
 {0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
 {0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
 {0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
 {0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
 {0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
 {0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
 {0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
 {0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
 {0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
 {0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
 {0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
 {0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
 {0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
 {0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
 {0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
 {0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
 {0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
 {0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
 {0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
 {0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
 {0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
 {0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
 {0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
 {0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
 {0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
 {0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
 {0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
 {0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
 {0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
 {0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
 {0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
 {0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
 {0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
 {0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
 {0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
 {0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
 {0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
 {0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
 {0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
 {0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
 {0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
 {0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
 {0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
 {0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
 {0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
 {0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
 {0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
 {0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
 {0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
 {0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
 {0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
 {0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
 {0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
 {0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
 {0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
 {0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
 {0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
 {0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
 {0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
 {0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
 {0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
 {0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
 {0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
 {0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
 {0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
 {0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
 {0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
 {0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
 {0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
 {0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
 {0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
 {0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
 {0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
 {0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
 {0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
 {0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
 {0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
 {0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
 {0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
 {0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
 {0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
 {0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
 {0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
 {0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
 {0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
 #endif
 },
 #endif /* !HAVE_FAST_FMA */
 };
diff --git a/contrib/arm-optimized-routines/math/logf.c b/contrib/arm-optimized-routines/pl/math/logf.c
similarity index 90%
copy from contrib/arm-optimized-routines/math/logf.c
copy to contrib/arm-optimized-routines/pl/math/logf.c
index cfbaee12df10..17a74ed6d28f 100644
--- a/contrib/arm-optimized-routines/math/logf.c
+++ b/contrib/arm-optimized-routines/pl/math/logf.c
@@ -1,79 +1,75 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
 #include <stdint.h>
 #include "math_config.h"
 
 /*
 LOGF_TABLE_BITS = 4
 LOGF_POLY_ORDER = 4
 
 ULP error: 0.818 (nearest rounding.)
 Relative error: 1.957 * 2^-26 (before rounding.)
 */
 
 #define T __logf_data.tab
 #define A __logf_data.poly
 #define Ln2 __logf_data.ln2
 #define N (1 << LOGF_TABLE_BITS)
 #define OFF 0x3f330000
 
 float
-logf (float x)
+optr_aor_log_f32 (float x)
 {
   /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
   double_t z, r, r2, y, y0, invc, logc;
   uint32_t ix, iz, tmp;
   int k, i;
 
   ix = asuint (x);
 #if WANT_ROUNDING
   /* Fix sign of zero with downward rounding when x==1.  */
   if (unlikely (ix == 0x3f800000))
     return 0;
 #endif
   if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
     {
       /* x < 0x1p-126 or inf or nan.  */
       if (ix * 2 == 0)
 	return __math_divzerof (1);
       if (ix == 0x7f800000) /* log(inf) == inf.  */
 	return x;
       if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
 	return __math_invalidf (x);
       /* x is subnormal, normalize it.  */
       ix = asuint (x * 0x1p23f);
       ix -= 23 << 23;
     }
 
   /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
   iz = ix - (tmp & 0x1ff << 23);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
 
   /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
   r = z * invc - 1;
   y0 = logc + (double_t) k * Ln2;
 
   /* Pipelined polynomial evaluation to approximate log1p(r).  */
   r2 = r * r;
   y = A[1] * r + A[2];
   y = A[0] * r2 + y;
   y = y * r2 + (y0 + r);
   return eval_as_float (y);
 }
-#if USE_GLIBC_ABI
-strong_alias (logf, __logf_finite)
-hidden_alias (logf, __ieee754_logf)
-#endif
diff --git a/contrib/arm-optimized-routines/pl/math/logf_data.c b/contrib/arm-optimized-routines/pl/math/logf_data.c
new file mode 100644
index 000000000000..97d9eb8d0097
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/logf_data.c
@@ -0,0 +1,36 @@
+/*
+ * Data definition for logf and log10f.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct logf_data __logf_data = {
+    .tab =
+        {
+            {0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2},
+            {0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2},
+            {0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2},
+            {0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3},
+            {0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3},
+            {0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3},
+            {0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4},
+            {0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4},
+            {0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5},
+            {0x1p+0, 0x0p+0},
+            {0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5},
+            {0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4},
+            {0x1.b2036576afce6p-1, 0x1.526e57720db08p-3},
+            {0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3},
+            {0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2},
+            {0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2},
+        },
+    .ln2 = 0x1.62e42fefa39efp-1,
+    .invln10 = 0x1.bcb7b1526e50ep-2,
+    .poly = {
+        -0x1.00ea348b88334p-2,
+        0x1.5575b0be00b6ap-2,
+        -0x1.ffffef20a4123p-2,
+    }};
diff --git a/contrib/arm-optimized-routines/math/math_config.h b/contrib/arm-optimized-routines/pl/math/math_config.h
similarity index 65%
copy from contrib/arm-optimized-routines/math/math_config.h
copy to contrib/arm-optimized-routines/pl/math/math_config.h
index e85104337048..dccb3ce4c775 100644
--- a/contrib/arm-optimized-routines/math/math_config.h
+++ b/contrib/arm-optimized-routines/pl/math/math_config.h
@@ -1,462 +1,572 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATH_CONFIG_H
 #define _MATH_CONFIG_H
 
 #include <math.h>
 #include <stdint.h>
 
 #ifndef WANT_ROUNDING
 /* If defined to 1, return correct results for special cases in non-nearest
    rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
    This may be set to 0 if there is no fenv support or if math functions only
    get called in round to nearest mode.  */
 # define WANT_ROUNDING 1
 #endif
 #ifndef WANT_ERRNO
 /* If defined to 1, set errno in math functions according to ISO C.  Many math
    libraries do not set errno, so this is 0 by default.  It may need to be
    set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
 # define WANT_ERRNO 0
 #endif
-#ifndef WANT_ERRNO_UFLOW
-/* Set errno to ERANGE if result underflows to 0 (in all rounding modes).  */
-# define WANT_ERRNO_UFLOW (WANT_ROUNDING && WANT_ERRNO)
+#ifndef WANT_SIMD_EXCEPT
+/* If defined to 1, trigger fp exceptions in vector routines, consistently with
+   behaviour expected from the corresponding scalar routine.  */
+#define WANT_SIMD_EXCEPT 0
 #endif
 
 /* Compiler can inline round as a single instruction.  */
 #ifndef HAVE_FAST_ROUND
 # if __aarch64__
 #   define HAVE_FAST_ROUND 1
 # else
 #   define HAVE_FAST_ROUND 0
 # endif
 #endif
 
 /* Compiler can inline lround, but not (long)round(x).  */
 #ifndef HAVE_FAST_LROUND
 # if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__
 #   define HAVE_FAST_LROUND 1
 # else
 #   define HAVE_FAST_LROUND 0
 # endif
 #endif
 
 /* Compiler can inline fma as a single instruction.  */
 #ifndef HAVE_FAST_FMA
 # if defined FP_FAST_FMA || __aarch64__
 #   define HAVE_FAST_FMA 1
 # else
 #   define HAVE_FAST_FMA 0
 # endif
 #endif
 
 /* Provide *_finite symbols and some of the glibc hidden symbols
    so libmathlib can be used with binaries compiled against glibc
    to interpose math functions with both static and dynamic linking.  */
 #ifndef USE_GLIBC_ABI
 # if __GNUC__
 #   define USE_GLIBC_ABI 1
 # else
 #   define USE_GLIBC_ABI 0
 # endif
 #endif
 
 /* Optionally used extensions.  */
 #ifdef __GNUC__
 # define HIDDEN __attribute__ ((__visibility__ ("hidden")))
 # define NOINLINE __attribute__ ((noinline))
 # define UNUSED __attribute__ ((unused))
 # define likely(x) __builtin_expect (!!(x), 1)
 # define unlikely(x) __builtin_expect (x, 0)
 # if __GNUC__ >= 9
 #   define attribute_copy(f) __attribute__ ((copy (f)))
 # else
 #   define attribute_copy(f)
 # endif
 # define strong_alias(f, a) \
   extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
 # define hidden_alias(f, a) \
   extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
   attribute_copy (f);
 #else
 # define HIDDEN
 # define NOINLINE
 # define UNUSED
 # define likely(x) (x)
 # define unlikely(x) (x)
 #endif
 
 #if HAVE_FAST_ROUND
 /* When set, the roundtoint and converttoint functions are provided with
    the semantics documented below.  */
 # define TOINT_INTRINSICS 1
 
 /* Round x to nearest int in all rounding modes, ties have to be rounded
    consistently with converttoint so the results match.  If the result
    would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
 static inline double_t
 roundtoint (double_t x)
 {
   return round (x);
 }
 
 /* Convert x to nearest int in all rounding modes, ties have to be rounded
    consistently with roundtoint.  If the result is not representible in an
    int32_t then the semantics is unspecified.  */
 static inline int32_t
 converttoint (double_t x)
 {
 # if HAVE_FAST_LROUND
   return lround (x);
 # else
   return (long) round (x);
 # endif
 }
 #endif
 
 static inline uint32_t
 asuint (float f)
 {
   union
   {
     float f;
     uint32_t i;
   } u = {f};
   return u.i;
 }
 
 static inline float
 asfloat (uint32_t i)
 {
   union
   {
     uint32_t i;
     float f;
   } u = {i};
   return u.f;
 }
 
 static inline uint64_t
 asuint64 (double f)
 {
   union
   {
     double f;
     uint64_t i;
   } u = {f};
   return u.i;
 }
 
 static inline double
 asdouble (uint64_t i)
 {
   union
   {
     uint64_t i;
     double f;
   } u = {i};
   return u.f;
 }
 
 #ifndef IEEE_754_2008_SNAN
 # define IEEE_754_2008_SNAN 1
 #endif
 static inline int
 issignalingf_inline (float x)
 {
   uint32_t ix = asuint (x);
   if (!IEEE_754_2008_SNAN)
     return (ix & 0x7fc00000) == 0x7fc00000;
   return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
 }
 
 static inline int
 issignaling_inline (double x)
 {
   uint64_t ix = asuint64 (x);
   if (!IEEE_754_2008_SNAN)
     return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
   return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
 }
 
 #if __aarch64__ && __GNUC__
 /* Prevent the optimization of a floating-point expression.  */
 static inline float
 opt_barrier_float (float x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
   return x;
 }
 static inline double
 opt_barrier_double (double x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
   return x;
 }
 /* Force the evaluation of a floating-point expression for its side-effect.  */
 static inline void
 force_eval_float (float x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
 }
 static inline void
 force_eval_double (double x)
 {
   __asm__ __volatile__ ("" : "+w" (x));
 }
 #else
 static inline float
 opt_barrier_float (float x)
 {
   volatile float y = x;
   return y;
 }
 static inline double
 opt_barrier_double (double x)
 {
   volatile double y = x;
   return y;
 }
 static inline void
 force_eval_float (float x)
 {
   volatile float y UNUSED = x;
 }
 static inline void
 force_eval_double (double x)
 {
   volatile double y UNUSED = x;
 }
 #endif
 
 /* Evaluate an expression as the specified type, normally a type
    cast should be enough, but compilers implement non-standard
    excess-precision handling, so when FLT_EVAL_METHOD != 0 then
    these functions may need to be customized.  */
 static inline float
 eval_as_float (float x)
 {
   return x;
 }
 static inline double
 eval_as_double (double x)
 {
   return x;
 }
 
 /* Error handling tail calls for special cases, with a sign argument.
    The sign of the return value is set if the argument is non-zero.  */
 
 /* The result overflows.  */
 HIDDEN float __math_oflowf (uint32_t);
 /* The result underflows to 0 in nearest rounding mode.  */
 HIDDEN float __math_uflowf (uint32_t);
 /* The result underflows to 0 in some directed rounding mode only.  */
 HIDDEN float __math_may_uflowf (uint32_t);
 /* Division by zero.  */
 HIDDEN float __math_divzerof (uint32_t);
 /* The result overflows.  */
 HIDDEN double __math_oflow (uint32_t);
 /* The result underflows to 0 in nearest rounding mode.  */
 HIDDEN double __math_uflow (uint32_t);
 /* The result underflows to 0 in some directed rounding mode only.  */
 HIDDEN double __math_may_uflow (uint32_t);
 /* Division by zero.  */
 HIDDEN double __math_divzero (uint32_t);
 
 /* Error handling using input checking.  */
 
 /* Invalid input unless it is a quiet NaN.  */
 HIDDEN float __math_invalidf (float);
 /* Invalid input unless it is a quiet NaN.  */
 HIDDEN double __math_invalid (double);
 
 /* Error handling using output checking, only for errno setting.  */
 
 /* Check if the result overflowed to infinity.  */
 HIDDEN double __math_check_oflow (double);
 /* Check if the result underflowed to 0.  */
 HIDDEN double __math_check_uflow (double);
 
 /* Check if the result overflowed to infinity.  */
 static inline double
 check_oflow (double x)
 {
   return WANT_ERRNO ? __math_check_oflow (x) : x;
 }
 
 /* Check if the result underflowed to 0.  */
 static inline double
 check_uflow (double x)
 {
   return WANT_ERRNO ? __math_check_uflow (x) : x;
 }
 
 /* Check if the result overflowed to infinity.  */
 HIDDEN float __math_check_oflowf (float);
 /* Check if the result underflowed to 0.  */
 HIDDEN float __math_check_uflowf (float);
 
 /* Check if the result overflowed to infinity.  */
 static inline float
 check_oflowf (float x)
 {
   return WANT_ERRNO ? __math_check_oflowf (x) : x;
 }
 
 /* Check if the result underflowed to 0.  */
 static inline float
 check_uflowf (float x)
 {
   return WANT_ERRNO ? __math_check_uflowf (x) : x;
 }
 
-/* Shared between expf, exp2f and powf.  */
-#define EXP2F_TABLE_BITS 5
-#define EXP2F_POLY_ORDER 3
-extern const struct exp2f_data
+extern const struct erff_data
 {
-  uint64_t tab[1 << EXP2F_TABLE_BITS];
-  double shift_scaled;
-  double poly[EXP2F_POLY_ORDER];
-  double shift;
-  double invln2_scaled;
-  double poly_scaled[EXP2F_POLY_ORDER];
-} __exp2f_data HIDDEN;
+  float erff_poly_A[6];
+  float erff_poly_B[7];
+} __erff_data HIDDEN;
 
+/* Data for logf and log10f.  */
 #define LOGF_TABLE_BITS 4
 #define LOGF_POLY_ORDER 4
 extern const struct logf_data
 {
   struct
   {
     double invc, logc;
   } tab[1 << LOGF_TABLE_BITS];
   double ln2;
+  double invln10;
   double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1.  */
 } __logf_data HIDDEN;
 
-#define LOG2F_TABLE_BITS 4
-#define LOG2F_POLY_ORDER 4
-extern const struct log2f_data
+/* Data for low accuracy log10 (with 1/ln(10) included in coefficients).  */
+#define LOG10_TABLE_BITS 7
+#define LOG10_POLY_ORDER 6
+#define LOG10_POLY1_ORDER 12
+extern const struct log10_data
 {
-  struct
-  {
-    double invc, logc;
-  } tab[1 << LOG2F_TABLE_BITS];
-  double poly[LOG2F_POLY_ORDER];
-} __log2f_data HIDDEN;
-
-#define POWF_LOG2_TABLE_BITS 4
-#define POWF_LOG2_POLY_ORDER 5
-#if TOINT_INTRINSICS
-# define POWF_SCALE_BITS EXP2F_TABLE_BITS
-#else
-# define POWF_SCALE_BITS 0
+  double ln2hi;
+  double ln2lo;
+  double invln10;
+  double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10).  */
+  double poly1[LOG10_POLY1_ORDER - 1];
+  struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS];
+#if !HAVE_FAST_FMA
+  struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS];
 #endif
-#define POWF_SCALE ((double) (1 << POWF_SCALE_BITS))
-extern const struct powf_log2_data
-{
-  struct
-  {
-    double invc, logc;
-  } tab[1 << POWF_LOG2_TABLE_BITS];
-  double poly[POWF_LOG2_POLY_ORDER];
-} __powf_log2_data HIDDEN;
-
+} __log10_data HIDDEN;
 
 #define EXP_TABLE_BITS 7
 #define EXP_POLY_ORDER 5
 /* Use polynomial that is optimized for a wider input range.  This may be
    needed for good precision in non-nearest rounding and !TOINT_INTRINSICS.  */
 #define EXP_POLY_WIDE 0
 /* Use close to nearest rounding toint when !TOINT_INTRINSICS.  This may be
    needed for good precision in non-nearest rouning and !EXP_POLY_WIDE.  */
 #define EXP_USE_TOINT_NARROW 0
 #define EXP2_POLY_ORDER 5
 #define EXP2_POLY_WIDE 0
 extern const struct exp_data
 {
   double invln2N;
   double shift;
   double negln2hiN;
   double negln2loN;
   double poly[4]; /* Last four coefficients.  */
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data HIDDEN;
 
+#define ERFC_NUM_INTERVALS 20
+#define ERFC_POLY_ORDER 12
+extern const struct erfc_data
+{
+  double interval_bounds[ERFC_NUM_INTERVALS + 1];
+  double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1];
+} __erfc_data HIDDEN;
+extern const struct v_erfc_data
+{
+  double interval_bounds[ERFC_NUM_INTERVALS + 1];
+  double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1];
+}  __v_erfc_data HIDDEN;
+
+#define ERFCF_POLY_NCOEFFS 16
+extern const struct erfcf_poly_data
+{
+  double poly[4][ERFCF_POLY_NCOEFFS];
+} __erfcf_poly_data HIDDEN;
+
+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
+
+#define V_ERF_NINTS 49
+#define V_ERF_NCOEFFS 10
+extern const struct v_erf_data
+{
+  double shifts[V_ERF_NINTS];
+  double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS];
+} __v_erf_data HIDDEN;
+
+#define V_ERFF_NCOEFFS 7
+extern const struct v_erff_data
+{
+  float coeffs[V_ERFF_NCOEFFS][2];
+} __v_erff_data HIDDEN;
+
+#define ATAN_POLY_NCOEFFS 20
+extern const struct atan_poly_data
+{
+  double poly[ATAN_POLY_NCOEFFS];
+} __atan_poly_data HIDDEN;
+
+#define ATANF_POLY_NCOEFFS 8
+extern const struct atanf_poly_data
+{
+  float poly[ATANF_POLY_NCOEFFS];
+} __atanf_poly_data HIDDEN;
+
+#define ASINHF_NCOEFFS 8
+extern const struct asinhf_data
+{
+  float coeffs[ASINHF_NCOEFFS];
+} __asinhf_data HIDDEN;
+
 #define LOG_TABLE_BITS 7
 #define LOG_POLY_ORDER 6
 #define LOG_POLY1_ORDER 12
 extern const struct log_data
 {
   double ln2hi;
   double ln2lo;
   double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
   double poly1[LOG_POLY1_ORDER - 1];
-  struct {double invc, logc;} tab[1 << LOG_TABLE_BITS];
+  struct
+  {
+    double invc, logc;
+  } tab[1 << LOG_TABLE_BITS];
 #if !HAVE_FAST_FMA
-  struct {double chi, clo;} tab2[1 << LOG_TABLE_BITS];
+  struct
+  {
+    double chi, clo;
+  } tab2[1 << LOG_TABLE_BITS];
 #endif
 } __log_data HIDDEN;
 
-#define LOG2_TABLE_BITS 6
-#define LOG2_POLY_ORDER 7
-#define LOG2_POLY1_ORDER 11
-extern const struct log2_data
+#define ASINH_NCOEFFS 18
+extern const struct asinh_data
 {
-  double invln2hi;
-  double invln2lo;
-  double poly[LOG2_POLY_ORDER - 1];
-  double poly1[LOG2_POLY1_ORDER - 1];
-  struct {double invc, logc;} tab[1 << LOG2_TABLE_BITS];
-#if !HAVE_FAST_FMA
-  struct {double chi, clo;} tab2[1 << LOG2_TABLE_BITS];
+  double poly[ASINH_NCOEFFS];
+} __asinh_data HIDDEN;
+
+#define LOG1P_NCOEFFS 19
+extern const struct log1p_data
+{
+  double coeffs[LOG1P_NCOEFFS];
+} __log1p_data HIDDEN;
+
+#define LOG1PF_2U5
+#define V_LOG1PF_2U5
+#define LOG1PF_NCOEFFS 9
+extern const struct log1pf_data
+{
+  float coeffs[LOG1PF_NCOEFFS];
+} __log1pf_data HIDDEN;
+
+#define TANF_P_POLY_NCOEFFS 6
+/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps.  */
+#define TANF_Q_POLY_NCOEFFS 4
+extern const struct tanf_poly_data
+{
+  float poly_tan[TANF_P_POLY_NCOEFFS];
+  float poly_cotan[TANF_Q_POLY_NCOEFFS];
+} __tanf_poly_data HIDDEN;
+
+#define V_LOG2F_POLY_NCOEFFS 9
+extern const struct v_log2f_data
+{
+  float poly[V_LOG2F_POLY_NCOEFFS];
+} __v_log2f_data HIDDEN;
+
+#define V_LOG2_TABLE_BITS 7
+#define V_LOG2_POLY_ORDER 6
+extern const struct v_log2_data
+{
+  double poly[V_LOG2_POLY_ORDER - 1];
+  struct
+  {
+    double invc, log2c;
+  } tab[1 << V_LOG2_TABLE_BITS];
+} __v_log2_data HIDDEN;
+
+#define V_SINF_NCOEFFS 4
+extern const struct sv_sinf_data
+{
+  float coeffs[V_SINF_NCOEFFS];
+} __sv_sinf_data HIDDEN;
+
+#define V_LOG10_TABLE_BITS 7
+#define V_LOG10_POLY_ORDER 6
+extern const struct v_log10_data
+{
+  struct
+  {
+    double invc, log10c;
+  } tab[1 << V_LOG10_TABLE_BITS];
+  double poly[V_LOG10_POLY_ORDER - 1];
+  double invln10, log10_2;
+} __v_log10_data HIDDEN;
+
+#define V_LOG10F_POLY_ORDER 9
+extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
+
+#define SV_LOGF_POLY_ORDER 8
+extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN;
+
+#define SV_LOG_POLY_ORDER 6
+#define SV_LOG_TABLE_BITS 7
+extern const struct sv_log_data
+{
+  double invc[1 << SV_LOG_TABLE_BITS];
+  double logc[1 << SV_LOG_TABLE_BITS];
+  double poly[SV_LOG_POLY_ORDER - 1];
+} __sv_log_data HIDDEN;
+
+#ifndef SV_EXPF_USE_FEXPA
+#define SV_EXPF_USE_FEXPA 0
 #endif
-} __log2_data HIDDEN;
+#define SV_EXPF_POLY_ORDER 6
+extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
 
-#define POW_LOG_TABLE_BITS 7
-#define POW_LOG_POLY_ORDER 8
-extern const struct pow_log_data
+#define EXPM1F_POLY_ORDER 5
+extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
+
+#define EXPF_TABLE_BITS 5
+#define EXPF_POLY_ORDER 3
+extern const struct expf_data
 {
-  double ln2hi;
-  double ln2lo;
-  double poly[POW_LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
-  /* Note: the pad field is unused, but allows slightly faster indexing.  */
-  struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
-} __pow_log_data HIDDEN;
+  uint64_t tab[1 << EXPF_TABLE_BITS];
+  double invln2_scaled;
+  double poly_scaled[EXPF_POLY_ORDER];
+} __expf_data HIDDEN;
 
-extern const struct erff_data
+#define EXPM1_POLY_ORDER 11
+extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
+
+extern const struct cbrtf_data
 {
-  float erff_poly_A[6];
-  float erff_poly_B[7];
-} __erff_data HIDDEN;
+  float poly[4];
+  float table[5];
+} __cbrtf_data HIDDEN;
 
-#define ERF_POLY_A_ORDER 19
-#define ERF_POLY_A_NCOEFFS 10
-#define ERFC_POLY_C_NCOEFFS 16
-#define ERFC_POLY_D_NCOEFFS 18
-#define ERFC_POLY_E_NCOEFFS 14
-#define ERFC_POLY_F_NCOEFFS 17
-extern const struct erf_data
-{
-  double erf_poly_A[ERF_POLY_A_NCOEFFS];
-  double erf_ratio_N_A[5];
-  double erf_ratio_D_A[5];
-  double erf_ratio_N_B[7];
-  double erf_ratio_D_B[6];
-  double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
-  double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
-  double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
-  double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
-} __erf_data HIDDEN;
+extern const struct cbrt_data
+{
+  double poly[4];
+  double table[5];
+} __cbrt_data HIDDEN;
 
+extern const struct v_tan_data
+{
+  double neg_half_pi_hi, neg_half_pi_lo;
+  double poly[9];
+} __v_tan_data HIDDEN;
 #endif
diff --git a/contrib/arm-optimized-routines/math/math_err.c b/contrib/arm-optimized-routines/pl/math/math_err.c
similarity index 93%
copy from contrib/arm-optimized-routines/math/math_err.c
copy to contrib/arm-optimized-routines/pl/math/math_err.c
index 1bf9538a1ab1..d246a89982de 100644
--- a/contrib/arm-optimized-routines/math/math_err.c
+++ b/contrib/arm-optimized-routines/pl/math/math_err.c
@@ -1,80 +1,78 @@
 /*
  * Double-precision math error handling.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #if WANT_ERRNO
 #include <errno.h>
 /* NOINLINE reduces code size and avoids making math functions non-leaf
    when the error handling is inlined.  */
 NOINLINE static double
 with_errno (double y, int e)
 {
   errno = e;
   return y;
 }
 #else
 #define with_errno(x, e) (x)
 #endif
 
 /* NOINLINE reduces code size.  */
 NOINLINE static double
 xflow (uint32_t sign, double y)
 {
   y = eval_as_double (opt_barrier_double (sign ? -y : y) * y);
   return with_errno (y, ERANGE);
 }
 
 HIDDEN double
 __math_uflow (uint32_t sign)
 {
   return xflow (sign, 0x1p-767);
 }
 
-#if WANT_ERRNO_UFLOW
 /* Underflows to zero in some non-nearest rounding mode, setting errno
    is valid even if the result is non-zero, but in the subnormal range.  */
 HIDDEN double
 __math_may_uflow (uint32_t sign)
 {
   return xflow (sign, 0x1.8p-538);
 }
-#endif
 
 HIDDEN double
 __math_oflow (uint32_t sign)
 {
   return xflow (sign, 0x1p769);
 }
 
 HIDDEN double
 __math_divzero (uint32_t sign)
 {
   double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0;
   return with_errno (y, ERANGE);
 }
 
 HIDDEN double
 __math_invalid (double x)
 {
   double y = (x - x) / (x - x);
   return isnan (x) ? y : with_errno (y, EDOM);
 }
 
 /* Check result and set errno if necessary.  */
 
 HIDDEN double
 __math_check_uflow (double y)
 {
   return y == 0.0 ? with_errno (y, ERANGE) : y;
 }
 
 HIDDEN double
 __math_check_oflow (double y)
 {
   return isinf (y) ? with_errno (y, ERANGE) : y;
 }
diff --git a/contrib/arm-optimized-routines/math/math_errf.c b/contrib/arm-optimized-routines/pl/math/math_errf.c
similarity index 93%
copy from contrib/arm-optimized-routines/math/math_errf.c
copy to contrib/arm-optimized-routines/pl/math/math_errf.c
index d5350b819ab1..96271ff18bc1 100644
--- a/contrib/arm-optimized-routines/math/math_errf.c
+++ b/contrib/arm-optimized-routines/pl/math/math_errf.c
@@ -1,80 +1,78 @@
 /*
  * Single-precision math error handling.
  *
- * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
 
 #if WANT_ERRNO
 #include <errno.h>
 /* NOINLINE reduces code size and avoids making math functions non-leaf
    when the error handling is inlined.  */
 NOINLINE static float
 with_errnof (float y, int e)
 {
   errno = e;
   return y;
 }
 #else
 #define with_errnof(x, e) (x)
 #endif
 
 /* NOINLINE reduces code size.  */
 NOINLINE static float
 xflowf (uint32_t sign, float y)
 {
   y = eval_as_float (opt_barrier_float (sign ? -y : y) * y);
   return with_errnof (y, ERANGE);
 }
 
 HIDDEN float
 __math_uflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1p-95f);
 }
 
-#if WANT_ERRNO_UFLOW
 /* Underflows to zero in some non-nearest rounding mode, setting errno
    is valid even if the result is non-zero, but in the subnormal range.  */
 HIDDEN float
 __math_may_uflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1.4p-75f);
 }
-#endif
 
 HIDDEN float
 __math_oflowf (uint32_t sign)
 {
   return xflowf (sign, 0x1p97f);
 }
 
 HIDDEN float
 __math_divzerof (uint32_t sign)
 {
   float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f;
   return with_errnof (y, ERANGE);
 }
 
 HIDDEN float
 __math_invalidf (float x)
 {
   float y = (x - x) / (x - x);
   return isnan (x) ? y : with_errnof (y, EDOM);
 }
 
 /* Check result and set errno if necessary.  */
 
 HIDDEN float
 __math_check_uflowf (float y)
 {
   return y == 0.0f ? with_errnof (y, ERANGE) : y;
 }
 
 HIDDEN float
 __math_check_oflowf (float y)
 {
   return isinf (y) ? with_errnof (y, ERANGE) : y;
 }
diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner.h
new file mode 100644
index 000000000000..6ad98dccd6aa
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/pairwise_horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h
new file mode 100644
index 000000000000..e56f059514ad
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  PW_HORNER_1_(x, c,     i) FMA(x,  c(i + 1),                       c(i))
+#define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x,     c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_1(x,     c) PW_HORNER_1_ (x, c, 0)
+#define  PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
+#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0)
+#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0)
+#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0)
+
+#define  PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2),                       PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
+#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
+#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
+#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0)
+#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0)
+#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0)
+// clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h b/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h
new file mode 100644
index 000000000000..784750cde0b6
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/contrib/arm-optimized-routines/pl/math/pl_sig.h b/contrib/arm-optimized-routines/pl/math/pl_sig.h
new file mode 100644
index 000000000000..686d24f0d9a5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/pl_sig.h
@@ -0,0 +1,43 @@
+/*
+ * PL macros for emitting various ulp/bench entries based on function signature
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+#define PL_DECL_SF1(fun) float fun##f (float);
+#define PL_DECL_SF2(fun) float fun##f (float, float);
+#define PL_DECL_SD1(fun) double fun (double);
+#define PL_DECL_SD2(fun) double fun (double, double);
+
+#if V_SUPPORTED
+#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t);
+#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t);
+#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t);
+#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t);
+#else
+#define PL_DECL_VF1(fun)
+#define PL_DECL_VF2(fun)
+#define PL_DECL_VD1(fun)
+#define PL_DECL_VD2(fun)
+#endif
+
+#if SV_SUPPORTED
+#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t);
+#define PL_DECL_SVF2(fun)                                                      \
+  sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t);
+#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t);
+#define PL_DECL_SVD2(fun)                                                      \
+  sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t);
+#else
+#define PL_DECL_SVF1(fun)
+#define PL_DECL_SVF2(fun)
+#define PL_DECL_SVD1(fun)
+#define PL_DECL_SVD2(fun)
+#endif
+
+/* For building the routines, emit function prototype from PL_SIG. This
+   ensures that the correct signature has been chosen (wrong one will be a
+   compile error). PL_SIG is defined differently by various components of the
+   build system to emit entries in the wrappers and entries for mathbench and
+   ulp.  */
+#define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f)
diff --git a/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c
new file mode 100644
index 000000000000..f62cbd6b53f0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_acosh_3u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c
new file mode 100644
index 000000000000..374066622a0f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_acoshf_3u1.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c
new file mode 100644
index 000000000000..ab8fbd9c3d69
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinh_3u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c
new file mode 100644
index 000000000000..13e1a5fd314a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinhf_2u7.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c
new file mode 100644
index 000000000000..4603e5f72615
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2_3u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c
new file mode 100644
index 000000000000..894d843273ea
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2f_3u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c
new file mode 100644
index 000000000000..4b61bc4d1460
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan_2u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c
new file mode 100644
index 000000000000..6b6571927195
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanf_3u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c
new file mode 100644
index 000000000000..f6a5f75b1779
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanh_3u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c
new file mode 100644
index 000000000000..e7e5c6197406
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanhf_3u1.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c
new file mode 100644
index 000000000000..435e74a546c6
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrt_2u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c
new file mode 100644
index 000000000000..5c793704b62a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrtf_1u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c
new file mode 100644
index 000000000000..cdf352cf5793
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cosh_2u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c
new file mode 100644
index 000000000000..8f7d5da6e6ef
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_coshf_2u4.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_erf_2u.c b/contrib/arm-optimized-routines/pl/math/s_erf_2u.c
new file mode 100644
index 000000000000..839535c3897f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_erf_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erf_2u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c
new file mode 100644
index 000000000000..bf9e3e62bd31
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfc_4u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c
new file mode 100644
index 000000000000..024d22498ff5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfcf_1u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c
new file mode 100644
index 000000000000..a5b9bf9afa72
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erff_1u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_exp_tail.c b/contrib/arm-optimized-routines/pl/math/s_exp_tail.c
new file mode 100644
index 000000000000..20b1b41a9689
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_exp_tail.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_exp_tail.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_expf.c b/contrib/arm-optimized-routines/pl/math/s_expf.c
new file mode 100644
index 000000000000..557a2e3d36af
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c
new file mode 100644
index 000000000000..da2d6e7ebf82
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1_2u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c
new file mode 100644
index 000000000000..eea8089da989
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1f_1u6.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c
new file mode 100644
index 000000000000..2480e5aa2cf1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log10_2u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c
new file mode 100644
index 000000000000..173e0fdc3400
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log10f_3u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c
new file mode 100644
index 000000000000..20b395a5a2d0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log1p_2u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c
new file mode 100644
index 000000000000..013ec4c1d903
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log1pf_2u1.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_log2_3u.c b/contrib/arm-optimized-routines/pl/math/s_log2_3u.c
new file mode 100644
index 000000000000..d46f3f998190
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_log2_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2_3u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c
new file mode 100644
index 000000000000..e76c67dceb62
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2f_2u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c
new file mode 100644
index 000000000000..27e5e65db178
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_sinh_3u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c
new file mode 100644
index 000000000000..607f94298a79
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_sinhf_2u3.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c
new file mode 100644
index 000000000000..adb807c5beb8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tan_3u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c
new file mode 100644
index 000000000000..fa64c8aef697
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanf_3u5.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c
new file mode 100644
index 000000000000..a4d7bce649f1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanh_3u.c"
diff --git a/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c
new file mode 100644
index 000000000000..896fc62ebe9b
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanhf_2u6.c"
diff --git a/contrib/arm-optimized-routines/pl/math/sinh_3u.c b/contrib/arm-optimized-routines/pl/math/sinh_3u.c
new file mode 100644
index 000000000000..f534815c6674
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sinh_3u.c
@@ -0,0 +1,66 @@
+/*
+ * Double-precision sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define OFlowBound                                                             \
+  0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results   \
+			in NaN.  */
+
+double
+__exp_dd (double, double);
+
+/* Approximation for double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   __v_sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+				 want 0x1.ab34e59d678d9p-2.  */
+double
+sinh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  double ax = asdouble (iax);
+  uint64_t sign = ix & ~AbsMask;
+  double halfsign = asdouble (Half | sign);
+
+  if (unlikely (iax >= OFlowBound))
+    {
+      /* Special values and overflow.  */
+      if (unlikely (iax > 0x7ff0000000000000))
+	return __math_invalidf (x);
+      /* expm1 overflows a little before sinh. We have to fill this
+	 gap by using a different algorithm, in this case we use a
+	 double-precision exp helper. For large x sinh(x) is dominated
+	 by exp(x), however we cannot compute exp without overflow
+	 either. We use the identity: exp(a) = (exp(a / 2)) ^ 2
+	 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2    for x > 0
+			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.  */
+      double e = __exp_dd (ax / 2, 0);
+      return (e * halfsign) * e;
+    }
+
+  /* Use expm1f to retain acceptable precision for small numbers.
+     Let t = e^(|x|) - 1.  */
+  double t = expm1 (ax);
+  /* Then sinh(x) = (t + t / (t + 1)) / 2   for x > 0
+		    (t + t / (t + 1)) / -2  for x < 0.  */
+  return (t + t / (t + 1)) * halfsign;
+}
+
+PL_SIG (S, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (sinh, 2.08)
+PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100)
+PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100)
+PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
+PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c
new file mode 100644
index 000000000000..de944288a02b
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define Expm1OFlowLimit                                                        \
+  0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f          \
+		overflows.  */
+#define OFlowLimit                                                             \
+  0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should   \
+		overflow.  */
+
+float
+optr_aor_exp_f32 (float);
+
+/* Approximation for single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4.  */
+float
+sinhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  float ax = asfloat (iax);
+  uint32_t sign = ix & ~AbsMask;
+  float halfsign = asfloat (Half | sign);
+
+  if (unlikely (iax >= Expm1OFlowLimit))
+    {
+      /* Special values and overflow.  */
+      if (iax >= 0x7fc00001 || iax == 0x7f800000)
+	return x;
+      if (iax >= 0x7f800000)
+	return __math_invalidf (x);
+      if (iax >= OFlowLimit)
+	return __math_oflowf (sign);
+
+      /* expm1f overflows a little before sinhf, (~88.7 vs ~89.4). We have to
+	 fill this gap by using a different algorithm, in this case we use a
+	 double-precision exp helper. For large x sinh(x) dominated by exp(x),
+	 however we cannot compute exp without overflow either. We use the
+	 identity:
+	 exp(a) = (exp(a / 2)) ^ 2.
+	 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2    for x > 0
+			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.
+	 Greatest error in this region is 1.89 ULP:
+	 sinhf(0x1.65898cp+6) got 0x1.f00aep+127  want 0x1.f00adcp+127.  */
+      float e = optr_aor_exp_f32 (ax / 2);
+      return (e * halfsign) * e;
+    }
+
+  /* Use expm1f to retain acceptable precision for small numbers.
+     Let t = e^(|x|) - 1.  */
+  float t = expm1f (ax);
+  /* Then sinh(x) = (t + t / (t + 1)) / 2   for x > 0
+		    (t + t / (t + 1)) / -2  for x < 0.  */
+  return (t + t / (t + 1)) * halfsign;
+}
+
+PL_SIG (S, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (sinhf, 1.76)
+PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
+PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100)
diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c
new file mode 100644
index 000000000000..a4bea1dcba09
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c
@@ -0,0 +1,93 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define SignMask sv_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+__attribute__ ((noinline)) static sv_f64_t
+specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp)
+{
+  return sv_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u64_t i, const svbool_t pg)
+{
+  return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1),
+		      sv_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of SVE atan2. Errors are greatest when y and
+   x are reasonably close together. The greatest observed error is 2.28 ULP:
+   sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
+sv_f64_t
+__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t iy = sv_as_u64_f64 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask);
+  sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask);
+  sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y);
+
+  sv_f64_t ax = svabs_f64_x (pg, x);
+  sv_f64_t ay = svabs_f64_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay);
+  sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax);
+  sv_f64_t z = svdiv_f64_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
+  shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift);
+  shift = svmul_f64_x (pg, shift, PiOver2);
+
+  sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2)
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (SV, D, 2, atan2)
+PL_TEST_ULP (__sv_atan2, 1.78)
+PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c
new file mode 100644
index 000000000000..f7674c441f2f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Single-precision vector atan2f(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f32 (0x1.921fb6p+0f)
+#define SignMask sv_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+static inline sv_f32_t
+specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp)
+{
+  return sv_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u32_t i, const svbool_t pg)
+{
+  return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1),
+		      sv_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2)
+   with reduction to [0,1] using z=1/x and shift = pi/2.
+   Maximum observed error is 2.95 ULP:
+   __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+					   want 0x1.967f00p-1.  */
+sv_f32_t
+__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t iy = sv_as_u32_f32 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask);
+  sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask);
+  sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y);
+
+  sv_f32_t ax = svabs_f32_x (pg, x);
+  sv_f32_t ay = svabs_f32_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay);
+  sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax);
+  sv_f32_t z = svdiv_f32_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
+  shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift);
+  shift = svmul_f32_x (pg, shift, PiOver2);
+
+  sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f)
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (SV, F, 2, atan2)
+PL_TEST_ULP (__sv_atan2f, 2.45)
+PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c
new file mode 100644
index 000000000000..02ac331970c9
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c
@@ -0,0 +1,62 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define AbsMask (0x7fffffffffffffff)
+
+/* Fast implementation of SVE atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
+   error is 2.27 ulps:
+   __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				  want 0x1.9225645bdd7c3p-1.  */
+sv_f64_t
+__sv_atan_x (sv_f64_t x, const svbool_t pg)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt_n_f64 (pg, x, 1.0);
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x);
+  /* Use absolute value only when needed (odd powers of z).  */
+  sv_f64_t az = svabs_f64_x (pg, z);
+  az = svneg_f64_m (az, red, az);
+
+  sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  return y;
+}
+
+PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan)
+
+PL_SIG (SV, D, 1, atan, -3.1, 3.1)
+PL_TEST_ULP (__sv_atan, 1.78)
+PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_common.h b/contrib/arm-optimized-routines/pl/math/sv_atan_common.h
new file mode 100644
index 000000000000..bfe6998d2416
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_atan_common.h
@@ -0,0 +1,61 @@
+/*
+ * Double-precision polynomial evaluation function for SVE atan(x) and
+ * atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#define P(i) sv_f64 (__atan_poly_data.poly[i])
+
+/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations
+   The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline sv_f64_t
+__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az,
+		  sv_f64_t shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=19.  */
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+
+  /* Level 1.  */
+  sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0));
+  sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2));
+  sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4));
+  sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6));
+  sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8));
+  sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10));
+  sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12));
+  sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14));
+  sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16));
+  sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18));
+
+  /* Level 2.  */
+  sv_f64_t x2 = svmul_f64_x (pg, z2, z2);
+  sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0);
+  sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4);
+  sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8);
+  sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12);
+  sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16);
+
+  /* Level 3.  */
+  sv_f64_t x4 = svmul_f64_x (pg, x2, x2);
+  sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0);
+  sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8);
+
+  /* Level 4.  */
+  sv_f64_t x8 = svmul_f64_x (pg, x4, x4);
+  sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8);
+  y = sv_fma_f64_x (pg, y, x8, P_7_0);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  sv_f64_t z3 = svmul_f64_x (pg, z2, az);
+  y = sv_fma_f64_x (pg, y, z3, az);
+
+  /* Apply shift as indicated by `red` predicate.  */
+  y = svadd_f64_m (red, y, shift);
+
+  return y;
+}
diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c
new file mode 100644
index 000000000000..8d38e42b2290
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c
@@ -0,0 +1,59 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atanf_common.h"
+
+#define PiOver2 sv_f32 (0x1.921fb6p+0f)
+#define AbsMask (0x7fffffff)
+
+/* Fast implementation of SVE atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=-1/x and shift = pi/2.
+   Largest observed error is 2.9 ULP, close to +/-1.0:
+   __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1
+			    want -0x1.967fp-1.  */
+sv_f32_t
+__sv_atanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt_n_f32 (pg, x, 1.0f);
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x);
+  /* Use absolute value only when needed (odd powers of z).  */
+  sv_f32_t az = svabs_f32_x (pg, z);
+  az = svneg_f32_m (az, red, az);
+
+  sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+}
+
+PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf)
+
+PL_SIG (SV, F, 1, atan, -3.1, 3.1)
+PL_TEST_ULP (__sv_atanf, 2.9)
+PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h b/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h
new file mode 100644
index 000000000000..dc45effec1cd
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h
@@ -0,0 +1,47 @@
+/*
+ * Single-precision polynomial evaluation function for SVE atan(x) and
+ * atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_SV_ATANF_COMMON_H
+#define PL_MATH_SV_ATANF_COMMON_H
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#define P(i) sv_f32 (__atanf_poly_data.poly[i])
+
+/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations
+   The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline sv_f32_t
+__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az,
+		   sv_f32_t shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=7.  */
+
+  /* First compute square powers of z.  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t z8 = svmul_f32_x (pg, z4, z4);
+
+  /* Then assemble polynomial.  */
+  sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))),
+				 (sv_fma_f32_x (pg, z2, P (5), P (4))));
+  sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))),
+				 (sv_fma_f32_x (pg, z2, P (1), P (0))));
+  sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  sv_f32_t z3 = svmul_f32_x (pg, z2, az);
+  y = sv_fma_f32_x (pg, y, z3, az);
+
+  /* Apply shift as indicated by 'red' predicate.  */
+  y = svadd_f32_m (red, y, shift);
+
+  return y;
+}
+
+#endif // PL_MATH_SV_ATANF_COMMON_H
diff --git a/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c
new file mode 100644
index 000000000000..194034802452
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c
@@ -0,0 +1,84 @@
+/*
+ * Double-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+/* Original shift used in Neon cos,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f64 (0x1.8000000000001p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (cos, x, y, cmp);
+}
+
+/* A fast SVE implementation of cos based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.108 ULPs.
+   __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
+				 want -0x1.fddd4c65c7f05p-3.  */
+sv_f64_t
+__sv_cos_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cos_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos)
+
+PL_SIG (SV, D, 1, cos, -3.1, 3.1)
+PL_TEST_ULP (__sv_cos, 1.61)
+PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c
new file mode 100644
index 000000000000..8f138bcba7af
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+/* Original shift used in Neon cosf,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f32 (0x1.800002p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (cosf, x, y, cmp);
+}
+
+/* A fast SVE implementation of cosf based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.06 ULPs.
+   __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+			    want 0x1.fffe76p-6.  */
+sv_f32_t
+__sv_cosf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift);
+  n = svsub_f32_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f32_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q));
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f32 (r, sv_as_u32_f32 (q));
+  y = sv_f32 (0.0f);
+  y = svtmad_f32 (y, r2, 4);
+  y = svtmad_f32 (y, r2, 3);
+  y = svtmad_f32 (y, r2, 2);
+  y = svtmad_f32 (y, r2, 1);
+  y = svtmad_f32 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f32_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cosf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf)
+
+PL_SIG (SV, F, 1, cos, -3.1, 3.1)
+PL_TEST_ULP (__sv_cosf, 1.57)
+PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c b/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c
new file mode 100644
index 000000000000..bec7f8a819d2
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c
@@ -0,0 +1,103 @@
+/*
+ * Double-precision SVE erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define Scale (8.0)
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (erf, x, y, cmp);
+}
+
+/* Optimized double precision SVE error function erf.
+   Maximum observed error is 2.62 ULP:
+   __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0
+				 want 0x1.fffffffffffffp-1.  */
+sv_f64_t
+__sv_erf_x (sv_f64_t x, const svbool_t pg)
+{
+  /* Use top 16 bits to test for special cases and small values.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff);
+
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
+
+  /* Get sign and absolute value.  */
+  sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* i = trunc(Scale*x).  */
+  sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
+  /* Saturate index of intervals.  */
+  svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
+  sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
+
+  /* Load polynomial coefficients.  */
+  sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
+  sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i);
+  sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i);
+  sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i);
+  sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i);
+  sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i);
+  sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i);
+  sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i);
+  sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i);
+  sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i);
+
+  /* Get shift and scale.  */
+  sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
+
+  /* Transform polynomial variable.
+     Set z = 0 in the boring domain to avoid overflow.  */
+  sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
+
+  /* Evaluate polynomial P(z) using level-2 Estrin.  */
+  sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
+  sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2);
+  sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4);
+  sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6);
+  sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8);
+
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+  sv_f64_t z4 = svmul_f64_x (pg, z2, z2);
+
+  sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3);
+  sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1);
+
+  sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
+  y = sv_fma_f64_x (pg, z4, y, q1);
+
+  /* y = erf(x) if x > 0, -erf(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
+
+PL_SIG (SV, D, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erf, 2.13)
+PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c
new file mode 100644
index 000000000000..076b47129862
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c
@@ -0,0 +1,146 @@
+/*
+ * Double-precision SVE erfc(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+#include "sv_exp_tail.h"
+
+sv_f64_t __sv_exp_x (sv_f64_t, svbool_t);
+
+static NOINLINE sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+{
+  return sv_call_f64 (erfc, x, y, special);
+}
+
+static inline sv_u64_t
+lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x)
+{
+  /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by
+     the number of polynomials.  */
+  sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1);
+  xp1 = svmul_f64_x (pg, xp1, xp1);
+  xp1 = svmul_f64_x (pg, xp1, xp1);
+  sv_u64_t interval_idx
+    = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023);
+  return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS),
+		    interval_idx, sv_u64 (ERFC_NUM_INTERVALS));
+}
+
+static inline sv_f64_t
+sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx)
+{
+  sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1);
+  const double *base = &__v_erfc_data.poly[0][12];
+  sv_f64_t r = sv_lookup_f64_x (pg, base, offset);
+  for (int i = 0; i < ERFC_POLY_ORDER; i++)
+    {
+      base--;
+      sv_f64_t c = sv_lookup_f64_x (pg, base, offset);
+      r = sv_fma_f64_x (pg, z, r, c);
+    }
+  return r;
+}
+
+static inline sv_f64_t
+sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x)
+{
+  /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding
+     errors in x^2, so we compute an estimate for the error and use a custom exp
+     helper which corrects for the calculated error estimate.  */
+  sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x);
+
+  /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and
+     a_lo is the 'small' component.  */
+  const sv_f64_t scale = sv_f64 (0x1.0000002p27);
+  sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x,
+						 svneg_f64_x (pg, abs_x)));
+  a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi);
+  sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi);
+
+  sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi);
+  sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo);
+
+  /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) -
+     (a_hi + a_lo) * (a_hi + a_lo).  */
+  sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2);
+  e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2);
+  e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2);
+  e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2);
+
+  return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Maximum measured error is 3.64 ULP:
+   __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
+				  want 0x1.ff3f4c8e200d9p-42.  */
+sv_f64_t
+__sv_erfc_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_f64_t abs_x = svabs_f64_x (pg, x);
+  sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52);
+
+  /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes
+     to 2. As long as the polynomial is 0 in the boring zone, we can assemble
+     the result correctly. This is dealt with in two ways:
+
+     The 'coarse approach' is that the approximation algorithm is
+     zero-predicated on in_bounds = |x| < 32, which saves the need to do
+     coefficient lookup etc for |x| >= 32.
+
+     The coarse approach misses [-32, -6] and [28, 32], which are dealt with in
+     the polynomial and index calculation, such that the polynomial evaluates to
+     0 in these regions.  */
+  /* in_bounds is true for lanes where |x| < 32.  */
+  svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404);
+  /* boring_zone = 2 for x < 0, 0 otherwise.  */
+  sv_f64_t boring_zone
+    = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62));
+  /* Very small, nan and inf.  */
+  svbool_t special_cases
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432);
+
+  /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2)
+
+     Where P_i is a polynomial and x_i is an offset, both defined in
+     v_erfc_data.c. i is chosen based on which interval x falls in.  */
+  sv_u64_t i = lookup_interval_idx (in_bounds, abs_x);
+  sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i);
+  sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i);
+  /* 'copy' sign of x to p, i.e. negate p if x is negative.  */
+  sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff);
+  p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign));
+
+  sv_f64_t e = sv_eval_gauss (in_bounds, abs_x);
+
+  /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally
+     select boring_zone because P[V_ERFC_NINTS-1]=0.  */
+  sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone);
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return specialcase (x, y, special_cases);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc)
+
+PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (__sv_erfc, 3.15)
+PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c b/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c
new file mode 100644
index 000000000000..c7a738c55f7b
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (erff, x, y, cmp);
+}
+
+sv_f32_t __sv_expf_x (svbool_t, sv_f32_t);
+
+/* Optimized single precision vector erf. Worst-case error is 1.25 ULP:
+   __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1
+			   want 0x1.9f9c8ap-1.  */
+sv_f32_t
+__sv_erff_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff);
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180);
+
+  sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+  /* |x| < 0.921875.  */
+  svbool_t red = svaclt_n_f32 (pg, x, 0.921875f);
+  /* |x| > 4.0.  */
+  svbool_t bor = svacgt_n_f32 (pg, x, 4.0f);
+
+  /* Load polynomial coefficients.  */
+  sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1));
+  sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2);
+
+  const float *base = (float *) __v_erff_data.coeffs;
+  sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2);
+  sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6);
+  sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10);
+
+  /* Do not need to store elem 0 of __v_erff_data as it is not used.  */
+  sv_f32_t p1 = svtbl (c_2_5, idx_lo);
+  sv_f32_t p2 = svtbl (c_2_5, idx_hi);
+  sv_f32_t p3 = svtbl (c_6_9, idx_lo);
+  sv_f32_t p4 = svtbl (c_6_9, idx_hi);
+  sv_f32_t p5 = svtbl (c_10_13, idx_lo);
+  sv_f32_t p6 = svtbl (c_10_13, idx_hi);
+
+  sv_f32_t a = svabs_f32_x (pg, x);
+  /* Square with merging mul - z is x^2 for reduced, |x| otherwise.  */
+  sv_f32_t z = svmul_f32_m (red, a, a);
+
+  /* Evaluate polynomial on |x| or x^2.  */
+  sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5);
+  r = sv_fma_f32_x (pg, z, r, p4);
+  r = sv_fma_f32_x (pg, z, r, p3);
+  r = sv_fma_f32_x (pg, z, r, p2);
+  r = sv_fma_f32_x (pg, z, r, p1);
+  /* Use merging svmad for last operation - apply first coefficient if not
+     reduced, otherwise r is propagated unchanged. This is because the reduced
+     polynomial has lower order than the non-reduced.  */
+  r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]);
+  r = sv_fma_f32_x (pg, a, r, a);
+
+  /* y = |x| + |x| * P(x^2)               if |x| < 0.921875
+     y = 1 - exp (-(|x| + |x| * P(|x|)))  otherwise.  */
+  sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r));
+  y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0));
+
+  /* Boring domain (absolute value is required to get the sign of erf(-nan)
+     right).  */
+  y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y));
+
+  /* y = erf(x) if x>0, -erf(-x) otherwise.  */
+  y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erff_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff)
+
+PL_SIG (SV, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erff, 0.76)
+PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h b/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h
new file mode 100644
index 000000000000..9b739da9d82a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h
@@ -0,0 +1,79 @@
+/*
+ * Double-precision SVE e^(x+tail) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_EXP_TAIL_H
+#define SV_EXP_TAIL_H
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "v_exp_tail.h"
+
+#define C1 sv_f64 (C1_scal)
+#define C2 sv_f64 (C2_scal)
+#define C3 sv_f64 (C3_scal)
+#define MinusLn2hi (-Ln2hi_scal)
+#define MinusLn2lo (-Ln2lo_scal)
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+#define Tab __v_exp_tail_data
+#define IndexMask (N - 1)
+#define Shift sv_f64 (0x1.8p+52)
+#define Thres 704.0
+
+static inline sv_f64_t
+sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n)
+{
+  sv_f64_t absn = svabs_f64_x (pg, n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000),
+			  sv_u64 (0));
+  sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000));
+  sv_f64_t s2 = sv_as_f64_u64 (
+    svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000),
+		 b));
+
+  svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N);
+  sv_f64_t r1 = svmul_f64_x (pg, s1, s1);
+  sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1);
+  return svsel_f64 (cmp, r1, r0);
+}
+
+static inline sv_f64_t
+sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail)
+{
+  /* Calculate exp(x + xtail).  */
+  sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift);
+  sv_f64_t n = svsub_f64_x (pg, z, Shift);
+
+  sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x);
+  r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r);
+
+  sv_u64_t u = sv_as_u64_f64 (z);
+  sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+  sv_u64_t i = svand_n_u64_x (pg, u, IndexMask);
+
+  sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2);
+  y = sv_fma_f64_x (pg, y, r, C1);
+  y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0));
+  y = sv_fma_f64_x (pg, y, r, xtail);
+
+  /* s = 2^(n/N).  */
+  u = sv_lookup_u64_x (pg, Tab, i);
+  sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e));
+
+  svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres);
+  if (unlikely (svptest_any (pg, cmp)))
+    {
+      return sv_exp_tail_special_case (pg, s, y, n);
+    }
+  return sv_fma_f64_x (pg, y, s, s);
+}
+
+#endif
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c
new file mode 100644
index 000000000000..87fbe45df5fd
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c
@@ -0,0 +1,156 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define C(i) __sv_expf_poly[i]
+
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#if SV_EXPF_USE_FEXPA
+
+#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127.  */
+#define Thres                                                                  \
+  (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal    \
+		      and not handled correctly by FEXPA.  */
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  /* The special-case handler from the Neon routine does not handle subnormals
+     in a way that is compatible with FEXPA. For the FEXPA variant we just fall
+     back to scalar expf.  */
+  return sv_call_f32 (expf, x, y, special);
+}
+
+#else
+
+#define Shift (0x1.8p23f) /* 1.5 * 2^23.  */
+#define Thres (126.0f)
+
+/* Special-case handler adapted from Neon variant. Uses s, y and n to produce
+   the final result (normal cases included). It performs an update of all lanes!
+   Therefore:
+   - all previous computation need to be done on all lanes indicated by input
+     pg
+   - we cannot simply apply the special case to the special-case-activated
+     lanes. Besides it is likely that this would not increase performance (no
+     scatter/gather).  */
+static inline sv_f32_t
+specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e,
+	     svbool_t p_cmp1, sv_f32_t scale)
+{
+  /* s=2^(n/N) may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x820...0, 0 otherwise.  */
+  svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0.  */
+  sv_u32_t b
+    = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n.  */
+  sv_f32_t s1
+    = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000.  */
+  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  sv_f32_t s2 = sv_as_f32_u32 (
+    svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b.  */
+
+  /* |n| > 192 => 2^(n/N) overflows.  */
+  svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f);
+
+  sv_f32_t r2 = svmul_f32_x (pg, s1, s1);
+  sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2);
+  r1 = svmul_f32_x (pg, r1, s1);
+  sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale);
+
+  /* Apply condition 1 then 2.
+     Returns r2 if cond2 is true, otherwise
+     if cond1 is true then return r1, otherwise return r0.  */
+  sv_f32_t r = svsel_f32 (p_cmp1, r1, r0);
+
+  return svsel_f32 (p_cmp2, r2, r);
+}
+
+#endif
+
+/* Optimised single-precision SVE exp function. By default this is an SVE port
+   of the Neon algorithm from math/. Alternatively, enable a modification of
+   that algorithm that looks up scale using SVE FEXPA instruction with
+   SV_EXPF_USE_FEXPA.
+
+   Worst-case error of the default algorithm is 1.95 ulp:
+   __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8
+			     want 0x1.6a023p-8.
+
+   Worst-case error when using FEXPA is 1.04 ulp:
+   __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+			   want 0x1.ba74bap+4.  */
+sv_f32_t
+__sv_expf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+  /* n = round(x/(ln2/N)).  */
+  sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift));
+  sv_f32_t n = svsub_n_f32_x (pg, z, Shift);
+
+  /* r = x - n*ln2/N.  */
+  sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x);
+  r = sv_fma_n_f32_x (pg, -Ln2lo, n, r);
+
+/* scale = 2^(n/N).  */
+#if SV_EXPF_USE_FEXPA
+  /* NaNs also need special handling with FEXPA.  */
+  svbool_t is_special_case
+    = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x));
+  sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z));
+#else
+  sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23);
+  svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
+  sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
+#endif
+
+  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1)));
+  sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3)));
+  q = sv_fma_f32_x (pg, p, r2, q);
+  p = svmul_n_f32_x (pg, r, C (4));
+  sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p);
+
+  if (unlikely (svptest_any (pg, is_special_case)))
+#if SV_EXPF_USE_FEXPA
+    return special_case (x, sv_fma_f32_x (pg, poly, scale, scale),
+			 is_special_case);
+#else
+    return specialcase (pg, poly, n, e, is_special_case, scale);
+#endif
+
+  return sv_fma_f32_x (pg, poly, scale, scale);
+}
+
+PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf)
+
+PL_SIG (SV, F, 1, exp, -9.9, 9.9)
+PL_TEST_ULP (__sv_expf, 1.46)
+PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000)
+PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000)
+PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000)
+PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000)
+PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000)
+PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000)
+PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000)
+PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000)
+#endif // SV_SUPPORTED
diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_data.c b/contrib/arm-optimized-routines/pl/math/sv_expf_data.c
new file mode 100644
index 000000000000..6875adf857b6
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_expf_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients copied from the polynomial in math/v_expf.c.  */
+const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f,
+				0x1.fffdb6p-2f, 0x1.ffffecp-1f};
diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c
new file mode 100644
index 000000000000..884e2011d2f8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision SVE log10(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define OFF 0x3fe6900900000000
+#define N (1 << V_LOG10_TABLE_BITS)
+
+#define A(i) __v_log10_data.poly[i]
+
+static inline sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+{
+  return sv_call_f64 (log10, x, y, special);
+}
+
+/* SVE log10 algorithm. Maximum measured error is 2.46 ulps.
+   __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
+				   want 0x1.fffbdf6eaa667p-6.  */
+sv_f64_t
+__sv_log10_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+
+  svbool_t is_special_case
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  sv_u64_t i
+    = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N);
+  sv_f64_t k
+    = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
+  sv_f64_t z = sv_as_f64_u64 (
+    svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+
+  /* log(x) = k*log(2) + log(c) + log(z/c).  */
+
+  sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
+  sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx);
+  sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx);
+
+  /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1):
+     r = z/c - 1 (we look up precomputed 1/c)
+     log(z/c) ~= P(r).  */
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+
+  /* hi = log(c) + k*log(2).  */
+  sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc);
+  sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
+  sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
+  y = sv_fma_n_f64_x (pg, A (4), r2, y);
+  y = sv_fma_f64_x (pg, y, r2, p);
+  y = sv_fma_f64_x (pg, y, r2, hi);
+
+  if (unlikely (svptest_any (pg, is_special_case)))
+    {
+      return specialcase (x, y, is_special_case);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10)
+
+PL_SIG (SV, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (__sv_log10, 1.97)
+PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c
new file mode 100644
index 000000000000..e7b1e9801fa9
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision SVE log10 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define SpecialCaseMin 0x00800000
+#define SpecialCaseMax 0x7f800000
+#define Offset 0x3f2aaaab /* 0.666667.  */
+#define Mask 0x007fffff
+#define Ln2 0x1.62e43p-1f /* 0x3f317218.  */
+#define InvLn10 0x1.bcb7b2p-2f
+
+#define P(i) __v_log10f_poly[i]
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  return sv_call_f32 (log10f, x, y, special);
+}
+
+/* Optimised implementation of SVE log10f using the same algorithm and
+   polynomial as v_log10f. Maximum error is 3.31ulps:
+   __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			     want 0x1.ffe2f4p-4.  */
+sv_f32_t
+__sv_log10f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  svbool_t special_cases
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin),
+		     SpecialCaseMax - SpecialCaseMin);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  ix = svsub_n_u32_x (pg, ix, Offset);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix),
+						   23)); /* signextend.  */
+  ix = svand_n_u32_x (pg, ix, Mask);
+  ix = svadd_n_u32_x (pg, ix, Offset);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f);
+
+  /* y = log10(1+r) + n*log10(2)
+     log10(1+r) ~ r * InvLn(10) + P(r)
+     where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+     log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3)
+
+     P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67)))
+     and Qij  = Pi + r * Pj.  */
+  sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56);
+  y = sv_fma_f32_x (pg, y, r2, q34);
+  y = sv_fma_f32_x (pg, y, r2, q12);
+
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less
+     accurate.  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10));
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return special_case (x, y, special_cases);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f)
+
+PL_SIG (SV, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (__sv_log10f, 2.82)
+PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c
new file mode 100644
index 000000000000..a0815bb5646f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c
@@ -0,0 +1,85 @@
+/*
+ * Double-precision SVE log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvLn2 sv_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF 0x3fe6900900000000
+#define P(i) sv_f64 (__v_log2_data.poly[i])
+
+NOINLINE static sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp)
+{
+  return sv_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision SVE log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.58 ULP:
+   __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				 want 0x1.fffb34198d9ddp-5.  */
+sv_f64_t
+__sv_log2_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+
+  svbool_t special
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  sv_u64_t i
+    = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N);
+  sv_f64_t k
+    = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
+  sv_f64_t z = sv_as_f64_u64 (
+    svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+
+  sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
+  sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx);
+  sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+  sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c);
+
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2));
+  sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0));
+  sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23);
+  y = sv_fma_f64_x (pg, y, r2, p_01);
+  y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w));
+
+  if (unlikely (svptest_any (pg, special)))
+    {
+      return specialcase (x, y, special);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2)
+
+PL_SIG (SV, D, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (__sv_log2, 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2)
+PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000)
+PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000)
+
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c
new file mode 100644
index 000000000000..fe2ab16b90b7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c
@@ -0,0 +1,79 @@
+/*
+ * Single-precision vector/SVE log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define P(i) __v_log2f_data.poly[i]
+
+#define Ln2 (0x1.62e43p-1f) /* 0x3f317218.  */
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667.  */
+
+static NOINLINE sv_f32_t
+specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (log2f, x, y, cmp);
+}
+
+/* Optimised implementation of SVE log2f, using the same algorithm
+   and polynomial as Neon log2f. Maximum error is 2.48 ULPs:
+   __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+			    want 0x1.a9be8p-2.  */
+sv_f32_t
+__sv_log2f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t u = sv_as_u32_f32 (x);
+  svbool_t special
+    = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
+						   23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+
+  /* y = log2(1+r) + n.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+
+  /* Evaluate polynomial using pairwise Horner scheme.  */
+  sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+  sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t y;
+  y = sv_fma_n_f32_x (pg, P (8), r2, p67);
+  y = sv_fma_f32_x (pg, y, r2, p45);
+  y = sv_fma_f32_x (pg, y, r2, p23);
+  y = sv_fma_f32_x (pg, y, r2, p01);
+  y = sv_fma_f32_x (pg, y, r, n);
+
+  if (unlikely (svptest_any (pg, special)))
+    return specialcase (x, y, special);
+  return y;
+}
+
+PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f)
+
+PL_SIG (SV, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (__sv_log2f, 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f)
+PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000)
+PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000)
+
+#endif // SV_SUPPORTED
diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c
new file mode 100644
index 000000000000..7f06fd31ebf1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c
@@ -0,0 +1,85 @@
+/*
+ * Double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define A(i) __sv_log_data.poly[i]
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define N (1 << SV_LOG_TABLE_BITS)
+#define OFF (0x3fe6900900000000)
+
+double
+optr_aor_log_f64 (double);
+
+static NOINLINE sv_f64_t
+__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (optr_aor_log_f64, x, y, cmp);
+}
+
+/* SVE port of Neon log algorithm from math/.
+   Maximum measured error is 2.17 ulp:
+   __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+				 want 0x1.ffffff1cca045p-2.  */
+sv_f64_t
+__sv_log_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+  svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010),
+			      sv_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power
+     of 2.  */
+  sv_u64_t i
+    = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)),
+		     N - 1);
+  sv_s64_t k
+    = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift.  */
+  sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
+  sv_f64_t z = sv_as_f64_u64 (iz);
+  /* Lookup in 2 global lists (length N).  */
+  sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i);
+  sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+  sv_f64_t kd = sv_to_f64_s64_x (pg, k);
+  /* hi = r + log(c) + k*Ln2.  */
+  sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r));
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
+  sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
+  y = sv_fma_n_f64_x (pg, A (4), r2, y);
+  y = sv_fma_f64_x (pg, y, r2, p);
+  y = sv_fma_f64_x (pg, y, r2, hi);
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_log_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_log_x, _ZGVsMxv_log)
+
+PL_SIG (SV, D, 1, log, 0.01, 11.1)
+PL_TEST_ULP (__sv_log, 1.68)
+PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log, 100, inf, 50000)
+#endif // SV_SUPPORTED
diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_data.c b/contrib/arm-optimized-routines/pl/math/sv_log_data.c
new file mode 100644
index 000000000000..77f9989444f5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_log_data.c
@@ -0,0 +1,146 @@
+/*
+ * Coefficients for double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct sv_log_data __sv_log_data = {
+  /* All coefficients and table entries are copied from the Neon routine in
+     math/. See math/v_log_data.c for an explanation of the algorithm.  */
+
+  .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
+	   0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
+	   0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
+	   0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
+	   0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
+	   0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
+	   0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
+	   0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
+	   0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
+	   0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
+	   0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
+	   0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
+	   0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
+	   0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
+	   0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
+	   0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
+	   0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
+	   0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
+	   0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
+	   0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
+	   0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
+	   0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
+	   0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
+	   0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
+	   0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
+	   0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
+	   0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
+	   0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
+	   0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
+	   0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
+	   0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
+	   0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
+	   0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
+	   0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
+	   0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
+	   0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
+	   0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
+	   0x1.010037d38bcc2p+0, 1.0,
+	   0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
+	   0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
+	   0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
+	   0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
+	   0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
+	   0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
+	   0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
+	   0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
+	   0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
+	   0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
+	   0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
+	   0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
+	   0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
+	   0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
+	   0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
+	   0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
+	   0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
+	   0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
+	   0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
+	   0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
+	   0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
+	   0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
+	   0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
+	   0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
+	   0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
+	   0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1},
+
+  .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
+	   -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
+	   -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
+	   -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
+	   -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
+	   -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
+	   -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
+	   -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
+	   -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
+	   -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
+	   -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
+	   -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
+	   -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
+	   -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
+	   -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
+	   -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
+	   -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
+	   -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
+	   -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
+	   -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
+	   -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
+	   -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
+	   -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
+	   -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
+	   -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
+	   -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
+	   -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
+	   -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
+	   -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
+	   -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
+	   -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
+	   -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
+	   -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
+	   -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
+	   -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
+	   -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
+	   -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
+	   -0x1.ff6fe1feb4e53p-9, 0.0,
+	   0x1.fe91885ec8e20p-8,  0x1.fc516f716296dp-7,
+	   0x1.7bb4dd70a015bp-6,  0x1.f84c99b34b674p-6,
+	   0x1.39f9ce4fb2d71p-5,  0x1.7756c0fd22e78p-5,
+	   0x1.b43ee82db8f3ap-5,  0x1.f0b3fced60034p-5,
+	   0x1.165bd78d4878ep-4,  0x1.3425d2715ebe6p-4,
+	   0x1.51b8bd91b7915p-4,  0x1.6f15632c76a47p-4,
+	   0x1.8c3c88ecbe503p-4,  0x1.a92ef077625dap-4,
+	   0x1.c5ed5745fa006p-4,  0x1.e27876de1c993p-4,
+	   0x1.fed104fce4cdcp-4,  0x1.0d7bd9c17d78bp-3,
+	   0x1.1b76986cef97bp-3,  0x1.295913d24f750p-3,
+	   0x1.37239fa295d17p-3,  0x1.44d68dd78714bp-3,
+	   0x1.52722ebe5d780p-3,  0x1.5ff6d12671f98p-3,
+	   0x1.6d64c2389484bp-3,  0x1.7abc4da40fddap-3,
+	   0x1.87fdbda1e8452p-3,  0x1.95295b06a5f37p-3,
+	   0x1.a23f6d34abbc5p-3,  0x1.af403a28e04f2p-3,
+	   0x1.bc2c06a85721ap-3,  0x1.c903161240163p-3,
+	   0x1.d5c5aa93287ebp-3,  0x1.e274051823fa9p-3,
+	   0x1.ef0e656300c16p-3,  0x1.fb9509f05aa2ap-3,
+	   0x1.04041821f37afp-2,  0x1.0a340a49b3029p-2,
+	   0x1.105a7918a126dp-2,  0x1.1677819812b84p-2,
+	   0x1.1c8b405b40c0ep-2,  0x1.2295d16cfa6b1p-2,
+	   0x1.28975066318a2p-2,  0x1.2e8fd855d86fcp-2,
+	   0x1.347f83d605e59p-2,  0x1.3a666d1244588p-2,
+	   0x1.4044adb6f8ec4p-2,  0x1.461a5f077558cp-2,
+	   0x1.4be799e20b9c8p-2,  0x1.51ac76a6b79dfp-2,
+	   0x1.57690d5744a45p-2,  0x1.5d1d758e45217p-2},
+
+  .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+	   0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3},
+};
diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c
new file mode 100644
index 000000000000..11f0b8aa12c5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c
@@ -0,0 +1,77 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define P(i) __sv_logf_poly[i]
+
+#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667 */
+
+float
+optr_aor_log_f32 (float);
+
+static NOINLINE sv_f32_t
+__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (optr_aor_log_f32, x, y, cmp);
+}
+
+/* Optimised implementation of SVE logf, using the same algorithm and polynomial
+   as the Neon routine in math/. Maximum error is 3.34 ULPs:
+   __sv_logf(0x1.557298p+0) got 0x1.26edecp-2
+			   want 0x1.26ede6p-2.  */
+sv_f32_t
+__sv_logf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t u = sv_as_u32_f32 (x);
+  svbool_t cmp
+    = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
+						   23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+
+  /* y = log(1+r) + n*ln2.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2)));
+  sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4)));
+  sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6)));
+  p = sv_fma_n_f32_x (pg, P (0), r2, p);
+  q = sv_fma_f32_x (pg, p, r2, q);
+  y = sv_fma_f32_x (pg, q, r2, y);
+  p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, p);
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_logf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf)
+
+PL_SIG (SV, F, 1, log, 0.01, 11.1)
+PL_TEST_ULP (__sv_logf, 2.85)
+PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000)
+#endif // SV_SUPPORTED
diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_data.c b/contrib/arm-optimized-routines/pl/math/sv_logf_data.c
new file mode 100644
index 000000000000..51dd7a7eeb37
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_logf_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision SVE log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+const float __sv_logf_poly[] = {
+  /* Copied from coeffs for the Neon routine in math/.  */
+  -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
+  -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
+};
diff --git a/contrib/arm-optimized-routines/pl/math/sv_math.h b/contrib/arm-optimized-routines/pl/math/sv_math.h
new file mode 100644
index 000000000000..5ef0ad3bd5e0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_math.h
@@ -0,0 +1,245 @@
+/*
+ * Wrapper functions for SVE ACLE.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_MATH_H
+#define SV_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+#define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+#if WANT_SVE_MATH
+#define SV_SUPPORTED 1
+
+#include <arm_sve.h>
+#include <stdbool.h>
+
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+typedef svfloat64_t sv_f64_t;
+typedef svuint64_t sv_u64_t;
+typedef svint64_t sv_s64_t;
+
+typedef svfloat32_t sv_f32_t;
+typedef svuint32_t sv_u32_t;
+typedef svint32_t sv_s32_t;
+
+/* Double precision.  */
+static inline sv_s64_t
+sv_s64 (s64_t x)
+{
+  return svdup_n_s64 (x);
+}
+
+static inline sv_u64_t
+sv_u64 (u64_t x)
+{
+  return svdup_n_u64 (x);
+}
+
+static inline sv_f64_t
+sv_f64 (f64_t x)
+{
+  return svdup_n_f64 (x);
+}
+
+static inline sv_f64_t
+sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_f64_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar. */
+static inline sv_f64_t
+sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_n_f64_x (pg, z, y, x);
+}
+
+static inline sv_s64_t
+sv_as_s64_u64 (sv_u64_t x)
+{
+  return svreinterpret_s64_u64 (x);
+}
+
+static inline sv_u64_t
+sv_as_u64_f64 (sv_f64_t x)
+{
+  return svreinterpret_u64_f64 (x);
+}
+
+static inline sv_f64_t
+sv_as_f64_u64 (sv_u64_t x)
+{
+  return svreinterpret_f64_u64 (x);
+}
+
+static inline sv_f64_t
+sv_to_f64_s64_x (svbool_t pg, sv_s64_t s)
+{
+  return svcvt_f64_x (pg, s);
+}
+
+static inline sv_f64_t
+sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem = svclastb_n_f64 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f64_t y2 = svdup_n_f64 (elem);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
+static inline sv_f64_t
+sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y,
+	      svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem1 = svclastb_n_f64 (p, 0, x1);
+      f64_t elem2 = svclastb_n_f64 (p, 0, x2);
+      f64_t ret = (*f) (elem1, elem2);
+      sv_f64_t y2 = svdup_n_f64 (ret);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
+/* Load array of uint64_t into svuint64_t.  */
+static inline sv_u64_t
+sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx)
+{
+  return svld1_gather_u64index_u64 (pg, tab, idx);
+}
+
+/* Load array of double into svfloat64_t.  */
+static inline sv_f64_t
+sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx)
+{
+  return svld1_gather_u64index_f64 (pg, tab, idx);
+}
+
+static inline sv_u64_t
+sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y)
+{
+  sv_u64_t q = svdiv_n_u64_x (pg, x, y);
+  return svmls_n_u64_x (pg, x, q, y);
+}
+
+/* Single precision.  */
+static inline sv_s32_t
+sv_s32 (s32_t x)
+{
+  return svdup_n_s32 (x);
+}
+
+static inline sv_u32_t
+sv_u32 (u32_t x)
+{
+  return svdup_n_u32 (x);
+}
+
+static inline sv_f32_t
+sv_f32 (f32_t x)
+{
+  return svdup_n_f32 (x);
+}
+
+static inline sv_f32_t
+sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_f32_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar.  */
+static inline sv_f32_t
+sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_n_f32_x (pg, z, y, x);
+}
+
+static inline sv_u32_t
+sv_as_u32_f32 (sv_f32_t x)
+{
+  return svreinterpret_u32_f32 (x);
+}
+
+static inline sv_f32_t
+sv_as_f32_u32 (sv_u32_t x)
+{
+  return svreinterpret_f32_u32 (x);
+}
+
+static inline sv_s32_t
+sv_as_s32_u32 (sv_u32_t x)
+{
+  return svreinterpret_s32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
+{
+  return svcvt_f32_x (pg, s);
+}
+
+static inline sv_s32_t
+sv_to_s32_f32_x (svbool_t pg, sv_f32_t x)
+{
+  return svcvt_s32_f32_x (pg, x);
+}
+
+static inline sv_f32_t
+sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem = svclastb_n_f32 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f32_t y2 = svdup_n_f32 (elem);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
+static inline sv_f32_t
+sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y,
+	      svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem1 = svclastb_n_f32 (p, 0, x1);
+      f32_t elem2 = svclastb_n_f32 (p, 0, x2);
+      f32_t ret = (*f) (elem1, elem2);
+      sv_f32_t y2 = svdup_n_f32 (ret);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
+#endif
+#endif
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_powi.c b/contrib/arm-optimized-routines/pl/math/sv_powi.c
new file mode 100644
index 000000000000..1bb0eb3d3498
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_powi.c
@@ -0,0 +1,53 @@
+/*
+ * Double-precision SVE powi(x, n) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Optimized double-precision vector powi (double base, long integer power).
+   powi is developed for environments in which accuracy is of much less
+   importance than performance, hence we provide no estimate for worst-case
+   error.  */
+svfloat64_t
+__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p)
+{
+  /* Compute powi by successive squaring, right to left.  */
+  svfloat64_t acc = svdup_n_f64 (1.0);
+  svbool_t want_recip = svcmplt_n_s64 (p, ns, 0);
+  svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns));
+
+  /* We use a max to avoid needing to check whether any lane != 0 on each
+     iteration.  */
+  uint64_t max_n = svmaxv_u64 (p, ns_abs);
+
+  svfloat64_t c = as;
+  /* Successively square c, and use merging predication (_m) to determine
+     whether or not to perform the multiplication or keep the previous
+     iteration.  */
+  while (true)
+    {
+      svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull);
+      acc = svmul_f64_m (px, acc, c);
+      max_n >>= 1;
+      if (max_n == 0)
+	break;
+
+      ns_abs = svlsr_n_u64_x (p, ns_abs, 1);
+      c = svmul_f64_x (p, c, c);
+    }
+
+  /* Negative powers are handled by computing the abs(n) version and then
+     taking the reciprocal.  */
+  if (svptest_any (want_recip, want_recip))
+    acc = svdivr_n_f64_m (want_recip, acc, 1.0);
+
+  return acc;
+}
+
+strong_alias (__sv_powi_x, _ZGVsMxvv_powk)
+
+#endif // SV_SUPPORTED
diff --git a/contrib/arm-optimized-routines/pl/math/sv_powif.c b/contrib/arm-optimized-routines/pl/math/sv_powif.c
new file mode 100644
index 000000000000..d0567e393927
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_powif.c
@@ -0,0 +1,54 @@
+/*
+ * Single-precision SVE powi(x, n) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Optimized single-precision vector powi (float base, integer power).
+   powi is developed for environments in which accuracy is of much less
+   importance than performance, hence we provide no estimate for worst-case
+   error.  */
+svfloat32_t
+__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p)
+{
+  /* Compute powi by successive squaring, right to left.  */
+  svfloat32_t acc = svdup_n_f32 (1.f);
+  svbool_t want_recip = svcmplt_n_s32 (p, ns, 0);
+  svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns));
+
+  /* We use a max to avoid needing to check whether any lane != 0 on each
+     iteration.  */
+  uint32_t max_n = svmaxv_u32 (p, ns_abs);
+
+  svfloat32_t c = as;
+  /* Successively square c, and use merging predication (_m) to determine
+     whether or not to perform the multiplication or keep the previous
+     iteration.  */
+  while (true)
+    {
+      svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1);
+      acc = svmul_f32_m (px, acc, c);
+      max_n >>= 1;
+      if (max_n == 0)
+	break;
+
+      ns_abs = svlsr_n_u32_x (p, ns_abs, 1);
+      c = svmul_f32_x (p, c, c);
+    }
+
+  /* Negative powers are handled by computing the abs(n) version and then
+     taking the reciprocal.  */
+  if (svptest_any (want_recip, want_recip))
+    acc = svdivr_n_f32_m (want_recip, acc, 1.0f);
+
+  return acc;
+}
+
+/* Note no trailing f for ZGV... name - 64-bit integer version is powk.  */
+strong_alias (__sv_powif_x, _ZGVsMxvv_powi)
+
+#endif // SV_SUPPORTED
diff --git a/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c b/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c
new file mode 100644
index 000000000000..3fee08061918
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
+#define HalfPi (sv_f64 (0x1.921fb54442d18p+0))
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+#define Shift (sv_f64 (0x1.8p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (sin, x, y, cmp);
+}
+
+/* A fast SVE implementation of sin based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum observed error in 2.52 ULP:
+   __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40
+				  want 0x1.10ace8f3e7868p-40.  */
+sv_f64_t
+__sv_sin_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  sv_u64_t sign;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask);
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+
+  /* sin(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* sign = y^sign.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sin_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin)
+
+PL_SIG (SV, D, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (__sv_sin, 2.03)
+PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c
new file mode 100644
index 000000000000..9184ccd3cf0c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define A3 (sv_f32 (__sv_sinf_data.coeffs[3]))
+#define A5 (sv_f32 (__sv_sinf_data.coeffs[2]))
+#define A7 (sv_f32 (__sv_sinf_data.coeffs[1]))
+#define A9 (sv_f32 (__sv_sinf_data.coeffs[0]))
+
+#define NegPi1 (sv_f32 (-0x1.921fb6p+1f))
+#define NegPi2 (sv_f32 (0x1.777a5cp-24f))
+#define NegPi3 (sv_f32 (0x1.ee59dap-49f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPi (sv_f32 (0x1.45f306p-2f))
+#define Shift (sv_f32 (0x1.8p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (sinf, x, y, cmp);
+}
+
+/* A fast SVE implementation of sinf.
+   Maximum error: 1.89 ULPs.
+   This maximum error is achieved at multiple values in [-2^18, 2^18]
+   but one example is:
+   __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1.  */
+sv_f32_t
+__sv_sinf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  sv_u32_t sign, odd;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask);
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/pi).  */
+  n = sv_fma_f32_x (pg, InvPi, r, Shift);
+  odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31);
+  n = svsub_f32_x (pg, n, Shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = sv_fma_f32_x (pg, NegPi1, n, r);
+  r = sv_fma_f32_x (pg, NegPi2, n, r);
+  r = sv_fma_f32_x (pg, NegPi3, n, r);
+
+  /* sin(r) approx using a degree 9 polynomial from the Taylor series
+     expansion. Note that only the odd terms of this are non-zero.  */
+  r2 = svmul_f32_x (pg, r, r);
+  y = sv_fma_f32_x (pg, A9, r2, A7);
+  y = sv_fma_f32_x (pg, y, r2, A5);
+  y = sv_fma_f32_x (pg, y, r2, A3);
+  y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r);
+
+  /* sign = y^sign^odd.  */
+  y = sv_as_f32_u32 (
+    sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd)));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sinf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf)
+
+PL_SIG (SV, F, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (__sv_sinf, 1.40)
+PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c
new file mode 100644
index 000000000000..1e1ab5e48df1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c
@@ -0,0 +1,19 @@
+/*
+ * Data used in single-precision sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating sin(x) in single
+   precision. These are the non-zero coefficients from the
+   degree 9 Taylor series expansion of sin.  */
+
+const struct sv_sinf_data __sv_sinf_data = {.coeffs = {
+					      0x1.5b2e76p-19f,
+					      -0x1.9f42eap-13f,
+					      0x1.110df4p-7f,
+					      -0x1.555548p-3f,
+					    }};
diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c
new file mode 100644
index 000000000000..cca43bd886fd
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c
@@ -0,0 +1,112 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+#define RangeVal (sv_f32 (0x1p15f))
+#define Shift (sv_f32 (0x1.8p+23f))
+
+#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Use full Estrin's scheme to evaluate polynomial.  */
+static inline sv_f32_t
+eval_poly (svbool_t pg, sv_f32_t z)
+{
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
+  sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
+  sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
+  sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
+  sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10);
+  return y;
+}
+
+static NOINLINE sv_f32_t
+__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (tanf, x, y, cmp);
+}
+
+/* Fast implementation of SVE tanf.
+   Maximum error is 3.45 ULP:
+   __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			     want 0x1.ff9850p-1.  */
+sv_f32_t
+__sv_tanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* Determine whether input is too large to perform fast regression.  */
+  svbool_t cmp = svacge_f32 (pg, x, RangeVal);
+  svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0));
+
+  /* n = rint(x/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift);
+  sv_f32_t n = svsub_f32_x (pg, q, Shift);
+  /* n is already a signed integer, simply convert it.  */
+  sv_s32_t in = sv_to_s32_f32_x (pg, n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1));
+  svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0));
+
+  /* r = x - n * (pi/2)  (range reduction into 0 .. pi/4).  */
+  sv_f32_t r;
+  r = sv_fma_f32_x (pg, NegPio2_1, n, x);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  sv_f32_t z = svneg_f32_m (r, pred_alt, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t p = eval_poly (pg, z2);
+  sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z);
+
+  /* Transform result back, if necessary.  */
+  sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y);
+  y = svsel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = svsel_f32 (pred_minuszero, x, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_tanf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
+
+PL_SIG (SV, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (__sv_tanf, 2.96)
+PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/tanf_3u3.c b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c
new file mode 100644
index 000000000000..ec006dc04c4c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c
@@ -0,0 +1,202 @@
+/*
+ * Single-precision scalar tan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "pairwise_hornerf.h"
+
+/* Useful constants.  */
+#define NegPio2_1 (-0x1.921fb6p+0f)
+#define NegPio2_2 (0x1.777a5cp-25f)
+#define NegPio2_3 (0x1.ee59dap-50f)
+/* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps.  */
+#define RangeVal (0x1p17f)
+#define InvPio2 ((0x1.45f306p-1f))
+#define Shift (0x1.8p+23f)
+#define AbsMask (0x7fffffff)
+#define Pio4 (0x1.921fb6p-1)
+/* 2PI * 2^-64.  */
+#define Pio2p63 (0x1.921FB54442D18p-62)
+
+#define P(i) __tanf_poly_data.poly_tan[i]
+#define Q(i) __tanf_poly_data.poly_cotan[i]
+
+static inline float
+eval_P (float z)
+{
+  return PAIRWISE_HORNER_5 (z, z * z, P);
+}
+
+static inline float
+eval_Q (float z)
+{
+  return PAIRWISE_HORNER_3 (z, z * z, Q);
+}
+
+/* Reduction of the input argument x using Cody-Waite approach, such that x = r
+   + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer.  */
+static inline float
+reduce (float x, int32_t *in)
+{
+  /* n = rint(x/(pi/2)).  */
+  float r = x;
+  float q = fmaf (InvPio2, r, Shift);
+  float n = q - Shift;
+  /* There is no rounding here, n is representable by a signed integer.  */
+  *in = (int32_t) n;
+  /* r = x - n * (pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = fmaf (NegPio2_1, n, r);
+  r = fmaf (NegPio2_2, n, r);
+  r = fmaf (NegPio2_3, n, r);
+  return r;
+}
+
+/* Table with 4/PI to 192 bit precision.  To avoid unaligned accesses
+   only 8 new bits are added per entry, making the table 4 times larger.  */
+static const uint32_t __inv_pio4[24]
+  = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44,
+     0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
+     0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62,
+     0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041};
+
+/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
+   XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
+   Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
+   Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
+   multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
+   can have at most 29 leading zeros after the binary point, the double
+   precision result is accurate to 33 bits.  */
+static inline double
+reduce_large (uint32_t xi, int *np)
+{
+  const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
+  int shift = (xi >> 23) & 7;
+  uint64_t n, res0, res1, res2;
+
+  xi = (xi & 0xffffff) | 0x800000;
+  xi <<= shift;
+
+  res0 = xi * arr[0];
+  res1 = (uint64_t) xi * arr[4];
+  res2 = (uint64_t) xi * arr[8];
+  res0 = (res2 >> 32) | (res0 << 32);
+  res0 += res1;
+
+  n = (res0 + (1ULL << 61)) >> 62;
+  res0 -= n << 62;
+  double x = (int64_t) res0;
+  *np = n;
+  return x * Pio2p63;
+}
+
+/* Top 12 bits of the float representation with the sign bit cleared.  */
+static inline uint32_t
+top12 (float x)
+{
+  return (asuint (x) >> 20);
+}
+
+/* Fast single-precision tan implementation.
+   Maximum ULP error: 3.293ulps.
+   tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1.  */
+float
+tanf (float x)
+{
+  /* Get top words.  */
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 20;
+
+  /* Dispatch between no reduction (small numbers), fast reduction and
+     slow large numbers reduction. The reduction step determines r float
+     (|r| < pi/4) and n signed integer such that x = r + n * pi/2.  */
+  int32_t n;
+  float r;
+  if (ia12 < top12 (Pio4))
+    {
+      /* Optimize small values.  */
+      if (unlikely (ia12 < top12 (0x1p-12f)))
+	{
+	  if (unlikely (ia12 < top12 (0x1p-126f)))
+	    /* Force underflow for tiny x.  */
+	    force_eval_float (x * x);
+	  return x;
+	}
+
+      /* tan (x) ~= x + x^3 * P(x^2).  */
+      float x2 = x * x;
+      float y = eval_P (x2);
+      return fmaf (x2, x * y, x);
+    }
+  /* Similar to other trigonometric routines, fast inaccurate reduction is
+     performed for values of x from pi/4 up to RangeVal. In order to keep errors
+     below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for
+     other trigonometric routines. Above this value more advanced but slower
+     reduction techniques need to be implemented to reach a similar accuracy.
+  */
+  else if (ia12 < top12 (RangeVal))
+    {
+      /* Fast inaccurate reduction.  */
+      r = reduce (x, &n);
+    }
+  else if (ia12 < 0x7f8)
+    {
+      /* Slow accurate reduction.  */
+      uint32_t sign = ix & ~AbsMask;
+      double dar = reduce_large (ia, &n);
+      float ar = (float) dar;
+      r = asfloat (asuint (ar) ^ sign);
+    }
+  else
+    {
+      /* tan(Inf or NaN) is NaN.  */
+      return __math_invalidf (x);
+    }
+
+  /* If x lives in an interval where |tan(x)|
+     - is finite then use an approximation of tangent in the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use an approximation of cotangent in the form
+       cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early.
+       Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r),
+       we only need to change the sign of r to obtain tan(x) from cotan(r).
+     This 2-interval approach requires 2 different sets of coefficients P and
+     Q, where Q is a lower order polynomial than P.  */
+
+  /* Determine if x lives in an interval where |tan(x)| grows to infinity.  */
+  uint32_t alt = (uint32_t) n & 1;
+
+  /* Perform additional reduction if required.  */
+  float z = alt ? -r : r;
+
+  /* Prepare backward transformation.  */
+  float z2 = r * r;
+  float offset = alt ? 1.0f / z : z;
+  float scale = alt ? z : z * z2;
+
+  /* Evaluate polynomial approximation of tan or cotan.  */
+  float p = alt ? eval_Q (z2) : eval_P (z2);
+
+  /* A unified way of assembling the result on both interval types.  */
+  return fmaf (scale, p, offset);
+}
+
+PL_SIG (S, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (tanf, 2.80)
+PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000)
+PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000)
+PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000)
+PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000)
diff --git a/contrib/arm-optimized-routines/pl/math/tanf_data.c b/contrib/arm-optimized-routines/pl/math/tanf_data.c
new file mode 100644
index 000000000000..a6b9d512eed2
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tanf_data.c
@@ -0,0 +1,45 @@
+/*
+ * Data used in single-precision tan(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct tanf_poly_data __tanf_poly_data = {
+.poly_tan = {
+/* Coefficients generated using:
+   poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a*a;b*b]);
+   optimize relative error
+   final prec : 23 bits
+   deg : 5
+   a : 0x1p-126 ^ 2
+   b : ((pi) / 0x1p2) ^ 2
+   dirty rel error: 0x1.f7c2e4p-25
+   dirty abs error: 0x1.f7c2ecp-25.  */
+0x1.55555p-2,
+0x1.11166p-3,
+0x1.b88a78p-5,
+0x1.7b5756p-6,
+0x1.4ef4cep-8,
+0x1.0e1e74p-7
+},
+.poly_cotan = {
+/* Coefficients generated using:
+   fpminimax(f(x) = (0x1p0 / tan(sqrt(x)) - 0x1p0 / sqrt(x)) / sqrt(x), deg, [|dtype ...|], [a;b])
+   optimize a single polynomial
+   optimize absolute error
+   final prec : 23 bits
+   working prec : 128 bits
+   deg : 3
+   a : 0x1p-126
+   b : (pi) / 0x1p2
+   dirty rel error : 0x1.81298cp-25
+   dirty abs error : 0x1.a8acf4p-25.  */
+-0x1.55555p-2, /* -0.33333325.  */
+-0x1.6c23e4p-6, /* -2.2225354e-2.  */
+-0x1.12dbap-9, /* -2.0969994e-3.  */
+-0x1.05a1c2p-12, /* -2.495116e-4.  */
+}
+};
diff --git a/contrib/arm-optimized-routines/pl/math/tanh_3u.c b/contrib/arm-optimized-routines/pl/math/tanh_3u.c
new file mode 100644
index 000000000000..46d9fb3fd7e1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tanh_3u.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision tanh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define InvLn2 0x1.71547652b82fep0
+#define Ln2hi 0x1.62e42fefa39efp-1
+#define Ln2lo 0x1.abc9e3b39803fp-56
+#define Shift 0x1.8p52
+#define C(i) __expm1_poly[i]
+
+#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4).  */
+#define TinyBound 0x3e40000000000000   /* asuint64 (0x1p-27).  */
+#define One 0x3ff0000000000000
+
+static inline double
+expm1_inline (double x)
+{
+  /* Helper routine for calculating exp(x) - 1. Copied from expm1_2u5.c, with
+     several simplifications:
+     - No special-case handling for tiny or special values.
+     - Simpler combination of p and t in final stage of the algorithm.
+     - Use shift-and-add instead of ldexp to calculate t.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  double j = fma (InvLn2, x, Shift) - Shift;
+  int64_t i = j;
+  double f = fma (j, -Ln2hi, x);
+  f = fma (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* t = 2 ^ i.  */
+  double t = asdouble ((uint64_t) (i + 1023) << 52);
+  /* expm1(x) = p * t + (t - 1).  */
+  return fma (p, t, t - 1);
+}
+
+/* Approximation for double-precision tanh(x), using a simplified version of
+   expm1. The greatest observed error is 2.75 ULP:
+   tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
+			      want -0x1.ba31ba4691ab4p-3.  */
+double
+tanh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (ia > BoringBound))
+    {
+      if (ia > 0x7ff0000000000000)
+	return __math_invalid (x);
+      return asdouble (One | sign);
+    }
+
+  if (unlikely (ia < TinyBound))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  double q = expm1_inline (2 * x);
+  return q / (q + 2);
+}
+
+PL_SIG (S, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (tanh, 2.26)
+PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000)
+PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000)
+PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000)
+PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000)
+PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000)
diff --git a/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c
new file mode 100644
index 000000000000..76e54a438e57
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c
@@ -0,0 +1,91 @@
+/*
+ * Single-precision tanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#define C(i) __expm1f_poly[i]
+
+static inline float
+expm1f_inline (float x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from expm1f_1u6.c, with several simplifications:
+     - No special-case handling for tiny or special values, instead return early
+       from the main routine.
+     - No special handling for large values:
+       - No early return for infinity.
+       - Simpler combination of p and t in final stage of algorithm.
+       - |i| < 27, so can calculate t by simpler shift-and-add, instead of
+	 ldexpf (same as vector algorithm).  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main expm1f routine uses Horner.  */
+  float f2 = f * f;
+  float p_01 = fmaf (f, C (1), C (0));
+  float p_23 = fmaf (f, C (3), C (2));
+  float p = fmaf (f2, p_23, p_01);
+  p = fmaf (f2 * f2, C (4), p);
+  p = fmaf (f2, p, f);
+
+  /* t = 2^i.  */
+  float t = asfloat ((uint32_t) (i + 127) << 23);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return fmaf (p, t, t - 1);
+}
+
+/* Approximation for single-precision tanh(x), using a simplified version of
+   expm1f. The maximum error is 2.58 ULP:
+   tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+		      want 0x1.f9ba08p-5.  */
+float
+tanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax > BoringBound))
+    {
+      if (iax > 0x7f800000)
+	return __math_invalidf (x);
+      return asfloat (One | sign);
+    }
+
+  if (unlikely (iax < 0x34000000))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float q = expm1f_inline (2 * x);
+  return q / (q + 2);
+}
+
+PL_SIG (S, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (tanhf, 2.09)
+PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100)
diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h
new file mode 100644
index 000000000000..e0f6ac70912c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h
@@ -0,0 +1,86 @@
+// clang-format off
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _ZSF1(fun, a, b) F(fun##f, a, b)
+#define _ZSD1(f, a, b) D(f, a, b)
+
+#ifdef __vpcs
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b)
+
+#elif __aarch64__
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b)
+
+#elif WANT_VMATH
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b)
+
+#else
+
+#define _ZVF1(f, a, b)
+#define _ZVD1(f, a, b)
+
+#endif
+
+#if WANT_SVE_MATH
+
+#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b)
+#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b)
+
+#else
+
+#define _ZSVF1(f, a, b)
+#define _ZSVD1(f, a, b)
+
+#endif
+
+/* No auto-generated wrappers for binary functions - they have be
+   manually defined in mathbench_wrappers.h. We have to define silent
+   macros for them anyway as they will be emitted by PL_SIG.  */
+#define _ZSF2(...)
+#define _ZSD2(...)
+#define _ZVF2(...)
+#define _ZVD2(...)
+#define _ZSVF2(...)
+#define _ZSVD2(...)
+
+#include "mathbench_funcs_gen.h"
+
+/* PL_SIG only emits entries for unary functions, since if a function
+   needs to be wrapped in mathbench there is no way for it to know the
+   same of the wrapper. Add entries for binary functions, or any other
+   exotic signatures that need wrapping, below.  */
+
+{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+{"atan2",  'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
+{"powi",   'd', 0,  0.01, 11.1, {.d = powi_wrap}},
+
+{"__s_atan2f",       'f', 0,   -10.0, 10.0, {.f = __s_atan2f_wrap}},
+{"__s_atan2",        'd', 0,   -10.0, 10.0, {.d = __s_atan2_wrap}},
+{"__v_atan2f",       'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
+{"__v_atan2",        'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
+{"__vn_atan2f",      'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
+{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
+{"__vn_atan2",       'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
+{"_ZGVnN2vv_atan2",  'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
+
+#if WANT_SVE_MATH
+{"__sv_atan2f_x",    'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
+{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
+{"__sv_atan2_x",     'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
+{"_ZGVsM2vv_atan2",  'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
+{"__sv_powif_x",     'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}},
+{"_ZGVsMxvv_powi",   'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
+{"__sv_powi_x",      'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}},
+{"_ZGVsMxvv_powk",   'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
+#endif
+  // clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h
new file mode 100644
index 000000000000..eba960eb96ac
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h
@@ -0,0 +1,133 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+static double
+atan2_wrap (double x)
+{
+  return atan2 (5.0, x);
+}
+
+static float
+atan2f_wrap (float x)
+{
+  return atan2f (5.0f, x);
+}
+
+static double
+powi_wrap (double x)
+{
+  return __builtin_powi (x, (int) round (x));
+}
+
+#if WANT_VMATH
+#if __aarch64__
+
+static double
+__s_atan2_wrap (double x)
+{
+  return __s_atan2 (5.0, x);
+}
+
+static float
+__s_atan2f_wrap (float x)
+{
+  return __s_atan2f (5.0f, x);
+}
+
+static v_double
+__v_atan2_wrap (v_double x)
+{
+  return __v_atan2 (v_double_dup (5.0), x);
+}
+
+static v_float
+__v_atan2f_wrap (v_float x)
+{
+  return __v_atan2f (v_float_dup (5.0f), x);
+}
+
+#ifdef __vpcs
+
+__vpcs static v_double
+__vn_atan2_wrap (v_double x)
+{
+  return __vn_atan2 (v_double_dup (5.0), x);
+}
+
+__vpcs static v_float
+__vn_atan2f_wrap (v_float x)
+{
+  return __vn_atan2f (v_float_dup (5.0f), x);
+}
+
+__vpcs static v_double
+_Z_atan2_wrap (v_double x)
+{
+  return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
+}
+
+__vpcs static v_float
+_Z_atan2f_wrap (v_float x)
+{
+  return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
+}
+
+#endif // __vpcs
+#endif // __arch64__
+#endif // WANT_VMATH
+
+#if WANT_SVE_MATH
+
+static sv_float
+__sv_atan2f_wrap (sv_float x, sv_bool pg)
+{
+  return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg);
+}
+
+static sv_float
+_Z_sv_atan2f_wrap (sv_float x, sv_bool pg)
+{
+  return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg);
+}
+
+static sv_double
+__sv_atan2_wrap (sv_double x, sv_bool pg)
+{
+  return __sv_atan2_x (x, svdup_n_f64 (5.0), pg);
+}
+
+static sv_double
+_Z_sv_atan2_wrap (sv_double x, sv_bool pg)
+{
+  return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg);
+}
+
+static sv_float
+_Z_sv_powi_wrap (sv_float x, sv_bool pg)
+{
+  return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
+static sv_float
+__sv_powif_wrap (sv_float x, sv_bool pg)
+{
+  return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
+static sv_double
+_Z_sv_powk_wrap (sv_double x, sv_bool pg)
+{
+  return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+static sv_double
+__sv_powi_wrap (sv_double x, sv_bool pg)
+{
+  return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+#endif // WANT_SVE_MATH
diff --git a/contrib/arm-optimized-routines/pl/math/test/pl_test.h b/contrib/arm-optimized-routines/pl/math/test/pl_test.h
new file mode 100644
index 000000000000..467d1cac0c36
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/pl_test.h
@@ -0,0 +1,33 @@
+/*
+ * PL macros for emitting various details about routines for consumption by
+ * runulp.sh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV
+   on PL_TEST_ULP to add EXPECT_FENV to all scalar routines.  */
+#if !(V_SUPPORTED || SV_SUPPORTED)
+#define PL_TEST_ULP(f, l)                                                      \
+  PL_TEST_EXPECT_FENV_ALWAYS (f)                                               \
+  PL_TEST_ULP f l
+#else
+#define PL_TEST_ULP(f, l) PL_TEST_ULP f l
+#endif
+
+/* Emit aliases to allow test params to be mapped from aliases back to their
+   aliasees.  */
+#define PL_ALIAS(a, b) PL_TEST_ALIAS a b
+
+/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
+   exceptions. e allows declaration to be emitted conditionally upon certain
+   build flags - defer expansion by one pass to allow those flags to be expanded
+   properly.  */
+#define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e)
+#define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f)
+#define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f
+#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1)
+
+#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n
+#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c
diff --git a/contrib/arm-optimized-routines/pl/math/test/runulp.sh b/contrib/arm-optimized-routines/pl/math/test/runulp.sh
new file mode 100755
index 000000000000..4d02530d44b1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/runulp.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# ULP error check script.
+#
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+#set -x
+set -eu
+
+# cd to bin directory.
+cd "${0%/*}"
+
+flags="${ULPFLAGS:--q}"
+emu="$@"
+
+# Enable SVE testing
+WANT_SVE_MATH=${WANT_SVE_MATH:-0}
+
+FAIL=0
+PASS=0
+
+t() {
+	key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}')
+	L=$(cat $LIMITS | grep "^$key " | awk '{print $2}')
+	[[ $L =~ ^[0-9]+\.[0-9]+$ ]]
+	extra_flags=""
+	[[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5"
+	grep -q "^$key$" $FENV || extra_flags="$extra_flags -f"
+	$emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+}
+
+check() {
+	$emu ./ulp -f -q "$@" #>/dev/null
+}
+
+# Regression-test for correct NaN handling in atan2
+check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
+check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
+check atan2 nan nan x -nan -nan
+
+# vector functions
+flags="${ULPFLAGS:--q}"
+runs=
+check __s_log10f 1 && runs=1
+runv=
+check __v_log10f 1 && runv=1
+runvn=
+check __vn_log10f 1 && runvn=1
+runsv=
+if [ $WANT_SVE_MATH -eq 1 ]; then
+check __sv_cosf 0 && runsv=1
+check __sv_cos  0 && runsv=1
+check __sv_sinf 0 && runsv=1
+check __sv_sin 0 && runsv=1
+# No guarantees about powi accuracy, so regression-test for exactness
+# w.r.t. the custom reference impl in ulp_wrappers.h
+check -q -f -e 0 __sv_powif  0  inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif -0 -inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif  0  inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi   0  inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi  -0 -inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi   0  inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi  -0 -inf x -0 -1000 100000 && runsv=1
+fi
+
+while read F LO HI N C
+do
+	t $F $LO $HI $N $C
+done << EOF
+$(cat $INTERVALS)
+EOF
+
+[ 0 -eq $FAIL ] || {
+	echo "FAILED $FAIL PASSED $PASS"
+	exit 1
+}
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosh.tst
new file mode 100644
index 000000000000..dd962bd391da
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosh.tst
@@ -0,0 +1,19 @@
+; acosh.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=acosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=acosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=acosh op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=acosh op1=3fefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=00000000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=80000000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bfefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=7fe01ac0.7f03a83e result=40862e50.541778f1.8cc error=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acoshf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acoshf.tst
new file mode 100644
index 000000000000..606c615f9b74
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acoshf.tst
@@ -0,0 +1,19 @@
+; acoshf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acoshf op1=7fc00001 result=7fc00001 errno=0
+func=acoshf op1=ffc00001 result=7fc00001 errno=0
+func=acoshf op1=7f800001 result=7fc00001 errno=0 status=i
+func=acoshf op1=ff800001 result=7fc00001 errno=0 status=i
+func=acoshf op1=7f800000 result=7f800000 errno=0
+func=acoshf op1=3f800000 result=00000000 errno=0
+func=acoshf op1=3f7fffff result=7fc00001 errno=EDOM status=i
+func=acoshf op1=00000000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=80000000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf7fffff result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf800000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=7f767efe result=42b2c19d.83e error=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinh.tst
new file mode 100644
index 000000000000..1485dfeffecf
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinh.tst
@@ -0,0 +1,18 @@
+; asinh.tst
+;
+; Copyright (c) 2022-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=asinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=asinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=asinh op1=fff00000.00000000 result=fff00000.00000000 errno=0
+func=asinh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=asinh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=asinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=asinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinhf.tst
new file mode 100644
index 000000000000..eb76a5892a70
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinhf.tst
@@ -0,0 +1,18 @@
+; asinhf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinhf op1=7fc00001 result=7fc00001 errno=0
+func=asinhf op1=ffc00001 result=7fc00001 errno=0
+func=asinhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=asinhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=asinhf op1=7f800000 result=7f800000 errno=0
+func=asinhf op1=ff800000 result=ff800000 errno=0
+func=asinhf op1=00000000 result=00000000 errno=0
+func=asinhf op1=80000000 result=80000000 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=asinhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=asinhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan.tst
new file mode 100644
index 000000000000..4c670553d58f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan.tst
@@ -0,0 +1,22 @@
+; atan.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=7ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan op1=fff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atan op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atan op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
+
+func=atan op1=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan op1=bff00000.00000000 result=bfe921fb.54442d18.469 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2.tst
new file mode 100644
index 000000000000..647b3764072c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2.tst
@@ -0,0 +1,110 @@
+; atan2.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=fff00000.00000000 result=4002d97c.7f3321d2.34f errno=0
+func=atan2 op1=7ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=bff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff00000.00000000 op2=7ff00000.00000000 result=bfe921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=fff00000.00000000 result=c002d97c.7f3321d2.34f errno=0
+func=atan2 op1=fff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=3ff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=bff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=00000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=00000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=00000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=00000000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=00000000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=80000000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=3ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=bff00000.00000000 result=400921fb.54442d18.469 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2 op1=00000000.00000001 op2=3ff00000.00000000 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan2 op1=80000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=80000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=80000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=80000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=80000000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=80000000.00000000 op2=00000000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=80000000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=80000000.00000000 op2=3ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=bff00000.00000000 result=c00921fb.54442d18.469 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2 op1=80000000.00000001 op2=3ff00000.00000000 result=80000000.00000001 errno=0 maybestatus=ux
+func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=3ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=3ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=3ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=3ff00000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=bff00000.00000000 result=4002d97c.7f3321d2.34f errno=0
+func=atan2 op1=bff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=bff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=bff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=bff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=bff00000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=bff00000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=3ff00000.00000000 result=bfe921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=bff00000.00000000 result=c002d97c.7f3321d2.34f errno=0
+func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2f.tst
new file mode 100644
index 000000000000..85c5c5d47e10
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2f.tst
@@ -0,0 +1,121 @@
+; atan2f.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=7fc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ffc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=7f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ff800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=00000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=80000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=3f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=bf800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7fc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ffc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ff800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=00000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=80000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=3f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=bf800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=ffc00001 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=7f800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=ff800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=00000000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=80000000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=3f800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=bf800000 result=7fc00001 errno=0
+func=atan2f op1=ffc00001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ffc00001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ffc00001 op2=7fc00001 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=7f800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=ff800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=00000000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=80000000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=3f800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=bf800000 result=ffc00001 errno=0
+func=atan2f op1=7f800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=7f800000 op2=ffc00001 result=7fc00001 errno=0
+func=atan2f op1=7f800000 op2=7f800000 result=3f490fda.a22 errno=0
+func=atan2f op1=7f800000 op2=ff800000 result=4016cbe3.f99 errno=0
+func=atan2f op1=7f800000 op2=00000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=80000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=3f800000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=bf800000 result=3fc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=ff800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=ff800000 op2=7f800000 result=bf490fda.a22 errno=0
+func=atan2f op1=ff800000 op2=ff800000 result=c016cbe3.f99 errno=0
+func=atan2f op1=ff800000 op2=00000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=80000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=3f800000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=bf800000 result=bfc90fda.a22 errno=0
+func=atan2f op1=00000000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=00000000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=00000000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=00000000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=00000000 op2=7f800000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=ff800000 result=40490fda.a22 errno=0
+func=atan2f op1=00000000 op2=00000000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=80000000 result=40490fda.a22 errno=0
+func=atan2f op1=00000000 op2=3f800000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=bf800000 result=40490fda.a22 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2f op1=00000001 op2=3f800000 result=00000001 errno=0 maybestatus=ux
+
+func=atan2f op1=80000000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=80000000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=80000000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=80000000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=80000000 op2=7f800000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=ff800000 result=c0490fda.a22 errno=0
+func=atan2f op1=80000000 op2=00000000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=80000000 result=c0490fda.a22 errno=0
+func=atan2f op1=80000000 op2=3f800000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=bf800000 result=c0490fda.a22 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2f op1=80000001 op2=3f800000 result=80000001 errno=0 maybestatus=ux
+
+func=atan2f op1=3f800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=3f800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=3f800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=3f800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=3f800000 op2=7f800000 result=00000000 errno=0
+func=atan2f op1=3f800000 op2=ff800000 result=40490fda.a22 errno=0
+func=atan2f op1=3f800000 op2=00000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=3f800000 op2=80000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=3f800000 op2=3f800000 result=3f490fda.a22 errno=0
+func=atan2f op1=3f800000 op2=bf800000 result=4016cbe3.f99 errno=0
+func=atan2f op1=bf800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=bf800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=bf800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=bf800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=bf800000 op2=7f800000 result=80000000 errno=0
+func=atan2f op1=bf800000 op2=ff800000 result=c0490fda.a22 errno=0
+func=atan2f op1=bf800000 op2=00000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=bf800000 op2=80000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=bf800000 op2=3f800000 result=bf490fda.a22 errno=0
+func=atan2f op1=bf800000 op2=bf800000 result=c016cbe3.f99 errno=0
+func=atan2f op1=8005f16d op2=002bb601 result=be0a60a5.d88 error=0
+func=atan2f op1=80818ec8 op2=80ba5db9 result=c0222eda.f42 error=0
+
+func=atan2f op1=ff7fffff op2=ff7fffff result=c016cbe3.f99 errno=0
+func=atan2f op1=bfc00001 op2=7f7fffff result=80300000.700 errno=0 status=u
+func=atan2f op1=80800001 op2=40000000 result=80400000.800 errno=0 status=u
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanf.tst
new file mode 100644
index 000000000000..0a0bfc24c605
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanf.tst
@@ -0,0 +1,22 @@
+; atanf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanf op1=7fc00001 result=7fc00001 errno=0
+func=atanf op1=ffc00001 result=7fc00001 errno=0
+func=atanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanf op1=7f800000 result=3fc90fda.a22 errno=0
+func=atanf op1=ff800000 result=bfc90fda.a22 errno=0
+func=atanf op1=00000000 result=00000000 errno=0
+func=atanf op1=80000000 result=80000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=atanf op1=3f800000 result=3f490fda.a22 errno=0
+func=atanf op1=bf800000 result=bf490fda.a22 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanh.tst
new file mode 100644
index 000000000000..d96ff327fcd9
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanh.tst
@@ -0,0 +1,22 @@
+; atanh.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atanh op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=3ff00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
+func=atanh op1=bff00000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=atanh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atanh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanhf.tst
new file mode 100644
index 000000000000..21a68a661a11
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanhf.tst
@@ -0,0 +1,23 @@
+; atanhf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanhf op1=7fc00001 result=7fc00001 errno=0
+func=atanhf op1=ffc00001 result=7fc00001 errno=0
+func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z
+func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z
+func=atanhf op1=00000000 result=00000000 errno=0
+func=atanhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cbrtf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cbrtf.tst
new file mode 100644
index 000000000000..0dd8d09f1d4f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cbrtf.tst
@@ -0,0 +1,29 @@
+; cbrtf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cbrtf op1=7f800000 result=7f800000 errno=0
+func=cbrtf op1=ff800000 result=ff800000 errno=0
+func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i
+func=cbrtf op1=7fc00001 result=7fc00001 errno=0
+func=cbrtf op1=00000000 result=00000000 errno=0
+func=cbrtf op1=00000001 result=26a14517.cc7 errno=0
+func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0
+func=cbrtf op1=00000003 result=26e89768.579 errno=0
+func=cbrtf op1=00000004 result=27000000.000 errno=0
+func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0
+func=cbrtf op1=00800000 result=2a800000.000 errno=0
+func=cbrtf op1=3f800000 result=3f800000.000 errno=0
+func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0
+func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0
+func=cbrtf op1=80000000 result=80000000 errno=0
+func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0
+func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0
+func=cbrtf op1=80000003 result=a6e89768.579 errno=0
+func=cbrtf op1=80000004 result=a7000000.000 errno=0
+func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0
+func=cbrtf op1=80800000 result=aa800000.000 errno=0
+func=cbrtf op1=bf800000 result=bf800000.000 errno=0
+func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0
+func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cosh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cosh.tst
new file mode 100644
index 000000000000..c4efacb7272d
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cosh.tst
@@ -0,0 +1,15 @@
+; cosh.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=cosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=cosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=cosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=cosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=cosh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=cosh op1=fff00000.00000000 result=7ff00000.00000000 errno=0
+func=cosh op1=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=cosh op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=cosh op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/coshf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/coshf.tst
new file mode 100644
index 000000000000..2b967e78f4b4
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/coshf.tst
@@ -0,0 +1,15 @@
+; coshf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=coshf op1=7fc00001 result=7fc00001 errno=0
+func=coshf op1=ffc00001 result=7fc00001 errno=0
+func=coshf op1=7f800001 result=7fc00001 errno=0 status=i
+func=coshf op1=ff800001 result=7fc00001 errno=0 status=i
+func=coshf op1=7f800000 result=7f800000 errno=0
+func=coshf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=coshf op1=ff800000 result=7f800000 errno=0
+func=coshf op1=ff7fffff result=7f800000 errno=ERANGE status=ox
+func=coshf op1=00000000 result=3f800000 errno=0
+func=coshf op1=80000000 result=3f800000 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfc.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfc.tst
new file mode 100644
index 000000000000..c03fc591da47
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfc.tst
@@ -0,0 +1,23 @@
+; erfc.tst - Directed test cases for erfc
+;
+; Copyright (c) 2022-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=erfc op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=erfc op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erfc op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erfc op1=7ff00000.00000000 result=00000000.00000000 errno=0
+func=erfc op1=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+; We deliberately turned off errno setting in erf, as standard simply
+; state that errno `may` be set to ERANGE in case of underflow.
+; As a result the following condition on errno cannot be satisfied.
+;
+; func=erfc op1=403b44af.48b01531 result=00000000.00000000 errno=ERANGE status=ux
+;
+func=erfc op1=c03b44af.48b01531 result=40000000.00000000 errno=0
+func=erfc op1=403bffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+func=erfc op1=c03bffff.ffffffff result=40000000.00000000 errno=0
+func=erfc op1=fff00000.00000000 result=40000000.00000000 errno=0
+func=erfc op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=erfc op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfcf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfcf.tst
new file mode 100644
index 000000000000..719baccb2e45
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfcf.tst
@@ -0,0 +1,14 @@
+; erfcf.tst - Directed test cases for erfcf
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erfcf op1=7fc00001 result=7fc00001 errno=0
+func=erfcf op1=ffc00001 result=7fc00001 errno=0
+func=erfcf op1=7f800001 result=7fc00001 errno=0 status=i
+func=erfcf op1=ff800001 result=7fc00001 errno=0 status=i
+func=erfcf op1=7f800000 result=00000000 errno=0
+func=erfcf op1=7f7fffff result=00000000 errno=ERANGE status=ux
+func=erfcf op1=ff800000 result=40000000 errno=0
+func=erfcf op1=00000000 result=3f800000 errno=0
+func=erfcf op1=80000000 result=3f800000 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erff.tst
similarity index 85%
copy from contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
copy to contrib/arm-optimized-routines/pl/math/test/testcases/directed/erff.tst
index d05b7b1119c4..9b1d3d5114ae 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erff.tst
@@ -1,17 +1,17 @@
 ; erff.tst
 ;
-; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
 func=erff op1=ffc00001 result=7fc00001 errno=0
 func=erff op1=7f800001 result=7fc00001 errno=0 status=i
 func=erff op1=ff800001 result=7fc00001 errno=0 status=i
 func=erff op1=7f800000 result=3f800000 errno=0
 func=erff op1=ff800000 result=bf800000 errno=0
 func=erff op1=00000000 result=00000000 errno=ERANGE
 func=erff op1=80000000 result=80000000 errno=ERANGE
 func=erff op1=00000001 result=00000001 errno=0 status=ux
 func=erff op1=80000001 result=80000001 errno=0 status=ux
 func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
 func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1.tst
new file mode 100644
index 000000000000..609d6f479721
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1.tst
@@ -0,0 +1,21 @@
+; expm1.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=expm1 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=expm1 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=expm1 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=expm1 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=expm1 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=expm1 op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=expm1 op1=ffefffff.ffffffff result=bff00000.00000000 errno=0
+func=expm1 op1=00000000.00000000 result=00000000.00000000 errno=0
+func=expm1 op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=expm1 op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=expm1 op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1f.tst
new file mode 100644
index 000000000000..44c38420a617
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1f.tst
@@ -0,0 +1,57 @@
+; expm1f.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=expm1f op1=7fc00001 result=7fc00001 errno=0
+func=expm1f op1=ffc00001 result=7fc00001 errno=0
+func=expm1f op1=7f800001 result=7fc00001 errno=0 status=i
+func=expm1f op1=ff800001 result=7fc00001 errno=0 status=i
+func=expm1f op1=7f800000 result=7f800000 errno=0
+func=expm1f op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=expm1f op1=ff800000 result=bf800000 errno=0
+func=expm1f op1=ff7fffff result=bf800000 errno=0
+func=expm1f op1=00000000 result=00000000 errno=0
+func=expm1f op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+
+func=expm1f op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=expm1f op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=expm1f op1=42b145c0 result=7f6ac2dd.9b8 errno=0
+
+; Check both sides of the over/underflow thresholds in the code.
+func=expm1f op1=c2000000 result=bf7fffff.fff error=0
+func=expm1f op1=c2000001 result=bf7fffff.fff error=0
+func=expm1f op1=43000000 result=7f800000 error=overflow
+func=expm1f op1=43000001 result=7f800000 error=overflow
+func=expm1f op1=c2a80000 result=bf800000.000 error=0
+func=expm1f op1=c2a80001 result=bf800000.000 error=0
+
+; Check values for which exp goes denormal. expm1f should not report
+; spurious overflow.
+func=expm1f op1=c2b00f34 result=bf800000.000 error=0
+func=expm1f op1=c2ce8ed0 result=bf800000.000 error=0
+func=expm1f op1=c2dc6bba result=bf800000.000 error=0
+
+; Regression tests for significance loss when the two components of
+; the result have opposite sign but similar magnitude
+func=expm1f op1=be8516c1 result=be6a652b.0dc error=0
+func=expm1f op1=be851714 result=be6a65ab.0e5 error=0
+func=expm1f op1=be851cc7 result=be6a6e75.111 error=0
+func=expm1f op1=be851d1a result=be6a6ef5.102 error=0
+func=expm1f op1=be851d6d result=be6a6f75.0f2 error=0
+func=expm1f op1=be852065 result=be6a7409.0e4 error=0
+func=expm1f op1=be8520b8 result=be6a7489.0c7 error=0
+func=expm1f op1=be85210b result=be6a7509.0a8 error=0
+func=expm1f op1=be855401 result=be6ac39b.0d5 error=0
+func=expm1f op1=be933307 result=be7fdbf0.d8d error=0
+func=expm1f op1=be92ed6b result=be7f737a.d81 error=0
+func=expm1f op1=be933b90 result=be7fe8be.d76 error=0
+func=expm1f op1=3eb11364 result=3ed38deb.0c0 error=0
+func=expm1f op1=3f28e830 result=3f6f344b.0da error=0
+func=expm1f op1=3eb1578f result=3ed3ee47.13b error=0
+func=expm1f op1=3f50176a result=3fa08e36.fea error=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10.tst
new file mode 100644
index 000000000000..34831436234a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10.tst
@@ -0,0 +1,16 @@
+; log10.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=log10 op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=log10 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log10 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log10 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log10 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10f.tst
new file mode 100644
index 000000000000..d5744a66f092
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10f.tst
@@ -0,0 +1,69 @@
+; log10f.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log10f op1=7fc00001 result=7fc00001 errno=0
+func=log10f op1=ffc00001 result=7fc00001 errno=0
+func=log10f op1=7f800001 result=7fc00001 errno=0 status=i
+func=log10f op1=ff800001 result=7fc00001 errno=0 status=i
+func=log10f op1=ff810000 result=7fc00001 errno=0 status=i
+func=log10f op1=7f800000 result=7f800000 errno=0
+func=log10f op1=3f800000 result=00000000 errno=0
+func=log10f op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=log10f op1=00000000 result=ff800000 errno=ERANGE status=z
+func=log10f op1=80000000 result=ff800000 errno=ERANGE status=z
+func=log10f op1=80000001 result=7fc00001 errno=EDOM status=i
+
+; Directed tests for the special-case handling of log10 of things
+; very near 1
+func=log10f op1=3f81a618 result=3bb62472.b92 error=0
+func=log10f op1=3f876783 result=3cc811f4.26c error=0
+func=log10f op1=3f816af8 result=3b9cc4c7.057 error=0
+func=log10f op1=3f7bed7d result=bbe432cb.e23 error=0
+func=log10f op1=3f803ece result=3a59ff3a.a84 error=0
+func=log10f op1=3f80089f result=38ef9728.aa6 error=0
+func=log10f op1=3f86ab72 result=3cb4b711.457 error=0
+func=log10f op1=3f780854 result=bc60f953.904 error=0
+func=log10f op1=3f7c6d76 result=bbc7fd01.01c error=0
+func=log10f op1=3f85dff6 result=3c9fa76f.81f error=0
+func=log10f op1=3f7b87f4 result=bbfa9edc.be4 error=0
+func=log10f op1=3f81c710 result=3bc4457b.745 error=0
+func=log10f op1=3f80946d result=3b00a140.c06 error=0
+func=log10f op1=3f7e87ea result=bb23cd70.828 error=0
+func=log10f op1=3f811437 result=3b6ee960.b40 error=0
+func=log10f op1=3f858dcf result=3c971d9b.2ea error=0
+func=log10f op1=3f7f61a3 result=ba89b814.4e0 error=0
+func=log10f op1=3f82d642 result=3c1bfb8d.517 error=0
+func=log10f op1=3f80f3bc result=3b52ebe8.c75 error=0
+func=log10f op1=3f85eff9 result=3ca150d9.7e8 error=0
+func=log10f op1=3f843eb8 result=3c68263f.771 error=0
+func=log10f op1=3f78e691 result=bc481cf4.50a error=0
+func=log10f op1=3f87c56f result=3cd1b268.5e6 error=0
+func=log10f op1=3f83b711 result=3c4b94c5.918 error=0
+func=log10f op1=3f823b2b result=3bf5eb02.e2a error=0
+func=log10f op1=3f7f2c4e result=bab82c80.519 error=0
+func=log10f op1=3f83fc92 result=3c5a3ba1.543 error=0
+func=log10f op1=3f793956 result=bc3ee04e.03c error=0
+func=log10f op1=3f839ba5 result=3c45caca.92a error=0
+func=log10f op1=3f862f30 result=3ca7de76.16f error=0
+func=log10f op1=3f832a20 result=3c2dc6e9.afd error=0
+func=log10f op1=3f810296 result=3b5fb92a.429 error=0
+func=log10f op1=3f7e58c9 result=bb38655a.0a4 error=0
+func=log10f op1=3f8362e7 result=3c39cc65.d15 error=0
+func=log10f op1=3f7fdb85 result=b97d9016.40b error=0
+func=log10f op1=3f84484e result=3c6a29f2.f74 error=0
+func=log10f op1=3f861862 result=3ca5819e.f2d error=0
+func=log10f op1=3f7c027b result=bbdf912d.440 error=0
+func=log10f op1=3f867803 result=3caf6744.34d error=0
+func=log10f op1=3f789a89 result=bc509bce.458 error=0
+func=log10f op1=3f8361d9 result=3c399347.379 error=0
+func=log10f op1=3f7d3ac3 result=bb9ad93a.93d error=0
+func=log10f op1=3f7ee241 result=baf8bd12.a62 error=0
+func=log10f op1=3f83a1fd result=3c4721bd.0a4 error=0
+func=log10f op1=3f840da3 result=3c5dd375.675 error=0
+func=log10f op1=3f79c2fe result=bc2f8a60.8c5 error=0
+func=log10f op1=3f854a93 result=3c901cc9.add error=0
+func=log10f op1=3f87a50a result=3cce6125.cd6 error=0
+func=log10f op1=3f818bf5 result=3baaee68.a55 error=0
+func=log10f op1=3f830a44 result=3c2705c4.d87 error=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1p.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1p.tst
new file mode 100644
index 000000000000..9ee8c62fc9c0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1p.tst
@@ -0,0 +1,22 @@
+; log1p.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log1p op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log1p op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others.
+; The main reason seems to be the handling of errno and exceptions.
+
+func=log1p op1=00000000.00000000 result=00000000.00000000 errno=0
+func=log1p op1=80000000.00000000 result=80000000.00000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=log1p op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=log1p op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1pf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1pf.tst
new file mode 100644
index 000000000000..aaa01d67c2b3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1pf.tst
@@ -0,0 +1,130 @@
+; log1pf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log1pf op1=7fc00001 result=7fc00001 errno=0
+func=log1pf op1=ffc00001 result=7fc00001 errno=0
+func=log1pf op1=7f800001 result=7fc00001 errno=0 status=i
+func=log1pf op1=ff800001 result=7fc00001 errno=0 status=i
+func=log1pf op1=ff810000 result=7fc00001 errno=0 status=i
+func=log1pf op1=7f800000 result=7f800000 errno=0
+
+; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others.
+; The main reason seems to be the handling of errno and exceptions.
+
+func=log1pf op1=00000000 result=00000000 errno=0
+func=log1pf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=log1pf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=log1pf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=log1pf op1=3f1e91ee result=3ef6d127.fdb errno=0
+func=log1pf op1=3f201046 result=3ef8a881.fba errno=0
+func=log1pf op1=3f21b916 result=3efab23b.f9f errno=0
+func=log1pf op1=3f21bde6 result=3efab821.fee errno=0
+func=log1pf op1=3f22a5ee result=3efbd435.ff2 errno=0
+func=log1pf op1=3f231b56 result=3efc63b7.e26 errno=0
+func=log1pf op1=3f23ce96 result=3efd3e83.fc8 errno=0
+func=log1pf op1=3eee18c6 result=3ec38576.02e errno=0
+func=log1pf op1=3eee2f41 result=3ec394ce.057 errno=0
+func=log1pf op1=3eee770d result=3ec3c5cc.00c errno=0
+func=log1pf op1=3eee7fed result=3ec3cbda.065 errno=0
+func=log1pf op1=3eee8fb2 result=3ec3d69c.008 errno=0
+func=log1pf op1=3eeeb8eb result=3ec3f2ba.061 errno=0
+func=log1pf op1=3eeeccfd result=3ec4006a.01d errno=0
+func=log1pf op1=3eeef5f0 result=3ec41c56.020 errno=0
+func=log1pf op1=3eeeff12 result=3ec42290.00c errno=0
+func=log1pf op1=3eef05cf result=3ec42728.052 errno=0
+func=log1pf op1=3eef13d3 result=3ec430b6.00e errno=0
+func=log1pf op1=3eef2e70 result=3ec442da.04a errno=0
+func=log1pf op1=3eef3fbf result=3ec44ea6.055 errno=0
+func=log1pf op1=3eef3feb result=3ec44ec4.021 errno=0
+func=log1pf op1=3eef4399 result=3ec45146.011 errno=0
+func=log1pf op1=3eef452e result=3ec4525a.049 errno=0
+func=log1pf op1=3eef4ea9 result=3ec458d0.020 errno=0
+func=log1pf op1=3eef7365 result=3ec471d8.05e errno=0
+func=log1pf op1=3eefa38f result=3ec492a8.003 errno=0
+func=log1pf op1=3eefb1f1 result=3ec49c74.015 errno=0
+func=log1pf op1=3eefb334 result=3ec49d50.023 errno=0
+func=log1pf op1=3eefb3c1 result=3ec49db0.0bf errno=0
+func=log1pf op1=3eefb591 result=3ec49eec.15d errno=0
+func=log1pf op1=3eefd736 result=3ec4b5d6.02d errno=0
+func=log1pf op1=3eefd797 result=3ec4b618.114 errno=0
+func=log1pf op1=3eefee5d result=3ec4c59a.071 errno=0
+func=log1pf op1=3eeffff4 result=3ec4d194.0a7 errno=0
+func=log1pf op1=3ef00cd1 result=3ec4da56.025 errno=0
+func=log1pf op1=3ef0163a result=3ec4e0be.07a errno=0
+func=log1pf op1=3ef01e89 result=3ec4e666.007 errno=0
+func=log1pf op1=3ef02004 result=3ec4e768.00a errno=0
+func=log1pf op1=3ef02c40 result=3ec4efbc.017 errno=0
+func=log1pf op1=3ef05b50 result=3ec50fc4.031 errno=0
+func=log1pf op1=3ef05bb1 result=3ec51006.05f errno=0
+func=log1pf op1=3ef0651b result=3ec5166e.0d9 errno=0
+func=log1pf op1=3ef06609 result=3ec51710.02a errno=0
+func=log1pf op1=3ef0666a result=3ec51752.049 errno=0
+func=log1pf op1=3ef0791e result=3ec5240c.0a8 errno=0
+func=log1pf op1=3ef07d46 result=3ec526e0.00e errno=0
+func=log1pf op1=3ef091fd result=3ec534f8.03c errno=0
+func=log1pf op1=3ef09602 result=3ec537b4.128 errno=0
+func=log1pf op1=3ef09848 result=3ec53940.044 errno=0
+func=log1pf op1=3ef0a04f result=3ec53eb6.07d errno=0
+func=log1pf op1=3ef0ab6a result=3ec54644.062 errno=0
+func=log1pf op1=3ef0ae49 result=3ec54838.002 errno=0
+func=log1pf op1=3ef0c1b8 result=3ec55570.000 errno=0
+func=log1pf op1=3ef0ca06 result=3ec55b16.00d errno=0
+func=log1pf op1=3ef0cc29 result=3ec55c8a.095 errno=0
+func=log1pf op1=3ef0d228 result=3ec5609e.04f errno=0
+func=log1pf op1=3ef0d8c0 result=3ec5651a.05e errno=0
+func=log1pf op1=3ef0dc0c result=3ec56758.029 errno=0
+func=log1pf op1=3ef0e0e8 result=3ec56aa6.02e errno=0
+func=log1pf op1=3ef0e502 result=3ec56d70.102 errno=0
+func=log1pf op1=3ef0e754 result=3ec56f04.017 errno=0
+func=log1pf op1=3ef0efe9 result=3ec574da.01c errno=0
+func=log1pf op1=3ef0f309 result=3ec576fa.016 errno=0
+func=log1pf op1=3ef0f499 result=3ec5780a.005 errno=0
+func=log1pf op1=3ef0f6c2 result=3ec57982.083 errno=0
+func=log1pf op1=3ef0f852 result=3ec57a92.05d errno=0
+func=log1pf op1=3ef0f9e2 result=3ec57ba2.02e errno=0
+func=log1pf op1=3ef119ee result=3ec5916c.024 errno=0
+func=log1pf op1=3ef11edf result=3ec594c8.03d errno=0
+func=log1pf op1=3ef128c4 result=3ec59b82.001 errno=0
+func=log1pf op1=3ef12ac1 result=3ec59cdc.04b errno=0
+func=log1pf op1=3ef12fea result=3ec5a05e.045 errno=0
+func=log1pf op1=3ef131e7 result=3ec5a1b8.05a errno=0
+func=log1pf op1=3ef134e1 result=3ec5a3be.00e errno=0
+func=log1pf op1=3ef1397a result=3ec5a6de.127 errno=0
+func=log1pf op1=3ef13ade result=3ec5a7d0.0f6 errno=0
+func=log1pf op1=3ef13c0d result=3ec5a89e.054 errno=0
+func=log1pf op1=3ef13d71 result=3ec5a990.016 errno=0
+func=log1pf op1=3ef14074 result=3ec5ab9c.12c errno=0
+func=log1pf op1=3ef146a0 result=3ec5afce.035 errno=0
+func=log1pf op1=3ef14a39 result=3ec5b240.024 errno=0
+func=log1pf op1=3ef14d39 result=3ec5b44a.00c errno=0
+func=log1pf op1=3ef152a3 result=3ec5b7f8.04d errno=0
+func=log1pf op1=3ef170a1 result=3ec5cc5a.021 errno=0
+func=log1pf op1=3ef17855 result=3ec5d196.0dc errno=0
+func=log1pf op1=3ef17ece result=3ec5d5fc.010 errno=0
+func=log1pf op1=3ef1810c result=3ec5d782.08e errno=0
+func=log1pf op1=3ef18da9 result=3ec5e014.0ae errno=0
+func=log1pf op1=3ef19054 result=3ec5e1e4.1a2 errno=0
+func=log1pf op1=3ef190ea result=3ec5e24a.048 errno=0
+func=log1pf op1=3ef1a739 result=3ec5f172.0d8 errno=0
+func=log1pf op1=3ef1a83c result=3ec5f222.018 errno=0
+func=log1pf op1=3ef1bbcc result=3ec5ff6c.09d errno=0
+func=log1pf op1=3ef1bd3c result=3ec60066.03a errno=0
+func=log1pf op1=3ef1d6ee result=3ec611da.056 errno=0
+func=log1pf op1=3ef1de36 result=3ec616cc.01b errno=0
+func=log1pf op1=3ef1e623 result=3ec61c2e.008 errno=0
+func=log1pf op1=3ef1e9b1 result=3ec61e98.029 errno=0
+func=log1pf op1=3ef1ee19 result=3ec62196.0d8 errno=0
+func=log1pf op1=3ef1f13a result=3ec623b6.039 errno=0
+func=log1pf op1=3ef1f1a7 result=3ec62400.091 errno=0
+func=log1pf op1=3ef1f214 result=3ec6244a.0e8 errno=0
+func=log1pf op1=3ef206e1 result=3ec6326a.09b errno=0
+func=log1pf op1=3ef21245 result=3ec63a26.012 errno=0
+func=log1pf op1=3ef217fd result=3ec63e08.048 errno=0
+func=log1pf op1=3ef2186a result=3ec63e52.063 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2.tst
similarity index 91%
copy from contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
copy to contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2.tst
index ff1286cbd53e..5d1eb9b877e8 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2.tst
@@ -1,21 +1,21 @@
 ; Directed test cases for log2
 ;
-; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; Copyright (c) 2018-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
 func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
 func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0
 func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0
 func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
 func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0
 func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
 func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0
 func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2f.tst
similarity index 91%
copy from contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
copy to contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2f.tst
index 5832c4f08f1e..4e08110878d6 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2f.tst
@@ -1,27 +1,27 @@
 ; log2f.tst - Directed test cases for log2f
 ;
-; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; Copyright (c) 2017-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
 func=log2f op1=ffc00001 result=7fc00001 errno=0
 func=log2f op1=7f800001 result=7fc00001 errno=0 status=i
 func=log2f op1=ff800001 result=7fc00001 errno=0 status=i
 func=log2f op1=ff810000 result=7fc00001 errno=0 status=i
 func=log2f op1=7f800000 result=7f800000 errno=0
 func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i
 func=log2f op1=3f800000 result=00000000 errno=0
 func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z
 func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z
 func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i
 
 func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0
 func=log2f op1=3f604189 result=be4394c8.395 error=0
 func=log2f op1=3f278034 result=bf1caa73.88e error=0
 func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0
 func=log2f op1=3e61259a result=c00bdb95.650 error=0
 func=log2f op1=3f8147ae result=3c6b3267.d6a error=0
 func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0
 func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0
 func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0
 func=log2f op1=40070838 result=3f89e055.a0a error=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinh.tst
new file mode 100644
index 000000000000..d6a3da896693
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinh.tst
@@ -0,0 +1,21 @@
+; sinh.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=sinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=sinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=sinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=sinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=sinh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=sinh op1=fff00000.00000000 result=fff00000.00000000 errno=0
+func=sinh op1=ffefffff.ffffffff result=fff00000.00000000 errno=ERANGE status=ox
+func=sinh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=sinh op1=80000000.00000000 result=80000000.00000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=sinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=sinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinhf.tst
new file mode 100644
index 000000000000..5f7bd1b04137
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinhf.tst
@@ -0,0 +1,21 @@
+; sinhf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=sinhf op1=7fc00001 result=7fc00001 errno=0
+func=sinhf op1=ffc00001 result=7fc00001 errno=0
+func=sinhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=sinhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=sinhf op1=7f800000 result=7f800000 errno=0
+func=sinhf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=sinhf op1=ff800000 result=ff800000 errno=0
+func=sinhf op1=ff7fffff result=ff800000 errno=ERANGE status=ox
+func=sinhf op1=00000000 result=00000000 errno=0
+func=sinhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=sinhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=sinhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanf.tst
new file mode 100644
index 000000000000..3161f70f4361
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanf.tst
@@ -0,0 +1,25 @@
+; tanf.tst
+;
+; Copyright (c) 2022-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanf op1=7fc00001 result=7fc00001 errno=0
+func=tanf op1=ffc00001 result=7fc00001 errno=0
+func=tanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=tanf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=tanf op1=00000000 result=00000000 errno=0
+func=tanf op1=80000000 result=80000000 errno=0
+; SDCOMP-26094: check tanf in the cases for which the range reducer
+; returns values furthest beyond its nominal upper bound of pi/4.
+func=tanf op1=46427f1b result=3f80396d.599 error=0
+func=tanf op1=4647e568 result=3f8039a6.c9f error=0
+func=tanf op1=46428bac result=3f803a03.148 error=0
+func=tanf op1=4647f1f9 result=3f803a3c.852 error=0
+func=tanf op1=4647fe8a result=3f803ad2.410 error=0
+func=tanf op1=45d8d7f1 result=bf800669.901 error=0
+func=tanf op1=45d371a4 result=bf800686.3cd error=0
+func=tanf op1=45ce0b57 result=bf8006a2.e9a error=0
+func=tanf op1=45d35882 result=bf80071b.bc4 error=0
+func=tanf op1=45cdf235 result=bf800738.693 error=0
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanh.tst
new file mode 100644
index 000000000000..78776e6f3924
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanh.tst
@@ -0,0 +1,18 @@
+; tanh.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=tanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=tanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=tanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=tanh op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
+func=tanh op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=tanh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=tanh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=tanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=tanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanhf.tst
new file mode 100644
index 000000000000..603e3107e44f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanhf.tst
@@ -0,0 +1,20 @@
+; tanhf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanhf op1=7fc00001 result=7fc00001 errno=0
+func=tanhf op1=ffc00001 result=7fc00001 errno=0
+func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=7f800000 result=3f800000 errno=0
+func=tanhf op1=ff800000 result=bf800000 errno=0
+func=tanhf op1=00000000 result=00000000 errno=0
+func=tanhf op1=80000000 result=80000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
+func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/random/double.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/random/double.tst
new file mode 100644
index 000000000000..d83283ef7864
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/random/double.tst
@@ -0,0 +1,6 @@
+!! double.tst - Random test case specification for DP functions
+!!
+!! Copyright (c) 1999-2023, Arm Limited.
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+test log10 10000
diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/random/float.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/random/float.tst
new file mode 100644
index 000000000000..fa77efecfabb
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/testcases/random/float.tst
@@ -0,0 +1,8 @@
+!! float.tst - Random test case specification for SP functions
+!!
+!! Copyright (c) 2022-2023, Arm Limited.
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+test erff 10000
+test log10f 10000
+test tanf 10000
diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h
new file mode 100644
index 000000000000..5e3133e1db4c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h
@@ -0,0 +1,66 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifdef __vpcs
+
+#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f)
+#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f)
+#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f)
+#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f)
+
+#elif __aarch64
+
+#define _ZVF1(f) SF1 (f) VF1 (f)
+#define _ZVD1(f) SD1 (f) VD1 (f)
+#define _ZVF2(f) SF2 (f) VF2 (f)
+#define _ZVD2(f) SD2 (f) VD2 (f)
+
+#elif WANT_VMATH
+
+#define _ZVF1(f) SF1 (f)
+#define _ZVD1(f) SD1 (f)
+#define _ZVF2(f) SF2 (f)
+#define _ZVD2(f) SD2 (f)
+
+#else
+
+#define _ZVF1(f)
+#define _ZVD1(f)
+#define _ZVF2(f)
+#define _ZVD2(f)
+
+#endif
+
+#if WANT_SVE_MATH
+
+#define _ZSVF1(f) SVF1 (f) ZSVF1 (f)
+#define _ZSVF2(f) SVF2 (f) ZSVF2 (f)
+#define _ZSVD1(f) SVD1 (f) ZSVD1 (f)
+#define _ZSVD2(f) SVD2 (f) ZSVD2 (f)
+
+#else
+
+#define _ZSVF1(f)
+#define _ZSVF2(f)
+#define _ZSVD1(f)
+#define _ZSVD2(f)
+
+#endif
+
+#define _ZSF1(f) F1 (f)
+#define _ZSF2(f) F2 (f)
+#define _ZSD1(f) D1 (f)
+#define _ZSD2(f) D2 (f)
+
+#include "ulp_funcs_gen.h"
+
+#if WANT_SVE_MATH
+F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0)
+F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
+F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
+F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h
new file mode 100644
index 000000000000..b682e939054a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h
@@ -0,0 +1,148 @@
+// clang-format off
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_cos(y, x, r);
+  return mpfr_sin(y, x, r);
+}
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_sin(y, x, r);
+  return mpfr_cos(y, x, r);
+}
+static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) {
+  mpfr_t y2;
+  mpfr_init(y2);
+  mpfr_trunc(y2, y);
+  return mpfr_pow(ret, x, y2, rnd);
+}
+#endif
+
+/* Our implementations of powi/powk are too imprecise to verify
+   against any established pow implementation. Instead we have the
+   following simple implementation, against which it is enough to
+   maintain bitwise reproducibility. Note the test framework expects
+   the reference impl to be of higher precision than the function
+   under test. For instance this means that the reference for
+   double-precision powi will be passed a long double, so to check
+   bitwise reproducibility we have to cast it back down to
+   double. This is fine since a round-trip to higher precision and
+   back down is correctly rounded.  */
+#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T)                            \
+  static DBL_T NAME (DBL_T in_val, DBL_T y)                                    \
+  {                                                                            \
+    INT_T n = (INT_T) round (y);                                               \
+    FLT_T acc = 1.0;                                                           \
+    bool want_recip = n < 0;                                                   \
+    n = n < 0 ? -n : n;                                                        \
+                                                                               \
+    for (FLT_T c = in_val; n; c *= c, n >>= 1)                                 \
+      {                                                                        \
+        if (n & 0x1)                                                           \
+          {                                                                    \
+            acc *= c;                                                          \
+          }                                                                    \
+      }                                                                        \
+    if (want_recip)                                                            \
+      {                                                                        \
+        acc = 1.0 / acc;                                                       \
+      }                                                                        \
+    return acc;                                                                \
+  }
+
+DECL_POW_INT_REF(ref_powif, double, float, int)
+DECL_POW_INT_REF(ref_powi, long double, double, int)
+
+#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; }
+#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; }
+#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; }
+#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; }
+
+#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; }
+#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; }
+#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; }
+#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; }
+
+#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; }
+#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; }
+#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; }
+#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; }
+
+#ifdef __vpcs
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func)
+
+#elif __aarch64__
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func)
+
+#elif WANT_VMATH
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func)
+
+#else
+
+#define ZVNF1_WRAP(func)
+#define ZVNF2_WRAP(func)
+#define ZVND1_WRAP(func)
+#define ZVND2_WRAP(func)
+
+#endif
+
+#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); }
+#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); }
+#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); }
+#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); }
+
+#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); }
+#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); }
+#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); }
+#define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); }
+
+#if WANT_SVE_MATH
+
+#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func)
+#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func)
+#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func)
+#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func)
+
+#else
+
+#define ZSVNF1_WRAP(func)
+#define ZSVNF2_WRAP(func)
+#define ZSVND1_WRAP(func)
+#define ZSVND2_WRAP(func)
+
+#endif
+
+/* No wrappers for scalar routines, but PL_SIG will emit them.  */
+#define ZSNF1_WRAP(func)
+#define ZSNF2_WRAP(func)
+#define ZSND1_WRAP(func)
+#define ZSND2_WRAP(func)
+
+#include "ulp_wrappers_gen.h"
+
+#if WANT_SVE_MATH
+static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
+static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
+static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
+static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
+#endif
+// clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/tools/asinh.sollya b/contrib/arm-optimized-routines/pl/math/tools/asinh.sollya
new file mode 100644
index 000000000000..663ee92f3f34
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/asinh.sollya
@@ -0,0 +1,28 @@
+// polynomial for approximating asinh(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so
+// we use 2^-6 as the lower bound for coeff generation, which yields sufficiently
+// accurate results in [2^-26, 2^-6].
+a = 0x1p-6;
+b = 1.0;
+
+f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2);
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = 0;
+for i from 0 to deg do {
+  i;
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/asinhf.sollya b/contrib/arm-optimized-routines/pl/math/tools/asinhf.sollya
new file mode 100644
index 000000000000..ab115b53b8dc
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/asinhf.sollya
@@ -0,0 +1,29 @@
+// polynomial for approximating asinh(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9;
+
+a = 0x1.0p-12;
+b = 1.0;
+
+f = proc(y) {
+  return asinh(x);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/atan.sollya b/contrib/arm-optimized-routines/pl/math/tools/atan.sollya
new file mode 100644
index 000000000000..ad4f33b8516a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/atan.sollya
@@ -0,0 +1,23 @@
+// polynomial for approximating atan(x) and atan2(y, x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// atan is odd, so approximate with an odd polynomial:
+// x + ax^3 + bx^5 + cx^7 + ...
+// We generate a, b, c, ... such that we can approximate atan(x) by:
+// x + x^3 * (a + bx^2 + cx^4 + ...)
+
+// Assemble monomials
+deg = 20;
+mons = [|1,...,deg|];
+for i from 0 to deg-1 do mons[i] = mons[i] * 2 + 1;
+
+a = 0x1.0p-1022;
+b = 1;
+
+poly = fpminimax(atan(x)-x, mons, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg-1 do coeff(poly,mons[i]);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/atanf.sollya b/contrib/arm-optimized-routines/pl/math/tools/atanf.sollya
new file mode 100644
index 000000000000..ed88d0ba90f9
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/atanf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating atanf(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Generate list of monomials:
+// Taylor series of atan is of the form x + ax^3 + bx^5 + cx^7 + ...
+// So generate a, b, c, ... such that we can approximate atan(x) by:
+// x + x^3 * (a + bx^2 + cx^4 + ...)
+
+deg = 7;
+
+a = 1.1754943508222875e-38;
+b = 1;
+
+poly = fpminimax((atan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/cbrt.sollya b/contrib/arm-optimized-routines/pl/math/tools/cbrt.sollya
new file mode 100644
index 000000000000..1d43dc73d8cd
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/cbrt.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in double precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), D, RN);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/cbrtf.sollya b/contrib/arm-optimized-routines/pl/math/tools/cbrtf.sollya
new file mode 100644
index 000000000000..4e0cc69b46a5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/cbrtf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in single precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), SG, RN);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya
new file mode 100644
index 000000000000..8c40b4b5db6b
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya
@@ -0,0 +1,23 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12; // poly degree
+
+// interval bounds
+a = 0x1.60dfc14636e2ap0;
+b = 0x1.d413cccfe779ap0;
+
+f = proc(y) {
+  t = y + a;
+  return erfc(t) * exp(t*t);
+};
+
+poly = remez(f(x), deg, [0;b-a], 1, 1e-16);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), 52, RN);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya
new file mode 100644
index 000000000000..69c683647af7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya
@@ -0,0 +1,31 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 15; // poly degree
+
+// interval bounds
+a = 0x1.0p-26;
+b = 2;
+
+f = proc(y) {
+  return erfc(y) * exp(y*y);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = 0;
+for i from 0 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+  print(i);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/expm1.sollya b/contrib/arm-optimized-routines/pl/math/tools/expm1.sollya
new file mode 100644
index 000000000000..7b6f324eb247
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/expm1.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating exp(x)-1 in double precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12;
+
+a = -log(2)/2;
+b = log(2)/2;
+
+f = proc(y) {
+  return exp(y)-1;
+};
+
+poly = fpminimax(f(x), deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), D, RN);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/expm1f.sollya b/contrib/arm-optimized-routines/pl/math/tools/expm1f.sollya
new file mode 100644
index 000000000000..efdf1bd301e0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/expm1f.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating exp(x)-1 in single precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 5;
+
+a = -log(2)/2;
+b = log(2)/2;
+
+f = proc(y) {
+  return exp(y)-1;
+};
+
+poly = fpminimax(f(x), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), SG, RN);
diff --git a/contrib/arm-optimized-routines/math/tools/v_log.sollya b/contrib/arm-optimized-routines/pl/math/tools/log10.sollya
similarity index 50%
copy from contrib/arm-optimized-routines/math/tools/v_log.sollya
copy to contrib/arm-optimized-routines/pl/math/tools/log10.sollya
index cc3d2c4ae72a..85d1d15c1698 100644
--- a/contrib/arm-optimized-routines/math/tools/v_log.sollya
+++ b/contrib/arm-optimized-routines/pl/math/tools/log10.sollya
@@ -1,34 +1,44 @@
-// polynomial used for __v_log(x)
+// polynomial for approximating log10(1+x)
 //
-// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
-a = -0x1.fc1p-9;
-b = 0x1.009p-8;
+// |log10(1+x)| > 0x1p-5 outside the interval
+a = -0x1.p-5;
+b = 0x1.p-5;
 
-// find log(1+x)/x polynomial with minimal relative error
-// (minimal relative error polynomial for log(1+x) is the same * x)
+ln10 = evaluate(log(10),0);
+invln10hi = double(1/ln10 + 0x1p21) - 0x1p21; // round away last 21 bits
+invln10lo = double(1/ln10 - invln10hi);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
-poly = 1;
+poly = invln10hi + invln10lo;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
-
 display = hexadecimal;
+print("invln10hi:", invln10hi);
+print("invln10lo:", invln10lo);
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
+
+display = decimal;
+print("in [",a,b,"]");
diff --git a/contrib/arm-optimized-routines/math/tools/v_log.sollya b/contrib/arm-optimized-routines/pl/math/tools/log10f.sollya
similarity index 66%
copy from contrib/arm-optimized-routines/math/tools/v_log.sollya
copy to contrib/arm-optimized-routines/pl/math/tools/log10f.sollya
index cc3d2c4ae72a..94bf32f2c449 100644
--- a/contrib/arm-optimized-routines/math/tools/v_log.sollya
+++ b/contrib/arm-optimized-routines/pl/math/tools/log10f.sollya
@@ -1,34 +1,37 @@
-// polynomial used for __v_log(x)
+// polynomial for approximating log10f(1+x)
 //
-// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
-deg = 6; // poly degree
-a = -0x1.fc1p-9;
-b = 0x1.009p-8;
+// Computation of log10f(1+x) will be carried out in double precision
+
+deg = 4; // poly degree
+// [OFF; 2*OFF] is divided in 2^4 intervals with OFF~0.7
+a = -0.04375;
+b = 0.04375;
 
 // find log(1+x)/x polynomial with minimal relative error
 // (minimal relative error polynomial for log(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
-for i from 0 to deg do coeff(poly,i);
+for i from 0 to deg do double(coeff(poly,i));
diff --git a/contrib/arm-optimized-routines/pl/math/tools/log1p.sollya b/contrib/arm-optimized-routines/pl/math/tools/log1p.sollya
new file mode 100644
index 000000000000..598a36af0339
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/log1p.sollya
@@ -0,0 +1,30 @@
+// polynomial for approximating log(1+x) in double precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 20;
+
+a = sqrt(2)/2-1;
+b = sqrt(2)-1;
+
+f = proc(y) {
+  return log(1+y);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+
+print("coeffs:");
+display = hexadecimal;
+for i from 2 to deg do coeff(poly,i);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
diff --git a/contrib/arm-optimized-routines/pl/math/tools/log1pf.sollya b/contrib/arm-optimized-routines/pl/math/tools/log1pf.sollya
new file mode 100644
index 000000000000..cc1db10e4c0c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/log1pf.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating log(1+x) in single precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 10;
+
+a = -0.25;
+b = 0.5;
+
+f = proc(y) {
+  return log(1+y);
+};
+
+poly = fpminimax(f(x), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), SG, RN);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/tan.sollya b/contrib/arm-optimized-routines/pl/math/tools/tan.sollya
new file mode 100644
index 000000000000..bb0bb28270e3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/tan.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating double precision tan(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 8;
+
+// interval bounds
+a = 0x1.0p-126;
+b = pi / 8;
+
+display = hexadecimal;
+
+f = (tan(sqrt(x))-sqrt(x))/x^(3/2);
+poly = fpminimax(f, deg, [|double ...|], [a*a;b*b]);
+
+//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/pl/math/tools/tanf.sollya b/contrib/arm-optimized-routines/pl/math/tools/tanf.sollya
new file mode 100644
index 000000000000..f4b49b40ae64
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/tanf.sollya
@@ -0,0 +1,78 @@
+// polynomial for approximating single precision tan(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+dtype = single;
+
+mthd = 0; // approximate tan
+deg = 5; // poly degree
+
+// // Uncomment for cotan
+// mthd = 1; // approximate cotan
+// deg = 3; // poly degree
+
+// interval bounds
+a = 0x1.0p-126;
+b = pi / 4;
+
+print("Print some useful constants");
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+
+print("pi/4");
+pi/4;
+
+// Setup precisions (display and computation)
+display = decimal!;
+prec=128!;
+save_prec=prec;
+
+//
+// Select function to approximate with Sollya
+//
+if(mthd==0) then {
+  s = "x + x^3 * P(x^2)";
+  g = tan(x);
+  F = proc(P) { return x + x^3 * P(x^2); };
+  f = (g(sqrt(x))-sqrt(x))/(x*sqrt(x));
+  init_poly = 0;
+  // Display info
+  print("Approximate g(x) =", g, "as F(x)=", s, ".");
+  poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]);
+}
+else if (mthd==1) then {
+  s = "1/x + x * P(x^2)";
+  g = 1 / tan(x);
+  F = proc(P) { return 1/x + x * P(x^2); };
+  f = (g(sqrt(x))-1/sqrt(x))/(sqrt(x));
+  init_poly = 0;
+  deg_init_poly = -1; // a value such that we actually start by building constant coefficient
+  // Display info
+  print("Approximate g(x) =", g, "as F(x)=", s, ".");
+  // Fpminimax used to minimise absolute error
+  approx_fpminimax = proc(func, poly, d) {
+    return fpminimax(func - poly / x^-(deg-d), 0, [|dtype|], [a;b], absolute, floating);
+  };
+  // Optimise all coefficients at once
+  poly = fpminimax(f, [|0,...,deg|], [|dtype ...|], [a;b], absolute, floating);
+};
+
+
+//
+// Display coefficients in Sollya
+//
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+print("_coeffs :_ hex");
+for i from 0 to deg do coeff(poly, i);
+
+// Compute errors
+display = hexadecimal!;
+d_rel_err = dirtyinfnorm(1-F(poly)/g(x), [a;b]);
+d_abs_err = dirtyinfnorm(g(x)-F(poly), [a;b]);
+print("dirty rel error:", d_rel_err);
+print("dirty abs error:", d_abs_err);
+print("in [",a,b,"]");
diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_erf.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_erf.sollya
new file mode 100644
index 000000000000..394ba377df12
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/v_erf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating erf(x).
+// To generate coefficients for interval i (0 to 47) do:
+// $ sollya v_erf.sollya $i
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+scale = 1/8;
+deg = 9;
+
+itv = parse(__argv[0]);
+if (itv == 0)  then { a = 0x1p-1022; }
+else                { a = itv * scale; };
+
+prec=256;
+
+poly = fpminimax(erf(scale*x+a), deg, [|D ...|], [0; 1]);
+
+display = hexadecimal;
+for i from 0 to deg do coeff(poly, i);
\ No newline at end of file
diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_erfc.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_erfc.sollya
new file mode 100644
index 000000000000..3b03ba07863d
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/v_erfc.sollya
@@ -0,0 +1,46 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12; // poly degree
+
+itv = parse(__argv[0]);
+
+bounds = [|3.725290298461914e-9,
+           0.18920711500272103,
+           0.41421356237309515,
+           0.681792830507429,
+           1,
+           1.378414230005442,
+           1.8284271247461903,
+           2.363585661014858,
+           3,
+           3.756828460010884,
+           4.656854249492381,
+           5.727171322029716,
+           7,
+           8.513656920021768,
+           10.313708498984761,
+           12.454342644059432,
+           15,
+           18.027313840043536,
+           21.627416997969522,
+           25.908685288118864,
+           31|];
+
+a = bounds[itv];
+b = bounds[itv + 1];
+
+f = proc(y) {
+  t = y + a;
+  return erfc(t) * exp(t*t);
+};
+
+poly = fpminimax(f(x), deg, [|double ...|], [0;b-a]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly, i);
diff --git a/contrib/arm-optimized-routines/math/tools/v_log.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_log10.sollya
similarity index 74%
copy from contrib/arm-optimized-routines/math/tools/v_log.sollya
copy to contrib/arm-optimized-routines/pl/math/tools/v_log10.sollya
index cc3d2c4ae72a..e2df4364ada0 100644
--- a/contrib/arm-optimized-routines/math/tools/v_log.sollya
+++ b/contrib/arm-optimized-routines/pl/math/tools/v_log10.sollya
@@ -1,34 +1,38 @@
-// polynomial used for __v_log(x)
+// polynomial used for __v_log10(x)
 //
-// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 a = -0x1.fc1p-9;
 b = 0x1.009p-8;
 
 // find log(1+x)/x polynomial with minimal relative error
 // (minimal relative error polynomial for log(1+x) is the same * x)
 deg = deg-1; // because of /x
 
 // f = log(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
 poly = 1;
 for i from 1 to deg do {
   p = roundcoefficients(approx(poly,i), [|D ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
+// scale coefficients by 1/ln(10)
+ln10 = evaluate(log(10),0);
+poly = poly/ln10;
+
 display = hexadecimal;
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
-for i from 0 to deg do coeff(poly,i);
+for i from 0 to deg do double(coeff(poly,i));
diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_log10f.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_log10f.sollya
new file mode 100644
index 000000000000..396d5a92302b
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/tools/v_log10f.sollya
@@ -0,0 +1,45 @@
+// polynomial for approximating v_log10f(1+x)
+//
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9; // poly degree
+// |log10(1+x)| > 0x1p-4 outside the interval
+a = -1/3;
+b =  1/3;
+
+display = hexadecimal;
+print("log10(2) = ", single(log10(2)));
+
+ln10 = evaluate(log(10),0);
+invln10 = single(1/ln10);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln10;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+display = hexadecimal;
+print("invln10:", invln10);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do single(coeff(poly,i));
+
+display = decimal;
+print("in [",a,b,"]");
diff --git a/contrib/arm-optimized-routines/math/tools/log2.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_log2f.sollya
similarity index 57%
copy from contrib/arm-optimized-routines/math/tools/log2.sollya
copy to contrib/arm-optimized-routines/pl/math/tools/v_log2f.sollya
index 4a364c0f111f..99e050c91b03 100644
--- a/contrib/arm-optimized-routines/math/tools/log2.sollya
+++ b/contrib/arm-optimized-routines/pl/math/tools/v_log2f.sollya
@@ -1,42 +1,38 @@
-// polynomial for approximating log2(1+x)
+// polynomial used for __v_log2f(x)
 //
-// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
-deg = 11; // poly degree
-// |log2(1+x)| > 0x1p-4 outside the interval
-a = -0x1.5b51p-5;
-b =  0x1.6ab2p-5;
+deg = 9; // poly degree
+a = -1/3;
+b = 1/3;
 
 ln2 = evaluate(log(2),0);
-invln2hi = double(1/ln2 + 0x1p21) - 0x1p21; // round away last 21 bits
-invln2lo = double(1/ln2 - invln2hi);
+invln2 = single(1/ln2);
 
 // find log2(1+x)/x polynomial with minimal relative error
 // (minimal relative error polynomial for log2(1+x) is the same * x)
 deg = deg-1; // because of /x
 
-// f = log(1+x)/x; using taylor series
+// f = log2(1+x)/x; using taylor series
 f = 0;
 for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
-f = f/ln2;
+f = f * invln2;
 
 // return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
 approx = proc(poly,d) {
   return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
 };
 
 // first coeff is fixed, iteratively find optimal double prec coeffs
-poly = invln2hi + invln2lo;
+poly = invln2;
 for i from 1 to deg do {
-  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
   poly = poly + x^i*coeff(p,0);
 };
 
 display = hexadecimal;
-print("invln2hi:", invln2hi);
-print("invln2lo:", invln2lo);
 print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
 print("in [",a,b,"]");
 print("coeffs:");
 for i from 0 to deg do coeff(poly,i);
diff --git a/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c
new file mode 100644
index 000000000000..22f69d7636e4
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c
@@ -0,0 +1,51 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)).  */
+
+#if V_SUPPORTED
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (acosh, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+   The largest observed error is 3.02 ULP in the region where the
+   argument to log1p falls in the k=0 interval, i.e. x close to 1:
+   __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+				  want 0x1.f2d6d823bc9e2p-5.  */
+VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x)
+{
+  v_u64_t itop = v_as_u64_f64 (x) >> 52;
+  v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop));
+
+  /* Fall back to scalar routine for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  v_f64_t xm1 = x - 1;
+  v_f64_t u = xm1 * (x + 1);
+  return log1p_inline (xm1 + v_sqrt_f64 (u));
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (V_NAME (acosh), 2.53)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh))
+PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000)
+PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000)
+PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c
new file mode 100644
index 000000000000..2b5aff591a74
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define SignMask 0x80000000
+#define One 0x3f800000
+#define SquareLim 0x5f800000 /* asuint(0x1p64).  */
+
+#if V_SUPPORTED
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (acoshf, x, y, special);
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+   is 2.78 ULP:
+   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+			   want 0x1.ef9ea2p-3.
+   With exceptions disabled, we can compute u with a shorter dependency chain,
+   which gives maximum error of 3.07 ULP:
+  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+			   want 0x1.fbc7f4p-4.  */
+
+VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One));
+
+#if WANT_SIMD_EXCEPT
+  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+     only xm1 to calculate u, as operating on x will trigger invalid for NaN. */
+  v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1);
+  v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1);
+#else
+  v_f32_t xm1 = x - 1;
+  v_f32_t u = xm1 * (x + 1.0f);
+#endif
+  v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u));
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, acosh, 1.0, 10.0)
+#if WANT_SIMD_EXCEPT
+PL_TEST_ULP (V_NAME (acoshf), 2.29)
+#else
+PL_TEST_ULP (V_NAME (acoshf), 2.58)
+#endif
+PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500)
+PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000)
+PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c
new file mode 100644
index 000000000000..fd329b6b7f69
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c
@@ -0,0 +1,175 @@
+/*
+ * Double-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define OneTop 0x3ff	/* top12(asuint64(1.0f)).  */
+#define HugeBound 0x5fe /* top12(asuint64(0x1p511)).  */
+#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)).  */
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define C(i) v_f64 (__asinh_data.poly[i])
+
+/* Constants & data for log.  */
+#define OFF 0x3fe6000000000000
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define A(i) v_f64 (__sv_log_data.poly[i])
+#define T(i) __log_data.tab[i]
+#define N (1 << LOG_TABLE_BITS)
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (asinh, x, y, special);
+}
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t logc;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = T (i).invc;
+  e.logc = T (i).logc;
+#else
+  e.invc[0] = T (i[0]).invc;
+  e.logc[0] = T (i[0]).logc;
+  e.invc[1] = T (i[1]).invc;
+  e.logc[1] = T (i[1]).logc;
+#endif
+  return e;
+}
+
+static inline v_f64_t
+log_inline (v_f64_t x)
+{
+  /* Double-precision vector log, copied from math/v_log.c with some cosmetic
+     modification and special-cases removed. See that file for details of the
+     algorithm used.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52;
+  v_u64_t iz = ix - (tmp & 0xfffULL << 52);
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
+  v_f64_t r2 = r * r;
+  v_f64_t y = v_fma_f64 (A (3), r, A (2));
+  v_f64_t p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+  return y;
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 3.29 ULP, in
+   |x| >= 1:
+   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+				  want 0x1.ffffcfd0e2352p-1.  */
+VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t top12 = iax >> 52;
+
+  v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
+  v_u64_t special = v_cond_u64 (top12 >= HugeBound);
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
+  special |= tiny;
+#endif
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
+  v_f64_t option_1 = v_f64 (0);
+  if (likely (v_any_u64 (gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
+#else
+      v_f64_t xm = ax;
+#endif
+      option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
+     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				    want 0x1.c1d6bf874019cp-1.  */
+  v_f64_t option_2 = v_f64 (0);
+  if (likely (v_any_u64 (~gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
+#endif
+      v_f64_t x2 = ax * ax;
+      v_f64_t z2 = x2 * x2;
+      v_f64_t z4 = z2 * z2;
+      v_f64_t z8 = z4 * z4;
+      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
+      option_2 = v_fma_f64 (p, x2 * ax, ax);
+#if WANT_SIMD_EXCEPT
+      option_2 = v_sel_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinh), 2.80)
+PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+   Ensures the v_sel is choosing the right option in all cases.  */
+#define V_ASINH_INTERVAL(lo, hi, n)                                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5)                          \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2)                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
+V_ASINH_INTERVAL (0, 0x1p-26, 50000)
+V_ASINH_INTERVAL (0x1p-26, 1, 50000)
+V_ASINH_INTERVAL (1, 0x1p511, 50000)
+V_ASINH_INTERVAL (0x1p511, inf, 40000)
+V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
+V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
+V_ASINH_INTERVAL (-1, -0x1p511, 50000)
+V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c
new file mode 100644
index 000000000000..9d8c8a936ae3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c
@@ -0,0 +1,70 @@
+/*
+ * Single-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define SignMask v_u32 (0x80000000)
+#define One v_f32 (1.0f)
+#define BigBound v_u32 (0x5f800000)  /* asuint(0x1p64).  */
+#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30).  */
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+   Worst-case error is 2.66 ULP, at roughly +/-0.25:
+   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
+VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & ~SignMask;
+  v_u32_t sign = ix & SignMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+
+#if WANT_SIMD_EXCEPT
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special |= v_cond_u32 (iax < TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, One, ax);
+#endif
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
+  v_f32_t y = log1pf_inline (ax + ax * ax / d);
+  y = v_as_f32_u32 (sign | v_as_u32_f32 (y));
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinhf), 2.17)
+PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c
new file mode 100644
index 000000000000..6327fea8eb2c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c
@@ -0,0 +1,90 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp)
+{
+  return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u64_t
+zeroinfnan (v_u64_t i)
+{
+  return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of vector atan2.
+   Maximum observed error is 2.8 ulps:
+   v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+	got 0x1.92d628ab678ccp-1
+       want 0x1.92d628ab678cfp-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iy = v_as_u64_f64 (y);
+
+  v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u64_t sign_x = ix & SignMask;
+  v_u64_t sign_y = iy & SignMask;
+  v_u64_t sign_xy = sign_x ^ sign_y;
+
+  v_f64_t ax = v_abs_f64 (x);
+  v_f64_t ay = v_abs_f64 (y);
+
+  v_u64_t pred_xlt0 = x < 0.0;
+  v_u64_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atan.  */
+  v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay);
+  v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax);
+  v_f64_t z = v_div_f64 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0));
+  shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift);
+  shift *= PiOver2;
+
+  v_f64_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u64 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+PL_TEST_ULP (V_NAME (atan2), 2.9)
+PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c
new file mode 100644
index 000000000000..5d1e6ca4488e
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+{
+  return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u32_t
+zeroinfnan (v_u32_t i)
+{
+  return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+				       want 0x1.967f00p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iy = v_as_u32_f32 (y);
+
+  v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u32_t sign_x = ix & SignMask;
+  v_u32_t sign_y = iy & SignMask;
+  v_u32_t sign_xy = sign_x ^ sign_y;
+
+  v_f32_t ax = v_abs_f32 (x);
+  v_f32_t ay = v_abs_f32 (y);
+
+  v_u32_t pred_xlt0 = x < 0.0f;
+  v_u32_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atanf.  */
+  v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
+  v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
+  v_f32_t z = v_div_f32 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
+  shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
+  shift *= PiOver2;
+
+  v_f32_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of y.  */
+  ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, F, 2, atan2)
+PL_TEST_ULP (V_NAME (atan2f), 2.46)
+PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c
new file mode 100644
index 000000000000..0f3c2ccf2606
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c
@@ -0,0 +1,74 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+
+/* Fast implementation of vector atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				 want 0x1.9225645bdd7c3p-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan) (v_f64_t x)
+{
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t ia12 = (ix >> 52) & 0x7ff;
+  v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u64_t red = v_cagt_f64 (x, v_f64 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x);
+  v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f64_t az = v_abs_f64 (z);
+  az = v_sel_f64 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f64_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (atan), 1.78)
+PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000)
+
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c
new file mode 100644
index 000000000000..67d90b94f5d3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define AbsMask v_u32 (0x7fffffff)
+#define TinyBound 0x308 /* top12(asuint(0x1p-30)).  */
+#define BigBound 0x4e8	/* top12(asuint(0x1p30)).  */
+
+#if WANT_SIMD_EXCEPT
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (atanf, x, y, special);
+}
+#endif
+
+/* Fast implementation of vector atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+   v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atanf) (v_f32_t x)
+{
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t sign = ix & ~AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  v_u32_t ia12 = (ix >> 20) & 0x7ff;
+  v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, x, v_u32 (-1));
+#endif
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u32_t red = v_cagt_f32 (x, v_f32 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x);
+  v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f32_t az = v_abs_f32 (z);
+  az = v_sel_f32 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f32_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (atanf), 2.5)
+PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c
new file mode 100644
index 000000000000..bfaf5c2b917f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c
@@ -0,0 +1,61 @@
+/*
+ * Double-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pairwise_horner.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (atanh, x, y, special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+   The greatest observed error is 3.31 ULP:
+   __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+				 want 0x1.ffd8ff31b501cp-6.  */
+VPCS_ATTR
+v_f64_t V_NAME (atanh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+  v_u64_t ia = ix & AbsMask;
+  v_u64_t special = v_cond_u64 (ia >= One);
+  v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+
+  /* Mask special lanes with 0 to prevent spurious underflow.  */
+  v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia));
+  v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, atanh, -1.0, 1.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh))
+PL_TEST_ULP (V_NAME (atanh), 3.32)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c
new file mode 100644
index 000000000000..cd3069661142
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c
@@ -0,0 +1,62 @@
+/*
+ * Single-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "v_log1pf_inline.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 3.08 ULP:
+   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+			   want 0x1.ffcb82p-5.  */
+VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_f32_t halfsign
+    = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix));
+  v_u32_t iax = ix & AbsMask;
+
+  v_f32_t ax = v_as_f32_u32 (iax);
+
+#if WANT_SIMD_EXCEPT
+  v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
+  /* Side-step special cases by setting those lanes to 0, which will trigger no
+     exceptions. These will be fixed up later.  */
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, v_f32 (0), ax);
+#else
+  v_u32_t special = v_cond_u32 (iax >= One);
+#endif
+
+  v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (atanhf, x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (V_NAME (atanhf), 2.59)
+PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c
new file mode 100644
index 000000000000..d5abe41024bc
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c
@@ -0,0 +1,98 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds v_f64 (0x1.5555555555555p-1)
+#define TinyBound 0x001 /* top12 (smallest_normal).  */
+#define BigBound 0x7ff	/* top12 (infinity).  */
+#define MantissaMask v_u64 (0x000fffffffffffff)
+#define HalfExp v_u64 (0x3fe0000000000000)
+
+#define C(i) v_f64 (__cbrt_data.poly[i])
+#define T(i) v_lookup_f64 (__cbrt_data.table, i)
+
+static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (cbrt, x, y, special);
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+   and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				 want 0x1.965fe72821e99p+0.  */
+VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t ia12 = iax >> 52;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp));
+  v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022;
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  v_f64_t p_01 = v_fma_f64 (C (1), m, C (0));
+  v_f64_t p_23 = v_fma_f64 (C (3), m, C (2));
+  v_f64_t p = v_fma_f64 (m * m, p_23, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  v_f64_t m_by_3 = m / 3;
+  v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p));
+  a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s64_t ey = e / 3;
+  v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2));
+
+  /* Vector version of ldexp.  */
+  v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my;
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_TEST_ULP (V_NAME (cbrt), 1.30)
+PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt))
+PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c
new file mode 100644
index 000000000000..62fa37505834
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c
@@ -0,0 +1,96 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define SignMask v_u32 (0x80000000)
+#define TwoThirds v_f32 (0x1.555556p-1f)
+#define SmallestNormal 0x00800000
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+#define C(i) v_f32 (__cbrtf_data.poly[i])
+#define T(i) v_lookup_f32 (__cbrtf_data.table, i)
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (cbrtf, x, y, special);
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration with
+   initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP.
+   This is observed for every value where the mantissa is 0x1.81410e and the
+   exponent is a multiple of 3, for example:
+   __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			    want 0x1.255d92p+10.  */
+VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp);
+  v_s32_t e = v_as_s32_u32 (iax >> 23) - 126;
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  v_f32_t p_01 = v_fma_f32 (C (1), m, C (0));
+  v_f32_t p_23 = v_fma_f32 (C (3), m, C (2));
+  v_f32_t p = v_fma_f32 (m * m, p_23, p_01);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  v_f32_t m_by_3 = m / 3;
+  v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s32_t ey = e / 3;
+  v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2));
+
+  /* Vector version of ldexpf.  */
+  v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my;
+  /* Copy sign.  */
+  y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y)));
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (cbrtf), 1.03)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf))
+PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c
new file mode 100644
index 000000000000..0a9fbf817a10
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c
@@ -0,0 +1,96 @@
+/*
+ * Double-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "v_exp_tail.h"
+
+#define C1 v_f64 (C1_scal)
+#define C2 v_f64 (C2_scal)
+#define C3 v_f64 (C3_scal)
+#define InvLn2 v_f64 (InvLn2_scal)
+#define Ln2hi v_f64 (Ln2hi_scal)
+#define Ln2lo v_f64 (Ln2lo_scal)
+#define IndexMask v_u64 (IndexMask_scal)
+#define Shift v_f64 (Shift_scal)
+#define Thres v_f64 (Thres_scal)
+
+#define AbsMask 0x7fffffffffffffff
+#define Half v_f64 (0.5)
+#define SpecialBound                                                           \
+  0x4086000000000000 /* 0x1.6p9, above which exp overflows.  */
+
+#if V_SUPPORTED
+
+static inline v_f64_t
+exp_inline (v_f64_t x)
+{
+  /* Helper for approximating exp(x). Copied from v_exp_tail, with no
+     special-case handling or tail.  */
+
+  /* n = round(x/(ln2/N)).  */
+  v_f64_t z = v_fma_f64 (x, InvLn2, Shift);
+  v_u64_t u = v_as_u64_f64 (z);
+  v_f64_t n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  v_f64_t r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS);
+  v_u64_t i = u & IndexMask;
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  v_f64_t y = v_fma_f64 (C3, r, C2);
+  y = v_fma_f64 (y, r, C1);
+  y = v_fma_f64 (y, r, v_f64 (1)) * r;
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  v_f64_t s = v_as_f64_u64 (u + e);
+
+  return v_fma_f64 (y, s, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the same
+   as the scalar routine, 1.93 ULP:
+   __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+				 want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+				 want 0x1.f711dcb0c77b1p+7.  */
+VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t special = v_cond_u64 (iax > SpecialBound);
+
+  /* If any inputs are special, fall back to scalar for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (cosh, x, x, v_u64 (-1));
+
+  v_f64_t ax = v_as_f64_u64 (iax);
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  v_f64_t t = exp_inline (ax);
+  return t * Half + Half / t;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (cosh), 1.43)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh))
+PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000)
+PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000)
+PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c
new file mode 100644
index 000000000000..1422d4d12b31
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c
@@ -0,0 +1,74 @@
+/*
+ * Single-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
+#define SpecialBound                                                           \
+  0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use       \
+		special case.  */
+#define Half v_f32 (0.5)
+
+#if V_SUPPORTED
+
+v_f32_t V_NAME (expf) (v_f32_t);
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 2.38 ULP:
+   __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4.  */
+VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= SpecialBound);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all inputs if any input is a special value or above the bound
+     at which expf overflows. */
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+  v_u32_t tiny = v_cond_u32 (iax <= TinyBound);
+  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+     input to 1, which will generate no exceptions, and then also fixing tiny
+     lanes of output to 1 just before return.  */
+  if (unlikely (v_any_u32 (tiny)))
+    ax = v_sel_f32 (tiny, v_f32 (1), ax);
+#endif
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  v_f32_t t = V_NAME (expf) (ax);
+  v_f32_t y = t * Half + Half / t;
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (tiny)))
+    return v_sel_f32 (tiny, v_f32 (1), y);
+#else
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (coshf), 1.89)
+PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100)
+PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000)
+PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100)
+PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_2u.c b/contrib/arm-optimized-routines/pl/math/v_erf_2u.c
new file mode 100644
index 000000000000..1d7ddbb1ee3e
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erf_2u.c
@@ -0,0 +1,116 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define AbsXMax v_f64 (0x1.8p+2)
+#define Scale v_f64 (0x1p+3)
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erf, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter tables.  */
+struct entry
+{
+  v_f64_t P[V_ERF_NCOEFFS];
+  v_f64_t shift;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j < V_ERF_NCOEFFS; ++j)
+    e.P[j] = __v_erf_data.coeffs[j][i];
+  e.shift = __v_erf_data.shifts[i];
+#else
+  for (int j = 0; j < V_ERF_NCOEFFS; ++j)
+    {
+      e.P[j][0] = __v_erf_data.coeffs[j][i[0]];
+      e.P[j][1] = __v_erf_data.coeffs[j][i[1]];
+    }
+  e.shift[0] = __v_erf_data.shifts[i[0]];
+  e.shift[1] = __v_erf_data.shifts[i[1]];
+#endif
+  return e;
+}
+
+/* Optimized double precision vector error function erf. Maximum
+   observed error is 1.75 ULP, in [0.110, 0.111]:
+   verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4
+			     want 0x1.fe0ed62a54985p-4.  */
+VPCS_ATTR
+v_f64_t V_NAME (erf) (v_f64_t x)
+{
+  /* Handle both inf/nan as well as small values (|x|<2^-28)
+     If any condition in the lane is true then a loop over
+     scalar calls will be performed.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t atop = (ix >> 48) & v_u64 (0x7fff);
+  v_u64_t special_case
+    = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30));
+
+  /* Get sign and absolute value.  */
+  v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask;
+  v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax);
+
+  /* Compute index by truncating 8 * a with a=|x| saturated to 6.0.  */
+
+#ifdef SCALAR
+  v_u64_t i = v_trunc_u64 (a * Scale);
+#else
+  v_u64_t i = vcvtq_n_u64_f64 (a, 3);
+#endif
+  /* Get polynomial coefficients and shift parameter using lookup.  */
+  struct entry dat = lookup (i);
+
+  /* Evaluate polynomial on transformed argument.  */
+  v_f64_t z = v_fma_f64 (a, Scale, dat.shift);
+
+  v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]);
+  v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]);
+  v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]);
+  v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]);
+  v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]);
+
+  v_f64_t z2 = z * z;
+  v_f64_t y = v_fma_f64 (z2, r5, r4);
+  y = v_fma_f64 (z2, y, r3);
+  y = v_fma_f64 (z2, y, r2);
+  y = v_fma_f64 (z2, y, r1);
+
+  /* y=erf(x) if x>0, -erf(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+
+  if (unlikely (v_any_u64 (special_case)))
+    return specialcase (x, y, special_case);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, erf, -6.0, 6.0)
+PL_TEST_ULP (V_NAME (erf), 1.26)
+PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_data.c b/contrib/arm-optimized-routines/pl/math/v_erf_data.c
new file mode 100644
index 000000000000..7bbb281ad912
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erf_data.c
@@ -0,0 +1,119 @@
+/*
+ * Polynomial coefficients and shifts for double-precision erf(x) vector
+ * function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for
+   i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for
+   [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1
+   above 6.
+
+   Coefficients for each interval generated using fpminimax algorithm. See
+   v_erf.sollya for details. Note the array is transposed, so for a set of
+   coefficients C generated on interval i, C[j] is at coeffs[j][i].  */
+
+const struct v_erf_data __v_erf_data
+  = {.shifts
+     = {-0x1p-1019, -1,	 -2,  -3,  -4,	-5,  -6,  -7,  -8,  -9,	 -10, -11, -12,
+	-13,	    -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25,
+	-26,	    -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38,
+	-39,	    -40, -41, -42, -43, -44, -45, -46, -47, 0},
+     .coeffs = {
+       // clang-format off
+
+{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1,
+ 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1,
+ 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1,
+ 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1,
+ 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1,
+ 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1,
+ 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1,
+ 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0},
+
+{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4,
+ 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6,
+ 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10,
+ 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15,
+ 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22,
+ 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31,
+ 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41,
+ 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0},
+
+{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8,
+ -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9,
+ -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12,
+ -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17,
+ -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23,
+ -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32,
+ -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42,
+ -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0},
+
+{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14,
+ 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12,
+ 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14,
+ 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19,
+ 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25,
+ 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33,
+ 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49,
+ 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0},
+
+{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15,
+ 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18,
+ -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18,
+ -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22,
+ -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27,
+ -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35,
+ -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41,
+ -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0},
+
+{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21,
+ -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21,
+ -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22,
+ 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25,
+ 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30,
+ 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38,
+ 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40,
+ 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0},
+
+{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23,
+ -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24,
+ 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28,
+ -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28,
+ -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33,
+ -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43,
+ -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39,
+ -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0},
+
+{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28,
+ 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31,
+ -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31,
+ -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32,
+ 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37,
+ 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40,
+ 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39,
+ 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0},
+
+{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32,
+ 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32,
+ -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34,
+ 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37,
+ -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40,
+ -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41,
+ -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40,
+ -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0},
+
+{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35,
+ -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37,
+ 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39,
+ -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43,
+ 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48,
+ 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43,
+ 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42,
+ 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0}
+       // clang-format on
+     }};
diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c
new file mode 100644
index 000000000000..c30635153a20
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c
@@ -0,0 +1,168 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "horner.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+/* Accurate exponential (vector variant of exp_dd).  */
+v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+#define One v_f64 (1.0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define Scale v_f64 (0x1.0000002p27)
+
+/* Coeffs for polynomial approximation on [0x1.0p-28., 31.].  */
+#define PX __v_erfc_data.poly
+#define xint __v_erfc_data.interval_bounds
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erfc, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter
+   tables.  */
+struct entry
+{
+  v_f64_t P[ERFC_POLY_ORDER + 1];
+  v_f64_t xi;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    e.P[j] = PX[i][j];
+  e.xi = xint[i];
+#else
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    {
+      e.P[j][0] = PX[i[0]][j];
+      e.P[j][1] = PX[i[1]][j];
+    }
+  e.xi[0] = xint[i[0]];
+  e.xi[1] = xint[i[1]];
+#endif
+  return e;
+}
+
+/* Accurate evaluation of exp(x^2) using compensated product
+   (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
+   corrections d<<y.  */
+static inline v_f64_t
+v_eval_gauss (v_f64_t a)
+{
+  v_f64_t e2;
+  v_f64_t a2 = a * a;
+
+  /* TwoProduct (Dekker) applied to a * a.  */
+  v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
+  a_hi = v_fma_f64 (Scale, a, a_hi);
+  v_f64_t a_lo = a - a_hi;
+
+  /* Now assemble error term.  */
+  e2 = v_fma_f64 (-a_hi, a_hi, a2);
+  e2 = v_fma_f64 (-a_hi, a_lo, e2);
+  e2 = v_fma_f64 (-a_lo, a_hi, e2);
+  e2 = v_fma_f64 (-a_lo, a_lo, e2);
+
+  /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2.  */
+  return V_NAME (exp_tail) (-a2, e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Maximum measured error is 3.64 ULP:
+   __v_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
+				 want 0x1.ff3f4c8e200d9p-42.  */
+VPCS_ATTR
+v_f64_t V_NAME (erfc) (v_f64_t x)
+{
+  v_f64_t z, p, y;
+  v_u64_t ix, atop, sign, i, cmp;
+
+  ix = v_as_u64_f64 (x);
+  /* Compute fac as early as possible in order to get best performance.  */
+  v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
+  /* Use 12-bit for small, nan and inf case detection.  */
+  atop = (ix >> 52) & 0x7ff;
+  cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
+
+  struct entry dat;
+
+  /* All entries of the vector are out of bounds, take a short path.
+     Use smallest possible number above 28 representable in 12 bits.  */
+  v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
+
+  /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
+  if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
+    return fac;
+
+  /* erfc(|x|) = P(|x|-x_i)*exp(-x^2).  */
+
+  v_f64_t a = v_abs_f64 (x);
+
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  v_f64_t xp1 = a + v_f64 (1.0);
+  xp1 = xp1 * xp1;
+  xp1 = xp1 * xp1;
+  v_u64_t ixp1 = v_as_u64_f64 (xp1);
+  i = (ixp1 >> 52) - v_u64 (1023);
+
+  /* Index cannot exceed number of polynomials.  */
+#ifdef SCALAR
+  i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
+#else
+  i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
+		i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
+#endif
+  /* Get coeffs of i-th polynomial.  */
+  dat = lookup (i);
+
+  /* Evaluate Polynomial: P(|x|-x_i).  */
+  z = a - dat.xi;
+#define C(i) dat.P[i]
+  p = HORNER_12 (z, C);
+
+  /* Evaluate Gaussian: exp(-x^2).  */
+  v_f64_t e = v_eval_gauss (a);
+
+  /* Copy sign.  */
+  sign = v_as_u64_f64 (x) & ~AbsMask;
+  p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
+
+  /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise.  */
+  y = v_fma_f64 (p, e, fac);
+
+  /* No need to fix value of y if x is out of bound, as
+     P[ERFC_NUM_INTERVALS]=0.  */
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME (erfc), 3.15)
+PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_data.c b/contrib/arm-optimized-routines/pl/math/v_erfc_data.c
new file mode 100644
index 000000000000..3c47033c1170
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erfc_data.c
@@ -0,0 +1,96 @@
+/*
+ * Polynomial coefficients for double-precision erfc(x) vector function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have
+   the same bounds as the scalar algorithm, with the exception of the lower
+   bound of the first interval which is larger. This is because the vector
+   variants fall back to the scalar for tiny arguments, meaning that we can use
+   a slightly different approach which is more precise for larger inputs but
+   unacceptably imprecise for tiny inputs.  */
+
+const struct v_erfc_data __v_erfc_data = {
+
+/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a
+   logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
+   exception of the first interval.  */
+.interval_bounds = {
+  0x1p-28,		/* If xmin=2^-28, 0 otherwise.  */
+  0x1.837f0518db8a9p-3, /* 0.189.  */
+  0x1.a827999fcef32p-2, /* 0.414.  */
+  0x1.5d13f32b5a75bp-1, /* 0.682.  */
+  0x1.0p0,		/* 1.000.  */
+  0x1.60dfc14636e2ap0,	/* 1.378.  */
+  0x1.d413cccfe779ap0,	/* 1.828.  */
+  0x1.2e89f995ad3adp1,	/* 2.364.  */
+  0x1.8p1,		/* 3.000.  */
+  0x1.e0dfc14636e2ap1,	/* 3.757.  */
+  0x1.2a09e667f3bcdp2,	/* 4.657.  */
+  0x1.6e89f995ad3adp2,	/* 5.727.  */
+  0x1.cp2,		/* 7.000.  */
+  0x1.106fe0a31b715p3,	/* 8.514.  */
+  0x1.4a09e667f3bcdp3,	/* 10.31.  */
+  0x1.8e89f995ad3adp3,	/* 12.45.  */
+  0x1.ep3,		/* 15.00.  */
+  0x1.206fe0a31b715p4,	/* 18.03.  */
+  0x1.5a09e667f3bcdp4,	/* 21.63.  */
+  0x1.9e89f995ad3adp4,	/* 25.91.  */
+  0x1.fp4		/* 31.00.  */
+},
+
+/* Generated using fpminimax algorithm on each interval separately. The
+   polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval
+   [0;b-a], where [a;b] is the interval in which the input lies. Note this is
+   slightly different from the scalar polynomial, which approximates
+   erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details.  */
+.poly = {
+/* 3.725290298461914e-9 < x < 0.18920711500272103.  */
+{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9},
+/* 0.18920711500272103 < x < 0.41421356237309515.  */
+{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16},
+/* 0.41421356237309515 < x < 0.681792830507429.  */
+{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12},
+/* 0.681792830507429 < x < 1.  */
+{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15},
+/* 1 < x < 1.378414230005442.  */
+{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18},
+/* 1.378414230005442 < x < 1.8284271247461903.  */
+{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20},
+/* 1.8284271247461903 < x < 2.363585661014858.  */
+{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22},
+/* 2.363585661014858 < x < 3.  */
+{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25},
+/* 3 < x < 3.756828460010884.  */
+{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28},
+/* 3.756828460010884 < x < 4.656854249492381.  */
+{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31},
+/* 4.656854249492381 < x < 5.727171322029716.  */
+{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34},
+/* 5.727171322029716 < x < 7.  */
+{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37},
+/* 7 < x < 8.513656920021768.  */
+{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41},
+/* 8.513656920021768 < x < 10.313708498984761.  */
+{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44},
+/* 10.313708498984761 < x < 12.454342644059432.  */
+{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47},
+/* 12.454342644059432 < x < 15.  */
+{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51},
+/* 15 < x < 18.027313840043536.  */
+{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54},
+/* 18.027313840043536 < x < 21.627416997969522.  */
+{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58},
+/* 21.627416997969522 < x < 25.908685288118864.  */
+{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61},
+/* 25.908685288118864 < x < 31.  */
+{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64},
+/* Dummy interval for x>31 */
+{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0,
+ 0x0p0, 0x0p0, 0x0p0}
+}
+};
diff --git a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c
new file mode 100644
index 000000000000..963490d789bd
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c
@@ -0,0 +1,183 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "erfcf.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
+
+VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+static VPCS_ATTR NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (erfcf, x, y, special);
+}
+
+static inline uint32_t
+interval_index (uint32_t ia12)
+{
+  // clang-format off
+  return (ia12 < 0x400 ? 0 :
+         (ia12 < 0x408 ? 1 :
+         (ia12 < 0x410 ? 2 :
+                         3)));
+  // clang-format on
+}
+
+/* The C macro wraps the coeffs argument in order to make the
+   poynomial evaluation more readable. In the scalarised variant the
+   second pointer is ignored.  */
+#ifdef SCALAR
+#define C(i) coeff1[i]
+#else
+#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
+#endif
+
+static inline v_f64_t
+v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
+			   const double *coeff2)
+{
+  v_f64_t x2 = x * x;
+  v_f64_t x4 = x2 * x2;
+  v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
+  v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
+  return poly * gauss;
+}
+
+static inline float
+approx_poly_gauss (float abs_x, const double *coeff)
+{
+  return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
+}
+
+static v_f32_t
+v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
+{
+#ifdef SCALAR
+  float y = approx_poly_gauss (abs_x, P (ia12));
+  return sign ? 2 - y : y;
+#else
+  float32x2_t lo32 = {0, 0};
+  float32x2_t hi32 = {0, 0};
+  /* The polynomial and Gaussian components must be calculated in
+     double precision in order to meet the required ULP error. This
+     means we have to promote low and high halves of the
+     single-precision input vector to two separate double-precision
+     input vectors. This incurs some overhead, and there is also
+     overhead to loading the polynomial coefficients as this cannot be
+     done in a vector fashion. This would be wasted effort for
+     elements which lie in the 'boring' zone, as they will be
+     overwritten later. Hence we use the lanes parameter to only do
+     the promotion on a pair of lanes if both of those lanes are
+     interesting and not special cases. If one lane is inactive, we
+     use a scalar routine which is shared with the scalar variant.  */
+  if (lanes[0] & lanes[1])
+    {
+      lo32 = vcvt_f32_f64 (
+	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
+				   P (ia12[0]), P (ia12[1])));
+    }
+  else if (lanes[0])
+    {
+      lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
+    }
+  else if (lanes[1])
+    {
+      lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
+    }
+
+  if (lanes[2] & lanes[3])
+    {
+      hi32
+	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
+						   P (ia12[2]), P (ia12[3])));
+    }
+  else if (lanes[2])
+    {
+      hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
+    }
+  else if (lanes[3])
+    {
+      hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
+    }
+
+  v_f32_t y = vcombine_f32 (lo32, hi32);
+
+  if (v_any_u32 (sign))
+    {
+      y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
+    }
+
+  return y;
+#endif
+}
+
+/* Optimized single-precision vector complementary error function
+   erfcf. Max measured error: 0.750092 at various values between
+   -0x1.06521p-20 and -0x1.add1dap-17. For example:
+   __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
+   +0.249908 ulp err 0.250092.  */
+VPCS_ATTR
+v_f32_t V_NAME (erfcf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ia = ix & 0x7fffffff;
+  v_u32_t ia12 = ia >> 20;
+  v_u32_t sign = ix >> 31;
+  v_u32_t inf_ia12 = v_u32 (0x7f8);
+
+  v_u32_t special_cases
+    = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
+  v_u32_t in_bounds
+    = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
+  v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
+
+#ifdef SCALAR
+  if (unlikely (special_cases))
+    {
+      if (ia12 >= 0x7f8)
+	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
+      else
+	return 1.0f - x; /* Small case.  */
+    }
+  else if (likely (!in_bounds))
+    {
+      return sign ? boring_zone : __math_uflowf (boring_zone);
+    }
+#endif
+
+  v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
+			      in_bounds & ~special_cases);
+
+#ifndef SCALAR
+  y = vbslq_f32 (~in_bounds, boring_zone, y);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (x, y, special_cases);
+    }
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME (erfcf), 0.26)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c
new file mode 100644
index 000000000000..3a25cc8751d1
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c
@@ -0,0 +1,116 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t);
+
+#define AbsMask v_u32 (0x7fffffff)
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (erff, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter tables.  */
+struct entry
+{
+  v_f32_t P[V_ERFF_NCOEFFS];
+};
+
+static inline struct entry
+lookup (v_u32_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
+    e.P[j] = __v_erff_data.coeffs[j][i];
+#else
+  for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
+    {
+      e.P[j][0] = __v_erff_data.coeffs[j][i[0]];
+      e.P[j][1] = __v_erff_data.coeffs[j][i[1]];
+      e.P[j][2] = __v_erff_data.coeffs[j][i[2]];
+      e.P[j][3] = __v_erff_data.coeffs[j][i[3]];
+    }
+#endif
+  return e;
+}
+
+/* Optimized single precision vector error function erf.
+   Maximum measured at +/- 0.931, 1.25ULP:
+   v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1
+			 want -0x1.9f9c8ap-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (erff) (v_f32_t x)
+{
+  /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition
+     in the lane is true then a loop over scalar calls will be performed.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t atop = (ix >> 16) & v_u32 (0x7fff);
+  v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180));
+
+  /* Get sign and absolute value.  */
+  v_u32_t sign = ix & ~AbsMask;
+  /* |x| < 0.921875.  */
+  v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f));
+  /* |x| > 4.0.  */
+  v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1));
+
+  /* Get polynomial coefficients.  */
+  struct entry dat = lookup (i);
+
+  v_f32_t a = v_abs_f32 (x);
+  v_f32_t z = v_sel_f32 (red, x * x, a);
+
+  /* Evaluate Polynomial of |x| or x^2.  */
+  v_f32_t r = dat.P[6];
+  r = v_fma_f32 (z, r, dat.P[5]);
+  r = v_fma_f32 (z, r, dat.P[4]);
+  r = v_fma_f32 (z, r, dat.P[3]);
+  r = v_fma_f32 (z, r, dat.P[2]);
+  r = v_fma_f32 (z, r, dat.P[1]);
+  r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0]));
+  r = v_fma_f32 (a, r, a);
+
+  /* y = |x| + |x|*P(|x|)        if |x| < 0.921875
+     1 - exp (-(|x|+|x|*P(x^2))) otherwise.  */
+  v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r));
+
+  /* Boring domain (absolute value is required to get the sign of erf(-nan)
+     right).  */
+  y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y));
+
+  /* y=erf(x) if x>0, -erf(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (V_NAME (erff), 0.76)
+PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_data.c b/contrib/arm-optimized-routines/pl/math/v_erff_data.c
new file mode 100644
index 000000000000..73ccb5cbcfa8
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_erff_data.c
@@ -0,0 +1,18 @@
+/*
+ * Data for approximation of vector erff.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff.  */
+const struct v_erff_data __v_erff_data
+  = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f},
+		{0x1.06eba6p-03f, 0x1.450aa0p-1},
+		{-0x1.8126e0p-02f, 0x1.b55cb0p-4f},
+		{0x1.ce1a46p-04f, -0x1.8d6300p-6f},
+		{-0x1.b68bd2p-06f, 0x1.fd1336p-9f},
+		{0x1.473f48p-08f, -0x1.91d2ccp-12f},
+		{-0x1.3a1a82p-11f, 0x1.222900p-16f}}};
diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail.c
new file mode 100644
index 000000000000..fd38aa8ae6ea
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail.c
@@ -0,0 +1,75 @@
+/*
+ * Double-precision vector e^(x+tail) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "math_config.h"
+#if V_SUPPORTED
+#include "v_exp_tail.h"
+
+#define C1 v_f64 (C1_scal)
+#define C2 v_f64 (C2_scal)
+#define C3 v_f64 (C3_scal)
+#define InvLn2 v_f64 (InvLn2_scal)
+#define Ln2hi v_f64 (Ln2hi_scal)
+#define Ln2lo v_f64 (Ln2lo_scal)
+
+#define IndexMask v_u64 (IndexMask_scal)
+#define Shift v_f64 (Shift_scal)
+#define Thres v_f64 (Thres_scal)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+  v_f64_t absn = v_abs_f64 (n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+  v_f64_t r1 = s1 * s1;
+  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail)
+{
+  v_f64_t n, r, s, y, z;
+  v_u64_t cmp, u, e, i;
+
+  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+  /* n = round(x/(ln2/N)).  */
+  z = v_fma_f64 (x, InvLn2, Shift);
+  u = v_as_u64_f64 (z);
+  n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  e = u << (52 - V_EXP_TAIL_TABLE_BITS);
+  i = u & IndexMask;
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  y = v_fma_f64 (C3, r, C2);
+  y = v_fma_f64 (y, r, C1);
+  y = v_fma_f64 (y, r, v_f64 (1.0));
+  y = v_fma_f64 (y, r, xtail);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  s = v_as_f64_u64 (u + e);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (s, y, n);
+  return v_fma_f64 (y, s, s);
+}
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail.h b/contrib/arm-optimized-routines/pl/math/v_exp_tail.h
new file mode 100644
index 000000000000..903f1fd95717
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail.h
@@ -0,0 +1,21 @@
+/*
+ * Constants for double-precision e^(x+tail) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define C1_scal 0x1.fffffffffffd4p-2
+#define C2_scal 0x1.5555571d6b68cp-3
+#define C3_scal 0x1.5555576a59599p-5
+#define InvLn2_scal 0x1.71547652b82fep8 /* N/ln2.  */
+#define Ln2hi_scal 0x1.62e42fefa39efp-9 /* ln2/N.  */
+#define Ln2lo_scal 0x1.abc9e3b39803f3p-64
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+#define Tab __v_exp_tail_data
+#define IndexMask_scal (N - 1)
+#define Shift_scal 0x1.8p+52
+#define Thres_scal 704.0
diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c
new file mode 100644
index 000000000000..675eb769bf07
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c
@@ -0,0 +1,97 @@
+/*
+ * Lookup table for double-precision e^(x+tail) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 2^(j/N), j=0..N (where N = 256).  */
+const uint64_t __v_exp_tail_data[]
+  = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+     0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+     0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+     0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+     0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+     0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+     0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+     0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+     0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+     0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+     0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+     0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+     0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+     0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+     0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+     0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+     0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+     0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+     0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+     0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+     0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+     0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+     0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+     0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+     0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+     0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+     0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+     0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+     0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+     0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+     0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+     0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+     0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+     0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+     0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+     0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+     0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+     0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+     0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+     0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+     0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+     0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+     0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+     0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+     0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+     0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+     0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+     0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+     0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+     0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+     0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+     0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+     0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+     0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+     0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+     0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+     0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+     0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+     0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+     0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+     0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+     0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+     0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+     0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+     0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+     0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+     0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+     0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+     0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+     0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+     0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+     0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+     0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+     0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+     0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+     0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+     0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+     0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+     0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+     0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+     0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+     0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+     0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+     0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+     0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+     0x3feff9d96b2a23d9};
diff --git a/contrib/arm-optimized-routines/math/v_expf.c b/contrib/arm-optimized-routines/pl/math/v_expf.c
similarity index 95%
copy from contrib/arm-optimized-routines/math/v_expf.c
copy to contrib/arm-optimized-routines/pl/math/v_expf.c
index d403e00534f0..a422e69feb62 100644
--- a/contrib/arm-optimized-routines/math/v_expf.c
+++ b/contrib/arm-optimized-routines/pl/math/v_expf.c
@@ -1,83 +1,83 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "mathlib.h"
 #include "v_math.h"
+#include "mathlib.h"
 #if V_SUPPORTED
 
 static const float Poly[] = {
   /* maxerr: 1.45358 +0.5 ulp.  */
   0x1.0e4020p-7f,
   0x1.573e2ep-5f,
   0x1.555e66p-3f,
   0x1.fffdb6p-2f,
   0x1.ffffecp-1f,
 };
 #define C0 v_f32 (Poly[0])
 #define C1 v_f32 (Poly[1])
 #define C2 v_f32 (Poly[2])
 #define C3 v_f32 (Poly[3])
 #define C4 v_f32 (Poly[4])
 
 #define Shift v_f32 (0x1.8p23f)
 #define InvLn2 v_f32 (0x1.715476p+0f)
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
   v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
   v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
   v_f32_t s2 = v_as_f32_u32 (e - b);
   v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
   v_u32_t r2 = v_as_u32_f32 (s1 * s1);
   v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
   /* Similar to r1 but avoids double rounding in the subnormal range.  */
   v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
 VPCS_ATTR
 v_f32_t
 V_NAME(expf) (v_f32_t x)
 {
   v_f32_t n, r, r2, scale, p, q, poly, absn, z;
   v_u32_t cmp, e;
 
   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
   z = v_fma_f32 (x, InvLn2, Shift);
   n = z - Shift;
   r = v_fma_f32 (n, -Ln2hi, x);
   r = v_fma_f32 (n, -Ln2lo, r);
   e = v_as_u32_f32 (z) << 23;
 #else
   z = x * InvLn2;
   n = v_round_f32 (z);
   r = v_fma_f32 (n, -Ln2hi, x);
   r = v_fma_f32 (n, -Ln2lo, r);
   e = v_as_u32_s32 (v_round_s32 (z)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
   absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn, cmp, scale);
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
 #endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c
new file mode 100644
index 000000000000..4b491d17feef
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c
@@ -0,0 +1,113 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define SpecialBound                                                           \
+  0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the      \
+			final stage of the algorithm overflows so fall back to \
+			scalar.  */
+#define AbsMask 0x7fffffffffffffff
+#define One 0x3ff0000000000000
+
+#define C(i) v_f64 (__expm1_poly[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f, v_f64_t f2)
+{
+  /* Evaluate custom polynomial using Estrin scheme.  */
+  v_f64_t p_01 = v_fma_f64 (f, C (1), C (0));
+  v_f64_t p_23 = v_fma_f64 (f, C (3), C (2));
+  v_f64_t p_45 = v_fma_f64 (f, C (5), C (4));
+  v_f64_t p_67 = v_fma_f64 (f, C (7), C (6));
+  v_f64_t p_89 = v_fma_f64 (f, C (9), C (8));
+
+  v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01);
+  v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45);
+  v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89);
+
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03);
+  return v_fma_f64 (f4 * f4, p_8a, p_07);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+   The maximum error observed error is 2.18 ULP:
+   __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+				  want 0x1.a8b9ea8d66e2p-2.  */
+VPCS_ATTR
+v_f64_t V_NAME (expm1) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ax = ix & AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all lanes if any of them should trigger an exception.  */
+  v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, x, v_u64 (-1));
+#else
+  /* Large input, NaNs and Infs.  */
+  v_u64_t special
+    = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000));
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f64_t y = v_fma_f64 (p, t, t - 1);
+
+#if !WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (V_NAME (expm1), 1.68)
+PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100)
+PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c
new file mode 100644
index 000000000000..ab132427e58d
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c
@@ -0,0 +1,94 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+#define AbsMask (0x7fffffff)
+#define One (0x3f800000)
+#define SpecialBound                                                           \
+  (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x)  \
+		  should round to -1.  */
+#define TinyBound (0x34000000) /* asuint(0x1p-23).  */
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+/* Single-precision vector exp(x) - 1 function.
+   The maximum error is 1.51 ULP:
+   expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
+			want 0x1.e2fb94p-2.  */
+VPCS_ATTR
+v_f32_t V_NAME (expm1f) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ax = ix & AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all lanes if any of them should trigger an exception.  */
+  v_u32_t special
+    = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound));
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff));
+#else
+  /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0.  */
+  v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000));
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+
+  v_f32_t p = v_fma_f32 (C (4), f, C (3));
+  p = v_fma_f32 (p, f, C (2));
+  p = v_fma_f32 (p, f, C (1));
+  p = v_fma_f32 (p, f, C (0));
+  p = v_fma_f32 (f * f, p, f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f32_t y = v_fma_f32 (p, t, t - 1);
+
+#if !WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (expm1f, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (V_NAME (expm1f), 1.02)
+PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000)
+PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h
new file mode 100644
index 000000000000..c261941ebed6
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h
@@ -0,0 +1,49 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_EXPM1F_INLINE_H
+#define PL_MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+#include "estrinf.h"
+
+#define One 0x3f800000
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+#endif // PL_MATH_V_EXPM1F_INLINE_H
diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c
new file mode 100644
index 000000000000..86d398ca13a9
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c
@@ -0,0 +1,110 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define A(i) v_f64 (__v_log10_data.poly[i])
+#define T(s, i) __v_log10_data.tab[i].s
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define N (1 << V_LOG10_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log10c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = T (invc, i);
+  e.log10c = T (log10c, i);
+#else
+  e.invc[0] = T (invc, i[0]);
+  e.log10c[0] = T (log10c, i[0]);
+  e.invc[1] = T (invc, i[1]);
+  e.log10c[1] = T (log10c, i[1]);
+#endif
+  return e;
+}
+
+VPCS_ATTR
+inline static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log10, x, y, cmp);
+}
+
+/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps).
+   Max ULP error: < 2.5 ulp (nearest rounding.)
+   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+     __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+				    want 0x1.fff6be3cae4b9p-6
+     -0.459999 ulp err 1.96.  */
+VPCS_ATTR
+v_f64_t V_NAME (log10) (v_f64_t x)
+{
+  v_f64_t z, r, r2, p, y, kd, hi;
+  v_u64_t ix, iz, tmp, top, i, cmp;
+  v_s64_t k;
+  struct entry e;
+
+  ix = v_as_u64_f64 (x);
+  top = ix >> 48;
+  cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N;
+  k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  z = v_as_f64_u64 (iz);
+  e = lookup (i);
+
+  /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
+  r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  kd = v_to_f64_s64 (k);
+
+  /* hi = r / log(10) + log10(c) + k*log10(2).
+     Constants in `v_log10_data.c` are computed (in extended precision) as
+     e.log10c := e.logc * ivln10.  */
+  v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c);
+
+  /* y = log10(1+r) + n * log10(2).  */
+  hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = r * r;
+  y = v_fma_f64 (A (3), r, A (2));
+  p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log10), 1.97)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10))
+PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000)
+PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000)
+PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_data.c b/contrib/arm-optimized-routines/pl/math/v_log10_data.c
new file mode 100644
index 000000000000..fda85c886963
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log10_data.c
@@ -0,0 +1,167 @@
+/*
+ * Lookup table for double-precision log10(x) vector function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << V_LOG10_TABLE_BITS)
+
+/* Algorithm:
+
+	x = 2^k z
+	log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10)
+
+where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
+and log(c) and 1/c for the ith subinterval comes from a lookup table:
+
+	tab[i].invc = 1/c
+	tab[i].log10c = (double)log10(c)
+
+where c is near the center of the subinterval and is chosen by trying several
+floating point invc candidates around 1/center and selecting one for which
+the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+that contains 1 and the previous one got tweaked to avoid cancellation.
+NB: invc should be optimized to minimize error in (double)log10(c) instead.  */
+const struct v_log10_data __v_log10_data
+  = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3},
+	     {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3},
+	     {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3},
+	     {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3},
+	     {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3},
+	     {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3},
+	     {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3},
+	     {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3},
+	     {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3},
+	     {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3},
+	     {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3},
+	     {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4},
+	     {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4},
+	     {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4},
+	     {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4},
+	     {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4},
+	     {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4},
+	     {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4},
+	     {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4},
+	     {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4},
+	     {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4},
+	     {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4},
+	     {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4},
+	     {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4},
+	     {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4},
+	     {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4},
+	     {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4},
+	     {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4},
+	     {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4},
+	     {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4},
+	     {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4},
+	     {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4},
+	     {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4},
+	     {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4},
+	     {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4},
+	     {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4},
+	     {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4},
+	     {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4},
+	     {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4},
+	     {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4},
+	     {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4},
+	     {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5},
+	     {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5},
+	     {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5},
+	     {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5},
+	     {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5},
+	     {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5},
+	     {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5},
+	     {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5},
+	     {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5},
+	     {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5},
+	     {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5},
+	     {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5},
+	     {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5},
+	     {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5},
+	     {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5},
+	     {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5},
+	     {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5},
+	     {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6},
+	     {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6},
+	     {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6},
+	     {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6},
+	     {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6},
+	     {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6},
+	     {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6},
+	     {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6},
+	     {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7},
+	     {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7},
+	     {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7},
+	     {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7},
+	     {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7},
+	     {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8},
+	     {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8},
+	     {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9},
+	     {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10},
+	     {1.0, 0.0},
+	     {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9},
+	     {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8},
+	     {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7},
+	     {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7},
+	     {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6},
+	     {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6},
+	     {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6},
+	     {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6},
+	     {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6},
+	     {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5},
+	     {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5},
+	     {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5},
+	     {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5},
+	     {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5},
+	     {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5},
+	     {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5},
+	     {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5},
+	     {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5},
+	     {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5},
+	     {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4},
+	     {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4},
+	     {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4},
+	     {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4},
+	     {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4},
+	     {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4},
+	     {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4},
+	     {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4},
+	     {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4},
+	     {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4},
+	     {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4},
+	     {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4},
+	     {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4},
+	     {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4},
+	     {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4},
+	     {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4},
+	     {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4},
+	     {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4},
+	     {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4},
+	     {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4},
+	     {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4},
+	     {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4},
+	     {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4},
+	     {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3},
+	     {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3},
+	     {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3},
+	     {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3},
+	     {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3},
+	     {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3},
+	     {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3},
+	     {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3},
+	     {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3},
+	     {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}},
+
+     /* Computed from log coeffs div by log(10) then rounded to double
+	precision.  */
+     .poly
+     = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
+	0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4},
+
+     .invln10 = 0x1.bcb7b1526e50ep-2,
+     .log10_2 = 0x1.34413509f79ffp-2
+
+};
diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c
new file mode 100644
index 000000000000..e9f7f0346ca2
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define P(i) v_f32 (__v_log10f_poly[i])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218.  */
+#define InvLn10 v_f32 (0x1.bcb7b2p-2f)
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667.  */
+
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log10f, x, y, cmp);
+}
+
+/* Our fast implementation of v_log10f uses a similar approach as v_logf.
+   With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with
+   order 9. This is more efficient than using a low order polynomial computed in
+   double precision.
+   Maximum error: 3.305ulps (nearest rounding.)
+   __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			    want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492.  */
+VPCS_ATTR
+v_f32_t V_NAME (log10f) (v_f32_t x)
+{
+  v_f32_t n, o, p, q, r, r2, y;
+  v_u32_t u, cmp;
+
+  u = v_as_u32_f32 (x);
+  cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u -= Off;
+  n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend.  */
+  u &= Mask;
+  u += Off;
+  r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log10(1+r) + n*log10(2).  */
+  r2 = r * r;
+  /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 +
+     r2*(P6+r*P7))).  */
+  o = v_fma_f32 (P (7), r, P (6));
+  p = v_fma_f32 (P (5), r, P (4));
+  q = v_fma_f32 (P (3), r, P (2));
+  y = v_fma_f32 (P (1), r, P (0));
+  p = v_fma_f32 (o, r2, p);
+  q = v_fma_f32 (p, r2, q);
+  y = v_fma_f32 (q, r2, y);
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster
+     but less accurate.  */
+  p = v_fma_f32 (Ln2, n, r);
+  y = v_fma_f32 (y, r2, p * InvLn10);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log10f), 2.81)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f))
+PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_data.c b/contrib/arm-optimized-routines/pl/math/v_log10f_data.c
new file mode 100644
index 000000000000..537482a92017
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log10f_data.c
@@ -0,0 +1,13 @@
+/*
+ * Coefficients for single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const float __v_log10f_poly[] = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
+  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c
new file mode 100644
index 000000000000..e48291081ab3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c
@@ -0,0 +1,120 @@
+/*
+ * Double-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+#define AbsMask 0x7fffffffffffffff
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f)
+{
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (log1p, x, y, special);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is a
+   modification of the algorithm used in scalar log1p, with no shortcut for k=0
+   and no narrowing for f and k. Maximum observed error is 2.46 ULP:
+    __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
+				    want 0x1.fd5565fb590f6p+2 .  */
+VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ia = ix & AbsMask;
+  v_u64_t special
+    = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000))
+		  | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000));
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (special)))
+    x = v_sel_f64 (special, v_f64 (0), x);
+#endif
+
+  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+			   is in [sqrt(2)/2, sqrt(2)]):
+     log1p(x) = k*log(2) + log1p(f).
+
+     f may not be representable exactly, so we need a correction term:
+     let m = round(1 + x), c = (1 + x) - m.
+     c << m: at very small x, log1p(x) ~ x, hence:
+     log(1+x) - log(m) ~ c/m.
+
+     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+
+  /* Obtain correctly scaled k by manipulation in the exponent.
+     The scalar algorithm casts down to 32-bit at this point to calculate k and
+     u_red. We stay in double-width to obtain f and k, using the same constants
+     as the scalar algorithm but shifted left by 32.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+  v_u64_t u = mi + OneMHfRt2Top;
+
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12;
+  v_f64_t k = v_to_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term c/m.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+   log1p(0)=0 we choose an approximation of the form:
+      x + C0*x^2 + C1*x^3 + C2x^4 + ...
+   Hence approximation has the form f + f^2 * P(f)
+      where P(x) = C0 + C1*x + C2x^2 + ...
+   Assembling this all correctly is dealt with at the final step.  */
+  v_f64_t p = eval_poly (f);
+
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (v_as_f64_u64 (ix), y, special);
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (V_NAME (log1p), 1.97)
+PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h
new file mode 100644
index 000000000000..e5c733964bc0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h
@@ -0,0 +1,77 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef PL_MATH_V_LOG1P_INLINE_H
+#define PL_MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+#include "pairwise_horner.h"
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop 0x3ff
+#define BottomMask 0xffffffff
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)).  */
+
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+log1p_inline (v_f64_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+     modifications:
+     - No special-case handling - this should be dealt with by the caller.
+     - Pairwise Horner polynomial evaluation for improved accuracy.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using v_sel, for improved accuracy when the argument to log1p is close to
+       0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+       the source of the caller before including this file.
+     See v_log1pf_2u1.c for details of the algorithm.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+  v_u64_t u = mi + OneMHfRt2Top;
+
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop;
+  v_f64_t k = v_to_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term c/m.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+#error                                                                         \
+  "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial. */
+  v_u64_t k0 = k == 0;
+  if (unlikely (v_any_u64 (k0)))
+    {
+      cm = v_sel_f64 (k0, v_f64 (0), cm);
+      f = v_sel_f64 (k0, x, f);
+    }
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  return v_fma_f64 (f2, p, ylo + yhi);
+}
+
+#endif // PL_MATH_V_LOG1P_INLINE_H
diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c
new file mode 100644
index 000000000000..4a7732b403ec
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c
@@ -0,0 +1,160 @@
+/*
+ * Single-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#define MinusOne 0xbf800000
+#define Ln2 (0x1.62e43p-1f)
+#define Four 0x40800000
+#define ThreeQuarters v_u32 (0x3f400000)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+#ifdef V_LOG1PF_1U3
+
+  /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme.  */
+  v_f32_t p = v_fma_f32 (C (8), m, C (7));
+  p = v_fma_f32 (p, m, C (6));
+  p = v_fma_f32 (p, m, C (5));
+  p = v_fma_f32 (p, m, C (4));
+  p = v_fma_f32 (p, m, C (3));
+  p = v_fma_f32 (p, m, C (2));
+  p = v_fma_f32 (p, m, C (1));
+  p = v_fma_f32 (p, m, C (0));
+  return v_fma_f32 (m, m * p, m);
+
+#elif defined(V_LOG1PF_2U5)
+
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+
+#else
+#error No precision specified for v_log1pf
+#endif
+}
+
+static inline float
+handle_special (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000)
+    {
+      /* x == -Inf   => log1pf(x) = NaN.
+	 x <  -1.0   => log1pf(x) = NaN.
+	 x == +/-NaN => log1pf(x) = NaN.  */
+#if WANT_SIMD_EXCEPT
+      return __math_invalidf (asfloat (ia));
+#else
+      return NAN;
+#endif
+    }
+  if (ix == 0xbf800000)
+    {
+      /* x == -1.0 => log1pf(x) = -Inf.  */
+#if WANT_SIMD_EXCEPT
+      return __math_divzerof (ix);
+#else
+      return -INFINITY;
+#endif
+    }
+  /* |x| < TinyBound => log1p(x)  =  x.  */
+  return x;
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is
+   the same as for the scalar algorithm, i.e. worst-case error when using Estrin
+   is roughly 2.02 ULP:
+   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8);
+  v_u32_t special_cases
+    = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound))
+      | v_cond_u32 (ix >= MinusOne);
+  v_f32_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (special_cases)))
+    /* Side-step special lanes so fenv exceptions are not triggered
+       inadvertently.  */
+    x = v_sel_f32 (special_cases, v_f32 (1), x);
+#endif
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+
+  v_f32_t m = x + v_f32 (1.0f);
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000);
+
+  /* Scale x by exponent manipulation.  */
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k));
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+
+  /* Evaluate polynomial on the reduced interval.  */
+  v_f32_t p = eval_poly (m_scale);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f);
+
+  /* Apply the scaling back.  */
+  v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    return v_call_f32 (handle_special, special_arg, y, special_cases);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (V_NAME (log1pf), 1.53)
+PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h
new file mode 100644
index 000000000000..e3048e667c26
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h
@@ -0,0 +1,55 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_LOG1PF_INLINE_H
+#define PL_MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+
+#define Four 0x40800000
+#define Ln2 v_f32 (0x1.62e43p-1f)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  v_f32_t m = x + 1.0f;
+  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+  v_f32_t p = eval_poly (m_scale);
+  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+  return v_fma_f32 (scale_back, Ln2, p);
+}
+
+#endif //  PL_MATH_V_LOG1PF_INLINE_H
diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_3u.c b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c
new file mode 100644
index 000000000000..fac73f60c600
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+#define P(i) v_f64 (__v_log2_data.poly[i])
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log2c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = __v_log2_data.tab[i].invc;
+  e.log2c = __v_log2_data.tab[i].log2c;
+#else
+  e.invc[0] = __v_log2_data.tab[i[0]].invc;
+  e.log2c[0] = __v_log2_data.tab[i[0]].log2c;
+  e.invc[1] = __v_log2_data.tab[i[1]].invc;
+  e.log2c[1] = __v_log2_data.tab[i[1]].log2c;
+#endif
+  return e;
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.58 ULP:
+   __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				 want 0x1.fffb34198d9ddp-5.  */
+VPCS_ATTR
+v_f64_t V_NAME (log2) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t top = ix >> 48;
+  v_u64_t special
+    = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
+
+  v_f64_t r2 = r * r;
+  v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
+  v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
+  v_f64_t y = v_fma_f64 (P (4), r2, p_23);
+  y = v_fma_f64 (r2, y, p_01);
+  y = v_fma_f64 (r2, y, kd + w);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2), 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2))
+PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_data.c b/contrib/arm-optimized-routines/pl/math/v_log2_data.c
new file mode 100644
index 000000000000..2a1da6823fbc
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log2_data.c
@@ -0,0 +1,155 @@
+/*
+ * Coefficients and table entries for vector log2
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << V_LOG2_TABLE_BITS)
+
+// clang-format off
+
+const struct v_log2_data __v_log2_data = {
+
+/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6.
+   Each coefficient was scaled by log2(e) in extended precision and rounded back to
+   double.  */
+.poly = { -0x1.71547652b83p-1,    0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2,
+	   0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 },
+
+/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was
+   calculated by scaling log10(c) by log2(10) in extended precision and rounding
+   back.  */
+.tab = {
+{ 0x1.6a133d0dec120p+0,  -0x1.00130d57f5fadp-1 },
+{ 0x1.6815f2f3e42edp+0,  -0x1.f802661bd725ep-2 },
+{ 0x1.661e39be1ac9ep+0,  -0x1.efea1c6f73a5bp-2 },
+{ 0x1.642bfa30ac371p+0,  -0x1.e7dd1dcd06f05p-2 },
+{ 0x1.623f1d916f323p+0,  -0x1.dfdb4ae024809p-2 },
+{ 0x1.60578da220f65p+0,  -0x1.d7e484d101958p-2 },
+{ 0x1.5e75349dea571p+0,  -0x1.cff8ad452f6ep-2 },
+{ 0x1.5c97fd387a75ap+0,  -0x1.c817a666c997fp-2 },
+{ 0x1.5abfd2981f200p+0,  -0x1.c04152d640419p-2 },
+{ 0x1.58eca051dc99cp+0,  -0x1.b87595a3f64b2p-2 },
+{ 0x1.571e526d9df12p+0,  -0x1.b0b4526c44d07p-2 },
+{ 0x1.5554d555b3fcbp+0,  -0x1.a8fd6d1a90f5ep-2 },
+{ 0x1.539015e2a20cdp+0,  -0x1.a150ca2559fc6p-2 },
+{ 0x1.51d0014ee0164p+0,  -0x1.99ae4e62cca29p-2 },
+{ 0x1.50148538cd9eep+0,  -0x1.9215df1a1e842p-2 },
+{ 0x1.4e5d8f9f698a1p+0,  -0x1.8a8761fe1f0d9p-2 },
+{ 0x1.4cab0edca66bep+0,  -0x1.8302bd1cc9a54p-2 },
+{ 0x1.4afcf1a9db874p+0,  -0x1.7b87d6fb437f6p-2 },
+{ 0x1.495327136e16fp+0,  -0x1.741696673a86dp-2 },
+{ 0x1.47ad9e84af28fp+0,  -0x1.6caee2b3c6fe4p-2 },
+{ 0x1.460c47b39ae15p+0,  -0x1.6550a3666c27ap-2 },
+{ 0x1.446f12b278001p+0,  -0x1.5dfbc08de02a4p-2 },
+{ 0x1.42d5efdd720ecp+0,  -0x1.56b022766c84ap-2 },
+{ 0x1.4140cfe001a0fp+0,  -0x1.4f6db1c955536p-2 },
+{ 0x1.3fafa3b421f69p+0,  -0x1.4834579063054p-2 },
+{ 0x1.3e225c9c8ece5p+0,  -0x1.4103fd2249a76p-2 },
+{ 0x1.3c98ec29a211ap+0,  -0x1.39dc8c3fe6dabp-2 },
+{ 0x1.3b13442a413fep+0,  -0x1.32bdeed4b5c8fp-2 },
+{ 0x1.399156baa3c54p+0,  -0x1.2ba80f41e20ddp-2 },
+{ 0x1.38131639b4cdbp+0,  -0x1.249ad8332f4a7p-2 },
+{ 0x1.36987540fbf53p+0,  -0x1.1d96347e7f3ebp-2 },
+{ 0x1.352166b648f61p+0,  -0x1.169a0f7d6604ap-2 },
+{ 0x1.33adddb3eb575p+0,  -0x1.0fa654a221909p-2 },
+{ 0x1.323dcd99fc1d3p+0,  -0x1.08baefcf8251ap-2 },
+{ 0x1.30d129fefc7d2p+0,  -0x1.01d7cd14deecdp-2 },
+{ 0x1.2f67e6b72fe7dp+0,  -0x1.f5f9b1ad55495p-3 },
+{ 0x1.2e01f7cf8b187p+0,  -0x1.e853ff76a77afp-3 },
+{ 0x1.2c9f518ddc86ep+0,  -0x1.dabe5d624cba1p-3 },
+{ 0x1.2b3fe86e5f413p+0,  -0x1.cd38a5cef4822p-3 },
+{ 0x1.29e3b1211b25cp+0,  -0x1.bfc2b38d315f9p-3 },
+{ 0x1.288aa08b373cfp+0,  -0x1.b25c61f5edd0fp-3 },
+{ 0x1.2734abcaa8467p+0,  -0x1.a5058d18e9cacp-3 },
+{ 0x1.25e1c82459b81p+0,  -0x1.97be1113e47a3p-3 },
+{ 0x1.2491eb1ad59c5p+0,  -0x1.8a85cafdf5e27p-3 },
+{ 0x1.23450a54048b5p+0,  -0x1.7d5c97e8fc45bp-3 },
+{ 0x1.21fb1bb09e578p+0,  -0x1.704255d6486e4p-3 },
+{ 0x1.20b415346d8f7p+0,  -0x1.6336e2cedd7bfp-3 },
+{ 0x1.1f6fed179a1acp+0,  -0x1.563a1d9b0cc6ap-3 },
+{ 0x1.1e2e99b93c7b3p+0,  -0x1.494be541aaa6fp-3 },
+{ 0x1.1cf011a7a882ap+0,  -0x1.3c6c1964dd0f2p-3 },
+{ 0x1.1bb44b97dba5ap+0,  -0x1.2f9a99f19a243p-3 },
+{ 0x1.1a7b3e66cdd4fp+0,  -0x1.22d747344446p-3 },
+{ 0x1.1944e11dc56cdp+0,  -0x1.1622020d4f7f5p-3 },
+{ 0x1.18112aebb1a6ep+0,  -0x1.097aabb3553f3p-3 },
+{ 0x1.16e013231b7e9p+0,  -0x1.f9c24b48014c5p-4 },
+{ 0x1.15b1913f156cfp+0,  -0x1.e0aaa3bdc858ap-4 },
+{ 0x1.14859cdedde13p+0,  -0x1.c7ae257c952d6p-4 },
+{ 0x1.135c2dc68cfa4p+0,  -0x1.aecc960a03e58p-4 },
+{ 0x1.12353bdb01684p+0,  -0x1.9605bb724d541p-4 },
+{ 0x1.1110bf25b85b4p+0,  -0x1.7d595ca7147cep-4 },
+{ 0x1.0feeafd2f8577p+0,  -0x1.64c74165002d9p-4 },
+{ 0x1.0ecf062c51c3bp+0,  -0x1.4c4f31c86d344p-4 },
+{ 0x1.0db1baa076c8bp+0,  -0x1.33f0f70388258p-4 },
+{ 0x1.0c96c5bb3048ep+0,  -0x1.1bac5abb3037dp-4 },
+{ 0x1.0b7e20263e070p+0,  -0x1.0381272495f21p-4 },
+{ 0x1.0a67c2acd0ce3p+0,  -0x1.d6de4eba2de2ap-5 },
+{ 0x1.0953a6391e982p+0,  -0x1.a6ec4e8156898p-5 },
+{ 0x1.0841c3caea380p+0,  -0x1.772be542e3e1bp-5 },
+{ 0x1.07321489b13eap+0,  -0x1.479cadcde852dp-5 },
+{ 0x1.062491aee9904p+0,  -0x1.183e4265faa5p-5 },
+{ 0x1.05193497a7cc5p+0,  -0x1.d2207fdaa1b85p-6 },
+{ 0x1.040ff6b5f5e9fp+0,  -0x1.742486cb4a6a2p-6 },
+{ 0x1.0308d19aa6127p+0,  -0x1.1687d77cfc299p-6 },
+{ 0x1.0203beedb0c67p+0,  -0x1.7293623a6b5dep-7 },
+{ 0x1.010037d38bcc2p+0,  -0x1.70ec80ec8f25dp-8 },
+{ 1.0,   0.0 },
+{ 0x1.fc06d493cca10p-1,  0x1.704c1ca6b6bc9p-7 },
+{ 0x1.f81e6ac3b918fp-1,  0x1.6eac8ba664beap-6 },
+{ 0x1.f44546ef18996p-1,  0x1.11e67d040772dp-5 },
+{ 0x1.f07b10382c84bp-1,  0x1.6bc665e2105dep-5 },
+{ 0x1.ecbf7070e59d4p-1,  0x1.c4f8a9772bf1dp-5 },
+{ 0x1.e91213f715939p-1,  0x1.0ebff10fbb951p-4 },
+{ 0x1.e572a9a75f7b7p-1,  0x1.3aaf4d7805d11p-4 },
+{ 0x1.e1e0e2c530207p-1,  0x1.664ba81a4d717p-4 },
+{ 0x1.de5c72d8a8be3p-1,  0x1.9196387da6de4p-4 },
+{ 0x1.dae50fa5658ccp-1,  0x1.bc902f2b7796p-4 },
+{ 0x1.d77a71145a2dap-1,  0x1.e73ab5f584f28p-4 },
+{ 0x1.d41c51166623ep-1,  0x1.08cb78510d232p-3 },
+{ 0x1.d0ca6ba0bb29fp-1,  0x1.1dd2fe2f0dcb5p-3 },
+{ 0x1.cd847e8e59681p-1,  0x1.32b4784400df4p-3 },
+{ 0x1.ca4a499693e00p-1,  0x1.47706f3d49942p-3 },
+{ 0x1.c71b8e399e821p-1,  0x1.5c0768ee4a4dcp-3 },
+{ 0x1.c3f80faf19077p-1,  0x1.7079e86fc7c6dp-3 },
+{ 0x1.c0df92dc2b0ecp-1,  0x1.84c86e1183467p-3 },
+{ 0x1.bdd1de3cbb542p-1,  0x1.98f377a34b499p-3 },
+{ 0x1.baceb9e1007a3p-1,  0x1.acfb803bc924bp-3 },
+{ 0x1.b7d5ef543e55ep-1,  0x1.c0e10098b025fp-3 },
+{ 0x1.b4e749977d953p-1,  0x1.d4a46efe103efp-3 },
+{ 0x1.b20295155478ep-1,  0x1.e8463f45b8d0bp-3 },
+{ 0x1.af279f8e82be2p-1,  0x1.fbc6e3228997fp-3 },
+{ 0x1.ac5638197fdf3p-1,  0x1.079364f2e5aa8p-2 },
+{ 0x1.a98e2f102e087p-1,  0x1.1133306010a63p-2 },
+{ 0x1.a6cf5606d05c1p-1,  0x1.1ac309631bd17p-2 },
+{ 0x1.a4197fc04d746p-1,  0x1.24432485370c1p-2 },
+{ 0x1.a16c80293dc01p-1,  0x1.2db3b5449132fp-2 },
+{ 0x1.9ec82c4dc5bc9p-1,  0x1.3714ee1d7a32p-2 },
+{ 0x1.9c2c5a491f534p-1,  0x1.406700ab52c94p-2 },
+{ 0x1.9998e1480b618p-1,  0x1.49aa1d87522b2p-2 },
+{ 0x1.970d9977c6c2dp-1,  0x1.52de746d7ecb2p-2 },
+{ 0x1.948a5c023d212p-1,  0x1.5c0434336b343p-2 },
+{ 0x1.920f0303d6809p-1,  0x1.651b8ad6c90d1p-2 },
+{ 0x1.8f9b698a98b45p-1,  0x1.6e24a56ab5831p-2 },
+{ 0x1.8d2f6b81726f6p-1,  0x1.771fb04ec29b1p-2 },
+{ 0x1.8acae5bb55badp-1,  0x1.800cd6f19c25ep-2 },
+{ 0x1.886db5d9275b8p-1,  0x1.88ec441df11dfp-2 },
+{ 0x1.8617ba567c13cp-1,  0x1.91be21b7c93f5p-2 },
+{ 0x1.83c8d27487800p-1,  0x1.9a8298f8c7454p-2 },
+{ 0x1.8180de3c5dbe7p-1,  0x1.a339d255c04ddp-2 },
+{ 0x1.7f3fbe71cdb71p-1,  0x1.abe3f59f43db7p-2 },
+{ 0x1.7d055498071c1p-1,  0x1.b48129deca9efp-2 },
+{ 0x1.7ad182e54f65ap-1,  0x1.bd119575364c1p-2 },
+{ 0x1.78a42c3c90125p-1,  0x1.c5955e23ebcbcp-2 },
+{ 0x1.767d342f76944p-1,  0x1.ce0ca8f4e1557p-2 },
+{ 0x1.745c7ef26b00ap-1,  0x1.d6779a5a75774p-2 },
+{ 0x1.7241f15769d0fp-1,  0x1.ded6563550d27p-2 },
+{ 0x1.702d70d396e41p-1,  0x1.e728ffafd840ep-2 },
+{ 0x1.6e1ee3700cd11p-1,  0x1.ef6fb96c8d739p-2 },
+{ 0x1.6c162fc9cbe02p-1,  0x1.f7aaa57907219p-2 }}
+};
+// clang-format on
diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c
new file mode 100644
index 000000000000..8f9241bed8e6
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pairwise_hornerf.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#if V_SUPPORTED
+
+#define C(i) v_f32 (__v_log2f_data.poly[i])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
+
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log2f, x, y, cmp);
+}
+
+/* Fast implementation for single precision log2,
+   relies on same argument reduction as Neon logf.
+   Maximum error: 2.48 ULPs
+   __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+			   want 0x1.a9be8p-2.  */
+VPCS_ATTR
+v_f32_t V_NAME (log2f) (v_f32_t x)
+{
+  v_u32_t u = v_as_u32_f32 (x);
+  v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u -= Off;
+  v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend.  */
+  u &= Mask;
+  u += Off;
+  v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log2(1+r) + n.  */
+  v_f32_t r2 = r * r;
+  v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C);
+  v_f32_t y = v_fma_f32 (p, r, n);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2f), 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f))
+PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_data.c b/contrib/arm-optimized-routines/pl/math/v_log2f_data.c
new file mode 100644
index 000000000000..b144e8f4992d
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_log2f_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients for vector log2f
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* See tools/v_log2f.sollya for the algorithm used to generate these
+   coefficients.  */
+const struct v_log2f_data __v_log2f_data
+  = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)).  */
+	      -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f,
+	      -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}};
diff --git a/contrib/arm-optimized-routines/math/v_math.h b/contrib/arm-optimized-routines/pl/math/v_math.h
similarity index 72%
copy from contrib/arm-optimized-routines/math/v_math.h
copy to contrib/arm-optimized-routines/pl/math/v_math.h
index f2cc4670bb9b..a8fa091a7cbf 100644
--- a/contrib/arm-optimized-routines/math/v_math.h
+++ b/contrib/arm-optimized-routines/pl/math/v_math.h
@@ -1,641 +1,855 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _V_MATH_H
 #define _V_MATH_H
 
 #ifndef WANT_VMATH
 /* Enable the build of vector math code.  */
 # define WANT_VMATH 1
 #endif
 #if WANT_VMATH
 
-/* The goal of this header is to allow vector and scalar
-   build of the same algorithm, the provided intrinsic
-   wrappers are also vector length agnostic so they can
-   be implemented for SVE too (or other simd architectures)
-   and then the code should work on those targets too.  */
+/* The goal of this header is to allow vector (only Neon for now)
+   and scalar build of the same algorithm. */
 
 #if SCALAR
 #define V_NAME(x) __s_##x
 #elif VPCS && __aarch64__
 #define V_NAME(x) __vn_##x
 #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
 #else
 #define V_NAME(x) __v_##x
 #endif
 
 #ifndef VPCS_ATTR
 #define VPCS_ATTR
 #endif
 #ifndef VPCS_ALIAS
 #define VPCS_ALIAS
 #endif
 
 #include <stdint.h>
 #include "math_config.h"
 
 typedef float f32_t;
 typedef uint32_t u32_t;
 typedef int32_t s32_t;
 typedef double f64_t;
 typedef uint64_t u64_t;
 typedef int64_t s64_t;
 
 /* reinterpret as type1 from type2.  */
 static inline u32_t
 as_u32_f32 (f32_t x)
 {
   union { f32_t f; u32_t u; } r = {x};
   return r.u;
 }
 static inline f32_t
 as_f32_u32 (u32_t x)
 {
   union { u32_t u; f32_t f; } r = {x};
   return r.f;
 }
 static inline s32_t
 as_s32_u32 (u32_t x)
 {
   union { u32_t u; s32_t i; } r = {x};
   return r.i;
 }
 static inline u32_t
 as_u32_s32 (s32_t x)
 {
   union { s32_t i; u32_t u; } r = {x};
   return r.u;
 }
 static inline u64_t
 as_u64_f64 (f64_t x)
 {
   union { f64_t f; u64_t u; } r = {x};
   return r.u;
 }
 static inline f64_t
 as_f64_u64 (u64_t x)
 {
   union { u64_t u; f64_t f; } r = {x};
   return r.f;
 }
 static inline s64_t
 as_s64_u64 (u64_t x)
 {
   union { u64_t u; s64_t i; } r = {x};
   return r.i;
 }
 static inline u64_t
 as_u64_s64 (s64_t x)
 {
   union { s64_t i; u64_t u; } r = {x};
   return r.u;
 }
 
 #if SCALAR
 #define V_SUPPORTED 1
 typedef f32_t v_f32_t;
 typedef u32_t v_u32_t;
 typedef s32_t v_s32_t;
 typedef f64_t v_f64_t;
 typedef u64_t v_u64_t;
 typedef s64_t v_s64_t;
 
 static inline int
 v_lanes32 (void)
 {
   return 1;
 }
 
 static inline v_f32_t
 v_f32 (f32_t x)
 {
   return x;
 }
 static inline v_u32_t
 v_u32 (u32_t x)
 {
   return x;
 }
 static inline v_s32_t
 v_s32 (s32_t x)
 {
   return x;
 }
 
 static inline f32_t
 v_get_f32 (v_f32_t x, int i)
 {
   return x;
 }
 static inline u32_t
 v_get_u32 (v_u32_t x, int i)
 {
   return x;
 }
 static inline s32_t
 v_get_s32 (v_s32_t x, int i)
 {
   return x;
 }
 
 static inline void
 v_set_f32 (v_f32_t *x, int i, f32_t v)
 {
   *x = v;
 }
 static inline void
 v_set_u32 (v_u32_t *x, int i, u32_t v)
 {
   *x = v;
 }
 static inline void
 v_set_s32 (v_s32_t *x, int i, s32_t v)
 {
   *x = v;
 }
 
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u32 (v_u32_t x)
 {
   return x != 0;
 }
 /* to wrap the result of relational operators.  */
 static inline v_u32_t
 v_cond_u32 (v_u32_t x)
 {
   return x ? -1 : 0;
 }
 static inline v_f32_t
 v_abs_f32 (v_f32_t x)
 {
   return __builtin_fabsf (x);
 }
+static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) > fabsf (y);
+}
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) >= fabsf (y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) < fabsf (y);
+}
+static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return x / y;
+}
 static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return __builtin_fmaf (x, y, z);
 }
 static inline v_f32_t
 v_round_f32 (v_f32_t x)
 {
   return __builtin_roundf (x);
 }
 static inline v_s32_t
 v_round_s32 (v_f32_t x)
 {
   return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return p ? x : y;
+}
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return __builtin_sqrtf (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
 {
   return x;
 }
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return x;
+}
 static inline v_f32_t
 v_to_f32_u32 (v_u32_t x)
 {
   return x;
 }
 /* reinterpret as type1 from type2.  */
 static inline v_u32_t
 v_as_u32_f32 (v_f32_t x)
 {
   union { v_f32_t f; v_u32_t u; } r = {x};
   return r.u;
 }
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
 static inline v_f32_t
 v_as_f32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_f32_t f; } r = {x};
   return r.f;
 }
 static inline v_s32_t
 v_as_s32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_s32_t i; } r = {x};
   return r.i;
 }
 static inline v_u32_t
 v_as_u32_s32 (v_s32_t x)
 {
   union { v_s32_t i; v_u32_t u; } r = {x};
   return r.u;
 }
 static inline v_f32_t
 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
 {
   return tab[idx];
 }
 static inline v_u32_t
 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
 {
   return tab[idx];
 }
 static inline v_f32_t
 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
 {
   return f (x);
 }
 static inline v_f32_t
 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
 	     v_u32_t p)
 {
   return f (x1, x2);
 }
 
 static inline int
 v_lanes64 (void)
 {
   return 1;
 }
 static inline v_f64_t
 v_f64 (f64_t x)
 {
   return x;
 }
 static inline v_u64_t
 v_u64 (u64_t x)
 {
   return x;
 }
 static inline v_s64_t
 v_s64 (s64_t x)
 {
   return x;
 }
 static inline f64_t
 v_get_f64 (v_f64_t x, int i)
 {
   return x;
 }
 static inline void
 v_set_f64 (v_f64_t *x, int i, f64_t v)
 {
   *x = v;
 }
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
 {
   return x != 0;
 }
+/* true if all elements of a v_cond result is non-zero.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  return x;
+}
 /* to wrap the result of relational operators.  */
 static inline v_u64_t
 v_cond_u64 (v_u64_t x)
 {
   return x ? -1 : 0;
 }
 static inline v_f64_t
 v_abs_f64 (v_f64_t x)
 {
   return __builtin_fabs (x);
 }
+static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return fabs (x) > fabs (y);
+}
+static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return x / y;
+}
 static inline v_f64_t
 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
 {
   return __builtin_fma (x, y, z);
 }
 static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return x < y ? x : y;
+}
+static inline v_f64_t
 v_round_f64 (v_f64_t x)
 {
   return __builtin_round (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return __builtin_sqrt (x);
+}
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
   return __builtin_lround (x); /* relies on -fno-math-errno.  */
 }
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return __builtin_trunc (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
 {
   return x;
 }
 static inline v_f64_t
 v_to_f64_u64 (v_u64_t x)
 {
   return x;
 }
+
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return x;
+}
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
 {
   union { v_f64_t f; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_as_f64_u64 (v_u64_t x)
 {
   union { v_u64_t u; v_f64_t f; } r = {x};
   return r.f;
 }
 static inline v_s64_t
 v_as_s64_u64 (v_u64_t x)
 {
   union { v_u64_t u; v_s64_t i; } r = {x};
   return r.i;
 }
 static inline v_u64_t
 v_as_u64_s64 (v_s64_t x)
 {
   union { v_s64_t i; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
 {
   return tab[idx];
 }
 static inline v_u64_t
 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
 {
   return tab[idx];
 }
 static inline v_f64_t
 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
 {
   return f (x);
 }
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return f (x1, x2);
+}
 
 #elif __aarch64__
 #define V_SUPPORTED 1
 #include <arm_neon.h>
 typedef float32x4_t v_f32_t;
 typedef uint32x4_t v_u32_t;
 typedef int32x4_t v_s32_t;
 typedef float64x2_t v_f64_t;
 typedef uint64x2_t v_u64_t;
 typedef int64x2_t v_s64_t;
 
 static inline int
 v_lanes32 (void)
 {
   return 4;
 }
 
 static inline v_f32_t
 v_f32 (f32_t x)
 {
   return (v_f32_t){x, x, x, x};
 }
 static inline v_u32_t
 v_u32 (u32_t x)
 {
   return (v_u32_t){x, x, x, x};
 }
 static inline v_s32_t
 v_s32 (s32_t x)
 {
   return (v_s32_t){x, x, x, x};
 }
 
 static inline f32_t
 v_get_f32 (v_f32_t x, int i)
 {
   return x[i];
 }
 static inline u32_t
 v_get_u32 (v_u32_t x, int i)
 {
   return x[i];
 }
 static inline s32_t
 v_get_s32 (v_s32_t x, int i)
 {
   return x[i];
 }
 
 static inline void
 v_set_f32 (v_f32_t *x, int i, f32_t v)
 {
   (*x)[i] = v;
 }
 static inline void
 v_set_u32 (v_u32_t *x, int i, u32_t v)
 {
   (*x)[i] = v;
 }
 static inline void
 v_set_s32 (v_s32_t *x, int i, s32_t v)
 {
   (*x)[i] = v;
 }
 
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u32 (v_u32_t x)
 {
   /* assume elements in x are either 0 or -1u.  */
   return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
 }
 /* to wrap the result of relational operators.  */
 static inline v_u32_t
 v_cond_u32 (v_u32_t x)
 {
   return x;
 }
 static inline v_f32_t
 v_abs_f32 (v_f32_t x)
 {
   return vabsq_f32 (x);
 }
+static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (m, x, y);
+}
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcagtq_f32 (x, y);
+}
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcageq_f32 (x, y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcaltq_f32 (x, y);
+}
+static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return vdivq_f32 (x, y);
+}
 static inline v_f32_t
 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
 {
   return vfmaq_f32 (z, x, y);
 }
 static inline v_f32_t
 v_round_f32 (v_f32_t x)
 {
   return vrndaq_f32 (x);
 }
 static inline v_s32_t
 v_round_s32 (v_f32_t x)
 {
   return vcvtaq_s32_f32 (x);
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (p, x, y);
+}
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return vsqrtq_f32 (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
 {
   return (v_f32_t){x[0], x[1], x[2], x[3]};
 }
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return vcvtq_s32_f32 (x);
+}
 static inline v_f32_t
 v_to_f32_u32 (v_u32_t x)
 {
   return (v_f32_t){x[0], x[1], x[2], x[3]};
 }
 /* reinterpret as type1 from type2.  */
 static inline v_u32_t
 v_as_u32_f32 (v_f32_t x)
 {
   union { v_f32_t f; v_u32_t u; } r = {x};
   return r.u;
 }
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
 static inline v_f32_t
 v_as_f32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_f32_t f; } r = {x};
   return r.f;
 }
 static inline v_s32_t
 v_as_s32_u32 (v_u32_t x)
 {
   union { v_u32_t u; v_s32_t i; } r = {x};
   return r.i;
 }
 static inline v_u32_t
 v_as_u32_s32 (v_s32_t x)
 {
   union { v_s32_t i; v_u32_t u; } r = {x};
   return r.u;
 }
 static inline v_f32_t
 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
 {
   return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
 }
 static inline v_u32_t
 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
 {
   return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
 }
 static inline v_f32_t
 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
 {
   return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
 		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
 }
 static inline v_f32_t
 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
 	     v_u32_t p)
 {
   return (
     v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
 	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
 }
 
 static inline int
 v_lanes64 (void)
 {
   return 2;
 }
 static inline v_f64_t
 v_f64 (f64_t x)
 {
   return (v_f64_t){x, x};
 }
 static inline v_u64_t
 v_u64 (u64_t x)
 {
   return (v_u64_t){x, x};
 }
 static inline v_s64_t
 v_s64 (s64_t x)
 {
   return (v_s64_t){x, x};
 }
 static inline f64_t
 v_get_f64 (v_f64_t x, int i)
 {
   return x[i];
 }
 static inline void
 v_set_f64 (v_f64_t *x, int i, f64_t v)
 {
   (*x)[i] = v;
 }
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
 {
   /* assume elements in x are either 0 or -1u.  */
   return vpaddd_u64 (x) != 0;
 }
+/* true if all elements of a v_cond result is 1.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
+}
 /* to wrap the result of relational operators.  */
 static inline v_u64_t
 v_cond_u64 (v_u64_t x)
 {
   return x;
 }
 static inline v_f64_t
 v_abs_f64 (v_f64_t x)
 {
   return vabsq_f64 (x);
 }
+static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return vbslq_u64 (m, x, y);
+}
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return vcagtq_f64 (x, y);
+}
+static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return vdivq_f64 (x, y);
+}
 static inline v_f64_t
 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
 {
   return vfmaq_f64 (z, x, y);
 }
 static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return vminq_f64(x, y);
+}
+static inline v_f64_t
 v_round_f64 (v_f64_t x)
 {
   return vrndaq_f64 (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return vsqrtq_f64 (x);
+}
 static inline v_s64_t
 v_round_s64 (v_f64_t x)
 {
   return vcvtaq_s64_f64 (x);
 }
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return vcvtq_u64_f64 (x);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
 {
   return (v_f64_t){x[0], x[1]};
 }
 static inline v_f64_t
 v_to_f64_u64 (v_u64_t x)
 {
   return (v_f64_t){x[0], x[1]};
 }
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return vcvtq_s64_f64 (x);
+}
 /* reinterpret as type1 from type2.  */
 static inline v_u64_t
 v_as_u64_f64 (v_f64_t x)
 {
   union { v_f64_t f; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_as_f64_u64 (v_u64_t x)
 {
   union { v_u64_t u; v_f64_t f; } r = {x};
   return r.f;
 }
 static inline v_s64_t
 v_as_s64_u64 (v_u64_t x)
 {
   union {  v_u64_t u; v_s64_t i; } r = {x};
   return r.i;
 }
 static inline v_u64_t
 v_as_u64_s64 (v_s64_t x)
 {
   union { v_s64_t i; v_u64_t u; } r = {x};
   return r.u;
 }
 static inline v_f64_t
 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
 {
   return (v_f64_t){tab[idx[0]], tab[idx[1]]};
 }
 static inline v_u64_t
 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
 {
   return (v_u64_t){tab[idx[0]], tab[idx[1]]};
 }
 static inline v_f64_t
 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
 {
   return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
 }
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0],
+		   p[1] ? f (x1[1], x2[1]) : y[1]};
+}
 #endif
 
 #endif
 #endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c
new file mode 100644
index 000000000000..57ec66ecc282
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define BigBound                                                               \
+  0x4080000000000000 /* 2^9. expm1 helper overflows for large input.  */
+#define TinyBound                                                              \
+  0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x.  */
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define One 0x3ff0000000000000
+#define C(i) v_f64 (__expm1_poly[i])
+
+#if V_SUPPORTED
+
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Reduce argument:
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where i = round(x / ln2)
+     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f);
+  /* t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+			     want 0x1.ab34e59d678d9p-2.  */
+VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t sign = ix & ~AbsMask;
+  v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u64_t special = v_cond_u64 (iax >= BigBound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of sinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  v_f64_t t = expm1_inline (ax);
+  return (t + t / (t + 1)) * halfsign;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (sinh), 2.08)
+PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c
new file mode 100644
index 000000000000..49cf078d0651
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c
@@ -0,0 +1,69 @@
+/*
+ * Single-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "v_expm1f_inline.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define BigBound                                                               \
+  0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+#define TinyBound                                                              \
+  0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x)
+{
+  return v_call_f32 (sinhf, x, x, v_u32 (-1));
+}
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4.  */
+VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t sign = ix & ~AbsMask;
+  v_f32_t halfsign = v_as_f32_u32 (sign | Half);
+
+#if WANT_SIMD_EXCEPT
+  v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+#endif
+
+  /* Fall back to the scalar variant for all lanes if any of them should trigger
+     an exception.  */
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+     using a slight rearrangement of the definition of asinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  v_f32_t t = expm1f_inline (ax);
+  return (t + t / (t + 1)) * halfsign;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (sinhf), 1.76)
+PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c
new file mode 100644
index 000000000000..f87baccc4fd7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c
@@ -0,0 +1,102 @@
+/*
+ * Double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi)
+#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo)
+#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1)
+#define Shift v_f64 (0x1.8p52)
+#define AbsMask 0x7fffffffffffffff
+#define RangeVal 0x4160000000000000  /* asuint64(2^23).  */
+#define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
+#define C(i) v_f64 (__v_tan_data.poly[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x)
+{
+  return v_call_f64 (tan, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision tan.
+   Maximum measured error is 3.48 ULP:
+   __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+				 want -0x1.f6ccd8ecf7deap+37.   */
+VPCS_ATTR
+v_f64_t V_NAME (tan) (v_f64_t x)
+{
+  v_u64_t iax = v_as_u64_f64 (x) & AbsMask;
+
+  /* Our argument reduction cannot calculate q with sufficient accuracy for very
+     large inputs. Fall back to scalar routine for all lanes if any are too
+     large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
+     input to avoid underflow. Note pl does not supply a scalar double-precision
+     tan, so the fallback will be statically linked from the system libm.  */
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound)))
+#else
+  if (unlikely (v_any_u64 (iax > RangeVal)))
+#endif
+    return specialcase (x);
+
+  /* q = nearest integer to 2 * x / pi.  */
+  v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift;
+  v_s64_t qi = v_to_s64_f64 (q);
+
+  /* Use q to reduce x to r in [-pi/4, pi/4], by:
+     r = x - q * pi/2, in extended precision.  */
+  v_f64_t r = x;
+  r = v_fma_f64 (q, MHalfPiHi, r);
+  r = v_fma_f64 (q, MHalfPiLo, r);
+  /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+     formula.  */
+  r = r * 0.5;
+
+  /* Approximate tan(r) using order 8 polynomial.
+     tan(x) is odd, so polynomial has the form:
+     tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+     Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+     Then compute the approximation by:
+     tan(r) ~= r + r^3 * (C0 + r^2 * P(r)).  */
+  v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4;
+  /* Use offset version of Estrin wrapper to evaluate from C1 onwards.  */
+  v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1);
+  p = v_fma_f64 (p, r2, C (0));
+  p = v_fma_f64 (r2, p * r, r);
+
+  /* Recombination uses double-angle formula:
+     tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+     and reciprocity around pi/2:
+     tan(x) = 1 / (tan(pi/2 - x))
+     to assemble result using change-of-sign and conditional selection of
+     numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
+  v_f64_t n = v_fma_f64 (p, p, v_f64 (-1));
+  v_f64_t d = p * 2;
+
+  v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0);
+
+  return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d);
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tan), 2.99)
+PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000)
+PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000)
+PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_data.c b/contrib/arm-optimized-routines/pl/math/v_tan_data.c
new file mode 100644
index 000000000000..04e25169bd88
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_tan_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and helpers for double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const struct v_tan_data __v_tan_data
+  = {.neg_half_pi_hi = -0x1.921fb54442d18p0,
+     .neg_half_pi_lo = -0x1.1a62633145c07p-54,
+     .poly
+     = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
+	0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
+	0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}};
diff --git a/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c
new file mode 100644
index 000000000000..828466b03182
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c
@@ -0,0 +1,131 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrinf.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
+#define InvPio2 (v_f32 (0x1.45f306p-1f))
+#define RangeVal (0x47000000)  /* asuint32(0x1p15f).  */
+#define TinyBound (0x30000000) /* asuint32 (0x1p-31).  */
+#define Shift (v_f32 (0x1.8p+23f))
+#define AbsMask (v_u32 (0x7fffffff))
+
+#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (tanf, x, y, cmp);
+}
+
+/* Use a full Estrin scheme to evaluate polynomial.  */
+static inline v_f32_t
+eval_poly (v_f32_t z)
+{
+  v_f32_t z2 = z * z;
+#if WANT_SIMD_EXCEPT
+  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
+     are to be triggered correctly, sidestep this by fixing such lanes to 0.  */
+  v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
+  if (unlikely (v_any_u32 (will_uflow)))
+    z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
+#endif
+  v_f32_t z4 = z2 * z2;
+  return ESTRIN_5 (z, z2, z4, poly);
+}
+
+/* Fast implementation of Neon tanf.
+   Maximum error is 3.45 ULP:
+   __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			    want 0x1.ff9850p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (tanf) (v_f32_t x)
+{
+  v_f32_t special_arg = x;
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
+     regression.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, also special-case tiny
+     input, as this will load to overflow later. Fix any special lanes to 1 to
+     prevent any exceptions being triggered.  */
+  v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    x = v_sel_f32 (special, v_f32 (1.0f), x);
+#else
+  /* Otherwise, special-case large and special values.  */
+  v_u32_t special = v_cond_u32 (iax >= RangeVal);
+#endif
+
+  /* n = rint(x/(pi/2)).  */
+  v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
+  v_f32_t n = q - Shift;
+  /* n is representable as a signed integer, simply convert it.  */
+  v_s32_t in = v_round_s32 (n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  v_s32_t alt = in & 1;
+  v_u32_t pred_alt = (alt != 0);
+
+  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
+  v_f32_t r;
+  r = v_fma_f32 (NegPio2_1, n, x);
+  r = v_fma_f32 (NegPio2_2, n, r);
+  r = v_fma_f32 (NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  v_f32_t z = v_sel_f32 (pred_alt, -r, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  v_f32_t z2 = r * r;
+  v_f32_t p = eval_poly (z2);
+  v_f32_t y = v_fma_f32 (z * z2, p, z);
+
+  /* Compute reciprocal and apply if required.  */
+  v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y);
+  y = v_sel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = v_sel_f32 (x == v_f32 (-0.0), x, y);
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (special_arg, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tanf), 2.96)
+PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c
new file mode 100644
index 000000000000..c8b6c251d453
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define C(i) v_f64 (__expm1_poly[i])
+
+#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4).  */
+#define TinyBound 0x3e40000000000000   /* asuint64 (0x1p-27).  */
+#define One v_u64 (0x3ff0000000000000)
+
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+     the scalar variant of tanh.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* t = 2 ^ i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) = p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (tanh, x, y, special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.75 ULP:
+   __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
+				  want -0x1.ba31ba4691ab4p-3.  */
+VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ia = ix & AbsMask;
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound));
+  v_f64_t u;
+
+  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+     They will be fixed up later by the special-case handler.  */
+  if (unlikely (v_any_u64 (special)))
+    u = v_sel_f64 (special, v_f64 (1), x) * 2;
+  else
+    u = x * 2;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f64_t q = expm1_inline (u);
+  v_f64_t y = q / (q + 2);
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanh), 2.26)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh))
+PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000)
+PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000)
+PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c
new file mode 100644
index 000000000000..36166118c0f0
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c
@@ -0,0 +1,69 @@
+/*
+ * Single-precision vector tanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "v_expm1f_inline.h"
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+
+static NOINLINE v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified version
+   of expm1f. The maximum error is 2.58 ULP:
+   __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+			  want 0x1.f9ba08p-5.  */
+VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_u32_t sign = ix & ~AbsMask;
+  v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
+  v_f32_t boring = v_as_f32_u32 (sign | One);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered properly, set all special and boring
+     lanes to 1, which will trigger no exceptions, and fix them up later.  */
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
+  ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+  if (unlikely (v_any_u32 (special)))
+    ix = v_sel_u32 (special, v_u32 (One), ix);
+#else
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
+  v_f32_t y = q / (q + 2);
+  y = v_sel_f32 (is_boring, boring, y);
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanhf), 2.09)
+PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100)
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c
new file mode 100644
index 000000000000..649735b140f3
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_acosh.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh)
+#include "v_acosh_3u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c
new file mode 100644
index 000000000000..8c5f106992a7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_acoshf.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf)
+#include "v_acoshf_3u1.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c
new file mode 100644
index 000000000000..0d2373b5e4b2
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh)
+#include "v_asinh_3u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c
new file mode 100644
index 000000000000..6c8927f0875b
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf)
+#include "v_asinhf_2u7.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c
new file mode 100644
index 000000000000..925b5b4ef324
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2)
+#include "v_atan2_3u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c
new file mode 100644
index 000000000000..51d33d50f6ef
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2f.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f)
+#include "v_atan2f_3u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c
new file mode 100644
index 000000000000..ccebce2dc2ed
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan)
+#include "v_atan_2u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c
new file mode 100644
index 000000000000..b8797276d981
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanf.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf)
+#include "v_atanf_3u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c
new file mode 100644
index 000000000000..19429b209b3a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh)
+#include "v_atanh_3u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c
new file mode 100644
index 000000000000..7de226dda054
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf)
+#include "v_atanhf_3u1.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c
new file mode 100644
index 000000000000..4cb0dc8cefb5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrt.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt)
+#include "v_cbrt_2u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c
new file mode 100644
index 000000000000..40a72d8c301e
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrtf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf)
+#include "v_cbrtf_1u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c
new file mode 100644
index 000000000000..9bf7f026447a
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cosh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh)
+#include "v_cosh_2u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c
new file mode 100644
index 000000000000..b149cb34df61
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_coshf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf)
+#include "v_coshf_2u4.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c b/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c
new file mode 100644
index 000000000000..95bd141554e4
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erf.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf)
+#include "v_erf_2u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c
new file mode 100644
index 000000000000..1cf6546ce715
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc)
+#include "v_erfc_4u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c
new file mode 100644
index 000000000000..ef5a21d6336c
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfcf.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf)
+#include "v_erfcf_1u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c
new file mode 100644
index 000000000000..ee8848ee24ed
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erff.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff)
+#include "v_erff_1u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c b/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c
new file mode 100644
index 000000000000..52a57feefbff
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_exp_tail.c"
+#endif
diff --git a/contrib/arm-optimized-routines/math/vn_expf.c b/contrib/arm-optimized-routines/pl/math/vn_expf.c
similarity index 63%
copy from contrib/arm-optimized-routines/math/vn_expf.c
copy to contrib/arm-optimized-routines/pl/math/vn_expf.c
index 0652907225d9..83e7f0a2070b 100644
--- a/contrib/arm-optimized-routines/math/vn_expf.c
+++ b/contrib/arm-optimized-routines/pl/math/vn_expf.c
@@ -1,12 +1,12 @@
 /*
  * AdvSIMD vector PCS variant of __v_expf.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
 #define VPCS 1
 #define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
 #include "v_expf.c"
 #endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c
new file mode 100644
index 000000000000..35111e2fc221
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1)
+#include "v_expm1_2u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c
new file mode 100644
index 000000000000..bea491f4898e
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1f.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f)
+#include "v_expm1f_1u6.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c
new file mode 100644
index 000000000000..5f32c33e059f
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log10.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10)
+#include "v_log10_2u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c
new file mode 100644
index 000000000000..2673ef515df7
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log10f.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f)
+#include "v_log10f_3u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c
new file mode 100644
index 000000000000..3f4f8d1bd297
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log1p.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p)
+#include "v_log1p_2u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c
new file mode 100644
index 000000000000..a319bc98f491
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log1pf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf)
+#include "v_log1pf_2u1.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c
new file mode 100644
index 000000000000..a87039204439
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2)
+#include "v_log2_3u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c
new file mode 100644
index 000000000000..b4a9cb708bae
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2f.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f)
+#include "v_log2f_2u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c
new file mode 100644
index 000000000000..7c881de21688
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh)
+#include "v_sinh_3u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c
new file mode 100644
index 000000000000..251e73232d01
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf)
+#include "v_sinhf_2u3.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c
new file mode 100644
index 000000000000..a4efb065bc08
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tan.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan)
+#include "v_tan_3u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c
new file mode 100644
index 000000000000..a88cb4077b3d
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanf.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
+#include "v_tanf_3u5.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c
new file mode 100644
index 000000000000..cb2746cf22a5
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanh.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh)
+#include "v_tanh_3u.c"
+#endif
diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c
new file mode 100644
index 000000000000..47f0a7f57d05
--- /dev/null
+++ b/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf)
+#include "v_tanhf_2u6.c"
+#endif
diff --git a/contrib/arm-optimized-routines/string/Dir.mk b/contrib/arm-optimized-routines/string/Dir.mk
index cf3453f7580d..40ff5acc093e 100644
--- a/contrib/arm-optimized-routines/string/Dir.mk
+++ b/contrib/arm-optimized-routines/string/Dir.mk
@@ -1,113 +1,113 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/string
 B := build/string
 
 ifeq ($(ARCH),)
 all-string bench-string check-string install-string clean-string:
 	@echo "*** Please set ARCH in config.mk. ***"
 	@exit 1
 else
 
 string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
 string-test-srcs := $(wildcard $(S)/test/*.c)
 string-bench-srcs := $(wildcard $(S)/bench/*.c)
 
 string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
 
 string-libs := \
 	build/lib/libstringlib.so \
 	build/lib/libstringlib.a \
 
 string-tests := \
 	build/bin/test/memcpy \
 	build/bin/test/memmove \
 	build/bin/test/memset \
 	build/bin/test/memchr \
 	build/bin/test/memrchr \
 	build/bin/test/memcmp \
 	build/bin/test/__mtag_tag_region \
 	build/bin/test/__mtag_tag_zero_region \
 	build/bin/test/strcpy \
 	build/bin/test/stpcpy \
 	build/bin/test/strcmp \
 	build/bin/test/strchr \
 	build/bin/test/strrchr \
 	build/bin/test/strchrnul \
 	build/bin/test/strlen \
 	build/bin/test/strnlen \
 	build/bin/test/strncmp
 
 string-benches := \
 	build/bin/bench/memcpy \
 	build/bin/bench/strlen
 
 string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
 string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
 string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs)))
 
 string-objs := \
 	$(string-lib-objs) \
 	$(string-lib-objs:%.o=%.os) \
 	$(string-test-objs) \
 	$(string-bench-objs)
 
 string-files := \
 	$(string-objs) \
 	$(string-libs) \
 	$(string-tests) \
 	$(string-benches) \
 	$(string-includes) \
 
 all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
 
 $(string-objs): $(string-includes)
 $(string-objs): CFLAGS_ALL += $(string-cflags)
 
 $(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
 
 build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os)
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
 
 build/lib/libstringlib.a: $(string-lib-objs)
 	rm -f $@
 	$(AR) rc $@ $^
 	$(RANLIB) $@
 
 build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
 build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
 
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
 string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
 
 build/string/test/%.out: build/bin/test/%
 	$(EMULATOR) $^ | tee $@.tmp
 	mv $@.tmp $@
 
 check-string: $(string-tests-out)
 	! grep FAIL $^
 
 bench-string: $(string-benches)
 	$(EMULATOR) build/bin/bench/strlen
 	$(EMULATOR) build/bin/bench/memcpy
 
 install-string: \
  $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
  $(string-includes:build/include/%=$(DESTDIR)$(includedir)/%)
 
 clean-string:
 	rm -f $(string-files)
 endif
 
 .PHONY: all-string bench-string check-string install-string clean-string
diff --git a/contrib/arm-optimized-routines/string/README.contributors b/contrib/arm-optimized-routines/string/README.contributors
new file mode 100644
index 000000000000..0b4a51b56366
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/README.contributors
@@ -0,0 +1,30 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
+================================================
+1. Code:
+   - The assumptions of the code must be clearly documented.
+
+   - Assembly style should be consistent across different implementations.
+
+
+2. Performance:
+   - Benchmarking is needed on several microarchitectures.
diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
index 84339f73cf23..207e22950c6d 100644
--- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
+++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
@@ -1,100 +1,100 @@
 /*
  * __mtag_tag_region - tag memory
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, MTE, LP64 ABI.
  *
  * Interface contract:
  * Address is 16 byte aligned and size is multiple of 16.
  * Returns the passed pointer.
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
 #define dstin	x0
 #define count	x1
 #define dst	x2
 #define dstend	x3
 #define tmp	x4
 #define zva_val	x4
 
 ENTRY (__mtag_tag_region)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 
 	add	dstend, dstin, count
 
 	cmp	count, 96
 	b.hi	L(set_long)
 
 	tbnz	count, 6, L(set96)
 
 	/* Set 0, 16, 32, or 48 bytes.  */
 	lsr	tmp, count, 5
 	add	tmp, dstin, tmp, lsl 4
 	cbz     count, L(end)
 	stg	dstin, [dstin]
 	stg	dstin, [tmp]
 	stg	dstin, [dstend, -16]
 L(end):
 	ret
 
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
 L(set96):
 	st2g	dstin, [dstin]
 	st2g	dstin, [dstin, 32]
 	st2g	dstin, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Size is > 96 bytes.  */
 L(set_long):
 	cmp	count, 160
 	b.lo	L(no_zva)
 
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
 	st2g	dstin, [dstin]
 	st2g	dstin, [dstin, 32]
 	bic	dst, dstin, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
 	sub	count, count, 128	/* Adjust count and bias for loop.  */
 
 	.p2align 4
 L(zva_loop):
 	add	dst, dst, 64
 	dc	gva, dst
 	subs	count, count, 64
 	b.hi	L(zva_loop)
 	st2g	dstin, [dstend, -64]
 	st2g	dstin, [dstend, -32]
 	ret
 
 L(no_zva):
 	sub	dst, dstin, 32		/* Dst is biased by -32.  */
 	sub	count, count, 64	/* Adjust count for loop.  */
 L(no_zva_loop):
 	st2g	dstin, [dst, 32]
 	st2g	dstin, [dst, 64]!
 	subs	count, count, 64
 	b.hi	L(no_zva_loop)
 	st2g	dstin, [dstend, -64]
 	st2g	dstin, [dstend, -32]
 	ret
 
 END (__mtag_tag_region)
 #endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
index f58364ca6fcb..44b8e0114f42 100644
--- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
+++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
@@ -1,100 +1,100 @@
 /*
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, MTE, LP64 ABI.
  *
  * Interface contract:
  * Address is 16 byte aligned and size is multiple of 16.
  * Returns the passed pointer.
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
 #define dstin	x0
 #define count	x1
 #define dst	x2
 #define dstend	x3
 #define tmp	x4
 #define zva_val	x4
 
 ENTRY (__mtag_tag_zero_region)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 
 	add	dstend, dstin, count
 
 	cmp	count, 96
 	b.hi	L(set_long)
 
 	tbnz	count, 6, L(set96)
 
 	/* Set 0, 16, 32, or 48 bytes.  */
 	lsr	tmp, count, 5
 	add	tmp, dstin, tmp, lsl 4
 	cbz     count, L(end)
 	stzg	dstin, [dstin]
 	stzg	dstin, [tmp]
 	stzg	dstin, [dstend, -16]
 L(end):
 	ret
 
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
 L(set96):
 	stz2g	dstin, [dstin]
 	stz2g	dstin, [dstin, 32]
 	stz2g	dstin, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Size is > 96 bytes.  */
 L(set_long):
 	cmp	count, 160
 	b.lo	L(no_zva)
 
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
 	stz2g	dstin, [dstin]
 	stz2g	dstin, [dstin, 32]
 	bic	dst, dstin, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
 	sub	count, count, 128	/* Adjust count and bias for loop.  */
 
 	.p2align 4
 L(zva_loop):
 	add	dst, dst, 64
 	dc	gzva, dst
 	subs	count, count, 64
 	b.hi	L(zva_loop)
 	stz2g	dstin, [dstend, -64]
 	stz2g	dstin, [dstend, -32]
 	ret
 
 L(no_zva):
 	sub	dst, dstin, 32		/* Dst is biased by -32.  */
 	sub	count, count, 64	/* Adjust count for loop.  */
 L(no_zva_loop):
 	stz2g	dstin, [dst, 32]
 	stz2g	dstin, [dst, 64]!
 	subs	count, count, 64
 	b.hi	L(no_zva_loop)
 	stz2g	dstin, [dstend, -64]
 	stz2g	dstin, [dstend, -32]
 	ret
 
 END (__mtag_tag_zero_region)
 #endif
diff --git a/contrib/arm-optimized-routines/string/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h
similarity index 83%
rename from contrib/arm-optimized-routines/string/asmdefs.h
rename to contrib/arm-optimized-routines/string/aarch64/asmdefs.h
index 340b427a505b..069b146f4a69 100644
--- a/contrib/arm-optimized-routines/string/asmdefs.h
+++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h
@@ -1,98 +1,92 @@
 /*
- * Macros for asm code.
+ * Macros for asm code.  AArch64 version.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 
-#if defined(__aarch64__)
-
 /* Branch Target Identitication support.  */
 #define BTI_C		hint	34
 #define BTI_J		hint	36
 /* Return address signing support (pac-ret).  */
 #define PACIASP		hint	25; .cfi_window_save
 #define AUTIASP		hint	29; .cfi_window_save
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
 #define FEATURE_1_AND 0xc0000000
 #define FEATURE_1_BTI 1
 #define FEATURE_1_PAC 2
 
 /* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
 #define GNU_PROPERTY(type, value)	\
   .section .note.gnu.property, "a";	\
   .p2align 3;				\
   .word 4;				\
   .word 16;				\
   .word 5;				\
   .asciz "GNU";				\
   .word type;				\
   .word 4;				\
   .word value;				\
   .word 0;				\
   .text
 
 /* If set then the GNU Property Note section will be added to
    mark objects to support BTI and PAC-RET.  */
 #ifndef WANT_GNU_PROPERTY
 #define WANT_GNU_PROPERTY 1
 #endif
 
 #if WANT_GNU_PROPERTY
 /* Add property note with supported features to all asm files.  */
 GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
 #endif
 
 #define ENTRY_ALIGN(name, alignment)	\
   .global name;		\
   .type name,%function;	\
   .align alignment;		\
   name:			\
   .cfi_startproc;	\
   BTI_C;
 
-#else
-
-#define END_FILE
-
-#define ENTRY_ALIGN(name, alignment)	\
-  .global name;		\
-  .type name,%function;	\
-  .align alignment;		\
-  name:			\
-  .cfi_startproc;
-
-#endif
-
 #define ENTRY(name)	ENTRY_ALIGN(name, 6)
 
 #define ENTRY_ALIAS(name)	\
   .global name;		\
   .type name,%function;	\
   name:
 
 #define END(name)	\
   .cfi_endproc;		\
   .size name, .-name;
 
 #define L(l) .L ## l
 
 #ifdef __ILP32__
   /* Sanitize padding bits of pointer arguments as per aapcs64 */
 #define PTR_ARG(n)  mov w##n, w##n
 #else
 #define PTR_ARG(n)
 #endif
 
 #ifdef __ILP32__
   /* Sanitize padding bits of size arguments as per aapcs64 */
 #define SIZE_ARG(n)  mov w##n, w##n
 #else
 #define SIZE_ARG(n)
 #endif
 
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
 #endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/check-arch.S b/contrib/arm-optimized-routines/string/aarch64/check-arch.S
index 5a54242d7de6..131b7fa36ec2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/check-arch.S
+++ b/contrib/arm-optimized-routines/string/aarch64/check-arch.S
@@ -1,13 +1,13 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__aarch64__
 # error ARCH setting does not match the compiler.
 #endif
 
 /* Include for GNU property notes.  */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
index c2e967d1004e..948c3cbc7dd4 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
@@ -1,116 +1,110 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
 #define cntin		x2
 #define result		x0
 
 #define src		x3
 #define cntrem		x4
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 	bic	src, srcin, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	rbit	synd, synd
 	clz	synd, synd
-	add	result, srcin, synd, lsr 2
 	cmp	cntin, synd, lsr 2
+	add	result, srcin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
+	.p2align 3
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 16
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 4
 L(loop32):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, 16]!
-	subs	cntrem, cntrem, 32
+	ldr	qdata, [src, 16]
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	subs	cntrem, cntrem, 32
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	cntrem, src, srcin
 	fmov	synd, dend
-	add	tmp, srcin, cntin
-	sub	cntrem, tmp, src
+	sub	cntrem, cntin, cntrem
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	clz	synd, synd
 	cmp	cntrem, synd, lsr 2
 	add	result, src, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
 L(nomatch):
 	mov	result, 0
 	ret
 
 END (__memchr_aarch64_mte)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
index c22e6596f19b..b851cf31f238 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
@@ -1,64 +1,64 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__memchr_aarch64_sve)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
 
 	.p2align 4
 0:	whilelo	p1.b, x3, x2			/* make sure off < max */
 	b.none	9f
 
 	/* Read a vector's worth of bytes, bounded by max,
 	   stopping on first fault.  */
 	ldff1b	z0.b, p1/z, [x0, x3]
 	rdffrs	p0.b, p1/z
 	b.nlast	2f
 
 	/* First fault did not fail: the vector bounded by max is valid.
 	   Avoid depending on the contents of FFR beyond the branch.  */
 	incb	x3				/* speculate increment */
 	cmpeq	p2.b, p1/z, z0.b, z1.b		/* search for c */
 	b.none	0b
 	decb	x3				/* undo speculate */
 
 	/* Found C.  */
 1:	brkb	p2.b, p1/z, p2.b	/* find the first c */
 	add	x0, x0, x3		/* form partial pointer */
 	incp	x0, p2.b		/* form final pointer to c */
 	ret
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparision only on the valid bytes.  */
 2:	cmpeq	p2.b, p0/z, z0.b, z1.b
 	b.any	1b
 
 	/* No C found.  Re-init FFR, increment, and loop.  */
 	setffr
 	incp	x3, p0.b
 	b	0b
 
 	/* Found end of count.  */
 9:	mov	x0, 0			/* return null */
 	ret
 
 END (__memchr_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr.S b/contrib/arm-optimized-routines/string/aarch64/memchr.S
index 353f0d1eac53..fe6cfe2bc0e2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr.S
@@ -1,146 +1,146 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 #define cntin		x2
 
 #define result		x0
 
 #define src		x3
 #define	tmp		x4
 #define wtmp2		w5
 #define synd		x6
 #define soff		x9
 #define cntrem		x10
 
 #define vrepchr		v0
 #define vdata1		v1
 #define vdata2		v2
 #define vhas_chr1	v3
 #define vhas_chr2	v4
 #define vrepmask	v5
 #define vend		v6
 
 /*
  * Core algorithm:
  *
  * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
  * per byte. For each tuple, bit 0 is set if the relevant byte matched the
  * requested character and bit 1 is not used (faster than using a 32bit
  * syndrome). Since the bits in the syndrome reflect exactly the order in which
  * things occur in the original string, counting trailing zeros allows to
  * identify exactly which byte has matched.
  */
 
 ENTRY (__memchr_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
 	 * Magic constant 0x40100401 allows us to identify which lane matches
 	 * the requested byte.
 	 */
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
 	/* Work with aligned 32-byte chunks */
 	bic	src, srcin, #31
 	dup	vrepmask.4s, wtmp2
 	ands	soff, srcin, #31
 	and	cntrem, cntin, #31
 	b.eq	L(loop)
 
 	/*
 	 * Input string is not 32-byte aligned. We calculate the syndrome
 	 * value for the aligned 32 bytes block containing the first bytes
 	 * and mask the irrelevant part.
 	 */
 
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	sub	tmp, soff, #32
 	adds	cntin, cntin, tmp
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Clear the soff*2 lower bits */
 	lsl	tmp, soff, #1
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp
 	/* The first block can also be the last */
 	b.ls	L(masklast)
 	/* Have we found something already? */
 	cbnz	synd, L(tail)
 
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	subs	cntin, cntin, #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	/* If we're out of data we finish regardless of the result */
 	b.ls	L(end)
 	/* Use a fast check for the termination condition */
 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend.2d, vend.2d, vend.2d
 	mov	synd, vend.d[0]
 	/* We're not out of data, loop if we haven't found the character */
 	cbz	synd, L(loop)
 
 L(end):
 	/* Termination condition found, let's calculate the syndrome value */
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
 	b.hs	L(tail)
 
 L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
 	sub	tmp, tmp, #32
 	neg	tmp, tmp, lsl #1
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp
 
 L(tail):
 	/* Count the trailing zeros using bit reversing */
 	rbit	synd, synd
 	/* Compensate the last post-increment */
 	sub	src, src, #32
 	/* Check that we have found a character */
 	cmp	synd, #0
 	/* And count the leading zeros */
 	clz	synd, synd
 	/* Compute the potential result */
 	add	result, src, synd, lsr #1
 	/* Select result or NULL */
 	csel	result, xzr, result, eq
 	ret
 
 L(zero_length):
 	mov	result, #0
 	ret
 
 END (__memchr_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
index 78c5ecaa4cdc..d52ce4555344 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
@@ -1,51 +1,51 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__memcmp_aarch64_sve)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
 	b.none	9f
 
 	ld1b	z0.b, p0/z, [x0, x3]	/* read vectors bounded by max.  */
 	ld1b	z1.b, p0/z, [x1, x3]
 
 	/* Increment for a whole vector, even if we've only read a partial.
 	   This is significantly cheaper than INCP, and since OFF is not
 	   used after the loop it is ok to increment OFF past MAX.  */
 	incb	x3
 
 	cmpne	p1.b, p0/z, z0.b, z1.b	/* while no inequalities */
 	b.none	0b
 
 	/* Found inequality.  */
 1:	brkb	p1.b, p0/z, p1.b	/* find first such */
 	lasta	w0, p1, z0.b		/* extract each byte */
 	lasta	w1, p1, z1.b
 	sub	x0, x0, x1		/* return comparison */
 	ret
 
 	/* Found end-of-count.  */
 9:	mov	x0, 0			/* return equality */
 	ret
 
 END (__memcmp_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp.S b/contrib/arm-optimized-routines/string/aarch64/memcmp.S
index 7ca1135edec7..35135e72cc8e 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcmp.S
@@ -1,190 +1,190 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define src1	x0
 #define src2	x1
 #define limit	x2
 #define result	w0
 
 #define data1	x3
 #define data1w	w3
 #define data2	x4
 #define data2w	w4
 #define data3	x5
 #define data3w	w5
 #define data4	x6
 #define data4w	w6
 #define tmp	x6
 #define src1end	x7
 #define src2end	x8
 
 
 ENTRY (__memcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 
 	cmp	limit, 16
 	b.lo	L(less16)
 	ldp	data1, data3, [src1]
 	ldp	data2, data4, [src2]
 	ccmp	data1, data2, 0, ne
 	ccmp	data3, data4, 0, eq
 	b.ne	L(return2)
 
 	add	src1end, src1, limit
 	add	src2end, src2, limit
 	cmp	limit, 32
 	b.ls	L(last_bytes)
 	cmp	limit, 160
 	b.hs	L(loop_align)
 	sub	limit, limit, 32
 
 	.p2align 4
 L(loop32):
 	ldp	data1, data3, [src1, 16]
 	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
 	ccmp	data3, data4, 0, eq
 	b.ne	L(return2)
 	cmp	limit, 16
 	b.ls	L(last_bytes)
 
 	ldp	data1, data3, [src1, 32]
 	ldp	data2, data4, [src2, 32]
 	cmp	data1, data2
 	ccmp	data3, data4, 0, eq
 	b.ne	L(return2)
 	add	src1, src1, 32
 	add	src2, src2, 32
 L(last64):
 	subs	limit, limit, 32
 	b.hi	L(loop32)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
 	ldp	data1, data3, [src1end, -16]
 	ldp	data2, data4, [src2end, -16]
 L(return2):
 	cmp	data1, data2
 	csel	data1, data1, data3, ne
 	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
 L(less16):
 	add	src1end, src1, limit
 	add	src2end, src2, limit
 	tbz	limit, 3, L(less8)
 	ldr	data1, [src1]
 	ldr	data2, [src2]
 	ldr	data3, [src1end, -8]
 	ldr	data4, [src2end, -8]
 	b	L(return2)
 
 	.p2align 4
 L(less8):
 	tbz	limit, 2, L(less4)
 	ldr	data1w, [src1]
 	ldr	data2w, [src2]
 	ldr	data3w, [src1end, -4]
 	ldr	data4w, [src2end, -4]
 	b	L(return2)
 
 L(less4):
 	tbz	limit, 1, L(less2)
 	ldrh	data1w, [src1]
 	ldrh	data2w, [src2]
 	cmp	data1w, data2w
 	b.ne	L(return)
 L(less2):
 	mov	result, 0
 	tbz	limit, 0, L(return_zero)
 	ldrb	data1w, [src1end, -1]
 	ldrb	data2w, [src2end, -1]
 	sub	result, data1w, data2w
 L(return_zero):
 	ret
 
 L(loop_align):
 	ldp	data1, data3, [src1, 16]
 	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
 	ccmp	data3, data4, 0, eq
 	b.ne	L(return2)
 
 	/* Align src2 and adjust src1, src2 and limit.  */
 	and	tmp, src2, 15
 	sub	tmp, tmp, 16
 	sub	src2, src2, tmp
 	add	limit, limit, tmp
 	sub	src1, src1, tmp
 	sub	limit, limit, 64 + 16
 
 	.p2align 4
 L(loop64):
 	ldr	q0, [src1, 16]
 	ldr	q1, [src2, 16]
 	subs	limit, limit, 64
 	ldr	q2, [src1, 32]
 	ldr	q3, [src2, 32]
 	eor	v0.16b, v0.16b, v1.16b
 	eor	v1.16b, v2.16b, v3.16b
 	ldr	q2, [src1, 48]
 	ldr	q3, [src2, 48]
 	umaxp	v0.16b, v0.16b, v1.16b
 	ldr	q4, [src1, 64]!
 	ldr	q5, [src2, 64]!
 	eor	v1.16b, v2.16b, v3.16b
 	eor	v2.16b, v4.16b, v5.16b
 	umaxp	v1.16b, v1.16b, v2.16b
 	umaxp	v0.16b, v0.16b, v1.16b
 	umaxp	v0.16b, v0.16b, v0.16b
 	fmov	tmp, d0
 	ccmp	tmp, 0, 0, hi
 	b.eq	L(loop64)
 
 	/* If equal, process last 1-64 bytes using scalar loop.  */
 	add	limit, limit, 64 + 16
 	cbz	tmp, L(last64)
 
 	/* Determine the 8-byte aligned offset of the first difference.  */
 #ifdef __AARCH64EB__
 	rev16	tmp, tmp
 #endif
 	rev	tmp, tmp
 	clz	tmp, tmp
 	bic	tmp, tmp, 7
 	sub	tmp, tmp, 48
 	ldr	data1, [src1, tmp]
 	ldr	data2, [src2, tmp]
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	mov	result, 1
 	cmp	data1, data2
 	cneg	result, result, lo
 	ret
 
 END (__memcmp_aarch64)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
index f97f2c3047b9..e6527d0dac2c 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
@@ -1,206 +1,206 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
 #define count	x2
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_lw	w10
 #define tmp1	x14
 
 #define A_q	q0
 #define B_q	q1
 #define C_q	q2
 #define D_q	q3
 #define E_q	q4
 #define F_q	q5
 #define G_q	q6
 #define H_q	q7
 
 /* This implementation handles overlaps and supports both memcpy and memmove
    from a single entry point.  It uses unaligned accesses and branchless
    sequences to keep the code small, simple and improve performance.
 
    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    copies of up to 128 bytes, and large copies.  The overhead of the overlap
    check is negligible since it is only required for large copies.
 
    Large copies use a software pipelined loop processing 64 bytes per iteration.
    The source pointer is 16-byte aligned to minimize unaligned accesses.
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
 ENTRY_ALIAS (__memmove_aarch64_simd)
 ENTRY (__memcpy_aarch64_simd)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
 	b.hi	L(copy_long)
 	cmp	count, 32
 	b.hi	L(copy32_128)
 
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
 	ldr	A_q, [src]
 	ldr	B_q, [srcend, -16]
 	str	A_q, [dstin]
 	str	B_q, [dstend, -16]
 	ret
 
 	/* Copy 8-15 bytes.  */
 L(copy16):
 	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
 
 	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
 	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
 	str	B_lw, [dstend, -4]
 	ret
 
 	/* Copy 0..3 bytes using a branchless sequence.  */
 L(copy4):
 	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
 	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
 	strb	C_lw, [dstend, -1]
 L(copy0):
 	ret
 
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
 	ldp	A_q, B_q, [src]
 	ldp	C_q, D_q, [srcend, -32]
 	cmp	count, 64
 	b.hi	L(copy128)
 	stp	A_q, B_q, [dstin]
 	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
 	ldp	E_q, F_q, [src, 32]
 	cmp	count, 96
 	b.ls	L(copy96)
 	ldp	G_q, H_q, [srcend, -64]
 	stp	G_q, H_q, [dstend, -64]
 L(copy96):
 	stp	A_q, B_q, [dstin]
 	stp	E_q, F_q, [dstin, 32]
 	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	/* Copy more than 128 bytes.  */
 L(copy_long):
 	/* Use backwards copy if there is an overlap.  */
 	sub	tmp1, dstin, src
 	cmp	tmp1, count
 	b.lo	L(copy_long_backwards)
 
 	/* Copy 16 bytes and then align src to 16-byte alignment.  */
 	ldr	D_q, [src]
 	and	tmp1, src, 15
 	bic	src, src, 15
 	sub	dst, dstin, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_q, B_q, [src, 16]
 	str	D_q, [dstin]
 	ldp	C_q, D_q, [src, 48]
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
 L(loop64):
 	stp	A_q, B_q, [dst, 16]
 	ldp	A_q, B_q, [src, 80]
 	stp	C_q, D_q, [dst, 48]
 	ldp	C_q, D_q, [src, 112]
 	add	src, src, 64
 	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
 	ldp	E_q, F_q, [srcend, -64]
 	stp	A_q, B_q, [dst, 16]
 	ldp	A_q, B_q, [srcend, -32]
 	stp	C_q, D_q, [dst, 48]
 	stp	E_q, F_q, [dstend, -64]
 	stp	A_q, B_q, [dstend, -32]
 	ret
 
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):
 	cbz	tmp1, L(copy0)
 	ldr	D_q, [srcend, -16]
 	and	tmp1, srcend, 15
 	bic	srcend, srcend, 15
 	sub	count, count, tmp1
 	ldp	A_q, B_q, [srcend, -32]
 	str	D_q, [dstend, -16]
 	ldp	C_q, D_q, [srcend, -64]
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
 	str	B_q, [dstend, -16]
 	str	A_q, [dstend, -32]
 	ldp	A_q, B_q, [srcend, -96]
 	str	D_q, [dstend, -48]
 	str	C_q, [dstend, -64]!
 	ldp	C_q, D_q, [srcend, -128]
 	sub	srcend, srcend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
 	ldp	E_q, F_q, [src, 32]
 	stp	A_q, B_q, [dstend, -32]
 	ldp	A_q, B_q, [src]
 	stp	C_q, D_q, [dstend, -64]
 	stp	E_q, F_q, [dstin, 32]
 	stp	A_q, B_q, [dstin]
 	ret
 
 END (__memcpy_aarch64_simd)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
index f85e8009f3c5..e8a946d7db37 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
@@ -1,180 +1,177 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
  *
  */
 
-#if __ARM_FEATURE_SVE
+#include "asmdefs.h"
 
-#include "../asmdefs.h"
+#ifdef HAVE_SVE
+
+.arch armv8-a+sve
 
 #define dstin	x0
 #define src	x1
 #define count	x2
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
 #define tmp1	x6
 #define vlen	x6
 
 #define A_q	q0
 #define B_q	q1
 #define C_q	q2
 #define D_q	q3
 #define E_q	q4
 #define F_q	q5
 #define G_q	q6
 #define H_q	q7
 
 /* This implementation handles overlaps and supports both memcpy and memmove
    from a single entry point.  It uses unaligned accesses and branchless
    sequences to keep the code small, simple and improve performance.
    SVE vectors are used to speedup small copies.
 
    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    copies of up to 128 bytes, and large copies.  The overhead of the overlap
    check is negligible since it is only required for large copies.
 
    Large copies use a software pipelined loop processing 64 bytes per iteration.
    The source pointer is 16-byte aligned to minimize unaligned accesses.
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
 ENTRY_ALIAS (__memmove_aarch64_sve)
 ENTRY (__memcpy_aarch64_sve)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 
 	cmp	count, 128
 	b.hi	L(copy_long)
-	cmp	count, 32
+	cntb	vlen
+	cmp	count, vlen, lsl 1
 	b.hi	L(copy32_128)
 
 	whilelo p0.b, xzr, count
-	cntb	vlen
-	tbnz	vlen, 4, L(vlen128)
-	ld1b	z0.b, p0/z, [src]
-	st1b	z0.b, p0, [dstin]
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
 	ret
 
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
 	add	srcend, src, count
 	add	dstend, dstin, count
 	ldp	A_q, B_q, [src]
 	ldp	C_q, D_q, [srcend, -32]
 	cmp	count, 64
 	b.hi	L(copy128)
 	stp	A_q, B_q, [dstin]
 	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	/* Copy 65..128 bytes.  */
 L(copy128):
 	ldp	E_q, F_q, [src, 32]
 	cmp	count, 96
 	b.ls	L(copy96)
 	ldp	G_q, H_q, [srcend, -64]
 	stp	G_q, H_q, [dstend, -64]
 L(copy96):
 	stp	A_q, B_q, [dstin]
 	stp	E_q, F_q, [dstin, 32]
 	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	/* Copy more than 128 bytes.  */
 L(copy_long):
 	add	srcend, src, count
 	add	dstend, dstin, count
 
 	/* Use backwards copy if there is an overlap.  */
 	sub	tmp1, dstin, src
 	cmp	tmp1, count
 	b.lo	L(copy_long_backwards)
 
 	/* Copy 16 bytes and then align src to 16-byte alignment.  */
 	ldr	D_q, [src]
 	and	tmp1, src, 15
 	bic	src, src, 15
 	sub	dst, dstin, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_q, B_q, [src, 16]
 	str	D_q, [dstin]
 	ldp	C_q, D_q, [src, 48]
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
 L(loop64):
 	stp	A_q, B_q, [dst, 16]
 	ldp	A_q, B_q, [src, 80]
 	stp	C_q, D_q, [dst, 48]
 	ldp	C_q, D_q, [src, 112]
 	add	src, src, 64
 	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
 	ldp	E_q, F_q, [srcend, -64]
 	stp	A_q, B_q, [dst, 16]
 	ldp	A_q, B_q, [srcend, -32]
 	stp	C_q, D_q, [dst, 48]
 	stp	E_q, F_q, [dstend, -64]
 	stp	A_q, B_q, [dstend, -32]
 	ret
 
-L(vlen128):
-	whilelo p1.b, vlen, count
-	ld1b	z0.b, p0/z, [src, 0, mul vl]
-	ld1b	z1.b, p1/z, [src, 1, mul vl]
-	st1b	z0.b, p0, [dstin, 0, mul vl]
-	st1b	z1.b, p1, [dstin, 1, mul vl]
-	ret
-
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):
 	cbz	tmp1, L(return)
 	ldr	D_q, [srcend, -16]
 	and	tmp1, srcend, 15
 	bic	srcend, srcend, 15
 	sub	count, count, tmp1
 	ldp	A_q, B_q, [srcend, -32]
 	str	D_q, [dstend, -16]
 	ldp	C_q, D_q, [srcend, -64]
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
 	str	B_q, [dstend, -16]
 	str	A_q, [dstend, -32]
 	ldp	A_q, B_q, [srcend, -96]
 	str	D_q, [dstend, -48]
 	str	C_q, [dstend, -64]!
 	ldp	C_q, D_q, [srcend, -128]
 	sub	srcend, srcend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
 	ldp	E_q, F_q, [src, 32]
 	stp	A_q, B_q, [dstend, -32]
 	ldp	A_q, B_q, [src]
 	stp	C_q, D_q, [dstend, -64]
 	stp	E_q, F_q, [dstin, 32]
 	stp	A_q, B_q, [dstin]
 L(return):
 	ret
 
 END (__memcpy_aarch64_sve)
+
 #endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy.S b/contrib/arm-optimized-routines/string/aarch64/memcpy.S
index dd254f6f9929..7c0606e2104a 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy.S
@@ -1,243 +1,243 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
 #define count	x2
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_l	x10
 #define C_lw	w10
 #define C_h	x11
 #define D_l	x12
 #define D_h	x13
 #define E_l	x14
 #define E_h	x15
 #define F_l	x16
 #define F_h	x17
 #define G_l	count
 #define G_h	dst
 #define H_l	src
 #define H_h	srcend
 #define tmp1	x14
 
 /* This implementation handles overlaps and supports both memcpy and memmove
    from a single entry point.  It uses unaligned accesses and branchless
    sequences to keep the code small, simple and improve performance.
 
    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    copies of up to 128 bytes, and large copies.  The overhead of the overlap
    check is negligible since it is only required for large copies.
 
    Large copies use a software pipelined loop processing 64 bytes per iteration.
    The destination pointer is 16-byte aligned to minimize unaligned accesses.
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
 ENTRY_ALIAS (__memmove_aarch64)
 ENTRY (__memcpy_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
 	b.hi	L(copy_long)
 	cmp	count, 32
 	b.hi	L(copy32_128)
 
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
 	ldp	A_l, A_h, [src]
 	ldp	D_l, D_h, [srcend, -16]
 	stp	A_l, A_h, [dstin]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
 	/* Copy 8-15 bytes.  */
 L(copy16):
 	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
 
 	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
 	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
 	str	B_lw, [dstend, -4]
 	ret
 
 	/* Copy 0..3 bytes using a branchless sequence.  */
 L(copy4):
 	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
 	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
 	strb	C_lw, [dstend, -1]
 L(copy0):
 	ret
 
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
 	ldp	A_l, A_h, [src]
 	ldp	B_l, B_h, [src, 16]
 	ldp	C_l, C_h, [srcend, -32]
 	ldp	D_l, D_h, [srcend, -16]
 	cmp	count, 64
 	b.hi	L(copy128)
 	stp	A_l, A_h, [dstin]
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstend, -32]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
 	ldp	E_l, E_h, [src, 32]
 	ldp	F_l, F_h, [src, 48]
 	cmp	count, 96
 	b.ls	L(copy96)
 	ldp	G_l, G_h, [srcend, -64]
 	ldp	H_l, H_h, [srcend, -48]
 	stp	G_l, G_h, [dstend, -64]
 	stp	H_l, H_h, [dstend, -48]
 L(copy96):
 	stp	A_l, A_h, [dstin]
 	stp	B_l, B_h, [dstin, 16]
 	stp	E_l, E_h, [dstin, 32]
 	stp	F_l, F_h, [dstin, 48]
 	stp	C_l, C_h, [dstend, -32]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
 	.p2align 4
 	/* Copy more than 128 bytes.  */
 L(copy_long):
 	/* Use backwards copy if there is an overlap.  */
 	sub	tmp1, dstin, src
 	cbz	tmp1, L(copy0)
 	cmp	tmp1, count
 	b.lo	L(copy_long_backwards)
 
 	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
 
 	ldp	D_l, D_h, [src]
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	sub	src, src, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
 	ldp	B_l, B_h, [src, 32]
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
 
 L(loop64):
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
 	ldp	B_l, B_h, [src, 32]
 	stp	C_l, C_h, [dst, 48]
 	ldp	C_l, C_h, [src, 48]
 	stp	D_l, D_h, [dst, 64]!
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
 	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]
 	stp	B_l, B_h, [dst, 32]
 	ldp	B_l, B_h, [srcend, -32]
 	stp	C_l, C_h, [dst, 48]
 	ldp	C_l, C_h, [srcend, -16]
 	stp	D_l, D_h, [dst, 64]
 	stp	E_l, E_h, [dstend, -64]
 	stp	A_l, A_h, [dstend, -48]
 	stp	B_l, B_h, [dstend, -32]
 	stp	C_l, C_h, [dstend, -16]
 	ret
 
 	.p2align 4
 
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align dst to 16-byte alignment.  */
 L(copy_long_backwards):
 	ldp	D_l, D_h, [srcend, -16]
 	and	tmp1, dstend, 15
 	sub	srcend, srcend, tmp1
 	sub	count, count, tmp1
 	ldp	A_l, A_h, [srcend, -16]
 	stp	D_l, D_h, [dstend, -16]
 	ldp	B_l, B_h, [srcend, -32]
 	ldp	C_l, C_h, [srcend, -48]
 	ldp	D_l, D_h, [srcend, -64]!
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
 	stp	A_l, A_h, [dstend, -16]
 	ldp	A_l, A_h, [srcend, -16]
 	stp	B_l, B_h, [dstend, -32]
 	ldp	B_l, B_h, [srcend, -32]
 	stp	C_l, C_h, [dstend, -48]
 	ldp	C_l, C_h, [srcend, -48]
 	stp	D_l, D_h, [dstend, -64]!
 	ldp	D_l, D_h, [srcend, -64]!
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
 	ldp	G_l, G_h, [src, 48]
 	stp	A_l, A_h, [dstend, -16]
 	ldp	A_l, A_h, [src, 32]
 	stp	B_l, B_h, [dstend, -32]
 	ldp	B_l, B_h, [src, 16]
 	stp	C_l, C_h, [dstend, -48]
 	ldp	C_l, C_h, [src]
 	stp	D_l, D_h, [dstend, -64]
 	stp	G_l, G_h, [dstin, 48]
 	stp	A_l, A_h, [dstin, 32]
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstin]
 	ret
 
 END (__memcpy_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memrchr.S b/contrib/arm-optimized-routines/string/aarch64/memrchr.S
index 7b4be847cecb..6418bdf56f41 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memrchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memrchr.S
@@ -1,117 +1,112 @@
 /*
  * memrchr - find last character in a memory zone.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
 #define cntin		x2
 #define result		x0
 
 #define src		x3
 #define cntrem		x4
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
 	PTR_ARG (0)
 	add	end, srcin, cntin
 	sub	endm1, end, 1
 	bic	src, endm1, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	clz	synd, synd
 	sub	result, endm1, synd, lsr 2
 	cmp	cntin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
+	nop
 L(start_loop):
-	sub	tmp, end, src
-	subs	cntrem, cntin, tmp
+	subs	cntrem, src, srcin
 	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
 
-	.p2align 4
+	.p2align 5
 L(loop32):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
 #ifdef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	clz	synd, synd
 	sub	tmp, tmp, synd, lsr 2
 	cmp	tmp, srcin
 	csel	result, tmp, xzr, hs
 	ret
 
 L(nomatch):
 	mov	result, 0
 	ret
 
 END (__memrchr_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/memset.S b/contrib/arm-optimized-routines/string/aarch64/memset.S
index 9fcd97579913..553b0fcaefea 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memset.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memset.S
@@ -1,117 +1,117 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define val	x1
 #define valw	w1
 #define count	x2
 #define dst	x3
 #define dstend	x4
 #define zva_val	x5
 
 ENTRY (__memset_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
 
 	cmp	count, 96
 	b.hi	L(set_long)
 	cmp	count, 16
 	b.hs	L(set_medium)
 	mov	val, v0.D[0]
 
 	/* Set 0..15 bytes.  */
 	tbz	count, 3, 1f
 	str	val, [dstin]
 	str	val, [dstend, -8]
 	ret
 	.p2align 4
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
 	ret
 2:	cbz	count, 3f
 	strb	valw, [dstin]
 	tbz	count, 1, 3f
 	strh	valw, [dstend, -2]
 3:	ret
 
 	/* Set 17..96 bytes.  */
 L(set_medium):
 	str	q0, [dstin]
 	tbnz	count, 6, L(set96)
 	str	q0, [dstend, -16]
 	tbz	count, 5, 1f
 	str	q0, [dstin, 16]
 	str	q0, [dstend, -32]
 1:	ret
 
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
 L(set96):
 	str	q0, [dstin, 16]
 	stp	q0, q0, [dstin, 32]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
 	cmp	count, 160
 	ccmp	valw, 0, 0, hs
 	b.ne	L(no_zva)
 
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
 	sub	count, count, 128	/* Adjust count and bias for loop.  */
 
 	.p2align 4
 L(zva_loop):
 	add	dst, dst, 64
 	dc	zva, dst
 	subs	count, count, 64
 	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 L(no_zva):
 	sub	count, dstend, dst	/* Count is 16 too large.  */
 	sub	dst, dst, 16		/* Dst is biased by -32.  */
 	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
 L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]!
 	subs	count, count, 64
 	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 END (__memset_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
index 82dd9717b0a0..5d3f14b86026 100644
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
@@ -1,10 +1,10 @@
 /*
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
 
 #include "strcpy-sve.S"
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
index 4f62aa462389..155c68d75a7b 100644
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
@@ -1,10 +1,10 @@
 /*
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
 
 #include "strcpy.S"
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
index dcb0e4625870..6ec08f7acc76 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
@@ -1,105 +1,101 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
 #define result		x0
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (__strchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
 	clz	tmp1, tmp1
 	/* Tmp1 is an even multiple of 2 if the target character was
 	   found first. Otherwise we've found the end of string.  */
 	tst	tmp1, 2
 	add	result, srcin, tmp1, lsr 2
 	csel	result, result, xzr, eq
 	ret
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
 	ret
 
 END (__strchr_aarch64_mte)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
index 13ba9f44f9c5..ff075167bfef 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
@@ -1,70 +1,70 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 /* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file.  */
 #ifdef BUILD_STRCHRNUL
 #define FUNC  __strchrnul_aarch64_sve
 #else
 #define FUNC  __strchr_aarch64_sve
 #endif
 
 ENTRY (FUNC)
 	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
 
 	.p2align 4
 	/* Read a vector's worth of bytes, stopping on first fault.  */
 0:	ldff1b	z0.b, p1/z, [x0, xzr]
 	rdffrs	p0.b, p1/z
 	b.nlast	2f
 
 	/* First fault did not fail: the whole vector is valid.
 	   Avoid depending on the contents of FFR beyond the branch.  */
 	incb	x0				/* speculate increment */
 	cmpeq	p2.b, p1/z, z0.b, z1.b		/* search for c */
 	cmpeq	p3.b, p1/z, z0.b, 0		/* search for 0 */
 	orrs	p4.b, p1/z, p2.b, p3.b		/* c | 0 */
 	b.none	0b
 	decb	x0				/* undo speculate */
 
 	/* Found C or 0.  */
 1:	brka	p4.b, p1/z, p4.b	/* find first such */
 	sub	x0, x0, 1		/* adjust pointer for that byte */
 	incp	x0, p4.b
 #ifndef BUILD_STRCHRNUL
 	ptest	p4, p2.b		/* was first in c? */
 	csel	x0, xzr, x0, none	/* if there was no c, return null */
 #endif
 	ret
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparision only on the valid bytes.  */
 2:	cmpeq	p2.b, p0/z, z0.b, z1.b		/* search for c */
 	cmpeq	p3.b, p0/z, z0.b, 0		/* search for 0 */
 	orrs	p4.b, p0/z, p2.b, p3.b		/* c | 0 */
 	b.any	1b
 
 	/* No C or 0 found.  Re-init FFR, increment, and loop.  */
 	setffr
 	incp	x0, p0.b
 	b	0b
 
 END (FUNC)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr.S b/contrib/arm-optimized-routines/string/aarch64/strchr.S
index 1063cbfd77aa..37193bd947a7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr.S
@@ -1,126 +1,126 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 
 #define result		x0
 
 #define src		x2
 #define	tmp1		x3
 #define wtmp2		w4
 #define tmp3		x5
 
 #define vrepchr		v0
 #define vdata1		v1
 #define vdata2		v2
 #define vhas_nul1	v3
 #define vhas_nul2	v4
 #define vhas_chr1	v5
 #define vhas_chr2	v6
 #define vrepmask_0	v7
 #define vrepmask_c	v16
 #define vend1		v17
 #define vend2		v18
 
 /* Core algorithm.
 
    For each 32-byte hunk we calculate a 64-bit syndrome value, with
    two bits per byte (LSB is always in bits 0 and 1, for both big
    and little-endian systems).  For each tuple, bit 0 is set iff
    the relevant byte matched the requested character; bit 1 is set
    iff the relevant byte matched the NUL end of string (we trigger
    off bit0 for the special case of looking for NUL).  Since the bits
    in the syndrome reflect exactly the order in which things occur
    in the original string a count_trailing_zeros() operation will
    identify exactly which byte is causing the termination, and why.  */
 
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
 	PTR_ARG (0)
 	/* Magic constant 0xc0300c03 to allow us to identify which lane
 	   matches the requested byte.  Even bits are set if the character
 	   matches, odd bits if either the char is NUL or matches.  */
 	mov	wtmp2, 0x0c03
 	movk	wtmp2, 0xc030, lsl 16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask_c.4s, wtmp2
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
 	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
 	   for all the bytes, but then mask off those bits of the
 	   syndrome that are related to the padding.  */
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	neg	tmp1, tmp1
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
 	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
 	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
 	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	mov	tmp3, #~0
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
 	lsr	tmp1, tmp3, tmp1
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
 	cbnz	tmp1, L(tail)
 
 	.p2align 4
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
 	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
 	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
 	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
 	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
 	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
 	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
 	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
 	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
 	mov	tmp1, vend1.d[0]
 L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
 	sub	src, src, #32
 	clz	tmp1, tmp1	/* And counting the leading zeros.  */
 	/* Tmp1 is even if the target charager was found first.  Otherwise
 	   we've found the end of string and we weren't looking for NUL.  */
 	tst	tmp1, #1
 	add	result, src, tmp1, lsr #1
 	csel	result, result, xzr, eq
 	ret
 
 END (__strchr_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
index 1b0d0a63094c..543ee88bb285 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
@@ -1,84 +1,85 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
 #define result		x0
 
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
 	clz	tmp1, tmp1
 	add	result, srcin, tmp1, lsr 2
 	ret
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	add	src, src, 16
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
 #endif
 	clz	tmp1, tmp1
 	add	result, src, tmp1, lsr 2
 	ret
 
 END (__strchrnul_aarch64_mte)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
index 428ff1a3d008..0005f9177514 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
@@ -1,9 +1,9 @@
 /*
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STRCHRNUL
 #include "strchr-sve.S"
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
index a4230d919b47..666e8d0304c1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
@@ -1,114 +1,114 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 
 #define result		x0
 
 #define src		x2
 #define	tmp1		x3
 #define wtmp2		w4
 #define tmp3		x5
 
 #define vrepchr		v0
 #define vdata1		v1
 #define vdata2		v2
 #define vhas_nul1	v3
 #define vhas_nul2	v4
 #define vhas_chr1	v5
 #define vhas_chr2	v6
 #define vrepmask	v7
 #define vend1		v16
 
 /* Core algorithm.
 
    For each 32-byte hunk we calculate a 64-bit syndrome value, with
    two bits per byte (LSB is always in bits 0 and 1, for both big
    and little-endian systems).  For each tuple, bit 0 is set iff
    the relevant byte matched the requested character or nul.  Since the
    bits in the syndrome reflect exactly the order in which things occur
    in the original string a count_trailing_zeros() operation will
    identify exactly which byte is causing the termination.  */
 
 /* Locals and temporaries.  */
 
 ENTRY (__strchrnul_aarch64)
 	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask.4s, wtmp2
 	ands	tmp1, srcin, #31
 	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
 	   for all the bytes, but then mask off those bits of the
 	   syndrome that are related to the padding.  */
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	neg	tmp1, tmp1
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
 	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
 	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
 	mov	tmp3, #~0
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 	lsr	tmp1, tmp3, tmp1
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
 	cbnz	tmp1, L(tail)
 
 	.p2align 4
 L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
 	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
 	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
 	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
 	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
 	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 
 	mov	tmp1, vend1.d[0]
 L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
 	sub	src, src, #32
 	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
 	/* tmp1 is twice the offset into the fragment.  */
 	add	result, src, tmp1, lsr #1
 	ret
 
 END (__strchrnul_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
index e6d2da5411ca..eaf909a378f1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
@@ -1,59 +1,59 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__strcmp_aarch64_sve)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
 	.p2align 4
 0:	ldff1b	z0.b, p1/z, [x0, x2]
 	ldff1b	z1.b, p1/z, [x1, x2]
 	rdffrs	p0.b, p1/z
 	b.nlast	2f
 
 	/* First fault did not fail: the whole vector is valid.
 	   Avoid depending on the contents of FFR beyond the branch.  */
 	incb	x2, all			/* skip bytes for next round */
 	cmpeq	p2.b, p1/z, z0.b, z1.b	/* compare strings */
 	cmpne	p3.b, p1/z, z0.b, 0	/* search for ~zero */
 	nands	p2.b, p1/z, p2.b, p3.b	/* ~(eq & ~zero) -> ne | zero */
 	b.none	0b
 
 	/* Found end-of-string or inequality.  */
 1:	brkb	p2.b, p1/z, p2.b	/* find first such */
 	lasta	w0, p2, z0.b		/* extract each char */
 	lasta	w1, p2, z1.b
 	sub	x0, x0, x1		/* return comparison */
 	ret
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparison only on the valid bytes.  */
 2:	incp	x2, p0.b		/* skip bytes for next round */
 	setffr				/* re-init FFR for next round */
 	cmpeq	p2.b, p0/z, z0.b, z1.b	/* compare strings, as above */
 	cmpne	p3.b, p0/z, z0.b, 0
 	nands	p2.b, p0/z, p2.b, p3.b
 	b.none	0b
 	b	1b
 
 END (__strcmp_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp.S b/contrib/arm-optimized-routines/string/aarch64/strcmp.S
index 6e77845ae6ff..137a9aa06681 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcmp.S
@@ -1,189 +1,189 @@
 /*
  * strcmp - compare two strings
  *
  * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 
 #define src1		x0
 #define src2		x1
 #define result		x0
 
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
 #define off1		x5
 #define syndrome	x6
 #define tmp		x6
 #define data3		x7
 #define zeroones	x8
 #define shift		x9
 #define off2		x10
 
 /* On big-endian early bytes are at MSB and on little-endian LSB.
    LS_FW means shifting towards early bytes.  */
 #ifdef __AARCH64EB__
 # define LS_FW lsl
 #else
 # define LS_FW lsr
 #endif
 
 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
    (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
    can be done in parallel across the entire word.
    Since carry propagation makes 0x1 bytes before a NUL byte appear
    NUL too in big-endian, byte-reverse the data before the NUL check.  */
 
 
 ENTRY (__strcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	sub	off2, src2, src1
 	mov	zeroones, REP8_01
 	and	tmp, src1, 7
 	tst	off2, 7
 	b.ne	L(misaligned8)
 	cbnz	tmp, L(mutual_align)
 
 	.p2align 4
 
 L(loop_aligned):
 	ldr	data2, [src1, off2]
 	ldr	data1, [src1], 8
 L(start_realigned):
 #ifdef __AARCH64EB__
 	rev	tmp, data1
 	sub	has_nul, tmp, zeroones
 	orr	tmp, tmp, REP8_7f
 #else
 	sub	has_nul, data1, zeroones
 	orr	tmp, data1, REP8_7f
 #endif
 	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
 	ccmp	data1, data2, 0, eq
 	b.eq	L(loop_aligned)
 #ifdef __AARCH64EB__
 	rev	has_nul, has_nul
 #endif
 	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
 L(end):
 #ifndef __AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	clz	shift, syndrome
 	/* The most-significant-non-zero bit of the syndrome marks either the
 	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
 	lsl	data1, data1, shift
 	lsl	data2, data2, shift
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, 56
 	sub	result, data1, data2, lsr 56
 	ret
 
 	.p2align 4
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that precede the start point.  */
 	bic	src1, src1, 7
 	ldr	data2, [src1, off2]
 	ldr	data1, [src1], 8
 	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
 	mov	tmp, -1
 	LS_FW	tmp, tmp, shift
 	orr	data1, data1, tmp
 	orr	data2, data2, tmp
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
 	   checking to make sure that we don't access beyond the end of SRC2.  */
 	cbz	tmp, L(src1_aligned)
 L(do_misaligned):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	cmp	data1w, 0
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.ne	L(done)
 	tst	src1, 7
 	b.ne	L(do_misaligned)
 
 L(src1_aligned):
 	neg	shift, src2, lsl 3
 	bic	src2, src2, 7
 	ldr	data3, [src2], 8
 #ifdef __AARCH64EB__
 	rev	data3, data3
 #endif
 	lsr	tmp, zeroones, shift
 	orr	data3, data3, tmp
 	sub	has_nul, data3, zeroones
 	orr	tmp, data3, REP8_7f
 	bics	has_nul, has_nul, tmp
 	b.ne	L(tail)
 
 	sub	off1, src2, src1
 
 	.p2align 4
 
 L(loop_unaligned):
 	ldr	data3, [src1, off1]
 	ldr	data2, [src1, off2]
 #ifdef __AARCH64EB__
 	rev	data3, data3
 #endif
 	sub	has_nul, data3, zeroones
 	orr	tmp, data3, REP8_7f
 	ldr	data1, [src1], 8
 	bics	has_nul, has_nul, tmp
 	ccmp	data1, data2, 0, eq
 	b.eq	L(loop_unaligned)
 
 	lsl	tmp, has_nul, shift
 #ifdef __AARCH64EB__
 	rev	tmp, tmp
 #endif
 	eor	diff, data1, data2
 	orr	syndrome, diff, tmp
 	cbnz	syndrome, L(end)
 L(tail):
 	ldr	data1, [src1]
 	neg	shift, shift
 	lsr	data2, data3, shift
 	lsr	has_nul, has_nul, shift
 #ifdef __AARCH64EB__
 	rev     data2, data2
 	rev	has_nul, has_nul
 #endif
 	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
 	b	L(end)
 
 L(done):
 	sub	result, data1, data2
 	ret
 
 END (__strcmp_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
index f515462e09ae..00e72dce4451 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
@@ -1,71 +1,71 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.  */
 #ifdef BUILD_STPCPY
 #define FUNC  __stpcpy_aarch64_sve
 #else
 #define FUNC  __strcpy_aarch64_sve
 #endif
 
 ENTRY (FUNC)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
 
 	.p2align 4
 	/* Read a vector's worth of bytes, stopping on first fault.  */
 0:	ldff1b	z0.b, p2/z, [x1, x2]
 	rdffrs	p0.b, p2/z
 	b.nlast	1f
 
 	/* First fault did not fail: the whole vector is valid.
 	   Avoid depending on the contexts of FFR beyond the branch.  */
 	cmpeq	p1.b, p2/z, z0.b, 0	/* search for zeros */
 	b.any	2f
 
 	/* No zero found.  Store the whole vector and loop.  */
 	st1b	z0.b, p2, [x0, x2]
 	incb	x2, all
 	b	0b
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparison only on the valid bytes.  */
 1:	cmpeq	p1.b, p0/z, z0.b, 0	/* search for zeros */
 	b.any	2f
 
 	/* No zero found.  Store the valid portion of the vector and loop.  */
 	setffr				/* re-init FFR */
 	st1b	z0.b, p0, [x0, x2]
 	incp	x2, p0.b
 	b	0b
 
 	/* Zero found.  Crop the vector to the found zero and finish.  */
 2:	brka	p0.b, p2/z, p1.b
 	st1b	z0.b, p0, [x0, x2]
 #ifdef BUILD_STPCPY
 	add	x0, x0, x2
 	sub	x0, x0, 1
 	incp	x0, p0.b
 #endif
 	ret
 
 END (FUNC)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy.S b/contrib/arm-optimized-routines/string/aarch64/strcpy.S
index b99e49403be8..97ae37ea4229 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcpy.S
@@ -1,161 +1,156 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin		x0
 #define srcin		x1
 #define result		x0
 
 #define src		x2
 #define dst		x3
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
 #define data2		x7
 #define dataw2		w7
 
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
 # define STRCPY __stpcpy_aarch64
 # define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
 # define STRCPY __strcpy_aarch64
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	sub	tmp, src, srcin
 	clz	len, synd
 	add	len, tmp, len, lsr 2
 	tbz	len, 4, L(less16)
 	sub	tmp, len, 15
 	ldr	dataq, [srcin]
 	ldr	dataq2, [srcin, tmp]
 	str	dataq, [dstin]
 	str	dataq2, [dstin, tmp]
 	IFSTPCPY (add result, dstin, len)
 	ret
 
-	.p2align 4,,8
 L(tail):
 	rbit	synd, synd
 	clz	len, synd
 	lsr	len, len, 2
-
-	.p2align 4
 L(less16):
 	tbz	len, 3, L(less8)
 	sub	tmp, len, 7
 	ldr	data1, [srcin]
 	ldr	data2, [srcin, tmp]
 	str	data1, [dstin]
 	str	data2, [dstin, tmp]
 	IFSTPCPY (add result, dstin, len)
 	ret
 
 	.p2align 4
 L(less8):
 	subs	tmp, len, 3
 	b.lo	L(less4)
 	ldr	dataw1, [srcin]
 	ldr	dataw2, [srcin, tmp]
 	str	dataw1, [dstin]
 	str	dataw2, [dstin, tmp]
 	IFSTPCPY (add result, dstin, len)
 	ret
 
 L(less4):
 	cbz	len, L(zerobyte)
 	ldrh	dataw1, [srcin]
 	strh	dataw1, [dstin]
 L(zerobyte):
 	strb	wzr, [dstin, len]
 	IFSTPCPY (add result, dstin, len)
 	ret
 
 	.p2align 4
 L(start_loop):
-	sub	len, src, srcin
+	sub	tmp, srcin, dstin
 	ldr	dataq2, [srcin]
-	add	dst, dstin, len
+	sub	dst, src, tmp
 	str	dataq2, [dstin]
-
-	.p2align 5
 L(loop):
-	str	dataq, [dst], 16
-	ldr	dataq, [src, 16]!
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
+	sub	dst, dst, 31
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	clz	len, synd
 	lsr	len, len, 2
-	sub	tmp, len, 15
-	ldr	dataq, [src, tmp]
-	str	dataq, [dst, tmp]
-	IFSTPCPY (add result, dst, len)
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	IFSTPCPY (add result, dst, 15)
 	ret
 
 END (STRCPY)
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
index 7cf41d5c1eac..77235797f7c5 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
@@ -1,80 +1,77 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define result		x0
 
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
 
 	rbit	synd, synd
 	clz	result, synd
 	lsr	result, result, 2
 	ret
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]!
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	sub	src, src, 16
+L(loop_end):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
+	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret
 
 END (__strlen_aarch64_mte)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
index 2392493f1a3c..12ebbdba5c93 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
@@ -1,55 +1,55 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__strlen_aarch64_sve)
 	PTR_ARG (0)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
 	.p2align 4
 0:	ldff1b	z0.b, p2/z, [x0, x1]
 	rdffrs	p0.b, p2/z
 	b.nlast	2f
 
 	/* First fault did not fail: the whole vector is valid.
 	   Avoid depending on the contents of FFR beyond the branch.  */
 	incb	x1, all			/* speculate increment */
 	cmpeq	p1.b, p2/z, z0.b, 0	/* loop if no zeros */
 	b.none	0b
 	decb	x1, all			/* undo speculate */
 
 	/* Zero found.  Select the bytes before the first and count them.  */
 1:	brkb	p0.b, p2/z, p1.b
 	incp	x1, p0.b
 	mov	x0, x1
 	ret
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparison only on the valid bytes.  */
 2:	cmpeq	p1.b, p0/z, z0.b, 0
 	b.any	1b
 
 	/* No zero found.  Re-init FFR, increment, and loop.  */
 	setffr
 	incp	x1, p0.b
 	b	0b
 
 END (__strlen_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen.S b/contrib/arm-optimized-routines/string/aarch64/strlen.S
index a1b164a49238..6f6f08f636b2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen.S
@@ -1,200 +1,193 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  * Not MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin	x0
 #define len	x0
 
 #define src	x1
 #define data1	x2
 #define data2	x3
 #define has_nul1 x4
 #define has_nul2 x5
 #define tmp1	x4
 #define tmp2	x5
 #define tmp3	x6
 #define tmp4	x7
 #define zeroones x8
 
 #define maskv	v0
 #define maskd	d0
 #define dataq1	q1
 #define dataq2	q2
 #define datav1	v1
 #define datav2	v2
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
+#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
    (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
    byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 
 /* To test the page crossing code path more thoroughly, compile with
    -DTEST_PAGE_CROSS - this will force all calls through the slower
    entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
 # define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
 /* Core algorithm:
 
    Since strings are short on average, we check the first 32 bytes of the
    string for a NUL character without aligning the string.  In order to use
    unaligned loads safely we must do a page cross check first.
 
    If there is a NUL byte we calculate the length from the 2 8-byte words
    using conditional select to reduce branch mispredictions (it is unlikely
    strlen will be repeatedly called on strings with the same length).
 
    If the string is longer than 32 bytes, align src so we don't need further
    page cross checks, and process 32 bytes per iteration using a fast SIMD
    loop.
 
    If the page cross check fails, we read 32 bytes from an aligned address,
    and ignore any characters before the string.  If it contains a NUL
    character, return the length, if not, continue in the main loop.  */
 
 ENTRY (__strlen_aarch64)
 	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	cmp	tmp1, MIN_PAGE_SIZE - 32
 	b.hi	L(page_cross)
 
 	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
 	mov	zeroones, REP8_01
 
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
 	   Since we expect strings to be small and early-exit,
 	   byte-swap the data now so has_null1/2 will be correct.  */
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
 	b.eq	L(bytes16_31)
 
 	/* Find the exact offset of the first NUL byte in the first 16 bytes
 	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
 	csel	len, xzr, len, cc
 	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
 #ifdef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
 	b.eq	L(loop_entry)
 
 	/* Find the exact offset of the first NUL byte at offset 16..31 from
 	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 24
 	rev	has_nul1, has_nul1
 	mov	tmp3, 16
 	clz	tmp1, has_nul1
 	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
+	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
 	.p2align 5
 L(loop):
 	ldp	dataq1, dataq2, [src, 32]!
 	uminp	maskv.16b, datav1.16b, datav2.16b
 	uminp	maskv.16b, maskv.16b, maskv.16b
 	cmeq	maskv.8b, maskv.8b, 0
 	fmov	synd, maskd
 	cbz	synd, L(loop)
 
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	tst	synd, 0xffffffff
-	b.ne	1f
+	cbnz	syndw, 1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-#ifdef __AARCH64EB__
-	bic	maskv.8h, 0xf0
-#else
-	bic	maskv.8h, 0x0f, lsl 8
-#endif
-	umaxp	maskv.16b, maskv.16b, maskv.16b
+	shrn	maskv.8b, maskv.8h, 4
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	clz	tmp, synd
 	add	len, len, tmp, lsr 2
 	ret
 
-        .p2align 4
-
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
 	movk	tmpw, 0xc030, lsl 16
 	ld1	{datav1.16b, datav2.16b}, [src]
 	dup	maskv.4s, tmpw
 	cmeq	datav1.16b, datav1.16b, 0
 	cmeq	datav2.16b, datav2.16b, 0
 	and	datav1.16b, datav1.16b, maskv.16b
 	and	datav2.16b, datav2.16b, maskv.16b
 	addp	maskv.16b, datav1.16b, datav2.16b
 	addp	maskv.16b, maskv.16b, maskv.16b
 	fmov	synd, maskd
 	lsl	shift, srcin, 1
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
 
 	rbit	synd, synd
 	clz	len, synd
 	lsr	len, len, 1
 	ret
 
 END (__strlen_aarch64)
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
index 234190e245b0..6a9e9f7b6437 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
@@ -1,69 +1,69 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__strncmp_aarch64_sve)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
 	b.none	9f
 
 	ldff1b	z0.b, p0/z, [x0, x3]
 	ldff1b	z1.b, p0/z, [x1, x3]
 	rdffrs	p1.b, p0/z
 	b.nlast	2f
 
 	/* First fault did not fail: the vector up to max is valid.
 	   Avoid depending on the contents of FFR beyond the branch.
 	   Increment for a whole vector, even if we've only read a partial.
 	   This is significantly cheaper than INCP, and since OFF is not
 	   used after the loop it is ok to increment OFF past MAX.  */
 	incb	x3
 	cmpeq	p1.b, p0/z, z0.b, z1.b	/* compare strings */
 	cmpne	p2.b, p0/z, z0.b, 0	/* search for ~zero */
 	nands	p2.b, p0/z, p1.b, p2.b	/* ~(eq & ~zero) -> ne | zero */
 	b.none	0b
 
 	/* Found end-of-string or inequality.  */
 1:	brkb	p2.b, p0/z, p2.b	/* find first such */
 	lasta	w0, p2, z0.b		/* extract each char */
 	lasta	w1, p2, z1.b
 	sub	x0, x0, x1		/* return comparison */
 	ret
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparison only on the valid bytes.  */
 2:	cmpeq	p2.b, p1/z, z0.b, z1.b	/* compare strings, as above */
 	cmpne	p3.b, p1/z, z0.b, 0
 	nands	p2.b, p1/z, p2.b, p3.b
 	b.any	1b
 
 	/* No inequality or zero found.  Re-init FFR, incr and loop.  */
 	setffr
 	incp	x3, p1.b
 	b	0b
 
 	/* Found end-of-count.  */
 9:	mov	x0, 0			/* return equal */
 	ret
 
 END (__strncmp_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp.S b/contrib/arm-optimized-routines/string/aarch64/strncmp.S
index 7e636b4a593d..128a10c52bb1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strncmp.S
@@ -1,308 +1,308 @@
 /*
  * strncmp - compare two strings
  *
  * Copyright (c) 2013-2022, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
 #define result		x0
 
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
 #define data2		x4
 #define data2w		w4
 #define has_nul		x5
 #define diff		x6
 #define syndrome	x7
 #define tmp1		x8
 #define tmp2		x9
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
 #define mask		x13
 #define endloop		x14
 #define count		mask
 #define offset		pos
 #define neg_offset	x15
 
 /* Define endian dependent shift operations.
    On big-endian early bytes are at MSB and on little-endian LSB.
    LS_FW means shifting towards early bytes.
    LS_BK means shifting towards later bytes.
    */
 #ifdef __AARCH64EB__
 #define LS_FW lsl
 #define LS_BK lsr
 #else
 #define LS_FW lsr
 #define LS_BK lsl
 #endif
 
 ENTRY (__strncmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
 	and	count, src1, #7
 	b.ne	L(misaligned8)
 	cbnz	count, L(mutual_align)
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
 	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
 	subs	limit, limit, #8
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
 	/* End of main loop */
 
 L(full_check):
 #ifndef __AARCH64EB__
 	orr	syndrome, diff, has_nul
 	add	limit, limit, 8	/* Rewind limit to before last subs. */
 L(syndrome_check):
 	/* Limit was reached. Check if the NUL byte or the difference
 	   is before the limit. */
 	rev	syndrome, syndrome
 	rev	data1, data1
 	clz	pos, syndrome
 	rev	data2, data2
 	lsl	data1, data1, pos
 	cmp	limit, pos, lsr #3
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
 	csel result, result, xzr, hi
 	ret
 #else
 	/* Not reached the limit, must have found the end or a diff.  */
 	tbz	limit, #63, L(not_limit)
 	add	tmp1, limit, 8
 	cbz	limit, L(not_limit)
 
 	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
 	mov	mask, #~0
 	lsr	mask, mask, limit
 	bic	data1, data1, mask
 	bic	data2, data2, mask
 
 	/* Make sure that the NUL byte is marked in the syndrome.  */
 	orr	has_nul, has_nul, mask
 
 L(not_limit):
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
 	/* However, if there is no NUL byte in the dword, we can generate
 	   the result directly.  We can't just subtract the bytes as the
 	   MSB might be significant.  */
 	cbnz	has_nul, 1f
 	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 1:
 	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 	rev	tmp3, data1
 	sub	tmp1, tmp3, zeroones
 	orr	tmp2, tmp3, #REP8_7f
 	bic	has_nul, tmp1, tmp2
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
 	/* The most-significant-non-zero bit of the syndrome marks either the
 	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
 L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
 	ret
 #endif
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that precede the start point.
 	   We also need to adjust the limit calculations, but without
 	   overflowing if the limit is near ULONG_MAX.  */
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 	/* Adjust the limit and ensure it doesn't overflow.  */
 	adds	limit, limit, count
 	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	b	L(start_realigned)
 
 	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
 	b.hs	L(try_misaligned_words)
 
 L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
 	b.eq	L(byte_loop)
 L(done):
 	sub	result, data1, data2
 	ret
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
 	cbz	count, L(src1_aligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
 	b.ne	L(done)
 	subs	count, count, #1
 	b.hi	L(page_end_loop)
 
 	/* The following diagram explains the comparison of misaligned strings.
 	   The bytes are shown in natural order. For little-endian, it is
 	   reversed in the registers. The "x" bytes are before the string.
 	   The "|" separates data that is loaded at one time.
 	   src1     | a a a a a a a a | b b b c c c c c | . . .
 	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
 
 	   After shifting in each step, the data looks like this:
 	                STEP_A              STEP_B              STEP_C
 	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
 	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 
 	   The bytes with "0" are eliminated from the syndrome via mask.
 
 	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
 	   time from SRC2. The comparison happens in 3 steps. After each step
 	   the loop can exit, or read from SRC1 or SRC2. */
 L(src1_aligned):
 	/* Calculate offset from 8 byte alignment to string start in bits. No
 	   need to mask offset since shifts are ignoring upper bits. */
 	lsl	offset, src2, #3
 	bic	src2, src2, #0xf
 	mov	mask, -1
 	neg	neg_offset, offset
 	ldr	data1, [src1], #8
 	ldp	tmp1, tmp2, [src2], #16
 	LS_BK	mask, mask, neg_offset
 	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
 	/* Skip the first compare if data in tmp1 is irrelevant. */
 	tbnz	offset, 6, L(misaligned_mid_loop)
 
 L(loop_misaligned):
 	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
 	LS_FW	data2, tmp1, offset
 	LS_BK	tmp1, tmp2, neg_offset
 	subs	limit, limit, #8
 	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
 	sub	has_nul, data1, zeroones
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	orr	tmp3, data1, #REP8_7f
 	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
 	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
 	orr	tmp3, endloop, has_nul
 	cbnz	tmp3, L(full_check)
 
 	ldr	data1, [src1], #8
 L(misaligned_mid_loop):
 	/* STEP_B: Compare first part of data1 to second part of tmp2. */
 	LS_FW	data2, tmp2, offset
 #ifdef __AARCH64EB__
 	/* For big-endian we do a byte reverse to avoid carry-propagation
 	problem described above. This way we can reuse the has_nul in the
 	next step and also use syndrome value trick at the end. */
 	rev	tmp3, data1
 	#define data1_fixed tmp3
 #else
 	#define data1_fixed data1
 #endif
 	sub	has_nul, data1_fixed, zeroones
 	orr	tmp3, data1_fixed, #REP8_7f
 	eor	diff, data2, data1	/* Non-zero if differences found.  */
 	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
 #ifdef __AARCH64EB__
 	rev	has_nul, has_nul
 #endif
 	cmp	limit, neg_offset, lsr #3
 	orr	syndrome, diff, has_nul
 	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
 	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
 	cbnz	tmp3, L(syndrome_check)
 
 	/* STEP_C: Compare second part of data1 to first part of tmp1. */
 	ldp	tmp1, tmp2, [src2], #16
 	cmp	limit, #8
 	LS_BK	data2, tmp1, neg_offset
 	eor	diff, data2, data1	/* Non-zero if differences found.  */
 	orr	syndrome, diff, has_nul
 	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
 	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
 	cbnz	tmp3, L(syndrome_check)
 
 	ldr	data1, [src1], #8
 	sub	limit, limit, #8
 	b	L(loop_misaligned)
 
 #ifdef	__AARCH64EB__
 L(syndrome_check):
 	clz	pos, syndrome
 	cmp	pos, limit, lsl #3
 	b.lo	L(end_quick)
 #endif
 
 L(ret0):
 	mov	result, #0
 	ret
 END(__strncmp_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
index 5b9ebf7763bc..6c43dc427da7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
@@ -1,74 +1,74 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__strnlen_aarch64_sve)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
 
 	.p2align 4
 	/* We have off + vl <= max, and so may read the whole vector.  */
 0:	ldff1b	z0.b, p0/z, [x0, x2]
 	rdffrs	p1.b, p0/z
 	b.nlast	2f
 
 	/* First fault did not fail: the whole vector is valid.
 	   Avoid depending on the contents of FFR beyond the branch.  */
 	cmpeq	p2.b, p0/z, z0.b, 0
 	b.any	8f
 	incb	x2
 
 1:	whilelo	p0.b, x2, x1
 	b.last	0b
 
 	/* We have off + vl < max.  Test for off == max before proceeding.  */
 	b.none	9f
 
 	ldff1b	z0.b, p0/z, [x0, x2]
 	rdffrs	p1.b, p0/z
 	b.nlast	2f
 
 	/* First fault did not fail: the vector up to max is valid.
 	   Avoid depending on the contents of FFR beyond the branch.
 	   Compare for end-of-string, but there are no more bytes.  */
 	cmpeq	p2.b, p0/z, z0.b, 0
 
 	/* Found end-of-string or zero.  */
 8:	brkb	p2.b, p0/z, p2.b
 	mov	x0, x2
 	incp	x0, p2.b
 	ret
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparison only on the valid bytes.  */
 2:	cmpeq	p2.b, p1/z, z0.b, 0
 	b.any	8b
 
 	/* No inequality or zero found.  Re-init FFR, incr and loop.  */
 	setffr
 	incp	x2, p1.b
 	b	1b
 
 	/* End of count.  Return max.  */
 9:	mov	x0, x1
 	ret
 
 END (__strnlen_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen.S b/contrib/arm-optimized-routines/string/aarch64/strnlen.S
index 48d2495d2082..f2090a7485a5 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strnlen.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strnlen.S
@@ -1,112 +1,102 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define cntin		x1
 #define result		x0
 
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strnlen_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
-	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 L(finish):
 	rbit	synd, synd
 	clz	synd, synd
 	lsr	result, synd, 2
 	cmp	cntin, result
 	csel	result, cntin, result, ls
 	ret
 
+L(nomatch):
+	mov	result, cntin
+	ret
+
 L(start_loop):
 	sub	tmp, src, srcin
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 5
 L(loop32):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 L(loop32_2):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, 0
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
-	sub	src, src, 16
-	mov	synd, vend.d[0]
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	result, src, srcin
+	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
 	clz	synd, synd
 	add	result, result, synd, lsr 2
 	cmp	cntin, result
 	csel	result, cntin, result, ls
 	ret
 
-L(nomatch):
-	mov	result, cntin
-	ret
-
 END (__strnlen_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
index 1e4fb1a68f7e..bb61ab9ad4e7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
@@ -1,127 +1,137 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
 #define result		x0
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
 #define nul_match	x5
 #define chr_match	x6
 
 #define vrepchr		v0
 #define vdata		v1
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value, with
    four bits per byte (LSB is always in bits 0 and 1, for both big
    and little-endian systems).  For each tuple, bits 0-1 are set if
    the relevant byte matched the requested character; bits 2-3 are set
    if the relevant byte matched the NUL end of string.  */
 
 ENTRY (__strrchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	rbit	synd, synd
+#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne
 	csel	chr_match, synd, chr_match, ne
 	ld1	{vdata.16b}, [src], 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	tst	synd, 0xcccccccccccccccc
 	beq	L(loop2)
 
 	bic	vhas_nul.8h, 0x0f, lsl 8
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	and	nul_match, synd, 0xcccccccccccccccc
 	sub	nul_match, nul_match, 1
 	and	tmp, synd, 0x3333333333333333
 	ands	tmp, tmp, nul_match
 	csel	chr_match, tmp, chr_match, ne
 	csel	src_match, src, src_match, ne
 	sub	src_match, src_match, 1
 	clz	tmp, chr_match
 	sub	result, src_match, tmp, lsr 2
 	ret
 
 END (__strrchr_aarch64_mte)
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
index d36d69af37fd..825a7384cfc1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
@@ -1,84 +1,84 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * SVE Available.
  */
 
 ENTRY (__strrchr_aarch64_sve)
 	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
 	mov	x2, 0			/* no match found so far */
 	pfalse	p2.b
 
 	.p2align 4
 	/* Read a vector's worth of bytes, stopping on first fault.  */
 0:	ldff1b	z0.b, p1/z, [x0, xzr]
 	rdffrs	p0.b, p1/z
 	b.nlast	1f
 
 	/* First fault did not fail: the whole vector is valid.
 	   Avoid depending on the contents of FFR beyond the branch.  */
 	incb	x0, all			/* skip bytes this round */
 	cmpeq	p3.b, p1/z, z0.b, 0	/* search for 0 */
 	b.any	3f
 
 	cmpeq	p3.b, p1/z, z0.b, z1.b	/* search for c; no eos */
 	b.none	0b
 
 	mov	x2, x0			/* save advanced base */
 	mov	p2.b, p3.b		/* save current search */
 	b	0b
 
 	/* First fault failed: only some of the vector is valid.
 	   Perform the comparisions only on the valid bytes.  */
 1:	cmpeq	p3.b, p0/z, z0.b, 0	/* search for 0 */
 	b.any	2f
 
 	cmpeq	p3.b, p0/z, z0.b, z1.b	/* search for c; no eos */
 	mov	x3, x0
 	incp	x0, p0.b		/* skip bytes this round */
 	setffr				/* re-init FFR */
 	b.none	0b
 
 	addvl	x2, x3, 1		/* save advanced base */
 	mov	p2.b, p3.b		/* save current search */
 	b	0b
 
 	/* Found end-of-string.  */
 2:	incb	x0, all			/* advance base */
 3:	brka	p3.b, p1/z, p3.b	/* mask after first 0 */
 	cmpeq	p3.b, p3/z, z0.b, z1.b	/* search for c not after eos */
 	b.any	4f
 
 	/* No C within last vector.  Did we have one before?  */
 	cbz	x2, 5f
 	mov	x0, x2			/* restore advanced base */
 	mov	p3.b, p2.b		/* restore saved search */
 
 	/* Find the *last* match in the predicate.  This is slightly
 	   more complicated than finding the first match.  */
 4:	rev	p3.b, p3.b		/* reverse the bits */
 	brka	p3.b, p1/z, p3.b	/* find position of last match */
 	decp	x0, p3.b		/* retard pointer to last match */
 	ret
 
 	/* No C whatsoever.  Return NULL.  */
 5:	mov	x0, 0
 	ret
 
 END (__strrchr_aarch64_sve)
 
 #endif
 
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr.S b/contrib/arm-optimized-routines/string/aarch64/strrchr.S
index 56185ff534e3..bf9cb297b6cb 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr.S
@@ -1,149 +1,149 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 
 #define result		x0
 
 #define src		x2
 #define	tmp1		x3
 #define wtmp2		w4
 #define tmp3		x5
 #define src_match	x6
 #define src_offset	x7
 #define const_m1	x8
 #define tmp4		x9
 #define nul_match	x10
 #define chr_match	x11
 
 #define vrepchr		v0
 #define vdata1		v1
 #define vdata2		v2
 #define vhas_nul1	v3
 #define vhas_nul2	v4
 #define vhas_chr1	v5
 #define vhas_chr2	v6
 #define vrepmask_0	v7
 #define vrepmask_c	v16
 #define vend1		v17
 #define vend2		v18
 
 /* Core algorithm.
 
    For each 32-byte hunk we calculate a 64-bit syndrome value, with
    two bits per byte (LSB is always in bits 0 and 1, for both big
    and little-endian systems).  For each tuple, bit 0 is set iff
    the relevant byte matched the requested character; bit 1 is set
    iff the relevant byte matched the NUL end of string (we trigger
    off bit0 for the special case of looking for NUL).  Since the bits
    in the syndrome reflect exactly the order in which things occur
    in the original string a count_trailing_zeros() operation will
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY (__strrchr_aarch64)
 	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask_c.4s, wtmp2
 	mov	src_offset, #0
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
 	b.eq	L(aligned)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
 	   for all the bytes, but then mask off those bits of the
 	   syndrome that are related to the padding.  */
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	neg	tmp1, tmp1
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
 	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
 	mov	nul_match, vend1.d[0]
 	lsl	tmp1, tmp1, #1
 	mov	const_m1, #~0
 	lsr	tmp3, const_m1, tmp1
 	mov	chr_match, vend1.d[1]
 
 	bic	nul_match, nul_match, tmp3	// Mask padding bits.
 	bic	chr_match, chr_match, tmp3	// Mask padding bits.
 	cbnz	nul_match, L(tail)
 
 	.p2align 4
 L(loop):
 	cmp	chr_match, #0
 	csel	src_match, src, src_match, ne
 	csel	src_offset, chr_match, src_offset, ne
 L(aligned):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	uminp	vend1.16b, vdata1.16b, vdata2.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
 	cmeq	vend1.16b, vend1.16b, 0
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
 	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
 	mov	nul_match, vend1.d[0]
 	mov	chr_match, vend1.d[1]
 	cbz	nul_match, L(loop)
 
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
 	mov	nul_match, vhas_nul1.d[0]
 
 L(tail):
 	/* Work out exactly where the string ends.  */
 	sub	tmp4, nul_match, #1
 	eor	tmp4, tmp4, nul_match
 	ands	chr_match, chr_match, tmp4
 	/* And pick the values corresponding to the last match.  */
 	csel	src_match, src, src_match, ne
 	csel	src_offset, chr_match, src_offset, ne
 
 	/* Count down from the top of the syndrome to find the last match.  */
 	clz	tmp3, src_offset
 	/* Src_match points beyond the word containing the match, so we can
 	   simply subtract half the bit-offset into the syndrome.  Because
 	   we are counting down, we need to go back one more character.  */
 	add	tmp3, tmp3, #2
 	sub	result, src_match, tmp3, lsr #1
 	/* But if the syndrome shows no match was found, then return NULL.  */
 	cmp	src_offset, #0
 	csel	result, result, xzr, ne
 
 	ret
 
 END (__strrchr_aarch64)
 
diff --git a/contrib/arm-optimized-routines/string/arm/asmdefs.h b/contrib/arm-optimized-routines/string/arm/asmdefs.h
new file mode 100644
index 000000000000..e31188804716
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/arm/asmdefs.h
@@ -0,0 +1,477 @@
+/*
+ * Macros for asm code.  Arm version.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Check whether leaf function PAC signing has been requested in the
+   -mbranch-protect compile-time option.  */
+#define LEAF_PROTECT_BIT 2
+
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+# define HAVE_PAC_LEAF \
+	((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1)
+#else
+# define HAVE_PAC_LEAF 0
+#endif
+
+/* Provide default parameters for PAC-code handling in leaf-functions.  */
+#if HAVE_PAC_LEAF
+# ifndef PAC_LEAF_PUSH_IP
+#  define PAC_LEAF_PUSH_IP 1
+# endif
+#else /* !HAVE_PAC_LEAF */
+# undef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 0
+#endif /* HAVE_PAC_LEAF */
+
+#define STACK_ALIGN_ENFORCE 0
+
+/******************************************************************************
+* Implementation of the prologue and epilogue assembler macros and their
+* associated helper functions.
+*
+* These functions add support for the following:
+*
+* - M-profile branch target identification (BTI) landing-pads when compiled
+*   with `-mbranch-protection=bti'.
+* - PAC-signing and verification instructions, depending on hardware support
+*   and whether the PAC-signing of leaf functions has been requested via the
+*   `-mbranch-protection=pac-ret+leaf' compiler argument.
+* - 8-byte stack alignment preservation at function entry, defaulting to the
+*   value of STACK_ALIGN_ENFORCE.
+*
+* Notes:
+* - Prologue stack alignment is implemented by detecting a push with an odd
+*   number of registers and prepending a dummy register to the list.
+* - If alignment is attempted on a list containing r0, compilation will result
+*   in an error.
+* - If alignment is attempted in a list containing r1, r0 will be prepended to
+*   the register list and r0 will be restored prior to function return.  for
+*   functions with non-void return types, this will result in the corruption of
+*   the result register.
+* - Stack alignment is enforced via the following helper macro call-chain:
+*
+*	{prologue|epilogue} ->_align8 -> _preprocess_reglist ->
+*		_preprocess_reglist1 -> {_prologue|_epilogue}
+*
+* - Debug CFI directives are automatically added to prologues and epilogues,
+*   assisted by `cfisavelist' and `cfirestorelist', respectively.
+*
+* Arguments:
+* prologue
+* --------
+* - first	- If `last' specified, this serves as start of general-purpose
+*		  register (GPR) range to push onto stack, otherwise represents
+*		  single GPR to push onto stack.  If omitted, no GPRs pushed
+*		  onto stack at prologue.
+* - last	- If given, specifies inclusive upper-bound of GPR range.
+* - push_ip	- Determines whether IP register is to be pushed to stack at
+*		  prologue.  When pac-signing is requested, this holds the
+*		  the pac-key.  Either 1 or 0 to push or not push, respectively.
+*		  Default behavior: Set to value of PAC_LEAF_PUSH_IP macro.
+* - push_lr	- Determines whether to push lr to the stack on function entry.
+*		  Either 1 or 0  to push or not push, respectively.
+* - align8	- Whether to enforce alignment. Either 1 or 0, with 1 requesting
+*		  alignment.
+*
+* epilogue
+* --------
+*   The epilogue should be called passing the same arguments as those passed to
+*   the prologue to ensure the stack is not corrupted on function return.
+*
+* Usage examples:
+*
+*   prologue push_ip=1 -> push {ip}
+*   epilogue push_ip=1, align8=1 -> pop {r2, ip}
+*   prologue push_ip=1, push_lr=1 -> push {ip, lr}
+*   epilogue 1 -> pop {r1}
+*   prologue 1, align8=1 -> push {r0, r1}
+*   epilogue 1, push_ip=1 -> pop {r1, ip}
+*   prologue 1, 4 -> push {r1-r4}
+*   epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip}
+*
+******************************************************************************/
+
+/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
+	.macro cfirestorelist first, last
+	.cfi_restore \last
+	.if \last-\first
+	 cfirestorelist \first, \last-1
+	.endif
+	.endm
+
+/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
+	.macro cfisavelist first, last, index=1
+	.cfi_offset \last, -4*(\index)
+	.if \last-\first
+	 cfisavelist \first, \last-1, \index+1
+	.endif
+	.endm
+
+.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _prologue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+# if __ARM_FEATURE_BTI_DEFAULT
+	pacbti	ip, lr, sp
+# else
+	pac	ip, lr, sp
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+	.cfi_register 143, 12
+#else
+# if __ARM_FEATURE_BTI_DEFAULT
+	bti
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
+	.if \first != -1
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: push register range, ip and lr registers.  */
+	push {r\first-r\last, ip, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+3)*4
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	cfisavelist \first, \last, 3
+	   .else // !\push_lr
+	/* Case 2: push register range and ip register.  */
+	push {r\first-r\last, ip}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 143, -4
+	cfisavelist \first, \last, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: push register range and lr register.  */
+	push {r\first-r\last, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 14, -4
+	cfisavelist \first, \last, 2
+	   .else // !\push_lr
+	/* Case 4: push register range.  */
+	push {r\first-r\last}
+	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
+	cfisavelist \first, \last, 1
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: push single GP register plus ip and lr registers.  */
+	push {r\first, ip, lr}
+	.cfi_adjust_cfa_offset 12
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+        cfisavelist \first, \first, 3
+	   .else // !\push_lr
+	/* Case 6: push single GP register plus ip register.  */
+	push {r\first, ip}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 143, -4
+        cfisavelist \first, \first, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: push single GP register plus lr register.  */
+	push {r\first, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	cfisavelist \first, \first, 2
+	   .else // !\push_lr
+	/* Case 8: push single GP register.  */
+	push {r\first}
+	.cfi_adjust_cfa_offset 4
+	cfisavelist \first, \first, 1
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: push ip and lr registers.  */
+	push {ip, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	  .else // !\push_lr
+	/* Case 10: push ip register.  */
+	push {ip}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 143, -4
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: push lr register.  */
+	push {lr}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 14, -4
+          .endif
+	 .endif
+	.endif
+.endm
+
+.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _epilogue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: pop register range, ip and lr registers.  */
+	pop {r\first-r\last, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 2: pop register range and ip register.  */
+	pop {r\first-r\last, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: pop register range and lr register.  */
+	pop {r\first-r\last, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 4: pop register range.  */
+	pop {r\first-r\last}
+	cfirestorelist \first, \last
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: pop single GP register plus ip and lr registers.  */
+	pop {r\first, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 6: pop single GP register plus ip register.  */
+	pop {r\first, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: pop single GP register plus lr register.  */
+	pop {r\first, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 8: pop single GP register.  */
+	pop {r\first}
+	cfirestorelist \first, \first
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: pop ip and lr registers.  */
+	pop {ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	  .else // !\push_lr
+	/* Case 10: pop ip register.  */
+	pop {ip}
+	.cfi_register 143, 12
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: pop lr register.  */
+	pop {lr}
+	.cfi_restore 14
+          .endif
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+	aut	ip, lr, sp
+#endif /* HAVE_PAC_LEAF */
+	bx	lr
+.endm
+
+/* Clean up expressions in 'last'.  */
+.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
+	.if \last == 0
+	 \reglist_op \first, 0, \push_ip, \push_lr
+	.elseif \last == 1
+	 \reglist_op \first, 1, \push_ip, \push_lr
+	.elseif \last == 2
+	 \reglist_op \first, 2, \push_ip, \push_lr
+	.elseif \last == 3
+	 \reglist_op \first, 3, \push_ip, \push_lr
+	.elseif \last == 4
+	 \reglist_op \first, 4, \push_ip, \push_lr
+	.elseif \last == 5
+	 \reglist_op \first, 5, \push_ip, \push_lr
+	.elseif \last == 6
+	 \reglist_op \first, 6, \push_ip, \push_lr
+	.elseif \last == 7
+	 \reglist_op \first, 7, \push_ip, \push_lr
+	.elseif \last == 8
+	 \reglist_op \first, 8, \push_ip, \push_lr
+	.elseif \last == 9
+	 \reglist_op \first, 9, \push_ip, \push_lr
+	.elseif \last == 10
+	 \reglist_op \first, 10, \push_ip, \push_lr
+	.elseif \last == 11
+	 \reglist_op \first, 11, \push_ip, \push_lr
+	.else
+	 .error "last (\last) out of range"
+	.endif
+.endm
+
+/* Clean up expressions in 'first'.  */
+.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
+	.ifb \last
+	 _preprocess_reglist \first \first \push_ip \push_lr
+	.else
+	 .if \first > \last
+	  .error "last (\last) must be at least as great as first (\first)"
+	 .endif
+	 .if \first == 0
+	  _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 1
+	  _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 2
+	  _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 3
+	  _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 4
+	  _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 5
+	  _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 6
+	  _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 7
+	  _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 8
+	  _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 9
+	  _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 10
+	  _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 11
+	  _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  .error "first (\first) out of range"
+	 .endif
+	.endif
+.endm
+
+.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue
+	.ifb \first
+	 .ifnb \last
+	  .error "can't have last (\last) without specifying first"
+	 .else // \last not blank
+	  .if ((\push_ip + \push_lr) % 2) == 0
+	   \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr
+	   .exitm
+	  .else // ((\push_ip + \push_lr) % 2) odd
+	   _align8 2, 2, \push_ip, \push_lr, \reglist_op
+	   .exitm
+	  .endif // ((\push_ip + \push_lr) % 2) == 0
+	 .endif // .ifnb \last
+	.endif // .ifb \first
+
+	.ifb \last
+	 _align8 \first, \first, \push_ip, \push_lr, \reglist_op
+	.else
+	 .if \push_ip & 1 <> \push_ip
+	  .error "push_ip may be 0 or 1"
+	 .endif
+	 .if \push_lr & 1 <> \push_lr
+	  .error "push_lr may be 0 or 1"
+	 .endif
+	 .ifeq (\last - \first + \push_ip + \push_lr) % 2
+	  .if \first == 0
+	   .error "Alignment required and first register is r0"
+	   .exitm
+	  .endif
+	  _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op
+	 .endif
+	.endif
+.endm
+
+.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, _prologue
+	.else
+	 _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue
+	.else
+	 _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .fnstart;		\
+  .cfi_startproc;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#if defined (IS_LEAF)
+# define END_UNWIND .cantunwind;
+#else
+# define END_UNWIND
+#endif
+
+#define END(name)	\
+  .cfi_endproc;		\
+  END_UNWIND		\
+  .fnend;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/contrib/arm-optimized-routines/string/arm/check-arch.S b/contrib/arm-optimized-routines/string/arm/check-arch.S
index 1cff9345e343..95516710fb85 100644
--- a/contrib/arm-optimized-routines/string/arm/check-arch.S
+++ b/contrib/arm-optimized-routines/string/arm/check-arch.S
@@ -1,10 +1,13 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__arm__
 # error ARCH setting does not match the compiler.
 #endif
+
+/* For attributes that may affect ABI.  */
+#include "asmdefs.h"
diff --git a/contrib/arm-optimized-routines/string/arm/memchr.S b/contrib/arm-optimized-routines/string/arm/memchr.S
index 3f1ac4df136f..823d6013eb35 100644
--- a/contrib/arm-optimized-routines/string/arm/memchr.S
+++ b/contrib/arm-optimized-routines/string/arm/memchr.S
@@ -1,132 +1,168 @@
 /*
  * memchr - scan memory for a character
  *
- * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2010-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
    Written by Dave Gilbert <david.gilbert@linaro.org>
 
    This __memchr_arm routine is optimised on a Cortex-A9 and should work on
    all ARMv7 processors.   It has a fast past for short sizes, and has
    an optimised path for large data sets; the worst case is finding the
    match early in a large data set.
 
  */
 
 @ 2011-02-07 david.gilbert@linaro.org
 @    Extracted from local git a5b438d861
 @ 2011-07-14 david.gilbert@linaro.org
 @    Import endianness fix from local git ea786f1b
 @ 2011-12-07 david.gilbert@linaro.org
 @    Removed unneeded cbz from align loop
 
 	.syntax unified
+#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M'
+    /* keep config inherited from -march= */
+#else
 	.arch armv7-a
+#endif
 
 @ this lets us check a flag in a 00/ff byte easily in either endianness
 #ifdef __ARMEB__
 #define CHARTSTMASK(c) 1<<(31-(c*8))
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
 	.thumb
+#include "asmdefs.h"
+
 
 @ ---------------------------------------------------------------------------
 	.thumb_func
 	.align 2
 	.p2align 4,,15
 	.global __memchr_arm
 	.type __memchr_arm,%function
+	.fnstart
+	.cfi_startproc
 __memchr_arm:
 	@ r0 = start of memory to scan
 	@ r1 = character to look for
 	@ r2 = length
 	@ returns r0 = pointer to character or NULL if not found
+	prologue
 	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
 
 	cmp	r2,#16		@ If it's short don't bother with anything clever
 	blt	20f 
 
 	tst	r0, #7		@ If it's already aligned skip the next bit
 	beq	10f
 
 	@ Work up to an aligned point
 5:
 	ldrb	r3, [r0],#1
 	subs	r2, r2, #1
 	cmp	r3, r1
 	beq	50f		@ If it matches exit found
 	tst	r0, #7
 	bne	5b		@ If not aligned yet then do next byte
 	
 10:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset 4, 0
+	.cfi_rel_offset 5, 4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
 	bic	r4, r2, #7	@ Number of double words to work with
 	mvns	r7, #0		@ all F's
 	movs	r3, #0
 	
 15:
 	ldmia	r0!,{r5,r6}
 	subs	r4, r4, #8
 	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
 	eor	r6,r6, r1
 	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
 	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
 	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
 	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
 	cbnz	r6, 60f
 	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
 
 	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
 	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
  
 20:
 	cbz	r2, 40f		@ 0 length or hit the end already then not found
 
 21:  @ Post aligned section, or just a short call
 	ldrb	r3,[r0],#1
 	subs	r2,r2,#1
 	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
 	cbz	r3, 50f
 	bne	21b		@ on r2 flags
 
 40:
+	.cfi_remember_state
 	movs	r0,#0		@ not found
-	bx	lr
+	epilogue
 
 50:
+	.cfi_restore_state
+	.cfi_remember_state
 	subs	r0,r0,#1	@ found
-	bx	lr
+	epilogue
 
 60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
 	@ r0 points to the start of the double word after the one that was tested
 	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	.cfi_restore_state	@ Standard post-prologue state
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset	4, 0
+	.cfi_rel_offset 5, 4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	cmp	r5, #0
 	itte	eq
 	moveq	r5, r6		@ the end is in the 2nd word
 	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
 	subne	r0,r0,#7	@ or 2nd byte of 1st word
 
 	@ r0 currently points to the 3rd byte of the word containing the hit
 	tst	r5, # CHARTSTMASK(0)	@ 1st character
 	bne	61f
 	adds	r0,r0,#1
 	tst	r5, # CHARTSTMASK(1)	@ 2nd character
 	ittt	eq
 	addeq	r0,r0,#1
 	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
 	@ If not the 3rd must be the last one
 	addeq	r0,r0,#1
 
 61:
 	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	subs	r0,r0,#1
-	bx	lr
+	epilogue
+	.cfi_endproc
+	.cantunwind
+	.fnend
 
 	.size	__memchr_arm, . - __memchr_arm
diff --git a/contrib/arm-optimized-routines/string/arm/memcpy.S b/contrib/arm-optimized-routines/string/arm/memcpy.S
index 86e64938edb1..2423cfd69061 100644
--- a/contrib/arm-optimized-routines/string/arm/memcpy.S
+++ b/contrib/arm-optimized-routines/string/arm/memcpy.S
@@ -1,587 +1,587 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
    This memcpy routine is optimised for Cortex-A15 cores and takes advantage
    of VFP or NEON when built with the appropriate flags.
 
    Assumptions:
 
     ARMv6 (ARMv7-a if using Neon)
     ARM state
     Unaligned accesses
 
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
 	.arm
 
 #ifdef __ARM_NEON__
 
 	.fpu	neon
 	.arch	armv7-a
 # define FRAME_SIZE	4
 # define USE_VFP
 # define USE_NEON
 
 #elif !defined (__SOFTFP__)
 
 	.arch	armv6
 	.fpu	vfpv2
 # define FRAME_SIZE	32
 # define USE_VFP
 
 #else
 	.arch	armv6
 # define FRAME_SIZE    32
 
 #endif
 
 /* Old versions of GAS incorrectly implement the NEON align semantics.  */
 #ifdef BROKEN_ASM_NEON_ALIGN
 #define ALIGN(addr, align) addr,:align
 #else
 #define ALIGN(addr, align) addr:align
 #endif
 
 #define PC_OFFSET	8	/* PC pipeline compensation.  */
 #define INSN_SIZE	4
 
 /* Call parameters.  */
 #define dstin	r0
 #define src	r1
 #define count	r2
 
 /* Locals.  */
 #define tmp1	r3
 #define dst	ip
 #define tmp2	r10
 
 #ifndef USE_NEON
 /* For bulk copies using GP registers.  */
 #define	A_l	r2		/* Call-clobbered.  */
 #define	A_h	r3		/* Call-clobbered.  */
 #define	B_l	r4
 #define	B_h	r5
 #define	C_l	r6
 #define	C_h	r7
 #define	D_l	r8
 #define	D_h	r9
 #endif
 
 /* Number of lines ahead to pre-fetch data.  If you change this the code
    below will need adjustment to compensate.  */
 
 #define prefetch_lines	5
 
 #ifdef USE_VFP
 	.macro	cpy_line_vfp vreg, base
 	vstr	\vreg, [dst, #\base]
 	vldr	\vreg, [src, #\base]
 	vstr	d0, [dst, #\base + 8]
 	vldr	d0, [src, #\base + 8]
 	vstr	d1, [dst, #\base + 16]
 	vldr	d1, [src, #\base + 16]
 	vstr	d2, [dst, #\base + 24]
 	vldr	d2, [src, #\base + 24]
 	vstr	\vreg, [dst, #\base + 32]
 	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
 	vstr	d0, [dst, #\base + 40]
 	vldr	d0, [src, #\base + 40]
 	vstr	d1, [dst, #\base + 48]
 	vldr	d1, [src, #\base + 48]
 	vstr	d2, [dst, #\base + 56]
 	vldr	d2, [src, #\base + 56]
 	.endm
 
 	.macro	cpy_tail_vfp vreg, base
 	vstr	\vreg, [dst, #\base]
 	vldr	\vreg, [src, #\base]
 	vstr	d0, [dst, #\base + 8]
 	vldr	d0, [src, #\base + 8]
 	vstr	d1, [dst, #\base + 16]
 	vldr	d1, [src, #\base + 16]
 	vstr	d2, [dst, #\base + 24]
 	vldr	d2, [src, #\base + 24]
 	vstr	\vreg, [dst, #\base + 32]
 	vstr	d0, [dst, #\base + 40]
 	vldr	d0, [src, #\base + 40]
 	vstr	d1, [dst, #\base + 48]
 	vldr	d1, [src, #\base + 48]
 	vstr	d2, [dst, #\base + 56]
 	vldr	d2, [src, #\base + 56]
 	.endm
 #endif
 
 ENTRY (__memcpy_arm)
 
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
 	bhs	L(cpy_not_short)
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 
 L(tail63unaligned):
 #ifdef USE_NEON
 	and	tmp1, count, #0x38
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 	add	pc, pc, tmp1
 	vld1.8	{d0}, [src]!	/* 14 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 12 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 10 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 8 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 6 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 4 words to go.  */
 	vst1.8	{d0}, [dst]!
 	vld1.8	{d0}, [src]!	/* 2 words to go.  */
 	vst1.8	{d0}, [dst]!
 
 	tst	count, #4
 	ldrne	tmp1, [src], #4
 	strne	tmp1, [dst], #4
 #else
 	/* Copy up to 15 full words of data.  May not be aligned.  */
 	/* Cannot use VFP for unaligned data.  */
 	and	tmp1, count, #0x3c
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
 	/* Jump directly into the sequence below at the correct offset.  */
 	add	pc, pc, tmp1, lsl #1
 
 	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
 	str	tmp1, [dst, #-60]
 
 	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
 	str	tmp1, [dst, #-56]
 	ldr	tmp1, [src, #-52]
 	str	tmp1, [dst, #-52]
 
 	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
 	str	tmp1, [dst, #-48]
 	ldr	tmp1, [src, #-44]
 	str	tmp1, [dst, #-44]
 
 	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
 	str	tmp1, [dst, #-40]
 	ldr	tmp1, [src, #-36]
 	str	tmp1, [dst, #-36]
 
 	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
 	str	tmp1, [dst, #-32]
 	ldr	tmp1, [src, #-28]
 	str	tmp1, [dst, #-28]
 
 	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
 	str	tmp1, [dst, #-24]
 	ldr	tmp1, [src, #-20]
 	str	tmp1, [dst, #-20]
 
 	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
 	str	tmp1, [dst, #-16]
 	ldr	tmp1, [src, #-12]
 	str	tmp1, [dst, #-12]
 
 	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
 	str	tmp1, [dst, #-8]
 	ldr	tmp1, [src, #-4]
 	str	tmp1, [dst, #-4]
 #endif
 
 	lsls	count, count, #31
 	ldrhcs	tmp1, [src], #2
 	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
 	strhcs	tmp1, [dst], #2
 	strbne	src, [dst]
 	bx	lr
 
 L(cpy_not_short):
 	/* At least 64 bytes to copy, but don't know the alignment yet.  */
 	str	tmp2, [sp, #-FRAME_SIZE]!
 	and	tmp2, src, #7
 	and	tmp1, dst, #7
 	cmp	tmp1, tmp2
 	bne	L(cpy_notaligned)
 
 #ifdef USE_VFP
 	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
 	   that the FP pipeline is much better at streaming loads and
 	   stores.  This is outside the critical loop.  */
 	vmov.f32	s0, s0
 #endif
 
 	/* SRC and DST have the same mutual 64-bit alignment, but we may
 	   still need to pre-copy some bytes to get to natural alignment.
 	   We bring SRC and DST into full 64-bit alignment.  */
 	lsls	tmp2, dst, #29
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
 	ldrmi	tmp1, [src], #4
 	strmi	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #2
 	ldrhcs	tmp1, [src], #2
 	ldrbne	tmp2, [src], #1
 	strhcs	tmp1, [dst], #2
 	strbne	tmp2, [dst], #1
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
 	blo	L(tail63aligned)
 
 	cmp	tmp2, #512
 	bhs	L(cpy_body_long)
 
 L(cpy_body_medium):			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
 	vldr	d0, [src, #0]
 	subs	tmp2, tmp2, #64
 	vldr	d1, [src, #8]
 	vstr	d0, [dst, #0]
 	vldr	d0, [src, #16]
 	vstr	d1, [dst, #8]
 	vldr	d1, [src, #24]
 	vstr	d0, [dst, #16]
 	vldr	d0, [src, #32]
 	vstr	d1, [dst, #24]
 	vldr	d1, [src, #40]
 	vstr	d0, [dst, #32]
 	vldr	d0, [src, #48]
 	vstr	d1, [dst, #40]
 	vldr	d1, [src, #56]
 	vstr	d0, [dst, #48]
 	add	src, src, #64
 	vstr	d1, [dst, #56]
 	add	dst, dst, #64
 	bhs	1b
 	tst	tmp2, #0x3f
 	beq	L(done)
 
 L(tail63aligned):			/* Count in tmp2.  */
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 	add	pc, pc, tmp1
 
 	vldr	d0, [src, #-56]	/* 14 words to go.  */
 	vstr	d0, [dst, #-56]
 	vldr	d0, [src, #-48]	/* 12 words to go.  */
 	vstr	d0, [dst, #-48]
 	vldr	d0, [src, #-40]	/* 10 words to go.  */
 	vstr	d0, [dst, #-40]
 	vldr	d0, [src, #-32]	/* 8 words to go.  */
 	vstr	d0, [dst, #-32]
 	vldr	d0, [src, #-24]	/* 6 words to go.  */
 	vstr	d0, [dst, #-24]
 	vldr	d0, [src, #-16]	/* 4 words to go.  */
 	vstr	d0, [dst, #-16]
 	vldr	d0, [src, #-8]	/* 2 words to go.  */
 	vstr	d0, [dst, #-8]
 #else
 	sub	src, src, #8
 	sub	dst, dst, #8
 1:
 	ldrd	A_l, A_h, [src, #8]
 	strd	A_l, A_h, [dst, #8]
 	ldrd	A_l, A_h, [src, #16]
 	strd	A_l, A_h, [dst, #16]
 	ldrd	A_l, A_h, [src, #24]
 	strd	A_l, A_h, [dst, #24]
 	ldrd	A_l, A_h, [src, #32]
 	strd	A_l, A_h, [dst, #32]
 	ldrd	A_l, A_h, [src, #40]
 	strd	A_l, A_h, [dst, #40]
 	ldrd	A_l, A_h, [src, #48]
 	strd	A_l, A_h, [dst, #48]
 	ldrd	A_l, A_h, [src, #56]
 	strd	A_l, A_h, [dst, #56]
 	ldrd	A_l, A_h, [src, #64]!
 	strd	A_l, A_h, [dst, #64]!
 	subs	tmp2, tmp2, #64
 	bhs	1b
 	tst	tmp2, #0x3f
 	bne	1f
 	ldr	tmp2,[sp], #FRAME_SIZE
 	bx	lr
 1:
 	add	src, src, #8
 	add	dst, dst, #8
 
 L(tail63aligned):			/* Count in tmp2.  */
 	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 	   we know that the src and dest are 64-bit aligned so we can use
 	   LDRD/STRD to improve efficiency.  */
 	/* TMP2 is now negative, but we don't care about that.  The bottom
 	   six bits still tell us how many bytes are left to copy.  */
 
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 	add	pc, pc, tmp1
 	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
 	strd	A_l, A_h, [dst, #-56]
 	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
 	strd	A_l, A_h, [dst, #-48]
 	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
 	strd	A_l, A_h, [dst, #-40]
 	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
 	strd	A_l, A_h, [dst, #-32]
 	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
 	strd	A_l, A_h, [dst, #-24]
 	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
 	strd	A_l, A_h, [dst, #-16]
 	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
 	strd	A_l, A_h, [dst, #-8]
 
 #endif
 	tst	tmp2, #4
 	ldrne	tmp1, [src], #4
 	strne	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
 	ldrhcs	tmp1, [src], #2
 	ldrbne	tmp2, [src]
 	strhcs	tmp1, [dst], #2
 	strbne	tmp2, [dst]
 
 L(done):
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 
 L(cpy_body_long):			/* Count in tmp2.  */
 
 	/* Long copy.  We know that there's at least (prefetch_lines * 64)
 	   bytes to go.  */
 #ifdef USE_VFP
 	/* Don't use PLD.  Instead, read some data in advance of the current
 	   copy position into a register.  This should act like a PLD
 	   operation but we won't have to repeat the transfer.  */
 
 	vldr	d3, [src, #0]
 	vldr	d4, [src, #64]
 	vldr	d5, [src, #128]
 	vldr	d6, [src, #192]
 	vldr	d7, [src, #256]
 
 	vldr	d0, [src, #8]
 	vldr	d1, [src, #16]
 	vldr	d2, [src, #24]
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
 	blo	2f
 1:
 	cpy_line_vfp	d3, 0
 	cpy_line_vfp	d4, 64
 	cpy_line_vfp	d5, 128
 	add	dst, dst, #3 * 64
 	add	src, src, #3 * 64
 	cpy_line_vfp	d6, 0
 	cpy_line_vfp	d7, 64
 	add	dst, dst, #2 * 64
 	add	src, src, #2 * 64
 	subs	tmp2, tmp2, #prefetch_lines * 64
 	bhs	1b
 
 2:
 	cpy_tail_vfp	d3, 0
 	cpy_tail_vfp	d4, 64
 	cpy_tail_vfp	d5, 128
 	add	src, src, #3 * 64
 	add	dst, dst, #3 * 64
 	cpy_tail_vfp	d6, 0
 	vstr	d7, [dst, #64]
 	vldr	d7, [src, #64]
 	vstr	d0, [dst, #64 + 8]
 	vldr	d0, [src, #64 + 8]
 	vstr	d1, [dst, #64 + 16]
 	vldr	d1, [src, #64 + 16]
 	vstr	d2, [dst, #64 + 24]
 	vldr	d2, [src, #64 + 24]
 	vstr	d7, [dst, #64 + 32]
 	add	src, src, #96
 	vstr	d0, [dst, #64 + 40]
 	vstr	d1, [dst, #64 + 48]
 	vstr	d2, [dst, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
 	b	L(cpy_body_medium)
 #else
 	/* Long copy.  Use an SMS style loop to maximize the I/O
 	   bandwidth of the core.  We don't have enough spare registers
 	   to synthesise prefetching, so use PLD operations.  */
 	/* Pre-bias src and dst.  */
 	sub	src, src, #8
 	sub	dst, dst, #8
 	pld	[src, #8]
 	pld	[src, #72]
 	subs	tmp2, tmp2, #64
 	pld	[src, #136]
 	ldrd	A_l, A_h, [src, #8]
 	strd	B_l, B_h, [sp, #8]
 	ldrd	B_l, B_h, [src, #16]
 	strd	C_l, C_h, [sp, #16]
 	ldrd	C_l, C_h, [src, #24]
 	strd	D_l, D_h, [sp, #24]
 	pld	[src, #200]
 	ldrd	D_l, D_h, [src, #32]!
 	b	1f
 	.p2align	6
 2:
 	pld	[src, #232]
 	strd	A_l, A_h, [dst, #40]
 	ldrd	A_l, A_h, [src, #40]
 	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [src, #48]
 	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [src, #56]
 	strd	D_l, D_h, [dst, #64]!
 	ldrd	D_l, D_h, [src, #64]!
 	subs	tmp2, tmp2, #64
 1:
 	strd	A_l, A_h, [dst, #8]
 	ldrd	A_l, A_h, [src, #8]
 	strd	B_l, B_h, [dst, #16]
 	ldrd	B_l, B_h, [src, #16]
 	strd	C_l, C_h, [dst, #24]
 	ldrd	C_l, C_h, [src, #24]
 	strd	D_l, D_h, [dst, #32]
 	ldrd	D_l, D_h, [src, #32]
 	bcs	2b
 	/* Save the remaining bytes and restore the callee-saved regs.  */
 	strd	A_l, A_h, [dst, #40]
 	add	src, src, #40
 	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	strd	D_l, D_h, [dst, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	add	dst, dst, #72
 	tst	tmp2, #0x3f
 	bne	L(tail63aligned)
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 #endif
 
 L(cpy_notaligned):
 	pld	[src]
 	pld	[src, #64]
 	/* There's at least 64 bytes to copy, but there is no mutual
 	   alignment.  */
 	/* Bring DST to 64-bit alignment.  */
 	lsls	tmp2, dst, #29
 	pld	[src, #(2 * 64)]
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
 	ldrmi	tmp1, [src], #4
 	strmi	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #2
 	ldrbne	tmp1, [src], #1
 	ldrhcs	tmp2, [src], #2
 	strbne	tmp1, [dst], #1
 	strhcs	tmp2, [dst], #2
 1:
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
 	ldrlo	tmp2, [sp], #FRAME_SIZE
 	blo	L(tail63unaligned)
 	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
 	vld1.8	{d0-d3}, [src]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
 	blo	2f
 1:
 	pld	[src, #(4 * 64)]
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vld1.8	{d0-d3}, [src]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
 	bhs	1b
 2:
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	ands	count, count, #0x3f
 #else
 	/* Use an SMS style loop to maximize the I/O bandwidth.  */
 	sub	src, src, #4
 	sub	dst, dst, #8
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
 	ldr	A_l, [src, #4]
 	ldr	A_h, [src, #8]
 	strd	B_l, B_h, [sp, #8]
 	ldr	B_l, [src, #12]
 	ldr	B_h, [src, #16]
 	strd	C_l, C_h, [sp, #16]
 	ldr	C_l, [src, #20]
 	ldr	C_h, [src, #24]
 	strd	D_l, D_h, [sp, #24]
 	ldr	D_l, [src, #28]
 	ldr	D_h, [src, #32]!
 	b	1f
 	.p2align	6
 2:
 	pld	[src, #(5 * 64) - (32 - 4)]
 	strd	A_l, A_h, [dst, #40]
 	ldr	A_l, [src, #36]
 	ldr	A_h, [src, #40]
 	strd	B_l, B_h, [dst, #48]
 	ldr	B_l, [src, #44]
 	ldr	B_h, [src, #48]
 	strd	C_l, C_h, [dst, #56]
 	ldr	C_l, [src, #52]
 	ldr	C_h, [src, #56]
 	strd	D_l, D_h, [dst, #64]!
 	ldr	D_l, [src, #60]
 	ldr	D_h, [src, #64]!
 	subs	tmp2, tmp2, #64
 1:
 	strd	A_l, A_h, [dst, #8]
 	ldr	A_l, [src, #4]
 	ldr	A_h, [src, #8]
 	strd	B_l, B_h, [dst, #16]
 	ldr	B_l, [src, #12]
 	ldr	B_h, [src, #16]
 	strd	C_l, C_h, [dst, #24]
 	ldr	C_l, [src, #20]
 	ldr	C_h, [src, #24]
 	strd	D_l, D_h, [dst, #32]
 	ldr	D_l, [src, #28]
 	ldr	D_h, [src, #32]
 	bcs	2b
 
 	/* Save the remaining bytes and restore the callee-saved regs.  */
 	strd	A_l, A_h, [dst, #40]
 	add	src, src, #36
 	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	strd	D_l, D_h, [dst, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	add	dst, dst, #72
 	ands	count, tmp2, #0x3f
 #endif
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bne	L(tail63unaligned)
 	bx	lr
 
 END (__memcpy_arm)
diff --git a/contrib/arm-optimized-routines/string/arm/memset.S b/contrib/arm-optimized-routines/string/arm/memset.S
index 11e927368fd1..487b9d6a8f6c 100644
--- a/contrib/arm-optimized-routines/string/arm/memset.S
+++ b/contrib/arm-optimized-routines/string/arm/memset.S
@@ -1,98 +1,98 @@
 /*
  * memset - fill memory with a constant
  *
  * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
    Written by Dave Gilbert <david.gilbert@linaro.org>
 
    This memset routine is optimised on a Cortex-A9 and should work on
    all ARMv7 processors.
 
  */
 
 	.syntax unified
 	.arch armv7-a
 
 @ 2011-08-30 david.gilbert@linaro.org
 @    Extracted from local git 2f11b436
 
 @ this lets us check a flag in a 00/ff byte easily in either endianness
 #ifdef __ARMEB__
 #define CHARTSTMASK(c) 1<<(31-(c*8))
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
 	.thumb
 
 @ ---------------------------------------------------------------------------
 	.thumb_func
 	.align 2
 	.p2align 4,,15
 	.global __memset_arm
 	.type __memset_arm,%function
 __memset_arm:
 	@ r0 = address
 	@ r1 = character
 	@ r2 = count
 	@ returns original address in r0
 
 	mov	r3, r0		@ Leave r0 alone
 	cbz	r2, 10f		@ Exit if 0 length
 
 	tst	r0, #7
 	beq	2f		@ Already aligned
 
 	@ Ok, so we're misaligned here
 1:
 	strb	r1, [r3], #1
 	subs	r2,r2,#1
 	tst	r3, #7
 	cbz	r2, 10f		@ Exit if we hit the end
 	bne	1b		@ go round again if still misaligned
 
 2:
 	@ OK, so we're aligned
 	push	{r4,r5,r6,r7}
 	bics	r4, r2, #15	@ if less than 16 bytes then need to finish it off
 	beq	5f
 
 3:
 	@ POSIX says that ch is cast to an unsigned char.  A uxtb is one
 	@ byte and takes two cycles, where an AND is four bytes but one
 	@ cycle.
 	and	r1, #0xFF
 	orr	r1, r1, r1, lsl#8	@ Same character into all bytes
 	orr	r1, r1, r1, lsl#16
 	mov	r5,r1
 	mov	r6,r1
 	mov	r7,r1
 
 4:
 	subs	r4,r4,#16
 	stmia	r3!,{r1,r5,r6,r7}
 	bne	4b
 	and	r2,r2,#15
 
 	@ At this point we're still aligned and we have upto align-1 bytes left to right
 	@ we can avoid some of the byte-at-a time now by testing for some big chunks
 	tst	r2,#8
 	itt	ne
 	subne	r2,r2,#8
 	stmiane	r3!,{r1,r5}
 
 5:
 	pop	{r4,r5,r6,r7}
 	cbz	r2, 10f
 
 	@ Got to do any last < alignment bytes
 6:
 	subs	r2,r2,#1
 	strb	r1,[r3],#1
 	bne	6b
 
 10:
 	bx	lr		@ goodbye
 	.size	__memset_arm, . - __memset_arm
diff --git a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S
index b75d4143db57..4d55306810ad 100644
--- a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S
+++ b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S
@@ -1,117 +1,119 @@
 /*
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include "asmdefs.h"
+
 #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
 
 	.thumb_func
 	.syntax unified
 	.arch	armv6-m
 
 	.macro DoSub n, label
 	subs	r0, r0, r1
 #ifdef __ARM_BIG_ENDIAN
 	lsrs	r1, r4, \n
 #else
 	lsls	r1, r4, \n
 #endif
 	orrs	r1, r0
 	bne	\label
 	.endm
 
 	.macro Byte_Test n, label
 	lsrs	r0, r2, \n
 	lsrs	r1, r3, \n
 	DoSub	\n, \label
 	.endm
 
 ENTRY_ALIGN (__strcmp_armv6m, 4)
 	mov	r2, r0
 	push	{r4, r5, r6, lr}
 	orrs	r2, r1
 	lsls	r2, r2, #30
 	bne	6f
 	ldr	r5, =0x01010101
 	lsls	r6, r5, #7
 1:
 	ldmia	r0!, {r2}
 	ldmia	r1!, {r3}
 	subs	r4, r2, r5
 	bics	r4, r2
 	ands	r4, r6
 	beq	3f
 
 #ifdef __ARM_BIG_ENDIAN
 	Byte_Test #24, 4f
 	Byte_Test #16, 4f
 	Byte_Test #8, 4f
 
 	b       7f
 3:
 	cmp     r2, r3
 	beq     1b
 	cmp     r2, r3
 #else
 	uxtb    r0, r2
 	uxtb    r1, r3
 	DoSub   #24, 2f
 
 	uxth    r0, r2
 	uxth    r1, r3
 	DoSub   #16, 2f
 
 	lsls    r0, r2, #8
 	lsls    r1, r3, #8
 	lsrs    r0, r0, #8
 	lsrs    r1, r1, #8
 	DoSub   #8, 2f
 
 	lsrs    r0, r2, #24
 	lsrs    r1, r3, #24
 	subs    r0, r0, r1
 2:
 	pop     {r4, r5, r6, pc}
 
 3:
 	cmp     r2, r3
 	beq     1b
 	rev     r0, r2
 	rev     r1, r3
 	cmp     r0, r1
 #endif
 
 	bls	5f
 	movs	r0, #1
 4:
 	pop	{r4, r5, r6, pc}
 5:
 	movs	r0, #0
 	mvns	r0, r0
 	pop	{r4, r5, r6, pc}
 6:
 	ldrb	r2, [r0, #0]
 	ldrb	r3, [r1, #0]
 	adds	r0, #1
 	adds	r1, #1
 	cmp	r2, #0
 	beq	7f
 	cmp	r2, r3
 	bne	7f
 	ldrb	r2, [r0, #0]
 	ldrb	r3, [r1, #0]
 	adds	r0, #1
 	adds	r1, #1
 	cmp	r2, #0
 	beq	7f
 	cmp	r2, r3
 	beq	6b
 7:
 	subs	r0, r2, r3
 	pop	{r4, r5, r6, pc}
 
 END (__strcmp_armv6m)
 
 #endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1  */
diff --git a/contrib/arm-optimized-routines/string/arm/strcmp.S b/contrib/arm-optimized-routines/string/arm/strcmp.S
index 51443e343058..74b3d235fb18 100644
--- a/contrib/arm-optimized-routines/string/arm/strcmp.S
+++ b/contrib/arm-optimized-routines/string/arm/strcmp.S
@@ -1,475 +1,486 @@
 /*
  * strcmp for ARMv7
  *
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
 
 /* Implementation of strcmp for ARMv7 when DSP instructions are
    available.  Use ldrd to support wider loads, provided the data
    is sufficiently aligned.  Use saturating arithmetic to optimize
    the compares.  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Build Options:
    STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
    byte in the string.  If comparing completely random strings
    the pre-check will save time, since there is a very high
    probability of a mismatch in the first character: we save
    significant overhead if this is the common case.  However,
    if strings are likely to be identical (eg because we're
    verifying a hit in a hash table), then this check is largely
    redundant.  */
 
 #define STRCMP_NO_PRECHECK	0
 
+/* Ensure the .cantunwind directive is prepended to .fnend.
+   Leaf functions cannot throw exceptions - EHABI only supports
+   synchronous exceptions.  */
+#define IS_LEAF
+
 	/* This version uses Thumb-2 code.  */
 	.thumb
 	.syntax unified
 
 #ifdef __ARM_BIG_ENDIAN
 #define S2LO lsl
 #define S2LOEQ lsleq
 #define S2HI lsr
 #define MSB 0x000000ff
 #define LSB 0xff000000
 #define BYTE0_OFFSET 24
 #define BYTE1_OFFSET 16
 #define BYTE2_OFFSET 8
 #define BYTE3_OFFSET 0
 #else /* not  __ARM_BIG_ENDIAN */
 #define S2LO lsr
 #define S2LOEQ lsreq
 #define S2HI lsl
 #define BYTE0_OFFSET 0
 #define BYTE1_OFFSET 8
 #define BYTE2_OFFSET 16
 #define BYTE3_OFFSET 24
 #define MSB 0xff000000
 #define LSB 0x000000ff
 #endif /* not  __ARM_BIG_ENDIAN */
 
 /* Parameters and result.  */
 #define src1		r0
 #define src2		r1
 #define result		r0	/* Overlaps src1.  */
 
 /* Internal variables.  */
 #define tmp1		r4
 #define tmp2		r5
 #define const_m1	r12
 
 /* Additional internal variables for 64-bit aligned data.  */
 #define data1a		r2
 #define data1b		r3
 #define data2a		r6
 #define data2b		r7
 #define syndrome_a	tmp1
 #define syndrome_b	tmp2
 
 /* Additional internal variables for 32-bit aligned data.  */
 #define data1		r2
 #define data2		r3
 #define syndrome	tmp2
 
 
 	/* Macro to compute and return the result value for word-aligned
 	   cases.  */
 	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
 #ifdef __ARM_BIG_ENDIAN
 	/* If data1 contains a zero byte, then syndrome will contain a 1 in
 	   bit 7 of that byte.  Otherwise, the highest set bit in the
 	   syndrome will highlight the first different bit.  It is therefore
 	   sufficient to extract the eight bits starting with the syndrome
 	   bit.  */
 	clz	tmp1, \synd
 	lsl	r1, \d2, tmp1
 	.if \restore_r6
 	ldrd	r6, r7, [sp, #8]
 	.endif
 	.cfi_restore 6
 	.cfi_restore 7
 	lsl	\d1, \d1, tmp1
 	.cfi_remember_state
 	lsr	result, \d1, #24
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1, lsr #24
-	bx	lr
+	epilogue push_ip=HAVE_PAC_LEAF
 #else
 	/* To use the big-endian trick we'd have to reverse all three words.
 	   that's slower than this approach.  */
 	rev	\synd, \synd
 	clz	tmp1, \synd
 	bic	tmp1, tmp1, #7
 	lsr	r1, \d2, tmp1
 	.cfi_remember_state
 	.if \restore_r6
 	ldrd	r6, r7, [sp, #8]
 	.endif
 	.cfi_restore 6
 	.cfi_restore 7
 	lsr	\d1, \d1, tmp1
 	and	result, \d1, #255
 	and	r1, r1, #255
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1
 
-	bx	lr
+	epilogue push_ip=HAVE_PAC_LEAF
 #endif
 	.endm
 
-	.p2align	5
-L(strcmp_start_addr):
-#if STRCMP_NO_PRECHECK == 0
-L(fastpath_exit):
-	sub	r0, r2, r3
-	bx	lr
-	nop
-#endif
-ENTRY_ALIGN (__strcmp_arm, 0)
+ENTRY(__strcmp_arm)
+	prologue push_ip=HAVE_PAC_LEAF
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
 	ldrb	r3, [src2]
 	cmp	r2, #1
 	it	cs
 	cmpcs	r2, r3
 	bne	L(fastpath_exit)
 #endif
 	strd	r4, r5, [sp, #-16]!
-	.cfi_def_cfa_offset 16
-	.cfi_offset 4, -16
-	.cfi_offset 5, -12
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset 4, 0
+	.cfi_rel_offset 5, 4
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
-	.cfi_offset 6, -8
-	.cfi_offset 7, -4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
 	cbz	r2, L(loop_aligned8)
 
 L(not_aligned):
 	eor	tmp1, src1, src2
 	tst	tmp1, #7
 	bne	L(misaligned8)
 
 	/* Deal with mutual misalignment by aligning downwards and then
 	   masking off the unwanted loaded data to prevent a difference.  */
 	and	tmp1, src1, #7
 	bic	src1, src1, #7
 	and	tmp2, tmp1, #3
 	bic	src2, src2, #7
 	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
 	ldrd	data1a, data1b, [src1], #16
 	tst	tmp1, #4
 	ldrd	data2a, data2b, [src2], #16
 	/* In thumb code we can't use MVN with a register shift, but
 	   we do have ORN.  */
 	S2HI	tmp1, const_m1, tmp2
 	orn	data1a, data1a, tmp1
 	orn	data2a, data2a, tmp1
 	beq	L(start_realigned8)
 	orn	data1b, data1b, tmp1
 	mov	data1a, const_m1
 	orn	data2b, data2b, tmp1
 	mov	data2a, const_m1
 	b	L(start_realigned8)
 
 	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
 	   pass.  */
 	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
 	.p2align 2	/* Always word aligned.  */
 L(loop_aligned8):
 	ldrd	data1a, data1b, [src1], #16
 	ldrd	data2a, data2b, [src2], #16
 L(start_realigned8):
 	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
 	eor	syndrome_a, data1a, data2a
 	sel	syndrome_a, syndrome_a, const_m1
 	cbnz	syndrome_a, L(diff_in_a)
 	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
 	eor	syndrome_b, data1b, data2b
 	sel	syndrome_b, syndrome_b, const_m1
 	cbnz	syndrome_b, L(diff_in_b)
 
 	ldrd	data1a, data1b, [src1, #-8]
 	ldrd	data2a, data2b, [src2, #-8]
 	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
 	eor	syndrome_a, data1a, data2a
 	sel	syndrome_a, syndrome_a, const_m1
 	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
 	eor	syndrome_b, data1b, data2b
 	sel	syndrome_b, syndrome_b, const_m1
 	/* Can't use CBZ for backwards branch.  */
 	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
 	beq	L(loop_aligned8)
 
 L(diff_found):
 	cbnz	syndrome_a, L(diff_in_a)
 
 L(diff_in_b):
 	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
 
 L(diff_in_a):
 	.cfi_restore_state
 	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
 
 	.cfi_restore_state
 L(misaligned8):
 	tst	tmp1, #3
 	bne	L(misaligned4)
 	ands	tmp1, src1, #3
 	bne	L(mutual_align4)
 
 	/* Unrolled by a factor of 2, to reduce the number of post-increment
 	   operations.  */
 L(loop_aligned4):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned4):
 	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
 	cbnz	syndrome, L(aligned4_done)
 	ldr	data1, [src1, #-4]
 	ldr	data2, [src2, #-4]
 	uadd8	syndrome, data1, const_m1
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
 	cmp	syndrome, #0
 	beq	L(loop_aligned4)
 
 L(aligned4_done):
 	strcmp_epilogue_aligned syndrome, data1, data2, 0
 
 L(mutual_align4):
 	.cfi_restore_state
 	/* Deal with mutual misalignment by aligning downwards and then
 	   masking off the unwanted loaded data to prevent a difference.  */
 	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
 	bic	src1, src1, #3
 	ldr	data1, [src1], #8
 	bic	src2, src2, #3
 	ldr	data2, [src2], #8
 
 	/* In thumb code we can't use MVN with a register shift, but
 	   we do have ORN.  */
 	S2HI	tmp1, const_m1, tmp1
 	orn	data1, data1, tmp1
 	orn	data2, data2, tmp1
 	b	L(start_realigned4)
 
 L(misaligned4):
 	ands	tmp1, src1, #3
 	beq	L(src1_aligned)
 	sub	src2, src2, tmp1
 	bic	src1, src1, #3
 	lsls	tmp1, tmp1, #31
 	ldr	data1, [src1], #4
 	beq	L(aligned_m2)
 	bcs	L(aligned_m1)
 
 #if STRCMP_NO_PRECHECK == 1
 	ldrb	data2, [src2, #1]
 	uxtb	tmp1, data1, ror #BYTE1_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	L(misaligned_exit)
 	cbz	data2, L(misaligned_exit)
 
 L(aligned_m2):
 	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	L(misaligned_exit)
 	cbz	data2, L(misaligned_exit)
 
 L(aligned_m1):
 	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	L(misaligned_exit)
 	add	src2, src2, #4
 	cbnz	data2, L(src1_aligned)
 #else  /* STRCMP_NO_PRECHECK */
 	/* If we've done the pre-check, then we don't need to check the
 	   first byte again here.  */
 	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	L(misaligned_exit)
 	cbz	data2, L(misaligned_exit)
 
 L(aligned_m2):
 	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	L(misaligned_exit)
 	cbnz	data2, L(aligned_m1)
 #endif
 
 L(misaligned_exit):
 	.cfi_remember_state
 	mov	result, tmp1
 	ldr	r4, [sp], #16
 	.cfi_restore 4
-	bx	lr
+	.cfi_adjust_cfa_offset -16
+	epilogue push_ip=HAVE_PAC_LEAF
 
 #if STRCMP_NO_PRECHECK == 0
+L(fastpath_exit):
+	.cfi_restore_state
+	.cfi_remember_state
+	sub	r0, r2, r3
+	epilogue push_ip=HAVE_PAC_LEAF
+
 L(aligned_m1):
+	.cfi_restore_state
+	.cfi_remember_state
 	add	src2, src2, #4
 #endif
 L(src1_aligned):
 	.cfi_restore_state
 	/* src1 is word aligned, but src2 has no common alignment
 	   with it.  */
 	ldr	data1, [src1], #4
 	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
 
 	bic	src2, src2, #3
 	ldr	data2, [src2], #4
 	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
 	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
 
 	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
 L(overlap3):
 	bic	tmp1, data1, #MSB
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #8
 	sel	syndrome, syndrome, const_m1
 	bne	4f
 	cbnz	syndrome, 5f
 	ldr	data2, [src2], #4
 	eor	tmp1, tmp1, data1
 	cmp	tmp1, data2, S2HI #24
 	bne	6f
 	ldr	data1, [src1], #4
 	b	L(overlap3)
 4:
 	S2LO	data2, data2, #8
 	b	L(strcmp_tail)
 
 5:
 	bics	syndrome, syndrome, #MSB
 	bne	L(strcmp_done_equal)
 
 	/* We can only get here if the MSB of data1 contains 0, so
 	   fast-path the exit.  */
 	ldrb	result, [src2]
 	.cfi_remember_state
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
 	/* R6/7 Not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
+	.cfi_adjust_cfa_offset -16
 	neg	result, result
-	bx	lr
-
+	epilogue push_ip=HAVE_PAC_LEAF
 6:
 	.cfi_restore_state
 	S2LO	data1, data1, #24
 	and	data2, data2, #LSB
 	b	L(strcmp_tail)
 
 	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
 L(overlap2):
 	and	tmp1, data1, const_m1, S2LO #16
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #16
 	sel	syndrome, syndrome, const_m1
 	bne	4f
 	cbnz	syndrome, 5f
 	ldr	data2, [src2], #4
 	eor	tmp1, tmp1, data1
 	cmp	tmp1, data2, S2HI #16
 	bne	6f
 	ldr	data1, [src1], #4
 	b	L(overlap2)
 4:
 	S2LO	data2, data2, #16
 	b	L(strcmp_tail)
 5:
 	ands	syndrome, syndrome, const_m1, S2LO #16
 	bne	L(strcmp_done_equal)
 
 	ldrh	data2, [src2]
 	S2LO	data1, data1, #16
 #ifdef __ARM_BIG_ENDIAN
 	lsl	data2, data2, #16
 #endif
 	b	L(strcmp_tail)
 
 6:
 	S2LO	data1, data1, #16
 	and	data2, data2, const_m1, S2LO #16
 	b	L(strcmp_tail)
 
 	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
 L(overlap1):
 	and	tmp1, data1, #LSB
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #24
 	sel	syndrome, syndrome, const_m1
 	bne	4f
 	cbnz	syndrome, 5f
 	ldr	data2, [src2], #4
 	eor	tmp1, tmp1, data1
 	cmp	tmp1, data2, S2HI #8
 	bne	6f
 	ldr	data1, [src1], #4
 	b	L(overlap1)
 4:
 	S2LO	data2, data2, #24
 	b	L(strcmp_tail)
 5:
 	tst	syndrome, #LSB
 	bne	L(strcmp_done_equal)
 	ldr	data2, [src2]
 6:
 	S2LO	data1, data1, #8
 	bic	data2, data2, #MSB
 	b	L(strcmp_tail)
 
 L(strcmp_done_equal):
 	mov	result, #0
 	.cfi_remember_state
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
 	/* R6/7 not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
-	bx	lr
+	.cfi_adjust_cfa_offset -16
+	epilogue push_ip=HAVE_PAC_LEAF
 
 L(strcmp_tail):
 	.cfi_restore_state
 #ifndef __ARM_BIG_ENDIAN
 	rev	data1, data1
 	rev	data2, data2
 	/* Now everything looks big-endian...  */
 #endif
 	uadd8	tmp1, data1, const_m1
 	eor	tmp1, data1, data2
 	sel	syndrome, tmp1, const_m1
 	clz	tmp1, syndrome
 	lsl	data1, data1, tmp1
 	lsl	data2, data2, tmp1
 	lsr	result, data1, #24
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
 	/* R6/7 not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, data2, lsr #24
-	bx	lr
+	epilogue push_ip=HAVE_PAC_LEAF
 
 END (__strcmp_arm)
 
 #endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
diff --git a/contrib/arm-optimized-routines/string/arm/strcpy.c b/contrib/arm-optimized-routines/string/arm/strcpy.c
index 02cf94ff4be0..b5728a2534f0 100644
--- a/contrib/arm-optimized-routines/string/arm/strcpy.c
+++ b/contrib/arm-optimized-routines/string/arm/strcpy.c
@@ -1,133 +1,133 @@
 /*
  * strcpy
  *
  * Copyright (c) 2008-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if defined (__thumb2__) && !defined (__thumb__)
 
 /* For GLIBC:
 #include <string.h>
 #include <memcopy.h>
 
 #undef strcmp
 */
 
 #ifdef __thumb2__
 #define magic1(REG) "#0x01010101"
 #define magic2(REG) "#0x80808080"
 #else
 #define magic1(REG) #REG
 #define magic2(REG) #REG ", lsl #7"
 #endif
 
 char* __attribute__((naked))
 __strcpy_arm (char* dst, const char* src)
 {
   __asm__ (
        "pld	[r1, #0]\n\t"
        "eor	r2, r0, r1\n\t"
        "mov	ip, r0\n\t"
        "tst	r2, #3\n\t"
        "bne	4f\n\t"
        "tst	r1, #3\n\t"
        "bne	3f\n"
   "5:\n\t"
 # ifndef __thumb2__
        "str	r5, [sp, #-4]!\n\t"
        "mov	r5, #0x01\n\t"
        "orr	r5, r5, r5, lsl #8\n\t"
        "orr	r5, r5, r5, lsl #16\n\t"
 # endif
 
        "str	r4, [sp, #-4]!\n\t"
        "tst	r1, #4\n\t"
        "ldr	r3, [r1], #4\n\t"
        "beq	2f\n\t"
        "sub	r2, r3, "magic1(r5)"\n\t"
        "bics	r2, r2, r3\n\t"
        "tst	r2, "magic2(r5)"\n\t"
        "itt	eq\n\t"
        "streq	r3, [ip], #4\n\t"
        "ldreq	r3, [r1], #4\n"
        "bne	1f\n\t"
        /* Inner loop.  We now know that r1 is 64-bit aligned, so we
 	  can safely fetch up to two words.  This allows us to avoid
 	  load stalls.  */
        ".p2align 2\n"
   "2:\n\t"
        "pld	[r1, #8]\n\t"
        "ldr	r4, [r1], #4\n\t"
        "sub	r2, r3, "magic1(r5)"\n\t"
        "bics	r2, r2, r3\n\t"
        "tst	r2, "magic2(r5)"\n\t"
        "sub	r2, r4, "magic1(r5)"\n\t"
        "bne	1f\n\t"
        "str	r3, [ip], #4\n\t"
        "bics	r2, r2, r4\n\t"
        "tst	r2, "magic2(r5)"\n\t"
        "itt	eq\n\t"
        "ldreq	r3, [r1], #4\n\t"
        "streq	r4, [ip], #4\n\t"
        "beq	2b\n\t"
        "mov	r3, r4\n"
   "1:\n\t"
 # ifdef __ARMEB__
        "rors	r3, r3, #24\n\t"
 # endif
        "strb	r3, [ip], #1\n\t"
        "tst	r3, #0xff\n\t"
 # ifdef __ARMEL__
        "ror	r3, r3, #8\n\t"
 # endif
        "bne	1b\n\t"
        "ldr	r4, [sp], #4\n\t"
 # ifndef __thumb2__
        "ldr	r5, [sp], #4\n\t"
 # endif
        "BX LR\n"
 
        /* Strings have the same offset from word alignment, but it's
 	  not zero.  */
   "3:\n\t"
        "tst	r1, #1\n\t"
        "beq	1f\n\t"
        "ldrb	r2, [r1], #1\n\t"
        "strb	r2, [ip], #1\n\t"
        "cmp	r2, #0\n\t"
        "it	eq\n"
        "BXEQ LR\n"
   "1:\n\t"
        "tst	r1, #2\n\t"
        "beq	5b\n\t"
        "ldrh	r2, [r1], #2\n\t"
 # ifdef __ARMEB__
        "tst	r2, #0xff00\n\t"
        "iteet	ne\n\t"
        "strneh	r2, [ip], #2\n\t"
        "lsreq	r2, r2, #8\n\t"
        "streqb	r2, [ip]\n\t"
        "tstne	r2, #0xff\n\t"
 # else
        "tst	r2, #0xff\n\t"
        "itet	ne\n\t"
        "strneh	r2, [ip], #2\n\t"
        "streqb	r2, [ip]\n\t"
        "tstne	r2, #0xff00\n\t"
 # endif
        "bne	5b\n\t"
        "BX LR\n"
 
        /* src and dst do not have a common word-alignement.  Fall back to
 	  byte copying.  */
   "4:\n\t"
        "ldrb	r2, [r1], #1\n\t"
        "strb	r2, [ip], #1\n\t"
        "cmp	r2, #0\n\t"
        "bne	4b\n\t"
        "BX LR");
 }
 /* For GLIBC: libc_hidden_builtin_def (strcpy) */
 
 #endif /* defined (__thumb2__) && !defined (__thumb__)  */
diff --git a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S
index 5ad30c941586..5eb8671bdc8b 100644
--- a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S
+++ b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S
@@ -1,124 +1,130 @@
 /*
  * strlen - calculate the length of a string
  *
- * Copyright (c) 2010-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2010-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
 
 /*
    Assumes:
    ARMv6T2, AArch32
 
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #ifdef __ARMEB__
 #define S2LO		lsl
 #define S2HI		lsr
 #else
 #define S2LO		lsr
 #define S2HI		lsl
 #endif
 
+/* Ensure the .cantunwind directive is prepended to .fnend.
+   Leaf functions cannot throw exceptions - EHABI only supports
+   synchronous exceptions.  */
+#define IS_LEAF
+
 	/* This code requires Thumb.  */
 	.thumb
 	.syntax unified
 
 /* Parameters and result.  */
 #define srcin		r0
 #define result		r0
 
 /* Internal variables.  */
 #define src		r1
 #define data1a		r2
 #define data1b		r3
 #define const_m1	r12
 #define const_0		r4
 #define tmp1		r4		/* Overlaps const_0  */
 #define tmp2		r5
 
 ENTRY (__strlen_armv6t2)
+	prologue 4 5 push_ip=HAVE_PAC_LEAF
 	pld	[srcin, #0]
-	strd	r4, r5, [sp, #-8]!
 	bic	src, srcin, #7
 	mvn	const_m1, #0
 	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
 	pld	[src, #32]
 	bne.w	L(misaligned8)
 	mov	const_0, #0
 	mov	result, #-8
 L(loop_aligned):
 	/* Bytes 0-7.  */
 	ldrd	data1a, data1b, [src]
 	pld	[src, #64]
 	add	result, result, #8
 L(start_realigned):
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cbnz	data1b, L(null_found)
 
 	/* Bytes 8-15.  */
 	ldrd	data1a, data1b, [src, #8]
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	add	result, result, #8
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cbnz	data1b, L(null_found)
 
 	/* Bytes 16-23.  */
 	ldrd	data1a, data1b, [src, #16]
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	add	result, result, #8
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cbnz	data1b, L(null_found)
 
 	/* Bytes 24-31.  */
 	ldrd	data1a, data1b, [src, #24]
 	add	src, src, #32
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	add	result, result, #8
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cmp	data1b, #0
 	beq	L(loop_aligned)
 
 L(null_found):
+	.cfi_remember_state
 	cmp	data1a, #0
 	itt	eq
 	addeq	result, result, #4
 	moveq	data1a, data1b
 #ifndef __ARMEB__
 	rev	data1a, data1a
 #endif
 	clz	data1a, data1a
-	ldrd	r4, r5, [sp], #8
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-	bx	lr
+	epilogue 4 5 push_ip=HAVE_PAC_LEAF
 
 L(misaligned8):
+	.cfi_restore_state
 	ldrd	data1a, data1b, [src]
 	and	tmp2, tmp1, #3
 	rsb	result, tmp1, #0
 	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
 	tst	tmp1, #4
 	pld	[src, #64]
 	S2HI	tmp2, const_m1, tmp2
 	orn	data1a, data1a, tmp2
 	itt	ne
 	ornne	data1b, data1b, tmp2
 	movne	data1a, const_m1
 	mov	const_0, #0
 	b	L(start_realigned)
 
 END (__strlen_armv6t2)
 
 #endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2  */
diff --git a/contrib/arm-optimized-routines/string/bench/memcpy.c b/contrib/arm-optimized-routines/string/bench/memcpy.c
index 6bd27633e224..1468663e51cd 100644
--- a/contrib/arm-optimized-routines/string/bench/memcpy.c
+++ b/contrib/arm-optimized-routines/string/bench/memcpy.c
@@ -1,339 +1,339 @@
 /*
  * memcpy benchmark.
  *
- * Copyright (c) 2020-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include "stringlib.h"
 #include "benchlib.h"
 
 #define ITERS  5000
 #define ITERS2 20000000
 #define ITERS3 200000
 #define NUM_TESTS 16384
 #define MIN_SIZE 32768
 #define MAX_SIZE (1024 * 1024)
 
 static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
 static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
 
 #define F(x) {#x, x},
 
 static const struct fun
 {
   const char *name;
   void *(*fun)(void *, const void *, size_t);
 } funtab[] =
 {
 #if __aarch64__
   F(__memcpy_aarch64)
 # if __ARM_NEON
   F(__memcpy_aarch64_simd)
 # endif
 # if __ARM_FEATURE_SVE
   F(__memcpy_aarch64_sve)
 # endif
 #elif __arm__
   F(__memcpy_arm)
 #endif
   F(memcpy)
 #undef F
   {0, 0}
 };
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
 
 #define SIZE_NUM 65536
 #define SIZE_MASK (SIZE_NUM-1)
 static uint8_t size_arr[SIZE_NUM];
 
 /* Frequency data for memcpy of less than 4096 bytes based on SPEC2017.  */
 static freq_data_t size_freq[] =
 {
 {32,22320}, { 16,9554}, {  8,8915}, {152,5327}, {  4,2159}, {292,2035},
 { 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
 {120, 661}, {  2, 649}, {882, 550}, {  5, 475}, {  7, 461}, {108, 460},
 { 10, 361}, {  9, 361}, {  6, 334}, {  3, 326}, {464, 308}, {2048,303},
 {  1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
 {192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288,  96},
 {104,  96}, {1144, 83}, { 18,  80}, { 23,  78}, { 40,  77}, { 19,  68},
 { 48,  63}, { 17,  57}, { 72,  54}, {1280, 51}, { 20,  49}, { 28,  47},
 { 22,  46}, {640,  45}, { 25,  41}, { 14,  40}, { 56,  37}, { 27,  35},
 { 35,  33}, {384,  33}, { 29,  32}, { 80,  30}, {4095, 22}, {232,  22},
 { 36,  19}, {184,  17}, { 21,  17}, {256,  16}, { 44,  15}, { 26,  15},
 { 31,  14}, { 88,  14}, {176,  13}, { 33,  12}, {1024, 12}, {208,  11},
 { 62,  11}, {128,  10}, {704,  10}, {324,  10}, { 96,  10}, { 60,   9},
 {136,   9}, {124,   9}, { 34,   8}, { 30,   8}, {480,   8}, {1344,  8},
 {273,   7}, {520,   7}, {112,   6}, { 52,   6}, {344,   6}, {336,   6},
 {504,   5}, {168,   5}, {424,   5}, {  0,   4}, { 76,   3}, {200,   3},
 {512,   3}, {312,   3}, {240,   3}, {960,   3}, {264,   2}, {672,   2},
 { 38,   2}, {328,   2}, { 84,   2}, { 39,   2}, {216,   2}, { 42,   2},
 { 37,   2}, {1608,  2}, { 70,   2}, { 46,   2}, {536,   2}, {280,   1},
 {248,   1}, { 47,   1}, {1088,  1}, {1288,  1}, {224,   1}, { 41,   1},
 { 50,   1}, { 49,   1}, {808,   1}, {360,   1}, {440,   1}, { 43,   1},
 { 45,   1}, { 78,   1}, {968,   1}, {392,   1}, { 54,   1}, { 53,   1},
 { 59,   1}, {376,   1}, {664,   1}, { 58,   1}, {272,   1}, { 66,   1},
 {2688,  1}, {472,   1}, {568,   1}, {720,   1}, { 51,   1}, { 63,   1},
 { 86,   1}, {496,   1}, {776,   1}, { 57,   1}, {680,   1}, {792,   1},
 {122,   1}, {760,   1}, {824,   1}, {552,   1}, { 67,   1}, {456,   1},
 {984,   1}, { 74,   1}, {408,   1}, { 75,   1}, { 92,   1}, {576,   1},
 {116,   1}, { 65,   1}, {117,   1}, { 82,   1}, {352,   1}, { 55,   1},
 {100,   1}, { 90,   1}, {696,   1}, {111,   1}, {880,   1}, { 79,   1},
 {488,   1}, { 61,   1}, {114,   1}, { 94,   1}, {1032,  1}, { 98,   1},
 { 87,   1}, {584,   1}, { 85,   1}, {648,   1}, {0, 0}
 };
 
 #define ALIGN_NUM 1024
 #define ALIGN_MASK (ALIGN_NUM-1)
 static uint8_t src_align_arr[ALIGN_NUM];
 static uint8_t dst_align_arr[ALIGN_NUM];
 
 /* Source alignment frequency for memcpy based on SPEC2017.  */
 static align_data_t src_align_freq[] =
 {
   {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
 };
 
 static align_data_t dst_align_freq[] =
 {
   {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
 };
 
 typedef struct
 {
   uint64_t src : 24;
   uint64_t dst : 24;
   uint64_t len : 16;
 } copy_t;
 
 static copy_t test_arr[NUM_TESTS];
 
 typedef char *(*proto_t) (char *, const char *, size_t);
 
 static void
 init_copy_distribution (void)
 {
   int i, j, freq, size, n;
 
   for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
     for (j = 0, size = size_freq[i].size; j < freq; j++)
       size_arr[n++] = size;
   assert (n == SIZE_NUM);
 
   for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
     for (j = 0, size = src_align_freq[i].align; j < freq; j++)
       src_align_arr[n++] = size - 1;
   assert (n == ALIGN_NUM);
 
   for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
     for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
       dst_align_arr[n++] = size - 1;
   assert (n == ALIGN_NUM);
 }
 
 static size_t
 init_copies (size_t max_size)
 {
   size_t total = 0;
   /* Create a random set of copies with the given size and alignment
      distributions.  */
   for (int i = 0; i < NUM_TESTS; i++)
     {
       test_arr[i].dst = (rand32 (0) & (max_size - 1));
       test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
       test_arr[i].src = (rand32 (0) & (max_size - 1));
       test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
       test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
       total += test_arr[i].len;
     }
 
   return total;
 }
 
 int main (void)
 {
   init_copy_distribution ();
 
   memset (a, 1, sizeof (a));
   memset (b, 2, sizeof (b));
 
   printf("Random memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t total = 0;
       uint64_t tsum = 0;
       printf ("%22s ", funtab[f].name);
       rand32 (0x12345678);
 
       for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
 	{
 	  size_t copy_size = init_copies (size) * ITERS;
 
 	  for (int c = 0; c < NUM_TESTS; c++)
 	    funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
 			   test_arr[c].len);
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS; i++)
 	    for (int c = 0; c < NUM_TESTS; c++)
 	      funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
 			     test_arr[c].len);
 	  t = clock_get_ns () - t;
 	  total += copy_size;
 	  tsum += t;
 	  printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
 	}
       printf( "avg %.2f\n", (double)total / tsum);
     }
 
   size_t total = 0;
   uint64_t tsum = 0;
   printf ("%22s ", "memcpy_call");
   rand32 (0x12345678);
 
   for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
     {
       size_t copy_size = init_copies (size) * ITERS;
 
       for (int c = 0; c < NUM_TESTS; c++)
 	memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
 
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
 	for (int c = 0; c < NUM_TESTS; c++)
 	  memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
       t = clock_get_ns () - t;
       total += copy_size;
       tsum += t;
       printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
     }
   printf( "avg %.2f\n", (double)total / tsum);
 
 
   printf ("\nAligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 8; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
 	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
   printf ("%22s ", "memcpy_call");
   for (int size = 8; size <= 512; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS2; i++)
 	memcpy (b, a, size);
       t = clock_get_ns () - t;
       printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
     }
   printf ("\n");
 
 
   printf ("\nUnaligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 8; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (b + 3, a + 1, size);
 	  t = clock_get_ns () - t;
 	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
   printf ("%22s ", "memcpy_call");
   for (int size = 8; size <= 512; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS2; i++)
 	memcpy (b + 3, a + 1, size);
       t = clock_get_ns () - t;
       printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
     }
   printf ("\n");
 
 
   printf ("\nLarge memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
 	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
   printf ("%22s ", "memcpy_call");
   for (int size = 1024; size <= 65536; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS3; i++)
 	memcpy (b, a, size);
       t = clock_get_ns () - t;
       printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
     }
   printf ("\n");
 
 
   printf ("\nUnaligned forwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a, a + 256 + (i & 31), size);
 	  t = clock_get_ns () - t;
 	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
 
   printf ("\nUnaligned backwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a + 256 + (i & 31), a, size);
 	  t = clock_get_ns () - t;
 	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
   printf ("\n");
 
   return 0;
 }
diff --git a/contrib/arm-optimized-routines/string/bench/memset.c b/contrib/arm-optimized-routines/string/bench/memset.c
index 2d6196931307..990e23ba9a36 100644
--- a/contrib/arm-optimized-routines/string/bench/memset.c
+++ b/contrib/arm-optimized-routines/string/bench/memset.c
@@ -1,243 +1,243 @@
 /*
  * memset benchmark.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include "stringlib.h"
 #include "benchlib.h"
 
 #define ITERS  5000
 #define ITERS2 20000000
 #define ITERS3 1000000
 #define NUM_TESTS 16384
 #define MIN_SIZE 32768
 #define MAX_SIZE (1024 * 1024)
 
 static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
 
 #define F(x) {#x, x},
 
 static const struct fun
 {
   const char *name;
   void *(*fun)(void *, int, size_t);
 } funtab[] =
 {
 #if __aarch64__
   F(__memset_aarch64)
 #elif __arm__
   F(__memset_arm)
 #endif
   F(memset)
 #undef F
   {0, 0}
 };
 
 typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
 static memset_test_t test_arr[NUM_TESTS];
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
 
 #define SIZE_NUM 65536
 #define SIZE_MASK (SIZE_NUM-1)
 static uint8_t len_arr[SIZE_NUM];
 
 /* Frequency data for memset sizes up to 4096 based on SPEC2017.  */
 static freq_data_t memset_len_freq[] =
 {
 {40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, {  8,1412},
 {292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
 { 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, {  2, 200}, {  4, 192},
 { 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
 {4095,133}, { 10, 130}, {  9, 124}, {  3, 124}, { 28, 120}, {  0, 118},
 {288, 110}, {1152, 96}, {104,  90}, {  1,  86}, {832,  76}, {248,  74},
 {1024, 69}, {120,  64}, {512,  63}, {384,  60}, {  6,  59}, { 80,  54},
 { 17,  50}, {  7,  49}, {520,  47}, {2048, 39}, {256,  37}, {864,  33},
 {1440, 28}, { 22,  27}, {2056, 24}, {260,  23}, { 68,  23}, {  5,  22},
 { 18,  21}, {200,  18}, {2120, 18}, { 60,  17}, { 52,  16}, {336,  15},
 { 44,  13}, {192,  13}, {160,  12}, {2064, 12}, {128,  12}, { 76,  11},
 {164,  11}, {152,  10}, {136,   9}, {488,   7}, { 96,   6}, {560,   6},
 {1016,  6}, {112,   5}, {232,   5}, {168,   5}, {952,   5}, {184,   5},
 {144,   4}, {252,   4}, { 84,   3}, {960,   3}, {3808,  3}, {244,   3},
 {280,   3}, {224,   3}, {156,   3}, {1088,  3}, {440,   3}, {216,   2},
 {304,   2}, { 23,   2}, { 25,   2}, { 26,   2}, {264,   2}, {328,   2},
 {1096,  2}, {240,   2}, {1104,  2}, {704,   2}, {1664,  2}, {360,   2},
 {808,   1}, {544,   1}, {236,   1}, {720,   1}, {368,   1}, {424,   1},
 {640,   1}, {1112,  1}, {552,   1}, {272,   1}, {776,   1}, {376,   1},
 { 92,   1}, {536,   1}, {824,   1}, {496,   1}, {760,   1}, {792,   1},
 {504,   1}, {344,   1}, {1816,  1}, {880,   1}, {176,   1}, {320,   1},
 {352,   1}, {2008,  1}, {208,   1}, {408,   1}, {228,   1}, {2072,  1},
 {568,   1}, {220,   1}, {616,   1}, {600,   1}, {392,   1}, {696,   1},
 {2144,  1}, {1280,  1}, {2136,  1}, {632,   1}, {584,   1}, {456,   1},
 {472,   1}, {3440,  1}, {2088,  1}, {680,   1}, {2928,  1}, {212,   1},
 {648,   1}, {1752,  1}, {664,   1}, {3512,  1}, {1032,  1}, {528,   1},
 {4072,  1}, {204,   1}, {2880,  1}, {3392,  1}, {712,   1}, { 59,   1},
 {736,   1}, {592,   1}, {2520,  1}, {744,   1}, {196,   1}, {172,   1},
 {728,   1}, {2040,  1}, {1192,  1}, {3600,  1}, {0, 0}
 };
 
 #define ALIGN_NUM 1024
 #define ALIGN_MASK (ALIGN_NUM-1)
 static uint8_t align_arr[ALIGN_NUM];
 
 /* Alignment data for memset based on SPEC2017.  */
 static align_data_t memset_align_freq[] =
 {
  {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
 };
 
 static void
 init_memset_distribution (void)
 {
   int i, j, freq, size, n;
 
   for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
     for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
       len_arr[n++] = size;
   assert (n == SIZE_NUM);
 
   for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
     for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
       align_arr[n++] = size - 1;
   assert (n == ALIGN_NUM);
 }
 
 static size_t
 init_memset (size_t max_size)
 {
   size_t total = 0;
   /* Create a random set of memsets with the given size and alignment
      distributions.  */
   for (int i = 0; i < NUM_TESTS; i++)
     {
       test_arr[i].offset = (rand32 (0) & (max_size - 1));
       test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
       test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
       total += test_arr[i].len;
     }
 
   return total;
 }
 
 
 int main (void)
 {
   init_memset_distribution ();
 
   memset (a, 1, sizeof (a));
 
   printf("Random memset (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t total_size = 0;
       uint64_t tsum = 0;
       printf ("%22s ", funtab[f].name);
       rand32 (0x12345678);
 
       for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
 	{
 	  size_t memset_size = init_memset (size) * ITERS;
 
 	  for (int c = 0; c < NUM_TESTS; c++)
 	    funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS; i++)
 	    for (int c = 0; c < NUM_TESTS; c++)
 	      funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
 	  t = clock_get_ns () - t;
 	  total_size += memset_size;
 	  tsum += t;
 	  printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
 	}
       printf( "avg %.2f\n", (double)total_size / tsum);
     }
 
   size_t total_size = 0;
   uint64_t tsum = 0;
   printf ("%22s ", "memset_call");
   rand32 (0x12345678);
 
   for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
     {
       size_t memset_size = init_memset (size) * ITERS;
 
       for (int c = 0; c < NUM_TESTS; c++)
 	memset (a + test_arr[c].offset, 0, test_arr[c].len);
 
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
 	for (int c = 0; c < NUM_TESTS; c++)
 	  memset (a + test_arr[c].offset, 0, test_arr[c].len);
       t = clock_get_ns () - t;
       total_size += memset_size;
       tsum += t;
       printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
     }
   printf( "avg %.2f\n", (double)total_size / tsum);
 
 
   printf ("\nMedium memset (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 8; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (a, 0, size);
 	  t = clock_get_ns () - t;
 	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
   printf ("%22s ", "memset_call");
   for (int size = 8; size <= 512; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS2; i++)
 	memset (a, 0, size);
       t = clock_get_ns () - t;
       printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
     }
 
 
   printf ("\nLarge memset (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a, 0, size);
 	  t = clock_get_ns () - t;
 	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
   printf ("%22s ", "memset_call");
   for (int size = 1024; size <= 65536; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS3; i++)
 	memset (a, 0, size);
       t = clock_get_ns () - t;
       printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
     }
   printf ("\n\n");
 
   return 0;
 }
diff --git a/contrib/arm-optimized-routines/string/bench/strlen.c b/contrib/arm-optimized-routines/string/bench/strlen.c
index b7eee6e905ab..f05d0d5b89e6 100644
--- a/contrib/arm-optimized-routines/string/bench/strlen.c
+++ b/contrib/arm-optimized-routines/string/bench/strlen.c
@@ -1,221 +1,221 @@
 /*
  * strlen benchmark.
  *
  * Copyright (c) 2020-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include "stringlib.h"
 #include "benchlib.h"
 
 #define ITERS 5000
 #define ITERS2 20000000
 #define ITERS3 2000000
 #define NUM_TESTS 16384
 
 #define MAX_ALIGN 32
 #define MAX_STRLEN 256
 
 static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   size_t (*fun) (const char *s);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strlen, 0)
 #if __aarch64__
   F(__strlen_aarch64, 0)
   F(__strlen_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strlen_aarch64_sve, 1)
 # endif
 #elif __arm__
 # if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
   F(__strlen_armv6t2, 0)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 static uint16_t strlen_tests[NUM_TESTS];
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
 
 #define SIZE_NUM 65536
 #define SIZE_MASK (SIZE_NUM - 1)
 static uint8_t strlen_len_arr[SIZE_NUM];
 
 /* Frequency data for strlen sizes up to 128 based on SPEC2017.  */
 static freq_data_t strlen_len_freq[] =
 {
   { 12,22671}, { 18,12834}, { 13, 9555}, {  6, 6348}, { 17, 6095}, { 11, 2115},
   { 10, 1335}, {  7,  814}, {  2,  646}, {  9,  483}, {  8,  471}, { 16,  418},
   {  4,  390}, {  1,  388}, {  5,  233}, {  3,  204}, {  0,   79}, { 14,   79},
   { 15,   69}, { 26,   36}, { 22,   35}, { 31,   24}, { 32,   24}, { 19,   21},
   { 25,   17}, { 28,   15}, { 21,   14}, { 33,   14}, { 20,   13}, { 24,    9},
   { 29,    9}, { 30,    9}, { 23,    7}, { 34,    7}, { 27,    6}, { 44,    5},
   { 42,    4}, { 45,    3}, { 47,    3}, { 40,    2}, { 41,    2}, { 43,    2},
   { 58,    2}, { 78,    2}, { 36,    2}, { 48,    1}, { 52,    1}, { 60,    1},
   { 64,    1}, { 56,    1}, { 76,    1}, { 68,    1}, { 80,    1}, { 84,    1},
   { 72,    1}, { 86,    1}, { 35,    1}, { 39,    1}, { 50,    1}, { 38,    1},
   { 37,    1}, { 46,    1}, { 98,    1}, {102,    1}, {128,    1}, { 51,    1},
   {107,    1}, { 0,     0}
 };
 
 #define ALIGN_NUM 1024
 #define ALIGN_MASK (ALIGN_NUM - 1)
 static uint8_t strlen_align_arr[ALIGN_NUM];
 
 /* Alignment data for strlen based on SPEC2017.  */
 static align_data_t string_align_freq[] =
 {
   {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
 };
 
 static void
 init_strlen_distribution (void)
 {
   int i, j, freq, size, n;
 
   for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
     for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
       strlen_len_arr[n++] = size;
   assert (n == SIZE_NUM);
 
   for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
     for (j = 0, size = string_align_freq[i].align; j < freq; j++)
       strlen_align_arr[n++] = size;
   assert (n == ALIGN_NUM);
 }
 
 static void
 init_strlen_tests (void)
 {
   uint16_t index[MAX_ALIGN];
 
   memset (a, 'x', sizeof (a));
 
   /* Create indices for strings at all alignments.  */
   for (int i = 0; i < MAX_ALIGN; i++)
     {
       index[i] = i * (MAX_STRLEN + 1);
       a[index[i] + MAX_STRLEN] = 0;
     }
 
   /* Create a random set of strlen input strings using the string length
      and alignment distributions.  */
   for (int n = 0; n < NUM_TESTS; n++)
     {
       int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
       int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
 
       strlen_tests[n] =
 	index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
     }
 }
 
 static volatile size_t maskv = 0;
 
 int main (void)
 {
   rand32 (0x12345678);
   init_strlen_distribution ();
   init_strlen_tests ();
 
   printf ("\nRandom strlen (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t res = 0, strlen_size = 0, mask = maskv;
       printf ("%22s ", funtab[f].name);
 
       for (int c = 0; c < NUM_TESTS; c++)
 	strlen_size += funtab[f].fun (a + strlen_tests[c]);
       strlen_size *= ITERS;
 
       /* Measure latency of strlen result with (res & mask).  */
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
 	for (int c = 0; c < NUM_TESTS; c++)
 	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
       t = clock_get_ns () - t;
       printf ("%.2f\n", (double)strlen_size / t);
     }
 
   printf ("\nSmall aligned strlen (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 1; size <= 64; size *= 2)
 	{
 	  memset (a, 'x', size);
 	  a[size - 1] = 0;
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (a);
 	  t = clock_get_ns () - t;
 	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
 		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
   printf ("\nSmall unaligned strlen (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       int align = 9;
       for (int size = 1; size <= 64; size *= 2)
 	{
 	  memset (a + align, 'x', size);
 	  a[align + size - 1] = 0;
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (a + align);
 	  t = clock_get_ns () - t;
 	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
 		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
   printf ("\nMedium strlen (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       printf ("%22s ", funtab[f].name);
 
       for (int size = 128; size <= 4096; size *= 2)
 	{
 	  memset (a, 'x', size);
 	  a[size - 1] = 0;
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a);
 	  t = clock_get_ns () - t;
 	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
 		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
   printf ("\n");
 
   return 0;
 }
diff --git a/contrib/arm-optimized-routines/string/include/benchlib.h b/contrib/arm-optimized-routines/string/include/benchlib.h
index 0f2ce2eb6bce..f1bbea388cd2 100644
--- a/contrib/arm-optimized-routines/string/include/benchlib.h
+++ b/contrib/arm-optimized-routines/string/include/benchlib.h
@@ -1,33 +1,33 @@
 /*
  * Benchmark support functions.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <time.h>
 
 /* Fast and accurate timer returning nanoseconds.  */
 static inline uint64_t
 clock_get_ns (void)
 {
   struct timespec ts;
   clock_gettime (CLOCK_MONOTONIC, &ts);
   return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
 }
 
 /* Fast 32-bit random number generator.  Passing a non-zero seed
    value resets the internal state.  */
 static inline uint32_t
 rand32 (uint32_t seed)
 {
   static uint64_t state = 0xb707be451df0bb19ULL;
   if (seed != 0)
     state = seed;
   uint32_t res = state >> 32;
   state = state * 6364136223846793005ULL + 1;
   return res;
 }
 
 
diff --git a/contrib/arm-optimized-routines/string/include/stringlib.h b/contrib/arm-optimized-routines/string/include/stringlib.h
index 85e630279ceb..f41a46446888 100644
--- a/contrib/arm-optimized-routines/string/include/stringlib.h
+++ b/contrib/arm-optimized-routines/string/include/stringlib.h
@@ -1,67 +1,67 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stddef.h>
 
 /* restrict is not needed, but kept for documenting the interface contract.  */
 #ifndef __restrict
 # define __restrict
 #endif
 
 #if __aarch64__
 void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64 (void *, const void *, size_t);
 void *__memset_aarch64 (void *, int, size_t);
 void *__memchr_aarch64 (const void *, int, size_t);
 void *__memrchr_aarch64 (const void *, int, size_t);
 int __memcmp_aarch64 (const void *, const void *, size_t);
 char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
 char *__stpcpy_aarch64 (char *__restrict, const char *__restrict);
 int __strcmp_aarch64 (const char *, const char *);
 char *__strchr_aarch64 (const char *, int);
 char *__strrchr_aarch64 (const char *, int);
 char *__strchrnul_aarch64 (const char *, int );
 size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
 void * __memchr_aarch64_mte (const void *, int, size_t);
 char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
 char *__strrchr_aarch64_mte (const char *, int);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
 #endif
 # if __ARM_FEATURE_SVE
 void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memchr_aarch64_sve (const void *, int, size_t);
 int __memcmp_aarch64_sve (const void *, const void *, size_t);
 char *__strchr_aarch64_sve (const char *, int);
 char *__strrchr_aarch64_sve (const char *, int);
 char *__strchrnul_aarch64_sve (const char *, int );
 int __strcmp_aarch64_sve (const char *, const char *);
 char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
 char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict);
 size_t __strlen_aarch64_sve (const char *);
 size_t __strnlen_aarch64_sve (const char *, size_t);
 int __strncmp_aarch64_sve (const char *, const char *, size_t);
 # endif
 # if __ARM_FEATURE_MEMORY_TAGGING
 void *__mtag_tag_region (void *, size_t);
 void *__mtag_tag_zero_region (void *, size_t);
 # endif
 #elif __arm__
 void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
 void *__memset_arm (void *, int, size_t);
 void *__memchr_arm (const void *, int, size_t);
 char *__strcpy_arm (char *__restrict, const char *__restrict);
 int __strcmp_arm (const char *, const char *);
 int __strcmp_armv6m (const char *, const char *);
 size_t __strlen_armv6t2 (const char *);
 #endif
diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c
index d8c02d92d626..c45fa6662a77 100644
--- a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c
+++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c
@@ -1,147 +1,147 @@
 /*
  * __mtag_tag_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 static void
 mtag_quoteat (const char *prefix, void *p, int len, int at)
 {
   /* Print tag, untag and quote the context.  */
   printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
   untag_buffer (p, len, 1);
   p = untag_pointer (p);
   quoteat (prefix, p, len, at);
 }
 
 #define F(x) {#x, x},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (void *s, size_t n);
 } funtab[] = {
 // clang-format off
 #if __aarch64__
   F(__mtag_tag_region)
 #endif
   {0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 64
 #define LEN 250000
 static unsigned char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int salign, int len)
 {
   unsigned char *src = alignup (sbuf);
   unsigned char *s = src + salign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || salign >= A)
     abort ();
   for (i = 0; i < len + 2 * A; i++)
     src[i] = '?';
   for (i = 0; i < len; i++)
     s[i] = 'a';
 
   src = tag_buffer (src, len + 2 * A, 1);
   s = src + salign;
   /* Use different tag.  */
   s = __arm_mte_increment_tag (s, 1);
   p = fun->fun (s, len);
 
   if (p != s)
     ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
 
   for (i = 0; i < salign; i++)
     {
       if (src[i] != '?')
 	{
 	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
 	  mtag_quoteat ("got head", src, len + 2 * A, i);
 	  return;
 	}
     }
 
   for (; i < salign + len; i++)
     {
       if (s[i - salign] != 'a')
 	{
 	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
 	  mtag_quoteat ("got body", src, len + 2 * A, i);
 	  return;
 	}
     }
 
   for (; i < len + 2 * A; i++)
     {
       if (src[i] != '?')
 	{
 	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
 	  mtag_quoteat ("got tail", src, len + 2 * A, i);
 	  return;
 	}
     }
 
   untag_buffer (src, len + 2 * A, 1);
 }
 
 int
 main ()
 {
   if (!mte_enabled ())
     return 0;
 
   sbuf = mte_mmap (LEN + 3 * A);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int s = 0; s < A; s += 16)
 	{
 	  int n;
 	  for (n = 0; n < 200; n += 16)
 	    {
 	      test (funtab + i, s, n);
 	    }
 	  for (; n < LEN; n *= 2)
 	    {
 	      test (funtab + i, s, n);
 	    }
 	}
       printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
 #else
 int
 main ()
 {
   return 0;
 }
 #endif
diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c
index 221c223a2f31..a4a7861620d1 100644
--- a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c
+++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c
@@ -1,147 +1,147 @@
 /*
  * __mtag_tag_zero_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 static void
 mtag_quoteat (const char *prefix, void *p, int len, int at)
 {
   /* Print tag, untag and quote the context.  */
   printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
   untag_buffer (p, len, 1);
   p = untag_pointer (p);
   quoteat (prefix, p, len, at);
 }
 
 #define F(x) {#x, x},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (void *s, size_t n);
 } funtab[] = {
 // clang-format off
 #if __aarch64__
   F(__mtag_tag_zero_region)
 #endif
   {0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 64
 #define LEN 250000
 static unsigned char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int salign, int len)
 {
   unsigned char *src = alignup (sbuf);
   unsigned char *s = src + salign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || salign >= A)
     abort ();
   for (i = 0; i < len + 2 * A; i++)
     src[i] = '?';
   for (i = 0; i < len; i++)
     s[i] = 'a' + i % 23;
 
   src = tag_buffer (src, len + 2 * A, 1);
   s = src + salign;
   /* Use different tag.  */
   s = __arm_mte_increment_tag (s, 1);
   p = fun->fun (s, len);
 
   if (p != s)
     ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
 
   for (i = 0; i < salign; i++)
     {
       if (src[i] != '?')
 	{
 	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
 	  mtag_quoteat ("got head", src, len + 2 * A, i);
 	  return;
 	}
     }
 
   for (; i < salign + len; i++)
     {
       if (s[i - salign] != 0)
 	{
 	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
 	  mtag_quoteat ("got body", src, len + 2 * A, i);
 	  return;
 	}
     }
 
   for (; i < len + 2 * A; i++)
     {
       if (src[i] != '?')
 	{
 	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
 	  mtag_quoteat ("got tail", src, len + 2 * A, i);
 	  return;
 	}
     }
 
   untag_buffer (src, len + 2 * A, 1);
 }
 
 int
 main ()
 {
   if (!mte_enabled ())
     return 0;
 
   sbuf = mte_mmap (LEN + 3 * A);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int s = 0; s < A; s += 16)
 	{
 	  int n;
 	  for (n = 0; n < 200; n += 16)
 	    {
 	      test (funtab + i, s, n);
 	    }
 	  for (; n < LEN; n *= 2)
 	    {
 	      test (funtab + i, s, n);
 	    }
 	}
       printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
 #else
 int
 main ()
 {
   return 0;
 }
 #endif
diff --git a/contrib/arm-optimized-routines/string/test/memchr.c b/contrib/arm-optimized-routines/string/test/memchr.c
index 0ff77f5710bf..c6a94481c0ad 100644
--- a/contrib/arm-optimized-routines/string/test/memchr.c
+++ b/contrib/arm-optimized-routines/string/test/memchr.c
@@ -1,110 +1,110 @@
 /*
  * memchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (const void *s, int c, size_t n);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(memchr, 0)
 #if __aarch64__
   F(__memchr_aarch64, 0)
   F(__memchr_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__memchr_aarch64_sve, 1)
 # endif
 #elif __arm__
   F(__memchr_arm, 0)
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, size_t seekpos, size_t len,
       size_t maxlen)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   char *f = seekpos < maxlen ? s + seekpos : NULL;
   int seekchar = 1;
   void *p;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || seekpos > LEN || align > ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = seekchar;
   for (int i = 0; i <= ALIGN; i++)
     s[len + i] = seekchar;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   s[seekpos] = seekchar;
   s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar;
 
   int mte_len = seekpos != -1 ? seekpos + 1 : maxlen;
   s = tag_buffer (s, mte_len, fun->test_mte);
   p = fun->fun (s, seekchar, maxlen);
   untag_buffer (s, mte_len, fun->test_mte);
   p = untag_pointer (p);
 
   if (p != f)
     {
       ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
 	   seekchar, maxlen, p, f);
       quote ("input", s, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  {
 	    for (int sp = 0; sp < LEN; sp++)
 	      test (funtab + i, a, sp, n, n);
 	    test (funtab + i, a, n, n, SIZE_MAX - a);
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/memcmp.c b/contrib/arm-optimized-routines/string/test/memcmp.c
index 7a7cf9cff35a..f9236b83a60d 100644
--- a/contrib/arm-optimized-routines/string/test/memcmp.c
+++ b/contrib/arm-optimized-routines/string/test/memcmp.c
@@ -1,125 +1,125 @@
 /*
  * memcmp test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   int (*fun) (const void *s1, const void *s2, size_t n);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(memcmp, 0)
 #if __aarch64__
   F(__memcmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__memcmp_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 32
 #define LEN 250000
 static unsigned char *s1buf;
 static unsigned char *s2buf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
       int delta)
 {
   unsigned char *src1 = alignup (s1buf);
   unsigned char *src2 = alignup (s2buf);
   unsigned char *s1 = src1 + s1align;
   unsigned char *s2 = src2 + s2align;
   int r;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || s1align >= A || s2align >= A)
     abort ();
   if (diffpos >= len)
     abort ();
   if ((diffpos < 0) != (delta == 0))
     abort ();
 
   for (int i = 0; i < len + A; i++)
     src1[i] = src2[i] = '?';
   for (int i = 0; i < len; i++)
     s1[i] = s2[i] = 'a' + i % 23;
   if (delta)
     s1[diffpos] += delta;
 
   s1 = tag_buffer (s1, len, fun->test_mte);
   s2 = tag_buffer (s2, len, fun->test_mte);
   r = fun->fun (s1, s2, len);
   untag_buffer (s1, len, fun->test_mte);
   untag_buffer (s2, len, fun->test_mte);
 
   if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
     {
       ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
 	   s1align, s2align, len, r);
       quoteat ("src1", src1, len + A, diffpos);
       quoteat ("src2", src2, len + A, diffpos);
     }
 }
 
 int
 main ()
 {
   s1buf = mte_mmap (LEN + 2 * A);
   s2buf = mte_mmap (LEN + 2 * A);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < A; d++)
 	for (int s = 0; s < A; s++)
 	  {
 	    int n;
 	    test (funtab + i, d, s, 0, -1, 0);
 	    test (funtab + i, d, s, 1, -1, 0);
 	    test (funtab + i, d, s, 1, 0, -1);
 	    test (funtab + i, d, s, 1, 0, 1);
 	    for (n = 2; n < 100; n++)
 	      {
 		test (funtab + i, d, s, n, -1, 0);
 		test (funtab + i, d, s, n, 0, -1);
 		test (funtab + i, d, s, n, n - 1, -1);
 		test (funtab + i, d, s, n, n / 2, 1);
 	      }
 	    for (; n < LEN; n *= 2)
 	      {
 		test (funtab + i, d, s, n, -1, 0);
 		test (funtab + i, d, s, n, n / 2, -1);
 	      }
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/memcpy.c b/contrib/arm-optimized-routines/string/test/memcpy.c
index 21b35b990b9b..fa15a95b2bda 100644
--- a/contrib/arm-optimized-routines/string/test/memcpy.c
+++ b/contrib/arm-optimized-routines/string/test/memcpy.c
@@ -1,123 +1,123 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (void *, const void *, size_t);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(memcpy, 0)
 #if __aarch64__
   F(__memcpy_aarch64, 1)
 # if __ARM_NEON
   F(__memcpy_aarch64_simd, 1)
 # endif
 # if __ARM_FEATURE_SVE
   F(__memcpy_aarch64_sve, 1)
 # endif
 #elif __arm__
   F(__memcpy_arm, 0)
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 32
 #define LEN 250000
 static unsigned char *dbuf;
 static unsigned char *sbuf;
 static unsigned char wbuf[LEN + 2 * A];
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int dalign, int salign, int len)
 {
   unsigned char *src = alignup (sbuf);
   unsigned char *dst = alignup (dbuf);
   unsigned char *want = wbuf;
   unsigned char *s = src + salign;
   unsigned char *d = dst + dalign;
   unsigned char *w = want + dalign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || dalign >= A || salign >= A)
     abort ();
   for (i = 0; i < len + A; i++)
     {
       src[i] = '?';
       want[i] = dst[i] = '*';
     }
   for (i = 0; i < len; i++)
     s[i] = w[i] = 'a' + i % 23;
 
   s = tag_buffer (s, len, fun->test_mte);
   d = tag_buffer (d, len, fun->test_mte);
   p = fun->fun (d, s, len);
   untag_buffer (s, len, fun->test_mte);
   untag_buffer (d, len, fun->test_mte);
 
   if (p != d)
     ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
   for (i = 0; i < len + A; i++)
     {
       if (dst[i] != want[i])
 	{
 	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
 	       len);
 	  quoteat ("got", dst, len + A, i);
 	  quoteat ("want", want, len + A, i);
 	  break;
 	}
     }
 }
 
 int
 main ()
 {
   dbuf = mte_mmap (LEN + 2 * A);
   sbuf = mte_mmap (LEN + 2 * A);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < A; d++)
 	for (int s = 0; s < A; s++)
 	  {
 	    int n;
 	    for (n = 0; n < 100; n++)
 	      test (funtab + i, d, s, n);
 	    for (; n < LEN; n *= 2)
 	      test (funtab + i, d, s, n);
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/memmove.c b/contrib/arm-optimized-routines/string/test/memmove.c
index 12a70574c7c5..5d509c03affa 100644
--- a/contrib/arm-optimized-routines/string/test/memmove.c
+++ b/contrib/arm-optimized-routines/string/test/memmove.c
@@ -1,167 +1,167 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (void *, const void *, size_t);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(memmove, 0)
 #if __aarch64__
   F(__memmove_aarch64, 1)
 # if __ARM_NEON
   F(__memmove_aarch64_simd, 1)
 # endif
 # if __ARM_FEATURE_SVE
   F(__memmove_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 32
 #define LEN 250000
 static unsigned char *dbuf;
 static unsigned char *sbuf;
 static unsigned char wbuf[LEN + 2 * A];
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int dalign, int salign, int len)
 {
   unsigned char *src = alignup (sbuf);
   unsigned char *dst = alignup (dbuf);
   unsigned char *want = wbuf;
   unsigned char *s = src + salign;
   unsigned char *d = dst + dalign;
   unsigned char *w = want + dalign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || dalign >= A || salign >= A)
     abort ();
   for (i = 0; i < len + A; i++)
     {
       src[i] = '?';
       want[i] = dst[i] = '*';
     }
   for (i = 0; i < len; i++)
     s[i] = w[i] = 'a' + i % 23;
 
   p = fun->fun (d, s, len);
   if (p != d)
     ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
   for (i = 0; i < len + A; i++)
     {
       if (dst[i] != want[i])
 	{
 	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
 	       len);
 	  quoteat ("got", dst, len + A, i);
 	  quoteat ("want", want, len + A, i);
 	  break;
 	}
     }
 }
 
 static void
 test_overlap (const struct fun *fun, int dalign, int salign, int len)
 {
   unsigned char *src = alignup (sbuf);
   unsigned char *dst = src;
   unsigned char *want = wbuf;
   unsigned char *s = src + salign;
   unsigned char *d = dst + dalign;
   unsigned char *w = wbuf + dalign;
   void *p;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || dalign >= A || salign >= A)
     abort ();
 
   for (int i = 0; i < len + A; i++)
     src[i] = want[i] = '?';
 
   for (int i = 0; i < len; i++)
     s[i] = want[salign + i] = 'a' + i % 23;
   for (int i = 0; i < len; i++)
     w[i] = s[i];
 
   s = tag_buffer (s, len, fun->test_mte);
   d = tag_buffer (d, len, fun->test_mte);
   p = fun->fun (d, s, len);
   untag_buffer (s, len, fun->test_mte);
   untag_buffer (d, len, fun->test_mte);
 
   if (p != d)
     ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
   for (int i = 0; i < len + A; i++)
     {
       if (dst[i] != want[i])
 	{
 	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
 	       len);
 	  quoteat ("got", dst, len + A, i);
 	  quoteat ("want", want, len + A, i);
 	  break;
 	}
     }
 }
 
 int
 main ()
 {
   dbuf = mte_mmap (LEN + 2 * A);
   sbuf = mte_mmap (LEN + 2 * A);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < A; d++)
 	for (int s = 0; s < A; s++)
 	  {
 	    int n;
 	    for (n = 0; n < 100; n++)
 	      {
 		test (funtab + i, d, s, n);
 		test_overlap (funtab + i, d, s, n);
 	      }
 	    for (; n < LEN; n *= 2)
 	      {
 		test (funtab + i, d, s, n);
 		test_overlap (funtab + i, d, s, n);
 	      }
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/memrchr.c b/contrib/arm-optimized-routines/string/test/memrchr.c
index adf96f049cc9..4171a56daefd 100644
--- a/contrib/arm-optimized-routines/string/test/memrchr.c
+++ b/contrib/arm-optimized-routines/string/test/memrchr.c
@@ -1,106 +1,106 @@
 /*
  * memchr test.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (const void *s, int c, size_t n);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(memrchr, 0)
 #if __aarch64__
   F(__memrchr_aarch64, 1)
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, size_t seekpos, size_t len,
       size_t maxlen)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   char *f = seekpos < maxlen ? s + seekpos : NULL;
   int seekchar = 1;
   void *p;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || seekpos > LEN || align > ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = seekchar;
   for (int i = 0; i <= ALIGN; i++)
     s[len + i] = seekchar;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   s[seekpos] = seekchar;
   s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar;
 
   s = tag_buffer (s, maxlen, fun->test_mte);
   p = fun->fun (s, seekchar, maxlen);
   untag_buffer (s, maxlen, fun->test_mte);
   p = untag_pointer (p);
 
   if (p != f)
     {
       ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
 	   seekchar, maxlen, p, f);
       quote ("input", s, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  {
 	    for (int sp = 0; sp < LEN; sp++)
 	      test (funtab + i, a, sp, n, n);
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/memset.c b/contrib/arm-optimized-routines/string/test/memset.c
index f1721442dbaf..5543f44bb026 100644
--- a/contrib/arm-optimized-routines/string/test/memset.c
+++ b/contrib/arm-optimized-routines/string/test/memset.c
@@ -1,129 +1,129 @@
 /*
  * memset test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   void *(*fun) (void *s, int c, size_t n);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(memset, 0)
 #if __aarch64__
   F(__memset_aarch64, 1)
 #elif __arm__
   F(__memset_arm, 0)
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 32
 #define LEN 250000
 static unsigned char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int salign, int c, int len)
 {
   unsigned char *src = alignup (sbuf);
   unsigned char *s = src + salign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || salign >= A)
     abort ();
   for (i = 0; i < len + A; i++)
     src[i] = '?';
   for (i = 0; i < len; i++)
     s[i] = 'a' + i % 23;
 
   s = tag_buffer (s, len, fun->test_mte);
   p = fun->fun (s, c, len);
   untag_buffer (s, len, fun->test_mte);
 
   if (p != s)
     ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
 
   for (i = 0; i < salign; i++)
     {
       if (src[i] != '?')
 	{
 	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
 	  quoteat ("got", src, len + A, i);
 	  return;
 	}
     }
   for (; i < salign + len; i++)
     {
       if (src[i] != (unsigned char) c)
 	{
 	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
 	  quoteat ("got", src, len + A, i);
 	  return;
 	}
     }
   for (; i < len + A; i++)
     {
       if (src[i] != '?')
 	{
 	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
 	  quoteat ("got", src, len + A, i);
 	  return;
 	}
     }
 }
 
 int
 main ()
 {
   sbuf = mte_mmap (LEN + 2 * A);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int s = 0; s < A; s++)
 	{
 	  int n;
 	  for (n = 0; n < 100; n++)
 	    {
 	      test (funtab + i, s, 0, n);
 	      test (funtab + i, s, 0x25, n);
 	      test (funtab + i, s, 0xaa25, n);
 	    }
 	  for (; n < LEN; n *= 2)
 	    {
 	      test (funtab + i, s, 0, n);
 	      test (funtab + i, s, 0x25, n);
 	      test (funtab + i, s, 0xaa25, n);
 	    }
 	}
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/mte.h b/contrib/arm-optimized-routines/string/test/mte.h
index e67cbd9d2d40..40b0ecf6c194 100644
--- a/contrib/arm-optimized-routines/string/test/mte.h
+++ b/contrib/arm-optimized-routines/string/test/mte.h
@@ -1,142 +1,142 @@
 /*
  * Memory tagging testing code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef __TEST_MTE_H
 #define __TEST_MTE_H
 
 #include <stdlib.h>
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
 #include <arm_acle.h>
 #include <sys/mman.h>
 #include <sys/prctl.h>
 
 // These depend on a not yet merged kernel ABI.
 #define PR_SET_TAGGED_ADDR_CTRL 55
 #define PR_TAGGED_ADDR_ENABLE (1UL << 0)
 #define PR_MTE_TCF_SHIFT 1
 #define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
 #define PR_MTE_TAG_SHIFT 3
 #define PROT_MTE 0x20
 
 #define MTE_GRANULE_SIZE 16
 
 int
 mte_enabled ()
 {
   static int enabled = -1;
   if (enabled == -1)
     {
       int res = prctl (PR_SET_TAGGED_ADDR_CTRL,
 		       PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC
 			 | (0xfffe << PR_MTE_TAG_SHIFT),
 		       0, 0, 0);
       enabled = (res == 0);
     }
   return enabled;
 }
 
 static void *
 mte_mmap (size_t size)
 {
   if (mte_enabled ())
     {
       return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE,
 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     }
   else
     {
       return malloc (size);
     }
 }
 
 void *
 alignup_mte (void *p)
 {
   return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1)
 		   & ~(MTE_GRANULE_SIZE - 1));
 }
 
 void *
 aligndown_mte (void *p)
 {
   return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1));
 }
 
 void *
 untag_pointer (void *p)
 {
   return (void *) ((unsigned long long) p & (~0ULL >> 8));
 }
 
 void
 tag_buffer_helper (void *p, int len)
 {
   char *ptr = p;
   char *end = alignup_mte (ptr + len);
   ptr = aligndown_mte (p);
   for (; ptr < end; ptr += MTE_GRANULE_SIZE)
     {
       __arm_mte_set_tag (ptr);
     }
 }
 
 void *
 tag_buffer (void *p, int len, int test_mte)
 {
   if (test_mte && mte_enabled ())
     {
       p = __arm_mte_increment_tag (p, 1);
       tag_buffer_helper (p, len);
     }
   return p;
 }
 
 void *
 untag_buffer (void *p, int len, int test_mte)
 {
   p = untag_pointer (p);
   if (test_mte && mte_enabled ())
     {
       tag_buffer_helper (p, len);
     }
   return p;
 }
 
 #else  // __ARM_FEATURE_MEMORY_TAGGING
 int
 mte_enabled ()
 {
   return 0;
 }
 static void *
 mte_mmap (size_t size)
 {
   return malloc (size);
 }
 void *
 tag_buffer (void *p, int len, int test_mte)
 {
   (void) len;
   (void) test_mte;
   return p;
 }
 void *
 untag_buffer (void *p, int len, int test_mte)
 {
   (void) len;
   (void) test_mte;
   return p;
 }
 void *
 untag_pointer (void *p)
 {
   return p;
 }
 #endif // __ARM_FEATURE_MEMORY_TAGGING
 
 #endif
diff --git a/contrib/arm-optimized-routines/string/test/stpcpy.c b/contrib/arm-optimized-routines/string/test/stpcpy.c
index 1b61245bf8df..0300892a1f3c 100644
--- a/contrib/arm-optimized-routines/string/test/stpcpy.c
+++ b/contrib/arm-optimized-routines/string/test/stpcpy.c
@@ -1,124 +1,124 @@
 /*
  * stpcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   char *(*fun) (char *dest, const char *src);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(stpcpy, 0)
 #if __aarch64__
   F(__stpcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__stpcpy_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *dbuf;
 static char *sbuf;
 static char wbuf[LEN + 3 * ALIGN];
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int dalign, int salign, int len)
 {
   char *src = alignup (sbuf);
   char *dst = alignup (dbuf);
   char *want = wbuf;
   char *s = src + salign;
   char *d = dst + dalign;
   char *w = want + dalign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
     abort ();
   for (i = 0; i < len + ALIGN; i++)
     {
       src[i] = '?';
       want[i] = dst[i] = '*';
     }
   for (int i = 0; src + i < s; i++)
     src[i] = 0;
   for (int i = 1; i <= ALIGN; i++)
     s[len + i] = (len + salign) & 1 ? 1 : 0;
   for (i = 0; i < len; i++)
     s[i] = w[i] = 'a' + (i & 31);
   s[len] = w[len] = '\0';
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   d = tag_buffer (d, len + 1, fun->test_mte);
   p = fun->fun (d, s);
   untag_buffer (s, len + 1, fun->test_mte);
   untag_buffer (d, len + 1, fun->test_mte);
 
   if (p != d + len)
     ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len);
 
   for (i = 0; i < len + ALIGN; i++)
     {
       if (dst[i] != want[i])
 	{
 	  ERR ("%s (align %d, align %d, %d) failed\n",
 	       fun->name, dalign, salign, len);
 	  quoteat ("got", dst, len + ALIGN, i);
 	  quoteat ("want", want, len + ALIGN, i);
 	  break;
 	}
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   dbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < ALIGN; d++)
 	for (int s = 0; s < ALIGN; s++)
 	  for (int n = 0; n < LEN; n++)
 	    test (funtab + i, d, s, n);
 
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strchr.c b/contrib/arm-optimized-routines/string/test/strchr.c
index f3ae982ef0ad..66180acfb57c 100644
--- a/contrib/arm-optimized-routines/string/test/strchr.c
+++ b/contrib/arm-optimized-routines/string/test/strchr.c
@@ -1,121 +1,121 @@
 /*
  * strchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   char *(*fun) (const char *s, int c);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strchr, 0)
 #if __aarch64__
   F(__strchr_aarch64, 0)
   F(__strchr_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strchr_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, int seekpos, int len)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   char *f = seekpos != -1 ? s + seekpos : 0;
   int seekchar = 0x1;
   void *p;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || seekpos >= len || align >= ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = (i + len) & 1 ? seekchar : 0;
   for (int i = 1; i <= ALIGN; i++)
      s[len + i] = (i + len) & 1 ? seekchar : 0;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   if (seekpos != -1)
     s[seekpos] = seekchar;
   if (seekpos != -1 && (len + align) & 1)
     s[seekpos + 1] = seekchar;
   s[len] = '\0';
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   p = fun->fun (s, seekchar);
   untag_buffer (s, len + 1, fun->test_mte);
   p = untag_pointer (p);
 
   if (p != f)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
 	   fun->name, s, seekchar, len, p, f, seekpos);
       quote ("input", s, len);
     }
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   p = fun->fun (s, 0);
   untag_buffer (s, len + 1, fun->test_mte);
 
   if (p != s + len)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
 	   fun->name, s, 0, len, p, f, len);
       quote ("input", s, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  {
 	    for (int sp = 0; sp < n; sp++)
 	      test (funtab + i, a, sp, n);
 	    test (funtab + i, a, -1, n);
 	  }
 
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strchrnul.c b/contrib/arm-optimized-routines/string/test/strchrnul.c
index 6c30ab2123f1..aad0bf59da66 100644
--- a/contrib/arm-optimized-routines/string/test/strchrnul.c
+++ b/contrib/arm-optimized-routines/string/test/strchrnul.c
@@ -1,126 +1,126 @@
 /*
  * strchrnul test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   char *(*fun) (const char *s, int c);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strchrnul, 0)
 #if __aarch64__
   F(__strchrnul_aarch64, 0)
   F(__strchrnul_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strchrnul_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, int seekpos, int len)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   char *f = seekpos != -1 ? s + seekpos : s + len;
   int seekchar = 0x1;
   void *p;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || seekpos >= len || align >= ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = (i + len) & 1 ? seekchar : 0;
   for (int i = 1; i <= ALIGN; i++)
     s[len + i] = (i + len) & 1 ? seekchar : 0;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   if (seekpos != -1)
     s[seekpos] = seekchar;
   if (seekpos != -1 && (len + align) & 1)
     s[seekpos + 1] = seekchar;
   s[len] = '\0';
 
   int mte_len = seekpos != -1 ? seekpos + 1 : len + 1;
   s = tag_buffer (s, mte_len, fun->test_mte);
   p = fun->fun (s, seekchar);
   untag_buffer (s, mte_len, fun->test_mte);
   p = untag_pointer (p);
 
   if (p != f)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
 	   fun->name, s, seekchar, len, p, f, seekpos);
       quote ("input", s, len);
     }
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   p = fun->fun (s, 0);
   untag_buffer (s, len + 1, fun->test_mte);
 
   if (p != s + len)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
 	   fun->name, s, 0, len, p, f, len);
       quote ("input", s, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  {
 	    for (int sp = 0; sp < n; sp++)
 	      test (funtab + i, a, sp, n);
 	    test (funtab + i, a, -1, n);
 	  }
 
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strcmp.c b/contrib/arm-optimized-routines/string/test/strcmp.c
index 0262397dec88..4aa95f4f2f1d 100644
--- a/contrib/arm-optimized-routines/string/test/strcmp.c
+++ b/contrib/arm-optimized-routines/string/test/strcmp.c
@@ -1,131 +1,131 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   int (*fun) (const char *s1, const char *s2);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strcmp, 0)
 #if __aarch64__
   F(__strcmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcmp_aarch64_sve, 1)
 # endif
 #elif __arm__
 # if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
   F(__strcmp_arm, 0)
 # elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
   F(__strcmp_armv6m, 0)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 32
 #define LEN 250000
 static char *s1buf;
 static char *s2buf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
       int delta)
 {
   char *src1 = alignup (s1buf);
   char *src2 = alignup (s2buf);
   char *s1 = src1 + s1align;
   char *s2 = src2 + s2align;
   int r;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || s1align >= A || s2align >= A)
     abort ();
   if (diffpos >= len)
     abort ();
   if ((diffpos < 0) != (delta == 0))
     abort ();
 
   for (int i = 0; i < len + A; i++)
     src1[i] = src2[i] = '?';
   for (int i = 0; i < len; i++)
     s1[i] = s2[i] = 'a' + i % 23;
   if (delta)
     s1[diffpos] += delta;
   s1[len] = s2[len] = '\0';
 
   s1 = tag_buffer (s1, len + 1, fun->test_mte);
   s2 = tag_buffer (s2, len + 1, fun->test_mte);
   r = fun->fun (s1, s2);
   untag_buffer (s1, len + 1, fun->test_mte);
   untag_buffer (s2, len + 1, fun->test_mte);
 
   if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
     {
       ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
 	   s1align, s2align, len, r);
       quoteat ("src1", src1, len + A, diffpos);
       quoteat ("src2", src2, len + A, diffpos);
     }
 }
 
 int
 main ()
 {
   s1buf = mte_mmap (LEN + 2 * A + 1);
   s2buf = mte_mmap (LEN + 2 * A + 1);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < A; d++)
 	for (int s = 0; s < A; s++)
 	  {
 	    int n;
 	    test (funtab + i, d, s, 0, -1, 0);
 	    test (funtab + i, d, s, 1, -1, 0);
 	    test (funtab + i, d, s, 1, 0, 1);
 	    test (funtab + i, d, s, 1, 0, -1);
 	    for (n = 2; n < 100; n++)
 	      {
 		test (funtab + i, d, s, n, -1, 0);
 		test (funtab + i, d, s, n, n - 1, -1);
 		test (funtab + i, d, s, n, n / 2, 1);
 	      }
 	    for (; n < LEN; n *= 2)
 	      {
 		test (funtab + i, d, s, n, -1, 0);
 		test (funtab + i, d, s, n, n / 2, -1);
 	      }
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strcpy.c b/contrib/arm-optimized-routines/string/test/strcpy.c
index 6de3bed590ef..af297f90396a 100644
--- a/contrib/arm-optimized-routines/string/test/strcpy.c
+++ b/contrib/arm-optimized-routines/string/test/strcpy.c
@@ -1,122 +1,122 @@
 /*
  * strcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   char *(*fun) (char *dest, const char *src);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strcpy, 0)
 #if __aarch64__
   F(__strcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcpy_aarch64_sve, 1)
 # endif
 #elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
   F(__strcpy_arm, 0)
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *dbuf;
 static char *sbuf;
 static char wbuf[LEN + 3 * ALIGN];
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int dalign, int salign, int len)
 {
   char *src = alignup (sbuf);
   char *dst = alignup (dbuf);
   char *want = wbuf;
   char *s = src + salign;
   char *d = dst + dalign;
   char *w = want + dalign;
   void *p;
   int i;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
     abort ();
   for (i = 0; i < len + ALIGN; i++)
     {
       src[i] = '?';
       want[i] = dst[i] = '*';
     }
   for (int i = 0; src + i < s; i++)
     src[i] = 0;
   for (int i = 1; i <= ALIGN; i++)
     s[len + i] = (len + salign) & 1 ? 1 : 0;
   for (i = 0; i < len; i++)
     s[i] = w[i] = 'a' + (i & 31);
   s[len] = w[len] = '\0';
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   d = tag_buffer (d, len + 1, fun->test_mte);
   p = fun->fun (d, s);
   untag_buffer (s, len + 1, fun->test_mte);
   untag_buffer (d, len + 1, fun->test_mte);
 
   if (p != d)
     ERR ("%s (%p,..) returned %p\n", fun->name, d, p);
 
   for (i = 0; i < len + ALIGN; i++)
     {
       if (dst[i] != want[i])
 	{
 	  ERR ("%s (align %d, align %d, %d) failed\n",
 	       fun->name, dalign, salign, len);
 	  quoteat ("got", dst, len + ALIGN, i);
 	  quoteat ("want", want, len + ALIGN, i);
 	  break;
 	}
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   dbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < ALIGN; d++)
 	for (int s = 0; s < ALIGN; s++)
 	  for (int n = 0; n < LEN; n++)
 	    test (funtab + i, d, s, n);
 
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/stringtest.h b/contrib/arm-optimized-routines/string/test/stringtest.h
index fe855fc21736..6bb7e1fdfeca 100644
--- a/contrib/arm-optimized-routines/string/test/stringtest.h
+++ b/contrib/arm-optimized-routines/string/test/stringtest.h
@@ -1,55 +1,55 @@
 /*
  * Common string test code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
 #include <stdio.h>
 
 /* Accounting errors for a test case.  */
 static int err_count;
 #define ERR_LIMIT 10
 #define ERR(...) (err_count++, printf (__VA_ARGS__))
 
 static inline void
 quotechar (unsigned char c)
 {
   if (isprint (c))
     putchar (c);
   else
     printf ("\\x%02x", c);
 }
 
 /* quoted print around at or the entire string if at < 0.  */
 static void
 quoteat (const char *prefix, const void *p, int len, int at)
 {
   static const int CTXLEN = 15;
   int i;
   const char *pre = "\"";
   const char *post = "\"";
   const char *s = p;
   if (at > CTXLEN)
     {
       s += at - CTXLEN;
       len -= at - CTXLEN;
       pre = "...\"";
     }
   if (at >= 0 && len > 2 * CTXLEN + 1)
     {
       len = 2 * CTXLEN + 1;
       post = "\"...";
     }
   printf ("%4s: %s", prefix, pre);
   for (i = 0; i < len; i++)
     quotechar (s[i]);
   printf ("%s\n", post);
 }
 
 static inline void
 quote (const char *prefix, const void *p, int len)
 {
   quoteat (prefix, p, len, -1);
 }
diff --git a/contrib/arm-optimized-routines/string/test/strlen.c b/contrib/arm-optimized-routines/string/test/strlen.c
index 6278380f26df..47ef3dcf0ef0 100644
--- a/contrib/arm-optimized-routines/string/test/strlen.c
+++ b/contrib/arm-optimized-routines/string/test/strlen.c
@@ -1,103 +1,102 @@
 /*
  * strlen test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/mman.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   size_t (*fun) (const char *s);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strlen, 0)
 #if __aarch64__
   F(__strlen_aarch64, 0)
   F(__strlen_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strlen_aarch64_sve, 1)
 # endif
 #elif __arm__
 # if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
   F(__strlen_armv6t2, 0)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, int len)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   size_t r;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || align >= ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = 0;
   for (int i = 1; i <= ALIGN; i++)
     s[len + i] = (len + align) & 1 ? 1 : 0;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   s[len] = '\0';
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   r = fun->fun (s);
   untag_buffer (s, len + 1, fun->test_mte);
 
   if (r != len)
     {
       ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len);
       quote ("input", src, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  test (funtab + i, a, n);
 
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strncmp.c b/contrib/arm-optimized-routines/string/test/strncmp.c
index f8c2167f8f1e..4bbab6f93450 100644
--- a/contrib/arm-optimized-routines/string/test/strncmp.c
+++ b/contrib/arm-optimized-routines/string/test/strncmp.c
@@ -1,138 +1,138 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   int (*fun) (const char *, const char *, size_t);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strncmp, 0)
 #if __aarch64__
   F(__strncmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strncmp_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define A 32
 #define LEN 250000
 static char *s1buf;
 static char *s2buf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + A - 1) & -A);
 }
 
 static void
 test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos,
       int len, int delta)
 {
   char *src1 = alignup (s1buf);
   char *src2 = alignup (s2buf);
   char *s1 = src1 + s1align;
   char *s2 = src2 + s2align;
   int r;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || s1align >= A || s2align >= A)
     abort ();
   if (diffpos >= len)
     abort ();
   if ((diffpos < 0) != (delta == 0))
     abort ();
 
   for (int i = 0; i < len + A; i++)
     src1[i] = src2[i] = '?';
   for (int i = 0; i < len; i++)
     s1[i] = s2[i] = 'a' + i % 23;
   if (delta)
     s1[diffpos] += delta;
   s1[len] = s2[len] = '\0';
 
   size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
   s1 = tag_buffer (s1, mte_len, fun->test_mte);
   s2 = tag_buffer (s2, mte_len, fun->test_mte);
   r = fun->fun (s1, s2, maxlen);
   untag_buffer (s1, mte_len, fun->test_mte);
   untag_buffer (s2, mte_len, fun->test_mte);
 
   if (diffpos >= maxlen)
     {
       diffpos = -1;
       delta = 0;
     }
   if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
     {
       ERR (
 	"%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
 	fun->name, s1align, s2align, maxlen, len, diffpos, r);
       quoteat ("src1", src1, len + A, diffpos);
       quoteat ("src2", src2, len + A, diffpos);
     }
 }
 
 int
 main ()
 {
   s1buf = mte_mmap (LEN + 2 * A + 1);
   s2buf = mte_mmap (LEN + 2 * A + 1);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int d = 0; d < A; d++)
 	for (int s = 0; s < A; s++)
 	  {
 	    int n;
 	    test (funtab + i, d, s, 0, -1, 0, 0);
 	    test (funtab + i, d, s, 1, -1, 0, 0);
 	    test (funtab + i, d, s, 0, -1, 1, 0);
 	    test (funtab + i, d, s, 1, -1, 1, 0);
 	    test (funtab + i, d, s, 2, -1, 1, 0);
 	    test (funtab + i, d, s, 1, 0, 1, 1);
 	    test (funtab + i, d, s, 1, 0, 1, -1);
 	    for (n = 2; n < 100; n++)
 	      {
 		test (funtab + i, d, s, n, -1, n, 0);
 		test (funtab + i, d, s, n, n / 2, n, 1);
 		test (funtab + i, d, s, n / 2, -1, n, 0);
 		test (funtab + i, d, s, n / 2, n / 2, n, -1);
 	      }
 	    for (; n < LEN; n *= 2)
 	      {
 		test (funtab + i, d, s, n, -1, n, 0);
 		test (funtab + i, d, s, n, n / 2, n, -1);
 		test (funtab + i, d, s, n / 2, -1, n, 0);
 		test (funtab + i, d, s, n / 2, n / 2, n, 1);
 	      }
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strnlen.c b/contrib/arm-optimized-routines/string/test/strnlen.c
index 0dea00eaf8e3..a800fd1993cd 100644
--- a/contrib/arm-optimized-routines/string/test/strnlen.c
+++ b/contrib/arm-optimized-routines/string/test/strnlen.c
@@ -1,109 +1,109 @@
 /*
  * strnlen test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   size_t (*fun) (const char *s, size_t m);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strnlen, 0)
 #if __aarch64__
   F(__strnlen_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strnlen_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, size_t maxlen, size_t len)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   size_t r;
   size_t e = maxlen < len ? maxlen : len;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || align >= ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = 0;
   for (int i = 1; i <= ALIGN; i++)
     s[len + i] = (len + align) & 1 ? 1 : 0;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   s[len] = 0;
   if ((len + align) & 1)
     s[e + 1] = 0;
 
   size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
   s = tag_buffer (s, mte_len, fun->test_mte);
   r = fun->fun (s, maxlen);
   untag_buffer (s, mte_len, fun->test_mte);
 
   if (r != e)
     {
       ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n",
 	   fun->name, s, maxlen, len, r, e);
       quote ("input", s, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  {
 	    for (int maxlen = 0; maxlen < LEN; maxlen++)
 	      test (funtab + i, a, maxlen, n);
 	    test (funtab + i, a, SIZE_MAX - a, n);
 	  }
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/test/strrchr.c b/contrib/arm-optimized-routines/string/test/strrchr.c
index fedbdc52fcc1..580ca497f8a4 100644
--- a/contrib/arm-optimized-routines/string/test/strrchr.c
+++ b/contrib/arm-optimized-routines/string/test/strrchr.c
@@ -1,121 +1,121 @@
 /*
  * strrchr test.
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
 #include "stringtest.h"
 
 #define F(x, mte) {#x, x, mte},
 
 static const struct fun
 {
   const char *name;
   char *(*fun) (const char *s, int c);
   int test_mte;
 } funtab[] = {
   // clang-format off
   F(strrchr, 0)
 #if __aarch64__
   F(__strrchr_aarch64, 0)
   F(__strrchr_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strrchr_aarch64_sve, 1)
 # endif
 #endif
   {0, 0, 0}
   // clang-format on
 };
 #undef F
 
 #define ALIGN 32
 #define LEN 512
 static char *sbuf;
 
 static void *
 alignup (void *p)
 {
   return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
 }
 
 static void
 test (const struct fun *fun, int align, int seekpos, int len)
 {
   char *src = alignup (sbuf);
   char *s = src + align;
   char *f = seekpos != -1 ? s + seekpos : 0;
   int seekchar = 0x1;
   void *p;
 
   if (err_count >= ERR_LIMIT)
     return;
   if (len > LEN || seekpos >= len || align >= ALIGN)
     abort ();
 
   for (int i = 0; src + i < s; i++)
     src[i] = (i + len) & 1 ? seekchar : 0;
   for (int i = 1; i <= ALIGN; i++)
     s[len + i] = (i + len) & 1 ? seekchar : 0;
   for (int i = 0; i < len; i++)
     s[i] = 'a' + (i & 31);
   if (seekpos != -1)
     s[seekpos / 2] = s[seekpos] = seekchar;
   if (seekpos > 0 && (len + align) & 1)
     s[seekpos - 1] = seekchar;
   s[len] = '\0';
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   p = fun->fun (s, seekchar);
   untag_buffer (s, len + 1, fun->test_mte);
   p = untag_pointer (p);
 
   if (p != f)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
 	   fun->name, s, seekchar, len, p, f, seekpos);
       quote ("input", s, len);
     }
 
   s = tag_buffer (s, len + 1, fun->test_mte);
   p = fun->fun (s, 0);
   untag_buffer (s, len + 1, fun->test_mte);
 
   if (p != s + len)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
 	   fun->name, s, 0, len, p, s + len, len);
       quote ("input", s, len);
     }
 }
 
 int
 main (void)
 {
   sbuf = mte_mmap (LEN + 3 * ALIGN);
   int r = 0;
   for (int i = 0; funtab[i].name; i++)
     {
       err_count = 0;
       for (int a = 0; a < ALIGN; a++)
 	for (int n = 0; n < LEN; n++)
 	  {
 	    for (int sp = 0; sp < n; sp++)
 	      test (funtab + i, a, sp, n);
 	    test (funtab + i, a, -1, n);
 	  }
 
       char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
       printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
       if (err_count)
 	r = -1;
     }
   return r;
 }
diff --git a/contrib/arm-optimized-routines/string/x86_64/check-arch.S b/contrib/arm-optimized-routines/string/x86_64/check-arch.S
index 26ade0a0c7db..5afcf7b7ee54 100644
--- a/contrib/arm-optimized-routines/string/x86_64/check-arch.S
+++ b/contrib/arm-optimized-routines/string/x86_64/check-arch.S
@@ -1,10 +1,10 @@
 /*
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__x86_64__
 # error ARCH setting does not match the compiler.
 #endif