Index: contrib/cortex-strings/.gitignore =================================================================== --- contrib/cortex-strings/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -*.a -*.o -*.la -*.lo -*.png -*.pyc -.deps -.dirstamp -.libs -try-* -cache.txt Index: contrib/cortex-strings/Makefile.am =================================================================== --- contrib/cortex-strings/Makefile.am +++ /dev/null @@ -1,327 +0,0 @@ -# Copyright (c) 2011, Linaro Limited -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the Linaro nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# - -# Top level Makefile for cortex-strings - -# Used to record the compiler version in the executables -COMPILER = $(shell $(CC) --version 2>&1 | head -n1) - -# The main library -lib_LTLIBRARIES = \ - libcortex-strings.la - -## Test suite -check_PROGRAMS = \ - tests/test-memchr \ - tests/test-memcmp \ - tests/test-memcpy \ - tests/test-memmove \ - tests/test-memset \ - tests/test-strchr \ - tests/test-strcmp \ - tests/test-strcpy \ - tests/test-strlen \ - tests/test-strncmp \ - tests/test-strnlen - -# Options for the tests -tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS) -tests_ldadd = libcortex-strings.la -tests_test_memchr_LDADD = $(tests_ldadd) -tests_test_memchr_CFLAGS = $(tests_cflags) -tests_test_memcmp_LDADD = $(tests_ldadd) -tests_test_memcmp_CFLAGS = $(tests_cflags) -tests_test_memcpy_LDADD = $(tests_ldadd) -tests_test_memcpy_CFLAGS = $(tests_cflags) -tests_test_memmove_LDADD = $(tests_ldadd) -tests_test_memmove_CFLAGS = $(tests_cflags) -tests_test_memset_LDADD = $(tests_ldadd) -tests_test_memset_CFLAGS = $(tests_cflags) -tests_test_strchr_LDADD = $(tests_ldadd) -tests_test_strchr_CFLAGS = $(tests_cflags) -tests_test_strcmp_LDADD = $(tests_ldadd) -tests_test_strcmp_CFLAGS = $(tests_cflags) -tests_test_strcpy_LDADD = $(tests_ldadd) -tests_test_strcpy_CFLAGS = $(tests_cflags) -tests_test_strlen_LDADD = $(tests_ldadd) -tests_test_strlen_CFLAGS = $(tests_cflags) -tests_test_strncmp_LDADD = $(tests_ldadd) -tests_test_strncmp_CFLAGS = $(tests_cflags) - -TESTS = $(check_PROGRAMS) - -## Benchmarks -noinst_PROGRAMS = \ - dhry \ - dhry-native \ - try-none \ - try-this \ - try-plain \ - try-newlib-c \ - try-bionic-c \ - try-glibc-c - -# Good 'ol Dhrystone -dhry_SOURCES = \ - benchmarks/dhry/dhry_1.c \ - benchmarks/dhry/dhry_2.c \ - benchmarks/dhry/dhry.h - -dhry_CFLAGS = -Dcompiler="\"$(COMPILER)\"" -Doptions="\"$(CFLAGS)\"" -dhry_LDADD = libcortex-strings.la - -dhry_native_SOURCES = $(dhry_SOURCES) -dhry_native_CFLAGS = $(dhry_CFLAGS) - -# Benchmark harness -noinst_LIBRARIES = \ - libmulti.a \ - libbionic-c.a \ - libglibc-c.a \ - libnewlib-c.a \ - libplain.a - -libmulti_a_SOURCES = \ - benchmarks/multi/harness.c - -libmulti_a_CFLAGS = -DVERSION=\"$(VERSION)\" $(AM_CFLAGS) - -## Other architecture independant implementaions -libbionic_c_a_SOURCES = \ - reference/bionic-c/bcopy.c \ - reference/bionic-c/memchr.c \ - reference/bionic-c/memcmp.c \ - reference/bionic-c/memcpy.c \ - reference/bionic-c/memset.c \ - reference/bionic-c/strchr.c \ - reference/bionic-c/strcmp.c \ - reference/bionic-c/strcpy.c \ - reference/bionic-c/strlen.c - -libglibc_c_a_SOURCES = \ - reference/glibc-c/memchr.c \ - reference/glibc-c/memcmp.c \ - reference/glibc-c/memcpy.c \ - reference/glibc-c/memset.c \ - reference/glibc-c/strchr.c \ - reference/glibc-c/strcmp.c \ - reference/glibc-c/strcpy.c \ - reference/glibc-c/strlen.c \ - reference/glibc-c/wordcopy.c \ - reference/glibc-c/memcopy.h \ - reference/glibc-c/pagecopy.h - -libnewlib_c_a_SOURCES = \ - reference/newlib-c/memchr.c \ - reference/newlib-c/memcmp.c \ - reference/newlib-c/memcpy.c \ - reference/newlib-c/memset.c \ - reference/newlib-c/strchr.c \ - reference/newlib-c/strcmp.c \ - reference/newlib-c/strcpy.c \ - reference/newlib-c/strlen.c \ - reference/newlib-c/shim.h - -libplain_a_SOURCES = \ - reference/plain/memset.c \ - reference/plain/memcpy.c \ - reference/plain/strcmp.c \ - reference/plain/strcpy.c - -try_none_SOURCES = -try_none_LDADD = libmulti.a -lrt -try_this_SOURCES = -try_this_LDADD = libmulti.a libcortex-strings.la -lrt -try_bionic_c_SOURCES = -try_bionic_c_LDADD = libmulti.a libbionic-c.a -lrt -try_glibc_c_SOURCES = -try_glibc_c_LDADD = libmulti.a libglibc-c.a -lrt -try_newlib_c_SOURCES = -try_newlib_c_LDADD = libmulti.a libnewlib-c.a -lrt -try_plain_SOURCES = -try_plain_LDADD = libmulti.a libplain.a -lrt - -# Architecture specific - -if HOST_AARCH32 - -if WITH_NEON -# Pull in the NEON specific files -neon_bionic_a9_sources = \ - reference/bionic-a9/memcpy.S \ - reference/bionic-a9/memset.S -neon_bionic_a15_sources = \ - reference/bionic-a15/memcpy.S \ - reference/bionic-a15/memset.S -fpu_flags = -mfpu=neon -else -if WITH_VFP -fpu_flags = -mfpu=vfp -else -fpu_flags = -msoft-float -endif -endif - -# Benchmarks and example programs -noinst_PROGRAMS += \ - try-bionic-a9 \ - try-bionic-a15 \ - try-csl \ - try-glibc \ - try-newlib \ - try-newlib-xscale - -# Libraries used in the benchmarks and examples -noinst_LIBRARIES += \ - libbionic-a9.a \ - libbionic-a15.a \ - libcsl.a \ - libglibc.a \ - libnewlib.a \ - libnewlib-xscale.a - -# Main library -libcortex_strings_la_SOURCES = \ - src/thumb-2/strcpy.c \ - src/arm/memchr.S \ - src/arm/strchr.S \ - src/thumb-2/strlen.S \ - src/arm/memset.S \ - src/arm/memcpy.S \ - src/arm/strcmp.S - -# Libraries containing the difference reference versions -libbionic_a9_a_SOURCES = \ - $(neon_bionic_a9_sources) \ - reference/bionic-a9/memcmp.S \ - reference/bionic-a9/strcmp.S \ - reference/bionic-a9/strcpy.S \ - reference/bionic-a9/strlen.c - -libbionic_a9_a_CFLAGS = -Wa,-mimplicit-it=thumb - -libbionic_a15_a_SOURCES = \ - $(neon_bionic_a15_sources) \ - reference/bionic-a15/memcmp.S \ - reference/bionic-a15/strcmp.S \ - reference/bionic-a15/strcpy.S \ - reference/bionic-a15/strlen.c - -libbionic_a15_a_CFLAGS = -Wa,-mimplicit-it=thumb - -libcsl_a_SOURCES = \ - reference/csl/memcpy.c \ - reference/csl/memset.c \ - reference/csl/arm_asm.h - -libglibc_a_SOURCES = \ - reference/glibc/memcpy.S \ - reference/glibc/memset.S \ - reference/glibc/strchr.S \ - reference/glibc/strlen.S - -libnewlib_a_SOURCES = \ - reference/newlib/memcpy.S \ - reference/newlib/strcmp.S \ - reference/newlib/strcpy.c \ - reference/newlib/strlen.c \ - reference/newlib/arm_asm.h \ - reference/newlib/shim.h - -libnewlib_xscale_a_SOURCES = \ - reference/newlib-xscale/memchr.c \ - reference/newlib-xscale/memcpy.c \ - reference/newlib-xscale/memset.c \ - reference/newlib-xscale/strchr.c \ - reference/newlib-xscale/strcmp.c \ - reference/newlib-xscale/strcpy.c \ - reference/newlib-xscale/strlen.c \ - reference/newlib-xscale/xscale.h - -# Flags for the benchmark helpers -try_bionic_a9_SOURCES = -try_bionic_a9_LDADD = libmulti.a libbionic-a9.a -lrt -try_bionic_a15_SOURCES = -try_bionic_a15_LDADD = libmulti.a libbionic-a15.a -lrt -try_csl_SOURCES = -try_csl_LDADD = libmulti.a libcsl.a -lrt -try_glibc_SOURCES = -try_glibc_LDADD = libmulti.a libglibc.a -lrt -try_newlib_SOURCES = -try_newlib_LDADD = libmulti.a libnewlib.a -lrt -try_newlib_xscale_SOURCES = -try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt - -AM_CPPFLAGS = $(fpu_flags) -AM_LDFLAGS = $(fpu_flags) - -endif - -# aarch64 specific -if HOST_AARCH64 - -libcortex_strings_la_SOURCES = \ - src/aarch64/memchr.S \ - src/aarch64/memcmp.S \ - src/aarch64/memcpy.S \ - src/aarch64/memmove.S \ - src/aarch64/memset.S \ - src/aarch64/strchr.S \ - src/aarch64/strchrnul.S \ - src/aarch64/strcmp.S \ - src/aarch64/strcpy.S \ - src/aarch64/strlen.S \ - src/aarch64/strncmp.S \ - src/aarch64/strnlen.S - -endif - -libcortex_strings_la_LDFLAGS = -version-info 1:0:0 - -AM_CFLAGS = \ - -std=gnu99 -Wall \ - -fno-builtin -fno-stack-protector -U_FORTIFY_SOURCE \ - $(AM_CPPFLAGS) - -if WITH_SUBMACHINE -AM_CFLAGS += \ - -mtune=$(submachine) -endif - -EXTRA_DIST = \ - tests/hp-timing.h \ - tests/test-string.h \ - tests/test-skeleton.c \ - scripts/add-license.sh \ - scripts/bench.py \ - scripts/fixup.py \ - scripts/libplot.py \ - scripts/plot-align.py \ - scripts/plot.py \ - scripts/plot-sizes.py \ - scripts/plot-top.py \ - scripts/trim.sh \ - autogen.sh Index: contrib/cortex-strings/README =================================================================== --- contrib/cortex-strings/README +++ /dev/null @@ -1,111 +0,0 @@ -= Cortex-A String Routines = - -This package contains optimised string routines including memcpy(), memset(), -strcpy(), strlen() for the ARM Cortex-A series of cores. - -Various implementations of these routines are provided, including generic -implementations for ARMv7-A cores with/without Neon, Thumb2 implementations -and generic implementations for cores supporting AArch64. - -== Getting started == -First configure and then install libcortex-strings.so. To make other -applications use this library, either add -lcortex-strings to the link -command or use LD_PRELOAD to load the library into existing applications. - -Our intent is to get these routines into the common C libraries such -as GLIBC, Bionic, and Newlib. Your system may already include them! - -== Contents == - * src/ contains the routines themselves - * tests/ contains the unit tests - * reference/ contains reference copies of other ARM-focused - implementations gathered from around the Internet - * benchmarks/ contains various benchmarks, tools, and scripts used to - check and report on the different implementations. - -The src directory contains different variants organised by the -implementation they run on and optional features used. For example: - * src/thumb-2 contains generic non-NEON routines for AArch32 (with Thumb-2). - * src/arm contains tuned routines for Cortex-A class processors. - * src/aarch64 contains generic routines for AArch64. - * src/thumb contains generic routines for armv6-M (with Thumb). - -== Reference versions == -reference/ contains versions collected from various popular Open -Source libraries. These have been modified for use in benchmarking. -Please refer to the individual files for any licensing terms. - -The routines were collected from the following releases: - * EGLIBC 2.13 - * Newlib 1.19.0 - * Bionic android-2.3.5_r1 - -== Licensing == -All Linaro-authored routines are under the modified BSD license: - -Copyright (c) 2011, Linaro Limited -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -All ARM-authored routines are under the modified BSD license: - -Copyright (c) 2014 ARM Ltd -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -All third party routines are under a GPL compatible license. - -== Notes and Limitations == -Some of the implementations have been collected from other -projects and have a variety of licenses and copyright holders. - -== Style == -Assembly code attempts to follow the GLIBC coding convetions. They -are: - * Copyright headers in C style comment blocks - * Instructions indented with one tab - * Operands indented with one tab - * Text is wrapped at 70 characters - * End of line comments are fine Index: contrib/cortex-strings/autogen.sh =================================================================== --- contrib/cortex-strings/autogen.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/sh -# -# autogen.sh glue for hplip -# -# HPLIP used to have five or so different autotools trees. Upstream -# has reduced it to two. Still, this script is capable of cleaning -# just about any possible mess of autoconf files. -# -# BE CAREFUL with trees that are not completely automake-generated, -# this script deletes all Makefile.in files it can find. -# -# Requires: automake 1.9, autoconf 2.57+ -# Conflicts: autoconf 2.13 -set -e - -# Refresh GNU autotools toolchain. -echo Cleaning autotools files... -find -type d -name autom4te.cache -print0 | xargs -0 rm -rf \; -find -type f \( -name missing -o -name install-sh -o -name mkinstalldirs \ - -o -name depcomp -o -name ltmain.sh -o -name configure \ - -o -name config.sub -o -name config.guess \ - -o -name Makefile.in \) -print0 | xargs -0 rm -f - -echo Running autoreconf... -autoreconf --force --install - -# For the Debian package build -test -d debian && { - # link these in Debian builds - rm -f config.sub config.guess - ln -s /usr/share/misc/config.sub . - ln -s /usr/share/misc/config.guess . - - # refresh list of executable scripts, to avoid possible breakage if - # upstream tarball does not include the file or if it is mispackaged - # for whatever reason. - [ "$1" = "updateexec" ] && { - echo Generating list of executable files... - rm -f debian/executable.files - find -type f -perm +111 ! -name '.*' -fprint debian/executable.files - } - - # Remove any files in upstream tarball that we don't have in the Debian - # package (because diff cannot remove files) - version=`dpkg-parsechangelog | awk '/Version:/ { print $2 }' | sed -e 's/-[^-]\+$//'` - source=`dpkg-parsechangelog | awk '/Source:/ { print $2 }' | tr -d ' '` - if test -r ../${source}_${version}.orig.tar.gz ; then - echo Generating list of files that should be removed... - rm -f debian/deletable.files - touch debian/deletable.files - [ -e debian/tmp ] && rm -rf debian/tmp - mkdir debian/tmp - ( cd debian/tmp ; tar -zxf ../../../${source}_${version}.orig.tar.gz ) - find debian/tmp/ -type f ! -name '.*' -print0 | xargs -0 -ri echo '{}' | \ - while read -r i ; do - if test -e "${i}" ; then - filename=$(echo "${i}" | sed -e 's#.*debian/tmp/[^/]\+/##') - test -e "${filename}" || echo "${filename}" >>debian/deletable.files - fi - done - rm -fr debian/tmp - else - echo Emptying list of files that should be deleted... - rm -f debian/deletable.files - touch debian/deletable.files - fi -} - -exit 0 Index: contrib/cortex-strings/benchmarks/dhry/dhry.h =================================================================== --- contrib/cortex-strings/benchmarks/dhry/dhry.h +++ /dev/null @@ -1,311 +0,0 @@ -/* - ************************************************************************** - * DHRYSTONE 2.1 BENCHMARK PC VERSION - ************************************************************************** - * - * "DHRYSTONE" Benchmark Program - * ----------------------------- - * - * Version: C, Version 2.1 - * - * File: dhry.h (part 1 of 3) - * - * Date: May 25, 1988 - * - * Author: Reinhold P. Weicker - * Siemens AG, AUT E 51 - * Postfach 3220 - * 8520 Erlangen - * Germany (West) - * Phone: [+49]-9131-7-20330 - * (8-17 Central European Time) - * Usenet: ..!mcsun!unido!estevax!weicker - * - * Original Version (in Ada) published in - * "Communications of the ACM" vol. 27., no. 10 (Oct. 1984), - * pp. 1013 - 1030, together with the statistics - * on which the distribution of statements etc. is based. - * - * In this C version, the following C library functions are used: - * - strcpy, strcmp (inside the measurement loop) - * - printf, scanf (outside the measurement loop) - * In addition, Berkeley UNIX system calls "times ()" or "time ()" - * are used for execution time measurement. For measurements - * on other systems, these calls have to be changed. - * - * Collection of Results: - * Reinhold Weicker (address see above) and - * - * Rick Richardson - * PC Research. Inc. - * 94 Apple Orchard Drive - * Tinton Falls, NJ 07724 - * Phone: (201) 389-8963 (9-17 EST) - * Usenet: ...!uunet!pcrat!rick - * - * Please send results to Rick Richardson and/or Reinhold Weicker. - * Complete information should be given on hardware and software used. - * Hardware information includes: Machine type, CPU, type and size - * of caches; for microprocessors: clock frequency, memory speed - * (number of wait states). - * Software information includes: Compiler (and runtime library) - * manufacturer and version, compilation switches, OS version. - * The Operating System version may give an indication about the - * compiler; Dhrystone itself performs no OS calls in the measurement - * loop. - * - * The complete output generated by the program should be mailed - * such that at least some checks for correctness can be made. - * - ************************************************************************** - * - * This version has changes made by Roy Longbottom to conform to a common - * format for a series of standard benchmarks for PCs: - * - * Running time greater than 5 seconds due to inaccuracy of the PC clock. - * - * Automatic adjustment of run time, no manually inserted parameters. - * - * Initial display of calibration times to confirm linearity. - * - * Display of results within one screen (or at a slow speed as the test - * progresses) so that it can be seen to have run successfully. - * - * Facilities to type in details of system used etc. - * - * All results and details appended to a results file. - * - * - * Roy Longbottom - * 101323.2241@compuserve.com - * - ************************************************************************** - * - * For details of history, changes, other defines, benchmark construction - * statistics see official versions from ftp.nosc.mil/pub/aburto where - * the latest table of results (dhry.tbl) are available. See also - * netlib@ornl.gov - * - ************************************************************************** - * - * Defines: The following "Defines" are possible: - * -DREG=register (default: Not defined) - * As an approximation to what an average C programmer - * might do, the "register" storage class is applied - * (if enabled by -DREG=register) - * - for local variables, if they are used (dynamically) - * five or more times - * - for parameters if they are used (dynamically) - * six or more times - * Note that an optimal "register" strategy is - * compiler-dependent, and that "register" declarations - * do not necessarily lead to faster execution. - * -DNOSTRUCTASSIGN (default: Not defined) - * Define if the C compiler does not support - * assignment of structures. - * -DNOENUMS (default: Not defined) - * Define if the C compiler does not support - * enumeration types. - *************************************************************************** - * - * Compilation model and measurement (IMPORTANT): - * - * This C version of Dhrystone consists of three files: - * - dhry.h (this file, containing global definitions and comments) - * - dhry_1.c (containing the code corresponding to Ada package Pack_1) - * - dhry_2.c (containing the code corresponding to Ada package Pack_2) - * - * The following "ground rules" apply for measurements: - * - Separate compilation - * - No procedure merging - * - Otherwise, compiler optimizations are allowed but should be indicated - * - Default results are those without register declarations - * See the companion paper "Rationale for Dhrystone Version 2" for a more - * detailed discussion of these ground rules. - * - * For 16-Bit processors (e.g. 80186, 80286), times for all compilation - * models ("small", "medium", "large" etc.) should be given if possible, - * together with a definition of these models for the compiler system used. - * - ************************************************************************** - * Examples of Pentium Results - * - * Dhrystone Benchmark Version 2.1 (Language: C) - * - * Month run 4/1996 - * PC model Escom - * CPU Pentium - * Clock MHz 100 - * Cache 256K - * Options Neptune chipset - * OS/DOS Windows 95 - * Compiler Watcom C/ C++ 10.5 Win386 - * OptLevel -otexan -zp8 -fp5 -5r - * Run by Roy Longbottom - * From UK - * Mail 101323.2241@compuserve.com - * - * Final values (* implementation-dependent): - * - * Int_Glob: O.K. 5 - * Bool_Glob: O.K. 1 - * Ch_1_Glob: O.K. A - * Ch_2_Glob: O.K. B - * Arr_1_Glob[8]: O.K. 7 - * Arr_2_Glob8/7: O.K. 1600010 - * Ptr_Glob-> - * Ptr_Comp: * 98008 - * Discr: O.K. 0 - * Enum_Comp: O.K. 2 - * Int_Comp: O.K. 17 - * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING - * Next_Ptr_Glob-> - * Ptr_Comp: * 98008 same as above - * Discr: O.K. 0 - * Enum_Comp: O.K. 1 - * Int_Comp: O.K. 18 - * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING - * Int_1_Loc: O.K. 5 - * Int_2_Loc: O.K. 13 - * Int_3_Loc: O.K. 7 - * Enum_Loc: O.K. 1 - * Str_1_Loc: O.K. DHRYSTONE PROGRAM, 1'ST STRING - * Str_2_Loc: O.K. DHRYSTONE PROGRAM, 2'ND STRING - * - * Register option Selected. - * - * Microseconds 1 loop: 4.53 - * Dhrystones / second: 220690 - * VAX MIPS rating: 125.61 - * - * - * Dhrystone Benchmark Version 2.1 (Language: C) - * - * Month run 4/1996 - * PC model Escom - * CPU Pentium - * Clock MHz 100 - * Cache 256K - * Options Neptune chipset - * OS/DOS Windows 95 - * Compiler Watcom C/ C++ 10.5 Win386 - * OptLevel No optimisation - * Run by Roy Longbottom - * From UK - * Mail 101323.2241@compuserve.com - * - * Final values (* implementation-dependent): - * - * Int_Glob: O.K. 5 - * Bool_Glob: O.K. 1 - * Ch_1_Glob: O.K. A - * Ch_2_Glob: O.K. B - * Arr_1_Glob[8]: O.K. 7 - * Arr_2_Glob8/7: O.K. 320010 - * Ptr_Glob-> - * Ptr_Comp: * 98004 - * Discr: O.K. 0 - * Enum_Comp: O.K. 2 - * Int_Comp: O.K. 17 - * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING - * Next_Ptr_Glob-> - * Ptr_Comp: * 98004 same as above - * Discr: O.K. 0 - * Enum_Comp: O.K. 1 - * Int_Comp: O.K. 18 - * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING - * Int_1_Loc: O.K. 5 - * Int_2_Loc: O.K. 13 - * Int_3_Loc: O.K. 7 - * Enum_Loc: O.K. 1 - * Str_1_Loc: O.K. DHRYSTONE PROGRAM, 1'ST STRING - * Str_2_Loc: O.K. DHRYSTONE PROGRAM, 2'ND STRING - * - * Register option Not selected. - * - * Microseconds 1 loop: 20.06 - * Dhrystones / second: 49844 - * VAX MIPS rating: 28.37 - * - ************************************************************************** - */ - -/* Compiler and system dependent definitions: */ - -#ifndef TIME -#define TIMES -#endif - /* Use times(2) time function unless */ - /* explicitly defined otherwise */ - -#ifdef TIMES -/* #include - #include */ - /* for "times" */ -#endif - -#define Mic_secs_Per_Second 1000000.0 - /* Berkeley UNIX C returns process times in seconds/HZ */ - -#ifdef NOSTRUCTASSIGN -#define structassign(d, s) memcpy(&(d), &(s), sizeof(d)) -#else -#define structassign(d, s) d = s -#endif - -#ifdef NOENUM -#define Ident_1 0 -#define Ident_2 1 -#define Ident_3 2 -#define Ident_4 3 -#define Ident_5 4 - typedef int Enumeration; -#else - typedef enum {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5} - Enumeration; -#endif - /* for boolean and enumeration types in Ada, Pascal */ - -/* General definitions: */ - -#include -#include - - /* for strcpy, strcmp */ - -#define Null 0 - /* Value of a Null pointer */ -#define true 1 -#define false 0 - -typedef int One_Thirty; -typedef int One_Fifty; -typedef char Capital_Letter; -typedef int Boolean; -typedef char Str_30 [31]; -typedef int Arr_1_Dim [50]; -typedef int Arr_2_Dim [50] [50]; - -typedef struct record - { - struct record *Ptr_Comp; - Enumeration Discr; - union { - struct { - Enumeration Enum_Comp; - int Int_Comp; - char Str_Comp [31]; - } var_1; - struct { - Enumeration E_Comp_2; - char Str_2_Comp [31]; - } var_2; - struct { - char Ch_1_Comp; - char Ch_2_Comp; - } var_3; - } variant; - } Rec_Type, *Rec_Pointer; - - - Index: contrib/cortex-strings/benchmarks/dhry/dhry_1.c =================================================================== --- contrib/cortex-strings/benchmarks/dhry/dhry_1.c +++ /dev/null @@ -1,778 +0,0 @@ -/* - ************************************************************************* - * - * "DHRYSTONE" Benchmark Program - * ----------------------------- - * - * Version: C, Version 2.1 - * - * File: dhry_1.c (part 2 of 3) - * - * Date: May 25, 1988 - * - * Author: Reinhold P. Weicker - * - ************************************************************************* - */ - - #include - #include - #include - #include "dhry.h" - /*COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER*/ - - #ifdef COW - #define compiler "Watcom C/C++ 10.5 Win386" - #define options " -otexan -zp8 -5r -ms" - #endif - #ifdef CNW - #define compiler "Watcom C/C++ 10.5 Win386" - #define options " No optimisation" - #endif - #ifdef COD - #define compiler "Watcom C/C++ 10.5 Dos4GW" - #define options " -otexan -zp8 -5r -ms" - #endif - #ifdef CND - #define compiler "Watcom C/C++ 10.5 Dos4GW" - #define options " No optimisation" - #endif - #ifdef CONT - #define compiler "Watcom C/C++ 10.5 Win32NT" - #define options " -otexan -zp8 -5r -ms" - #endif - #ifdef CNNT - #define compiler "Watcom C/C++ 10.5 Win32NT" - #define options " No optimisation" - #endif - #ifdef COO2 - #define compiler "Watcom C/C++ 10.5 OS/2-32" - #define options " -otexan -zp8 -5r -ms" - #endif - #ifdef CNO2 - #define compiler "Watcom C/C++ 10.5 OS/2-32" - #define options " No optimisation" - #endif - - -/* Global Variables: */ - -Rec_Pointer Ptr_Glob, - Next_Ptr_Glob; -int Int_Glob; - Boolean Bool_Glob; - char Ch_1_Glob, - Ch_2_Glob; - int Arr_1_Glob [50]; - int Arr_2_Glob [50] [50]; - int getinput = 1; - - - char Reg_Define[100] = "Register option Selected."; - - Enumeration Func_1 (Capital_Letter Ch_1_Par_Val, - Capital_Letter Ch_2_Par_Val); - /* - forward declaration necessary since Enumeration may not simply be int - */ - - #ifndef ROPT - #define REG - /* REG becomes defined as empty */ - /* i.e. no register variables */ - #else - #define REG register - #endif - - void Proc_1 (REG Rec_Pointer Ptr_Val_Par); - void Proc_2 (One_Fifty *Int_Par_Ref); - void Proc_3 (Rec_Pointer *Ptr_Ref_Par); - void Proc_4 (); - void Proc_5 (); - void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par); - void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, - One_Fifty *Int_Par_Ref); - void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, - int Int_1_Par_Val, int Int_2_Par_Val); - - Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref); - - - /* variables for time measurement: */ - - #define Too_Small_Time 2 - /* Measurements should last at least 2 seconds */ - - double Begin_Time, - End_Time, - User_Time; - - double Microseconds, - Dhrystones_Per_Second, - Vax_Mips; - - /* end of variables for time measurement */ - - - void main (int argc, char *argv[]) - /*****/ - - /* main program, corresponds to procedures */ - /* Main and Proc_0 in the Ada version */ - { - double dtime(); - - One_Fifty Int_1_Loc; - REG One_Fifty Int_2_Loc; - One_Fifty Int_3_Loc; - REG char Ch_Index; - Enumeration Enum_Loc; - Str_30 Str_1_Loc; - Str_30 Str_2_Loc; - REG int Run_Index; - REG int Number_Of_Runs; - int endit, count = 10; - FILE *Ap; - char general[9][80] = {" "}; - - /* Initializations */ - if (argc > 1) - { - switch (argv[1][0]) - { - case 'N': - getinput = 0; - break; - case 'n': - getinput = 0; - break; - } - } - - if ((Ap = fopen("Dhry.txt","a+")) == NULL) - { - printf("Can not open Dhry.txt\n\n"); - printf("Press any key\n"); - exit(1); - } - -/*********************************************************************** - * Change for compiler and optimisation used * - ***********************************************************************/ - - Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type)); - Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type)); - - Ptr_Glob->Ptr_Comp = Next_Ptr_Glob; - Ptr_Glob->Discr = Ident_1; - Ptr_Glob->variant.var_1.Enum_Comp = Ident_3; - Ptr_Glob->variant.var_1.Int_Comp = 40; - strcpy (Ptr_Glob->variant.var_1.Str_Comp, - "DHRYSTONE PROGRAM, SOME STRING"); - strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); - - Arr_2_Glob [8][7] = 10; - /* Was missing in published program. Without this statement, */ - /* Arr_2_Glob [8][7] would have an undefined value. */ - /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */ - /* overflow may occur for this array element. */ - - printf ("\n"); - printf ("Dhrystone Benchmark, Version 2.1 (Language: C or C++)\n"); - printf ("\n"); - - if (getinput == 0) - { - printf ("No run time input data\n\n"); - } - else - { - printf ("With run time input data\n\n"); - } - - printf ("Compiler %s\n", compiler); - printf ("Optimisation %s\n", options); - #ifdef ROPT - printf ("Register option selected\n\n"); - #else - printf ("Register option not selected\n\n"); - strcpy(Reg_Define, "Register option Not selected."); - #endif - - /* - if (Reg) - { - printf ("Program compiled with 'register' attribute\n"); - printf ("\n"); - } - else - { - printf ("Program compiled without 'register' attribute\n"); - printf ("\n"); - } - - printf ("Please give the number of runs through the benchmark: "); - { - int n; - scanf ("%d", &n); - Number_Of_Runs = n; - } - printf ("\n"); - printf ("Execution starts, %d runs through Dhrystone\n", - Number_Of_Runs); - */ - - Number_Of_Runs = 5000; - - do - { - - Number_Of_Runs = Number_Of_Runs * 2; - count = count - 1; - Arr_2_Glob [8][7] = 10; - - /***************/ - /* Start timer */ - /***************/ - - Begin_Time = dtime(); - - for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index) - { - - Proc_5(); - Proc_4(); - /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */ - Int_1_Loc = 2; - Int_2_Loc = 3; - strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING"); - Enum_Loc = Ident_2; - Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc); - /* Bool_Glob == 1 */ - while (Int_1_Loc < Int_2_Loc) /* loop body executed once */ - { - Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc; - /* Int_3_Loc == 7 */ - Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc); - /* Int_3_Loc == 7 */ - Int_1_Loc += 1; - } /* while */ - /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ - Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc); - /* Int_Glob == 5 */ - Proc_1 (Ptr_Glob); - for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index) - /* loop body executed twice */ - { - if (Enum_Loc == Func_1 (Ch_Index, 'C')) - /* then, not executed */ - { - Proc_6 (Ident_1, &Enum_Loc); - strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING"); - Int_2_Loc = Run_Index; - Int_Glob = Run_Index; - } - } - /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ - Int_2_Loc = Int_2_Loc * Int_1_Loc; - Int_1_Loc = Int_2_Loc / Int_3_Loc; - Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc; - /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */ - Proc_2 (&Int_1_Loc); - /* Int_1_Loc == 5 */ - - } /* loop "for Run_Index" */ - - /**************/ - /* Stop timer */ - /**************/ - - End_Time = dtime(); - User_Time = End_Time - Begin_Time; - - printf ("%12.0f runs %6.2f seconds \n",(double) Number_Of_Runs, User_Time); - if (User_Time > 5) - { - count = 0; - } - else - { - if (User_Time < 0.1) - { - Number_Of_Runs = Number_Of_Runs * 5; - } - } - } /* calibrate/run do while */ - while (count >0); - - printf ("\n"); - printf ("Final values (* implementation-dependent):\n"); - printf ("\n"); - printf ("Int_Glob: "); - if (Int_Glob == 5) printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Int_Glob); - - printf ("Bool_Glob: "); - if (Bool_Glob == 1) printf ("O.K. "); - else printf ("WRONG "); - printf ("%d\n", Bool_Glob); - - printf ("Ch_1_Glob: "); - if (Ch_1_Glob == 'A') printf ("O.K. "); - else printf ("WRONG "); - printf ("%c ", Ch_1_Glob); - - printf ("Ch_2_Glob: "); - if (Ch_2_Glob == 'B') printf ("O.K. "); - else printf ("WRONG "); - printf ("%c\n", Ch_2_Glob); - - printf ("Arr_1_Glob[8]: "); - if (Arr_1_Glob[8] == 7) printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Arr_1_Glob[8]); - - printf ("Arr_2_Glob8/7: "); - if (Arr_2_Glob[8][7] == Number_Of_Runs + 10) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%10d\n", Arr_2_Glob[8][7]); - - printf ("Ptr_Glob-> "); - printf (" Ptr_Comp: * %d\n", (int) Ptr_Glob->Ptr_Comp); - - printf (" Discr: "); - if (Ptr_Glob->Discr == 0) printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Ptr_Glob->Discr); - - printf ("Enum_Comp: "); - if (Ptr_Glob->variant.var_1.Enum_Comp == 2) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d\n", Ptr_Glob->variant.var_1.Enum_Comp); - - printf (" Int_Comp: "); - if (Ptr_Glob->variant.var_1.Int_Comp == 17) printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Ptr_Glob->variant.var_1.Int_Comp); - - printf ("Str_Comp: "); - if (strcmp(Ptr_Glob->variant.var_1.Str_Comp, - "DHRYSTONE PROGRAM, SOME STRING") == 0) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%s\n", Ptr_Glob->variant.var_1.Str_Comp); - - printf ("Next_Ptr_Glob-> "); - printf (" Ptr_Comp: * %d", (int) Next_Ptr_Glob->Ptr_Comp); - printf (" same as above\n"); - - printf (" Discr: "); - if (Next_Ptr_Glob->Discr == 0) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Next_Ptr_Glob->Discr); - - printf ("Enum_Comp: "); - if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp); - - printf (" Int_Comp: "); - if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Next_Ptr_Glob->variant.var_1.Int_Comp); - - printf ("Str_Comp: "); - if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp, - "DHRYSTONE PROGRAM, SOME STRING") == 0) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp); - - printf ("Int_1_Loc: "); - if (Int_1_Loc == 5) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Int_1_Loc); - - printf ("Int_2_Loc: "); - if (Int_2_Loc == 13) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d\n", Int_2_Loc); - - printf ("Int_3_Loc: "); - if (Int_3_Loc == 7) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d ", Int_3_Loc); - - printf ("Enum_Loc: "); - if (Enum_Loc == 1) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%d\n", Enum_Loc); - - printf ("Str_1_Loc: "); - if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%s\n", Str_1_Loc); - - printf ("Str_2_Loc: "); - if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0) - printf ("O.K. "); - else printf ("WRONG "); - printf ("%s\n", Str_2_Loc); - - printf ("\n"); - - - if (User_Time < Too_Small_Time) - { - printf ("Measured time too small to obtain meaningful results\n"); - printf ("Please increase number of runs\n"); - printf ("\n"); - } - else - { - Microseconds = User_Time * Mic_secs_Per_Second - / (double) Number_Of_Runs; - Dhrystones_Per_Second = (double) Number_Of_Runs / User_Time; - Vax_Mips = Dhrystones_Per_Second / 1757.0; - - printf ("Microseconds for one run through Dhrystone: "); - printf ("%12.2lf \n", Microseconds); - printf ("Dhrystones per Second: "); - printf ("%10.0lf \n", Dhrystones_Per_Second); - printf ("VAX MIPS rating = "); - printf ("%12.2lf \n",Vax_Mips); - printf ("\n"); - -/************************************************************************ - * Type details of hardware, software etc. * - ************************************************************************/ - - if (getinput == 1) - { - printf ("Enter the following which will be added with results to file DHRY.TXT\n"); - printf ("When submitting a number of results you need only provide details once\n"); - printf ("but a cross reference such as an abbreviated CPU type would be useful.\n"); - printf ("You can kill (exit or close) the program now and no data will be added.\n\n"); - - printf ("PC Supplier/model ? "); - gets(general[1]); - - printf ("CPU chip ? "); - gets(general[2]); - - printf ("Clock MHz ? "); - gets(general[3]); - - printf ("Cache size ? "); - gets(general[4]); - - printf ("Chipset & H/W options ? "); - gets(general[5]); - - printf ("OS/DOS version ? "); - gets(general[6]); - - printf ("Your name ? "); - gets(general[7]); - - printf ("Company/Location ? "); - gets(general[8]); - - printf ("E-mail address ? "); - gets(general[0]); - } -/************************************************************************ - * Add results to output file Dhry.txt * - ************************************************************************/ - fprintf (Ap, "-------------------- -----------------------------------" - "\n"); - fprintf (Ap, "Dhrystone Benchmark Version 2.1 (Language: C++)\n\n"); - fprintf (Ap, "PC model %s\n", general[1]); - fprintf (Ap, "CPU %s\n", general[2]); - fprintf (Ap, "Clock MHz %s\n", general[3]); - fprintf (Ap, "Cache %s\n", general[4]); - fprintf (Ap, "Options %s\n", general[5]); - fprintf (Ap, "OS/DOS %s\n", general[6]); - fprintf (Ap, "Compiler %s\n", compiler); - fprintf (Ap, "OptLevel %s\n", options); - fprintf (Ap, "Run by %s\n", general[7]); - fprintf (Ap, "From %s\n", general[8]); - fprintf (Ap, "Mail %s\n\n", general[0]); - - fprintf (Ap, "Final values (* implementation-dependent):\n"); - fprintf (Ap, "\n"); - fprintf (Ap, "Int_Glob: "); - if (Int_Glob == 5) fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Int_Glob); - - fprintf (Ap, "Bool_Glob: "); - if (Bool_Glob == 1) fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Bool_Glob); - - fprintf (Ap, "Ch_1_Glob: "); - if (Ch_1_Glob == 'A') fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%c\n", Ch_1_Glob); - - fprintf (Ap, "Ch_2_Glob: "); - if (Ch_2_Glob == 'B') fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%c\n", Ch_2_Glob); - - fprintf (Ap, "Arr_1_Glob[8]: "); - if (Arr_1_Glob[8] == 7) fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Arr_1_Glob[8]); - - fprintf (Ap, "Arr_2_Glob8/7: "); - if (Arr_2_Glob[8][7] == Number_Of_Runs + 10) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%10d\n", Arr_2_Glob[8][7]); - - fprintf (Ap, "Ptr_Glob-> \n"); - fprintf (Ap, " Ptr_Comp: * %d\n", (int) Ptr_Glob->Ptr_Comp); - - fprintf (Ap, " Discr: "); - if (Ptr_Glob->Discr == 0) fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Ptr_Glob->Discr); - - fprintf (Ap, " Enum_Comp: "); - if (Ptr_Glob->variant.var_1.Enum_Comp == 2) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Enum_Comp); - - fprintf (Ap, " Int_Comp: "); - if (Ptr_Glob->variant.var_1.Int_Comp == 17) fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Int_Comp); - - fprintf (Ap, " Str_Comp: "); - if (strcmp(Ptr_Glob->variant.var_1.Str_Comp, - "DHRYSTONE PROGRAM, SOME STRING") == 0) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%s\n", Ptr_Glob->variant.var_1.Str_Comp); - - fprintf (Ap, "Next_Ptr_Glob-> \n"); - fprintf (Ap, " Ptr_Comp: * %d", (int) Next_Ptr_Glob->Ptr_Comp); - fprintf (Ap, " same as above\n"); - - fprintf (Ap, " Discr: "); - if (Next_Ptr_Glob->Discr == 0) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Next_Ptr_Glob->Discr); - - fprintf (Ap, " Enum_Comp: "); - if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp); - - fprintf (Ap, " Int_Comp: "); - if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Int_Comp); - - fprintf (Ap, " Str_Comp: "); - if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp, - "DHRYSTONE PROGRAM, SOME STRING") == 0) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp); - - fprintf (Ap, "Int_1_Loc: "); - if (Int_1_Loc == 5) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Int_1_Loc); - - fprintf (Ap, "Int_2_Loc: "); - if (Int_2_Loc == 13) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Int_2_Loc); - - fprintf (Ap, "Int_3_Loc: "); - if (Int_3_Loc == 7) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Int_3_Loc); - - fprintf (Ap, "Enum_Loc: "); - if (Enum_Loc == 1) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%d\n", Enum_Loc); - - fprintf (Ap, "Str_1_Loc: "); - if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%s\n", Str_1_Loc); - - fprintf (Ap, "Str_2_Loc: "); - if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0) - fprintf (Ap, "O.K. "); - else fprintf (Ap, "WRONG "); - fprintf (Ap, "%s\n", Str_2_Loc); - - - fprintf (Ap, "\n"); - fprintf(Ap,"%s\n",Reg_Define); - fprintf (Ap, "\n"); - fprintf(Ap,"Microseconds 1 loop: %12.2lf\n",Microseconds); - fprintf(Ap,"Dhrystones / second: %10.0lf\n",Dhrystones_Per_Second); - fprintf(Ap,"VAX MIPS rating: %12.2lf\n\n",Vax_Mips); - fclose(Ap); - } - - printf ("\n"); - printf ("A new results file will have been created in the same directory as the\n"); - printf (".EXE files if one did not already exist. If you made a mistake on input, \n"); - printf ("you can use a text editor to correct it, delete the results or copy \n"); - printf ("them to a different file name. If you intend to run multiple tests you\n"); - printf ("you may wish to rename DHRY.TXT with a more informative title.\n\n"); - printf ("Please submit feedback and results files as a posting in Section 12\n"); - printf ("or to Roy_Longbottom@compuserve.com\n\n"); - - if (getinput == 1) - { - printf("Press any key to exit\n"); - printf ("\nIf this is displayed you must close the window in the normal way\n"); - } - } - - - void Proc_1 (REG Rec_Pointer Ptr_Val_Par) - /******************/ - - /* executed once */ - { - REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp; - /* == Ptr_Glob_Next */ - /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */ - /* corresponds to "rename" in Ada, "with" in Pascal */ - - structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob); - Ptr_Val_Par->variant.var_1.Int_Comp = 5; - Next_Record->variant.var_1.Int_Comp - = Ptr_Val_Par->variant.var_1.Int_Comp; - Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp; - Proc_3 (&Next_Record->Ptr_Comp); - /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp - == Ptr_Glob->Ptr_Comp */ - if (Next_Record->Discr == Ident_1) - /* then, executed */ - { - Next_Record->variant.var_1.Int_Comp = 6; - Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp, - &Next_Record->variant.var_1.Enum_Comp); - Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp; - Proc_7 (Next_Record->variant.var_1.Int_Comp, 10, - &Next_Record->variant.var_1.Int_Comp); - } - else /* not executed */ - structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp); - } /* Proc_1 */ - - - void Proc_2 (One_Fifty *Int_Par_Ref) - /******************/ - /* executed once */ - /* *Int_Par_Ref == 1, becomes 4 */ - - { - One_Fifty Int_Loc; - Enumeration Enum_Loc; - - Int_Loc = *Int_Par_Ref + 10; - do /* executed once */ - if (Ch_1_Glob == 'A') - /* then, executed */ - { - Int_Loc -= 1; - *Int_Par_Ref = Int_Loc - Int_Glob; - Enum_Loc = Ident_1; - } /* if */ - while (Enum_Loc != Ident_1); /* true */ - } /* Proc_2 */ - - - void Proc_3 (Rec_Pointer *Ptr_Ref_Par) - /******************/ - /* executed once */ - /* Ptr_Ref_Par becomes Ptr_Glob */ - - { - if (Ptr_Glob != Null) - /* then, executed */ - *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp; - Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp); - } /* Proc_3 */ - - -void Proc_4 () /* without parameters */ - /*******/ - /* executed once */ - { - Boolean Bool_Loc; - - Bool_Loc = Ch_1_Glob == 'A'; - Bool_Glob = Bool_Loc | Bool_Glob; - Ch_2_Glob = 'B'; - } /* Proc_4 */ - - - void Proc_5 () /* without parameters */ - /*******/ - /* executed once */ - { - Ch_1_Glob = 'A'; - Bool_Glob = false; - } /* Proc_5 */ - - - /* Procedure for the assignment of structures, */ - /* if the C compiler doesn't support this feature */ - #ifdef NOSTRUCTASSIGN - memcpy (d, s, l) - register char *d; - register char *s; - register int l; - { - while (l--) *d++ = *s++; - } - #endif - - -double dtime() -{ - - /* #include */ - - #define HZ CLOCKS_PER_SEC - clock_t tnow; - - double q; - tnow = clock(); - q = (double)tnow / (double)HZ; - return q; -} Index: contrib/cortex-strings/benchmarks/dhry/dhry_2.c =================================================================== --- contrib/cortex-strings/benchmarks/dhry/dhry_2.c +++ /dev/null @@ -1,186 +0,0 @@ - /* - ************************************************************************* - * - * "DHRYSTONE" Benchmark Program - * ----------------------------- - * - * Version: C, Version 2.1 - * - * File: dhry_2.c (part 3 of 3) - * - * Date: May 25, 1988 - * - * Author: Reinhold P. Weicker - * - ************************************************************************* - */ - - #include "dhry.h" - - #ifndef REG - #define REG - /* REG becomes defined as empty */ - /* i.e. no register variables */ - #else - #define REG register - #endif - - extern int Int_Glob; - extern char Ch_1_Glob; - - Boolean Func_3 (Enumeration Enum_Par_Val); - - void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par) - /*********************************/ - /* executed once */ - /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */ - - { - *Enum_Ref_Par = Enum_Val_Par; - if (! Func_3 (Enum_Val_Par)) - /* then, not executed */ - *Enum_Ref_Par = Ident_4; - switch (Enum_Val_Par) - { - case Ident_1: - *Enum_Ref_Par = Ident_1; - break; - case Ident_2: - if (Int_Glob > 100) - /* then */ - *Enum_Ref_Par = Ident_1; - else *Enum_Ref_Par = Ident_4; - break; - case Ident_3: /* executed */ - *Enum_Ref_Par = Ident_2; - break; - case Ident_4: break; - case Ident_5: - *Enum_Ref_Par = Ident_3; - break; - } /* switch */ - } /* Proc_6 */ - - - void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, - One_Fifty *Int_Par_Ref) - /**********************************************/ - /* executed three times */ - /* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */ - /* Int_Par_Ref becomes 7 */ - /* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */ - /* Int_Par_Ref becomes 17 */ - /* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */ - /* Int_Par_Ref becomes 18 */ - - { - One_Fifty Int_Loc; - - Int_Loc = Int_1_Par_Val + 2; - *Int_Par_Ref = Int_2_Par_Val + Int_Loc; - } /* Proc_7 */ - - - void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, - int Int_1_Par_Val, int Int_2_Par_Val) - /*********************************************************************/ - /* executed once */ - /* Int_Par_Val_1 == 3 */ - /* Int_Par_Val_2 == 7 */ - - { - REG One_Fifty Int_Index; - REG One_Fifty Int_Loc; - - Int_Loc = Int_1_Par_Val + 5; - Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val; - Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc]; - Arr_1_Par_Ref [Int_Loc+30] = Int_Loc; - for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index) - Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc; - Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1; - Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc]; - Int_Glob = 5; - } /* Proc_8 */ - - - Enumeration Func_1 (Capital_Letter Ch_1_Par_Val, - Capital_Letter Ch_2_Par_Val) - /*************************************************/ - /* executed three times */ - /* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */ - /* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */ - /* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */ - - { - Capital_Letter Ch_1_Loc; - Capital_Letter Ch_2_Loc; - - Ch_1_Loc = Ch_1_Par_Val; - Ch_2_Loc = Ch_1_Loc; - if (Ch_2_Loc != Ch_2_Par_Val) - /* then, executed */ - return (Ident_1); - else /* not executed */ - { - Ch_1_Glob = Ch_1_Loc; - return (Ident_2); - } - } /* Func_1 */ - - - Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref) - /*************************************************/ - /* executed once */ - /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */ - /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */ - - { - REG One_Thirty Int_Loc; - Capital_Letter Ch_Loc; - - Int_Loc = 2; - while (Int_Loc <= 2) /* loop body executed once */ - if (Func_1 (Str_1_Par_Ref[Int_Loc], - Str_2_Par_Ref[Int_Loc+1]) == Ident_1) - /* then, executed */ - { - Ch_Loc = 'A'; - Int_Loc += 1; - } /* if, while */ - if (Ch_Loc >= 'W' && Ch_Loc < 'Z') - /* then, not executed */ - Int_Loc = 7; - if (Ch_Loc == 'R') - /* then, not executed */ - return (true); - else /* executed */ - { - if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0) - /* then, not executed */ - { - Int_Loc += 7; - Int_Glob = Int_Loc; - return (true); - } - else /* executed */ - return (false); - } /* if Ch_Loc */ - } /* Func_2 */ - - - Boolean Func_3 (Enumeration Enum_Par_Val) - /***************************/ - /* executed once */ - /* Enum_Par_Val == Ident_3 */ - - { - Enumeration Enum_Loc; - - Enum_Loc = Enum_Par_Val; - if (Enum_Loc == Ident_3) - /* then, executed */ - return (true); - else /* not executed */ - return (false); - } /* Func_3 */ Index: contrib/cortex-strings/benchmarks/multi/harness.c =================================================================== --- contrib/cortex-strings/benchmarks/multi/harness.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Copyright (c) 2011, Linaro Limited - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the Linaro nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/** A simple harness that times how long a string function takes to - * run. - */ - -/* PENDING: Add EPL */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_ELEMS(_x) (sizeof(_x) / sizeof((_x)[0])) - -#ifndef VERSION -#define VERSION "(unknown version)" -#endif - -/** Make sure a function is called by using the return value */ -#define SPOIL(_x) volatile long x = (long)(_x); (void)x - -/** Type of functions that can be tested */ -typedef void (*stub_t)(void *dest, void *src, size_t n); - -/** Meta data about one test */ -struct test -{ - /** Test name */ - const char *name; - /** Function to test */ - stub_t stub; -}; - -/** Flush the cache by reading a chunk of memory */ -static void empty(volatile char *against) -{ - /* We know that there's a 16 k cache with 64 byte lines giving - a total of 256 lines. Read randomly from 256*5 places should - flush everything */ - int offset = (1024 - 256)*1024; - - for (int i = offset; i < offset + 16*1024*3; i += 64) - { - against[i]; - } -} - -/** Stub that does nothing. Used for calibrating */ -static void xbounce(void *dest, void *src, size_t n) -{ - SPOIL(0); -} - -/** Stub that calls memcpy */ -static void xmemcpy(void *dest, void *src, size_t n) -{ - SPOIL(memcpy(dest, src, n)); -} - -/** Stub that calls memset */ -static void xmemset(void *dest, void *src, size_t n) -{ - SPOIL(memset(dest, 0, n)); -} - -/** Stub that calls memcmp */ -static void xmemcmp(void *dest, void *src, size_t n) -{ - SPOIL(memcmp(dest, src, n)); -} - -/** Stub that calls strcpy */ -static void xstrcpy(void *dest, void *src, size_t n) -{ - SPOIL(strcpy(dest, src)); -} - -/** Stub that calls strlen */ -static void xstrlen(void *dest, void *src, size_t n) -{ - SPOIL(strlen(dest)); -} - -/** Stub that calls strcmp */ -static void xstrcmp(void *dest, void *src, size_t n) -{ - SPOIL(strcmp(dest, src)); -} - -/** Stub that calls strchr */ -static void xstrchr(void *dest, void *src, size_t n) -{ - /* Put the character at the end of the string and before the null */ - ((char *)src)[n-1] = 32; - SPOIL(strchr(src, 32)); -} - -/** Stub that calls memchr */ -static void xmemchr(void *dest, void *src, size_t n) -{ - /* Put the character at the end of the block */ - ((char *)src)[n-1] = 32; - SPOIL(memchr(src, 32, n)); -} - -/** All functions that can be tested */ -static const struct test tests[] = - { - { "bounce", xbounce }, - { "memchr", xmemchr }, - { "memcpy", xmemcpy }, - { "memset", xmemset }, - { "memcmp", xmemcmp }, - { "strchr", xstrchr }, - { "strcmp", xstrcmp }, - { "strcpy", xstrcpy }, - { "strlen", xstrlen }, - { NULL } - }; - -/** Show basic usage */ -static void usage(const char* name) -{ - printf("%s %s: run a string related benchmark.\n" - "usage: %s [-c block-size] [-l loop-count] [-a alignment|src_alignment:dst_alignment] [-f] [-t test-name] [-r run-id]\n" - , name, VERSION, name); - - printf("Tests:"); - - for (const struct test *ptest = tests; ptest->name != NULL; ptest++) - { - printf(" %s", ptest->name); - } - - printf("\n"); - - exit(-1); -} - -/** Find the test by name */ -static const struct test *find_test(const char *name) -{ - if (name == NULL) - { - return tests + 0; - } - else - { - for (const struct test *p = tests; p->name != NULL; p++) - { - if (strcmp(p->name, name) == 0) - { - return p; - } - } - } - - return NULL; -} - -#define MIN_BUFFER_SIZE 1024*1024 -#define MAX_ALIGNMENT 256 - -/** Take a pointer and ensure that the lower bits == alignment */ -static char *realign(char *p, int alignment) -{ - uintptr_t pp = (uintptr_t)p; - pp = (pp + (MAX_ALIGNMENT - 1)) & ~(MAX_ALIGNMENT - 1); - pp += alignment; - - return (char *)pp; -} - -static int parse_int_arg(const char *arg, const char *exe_name) -{ - long int ret; - - errno = 0; - ret = strtol(arg, NULL, 0); - - if (errno) - { - usage(exe_name); - } - - return (int)ret; -} - -static void parse_alignment_arg(const char *arg, const char *exe_name, - int *src_alignment, int *dst_alignment) -{ - long int ret; - char *endptr; - - errno = 0; - ret = strtol(arg, &endptr, 0); - - if (errno) - { - usage(exe_name); - } - - *src_alignment = (int)ret; - - if (ret > 256 || ret < 1) - { - printf("Alignment should be in the range [1, 256].\n"); - usage(exe_name); - } - - if (ret == 256) - ret = 0; - - if (endptr && *endptr == ':') - { - errno = 0; - ret = strtol(endptr + 1, NULL, 0); - - if (errno) - { - usage(exe_name); - } - - if (ret > 256 || ret < 1) - { - printf("Alignment should be in the range [1, 256].\n"); - usage(exe_name); - } - - if (ret == 256) - ret = 0; - } - - *dst_alignment = (int)ret; -} - -/** Setup and run a test */ -int main(int argc, char **argv) -{ - /* Size of src and dest buffers */ - size_t buffer_size = MIN_BUFFER_SIZE; - - /* Number of bytes per call */ - int count = 31; - /* Number of times to run */ - int loops = 10000000; - /* True to flush the cache each time */ - int flush = 0; - /* Name of the test */ - const char *name = NULL; - /* Alignment of buffers */ - int src_alignment = 8; - int dst_alignment = 8; - /* Name of the run */ - const char *run_id = "0"; - - int opt; - - while ((opt = getopt(argc, argv, "c:l:ft:r:hva:")) > 0) - { - switch (opt) - { - case 'c': - count = parse_int_arg(optarg, argv[0]); - break; - case 'l': - loops = parse_int_arg(optarg, argv[0]); - break; - case 'a': - parse_alignment_arg(optarg, argv[0], &src_alignment, &dst_alignment); - break; - case 'f': - flush = 1; - break; - case 't': - name = strdup(optarg); - break; - case 'r': - run_id = strdup(optarg); - break; - case 'h': - usage(argv[0]); - break; - default: - usage(argv[0]); - break; - } - } - - /* Find the test by name */ - const struct test *ptest = find_test(name); - - if (ptest == NULL) - { - usage(argv[0]); - } - - if (count + MAX_ALIGNMENT * 2 > MIN_BUFFER_SIZE) - { - buffer_size = count + MAX_ALIGNMENT * 2; - } - - /* Buffers to read and write from */ - char *src = malloc(buffer_size); - char *dest = malloc(buffer_size); - - assert(src != NULL && dest != NULL); - - src = realign(src, src_alignment); - dest = realign(dest, dst_alignment); - - /* Fill the buffer with non-zero, reproducable random data */ - srandom(1539); - - for (int i = 0; i < buffer_size; i++) - { - src[i] = (char)random() | 1; - dest[i] = src[i]; - } - - /* Make sure the buffers are null terminated for any string tests */ - src[count] = 0; - dest[count] = 0; - - struct timespec start, end; - int err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); - assert(err == 0); - - /* Preload */ - stub_t stub = ptest->stub; - - /* Run two variants to reduce the cost of testing for the flush */ - if (flush == 0) - { - for (int i = 0; i < loops; i++) - { - (*stub)(dest, src, count); - } - } - else - { - for (int i = 0; i < loops; i++) - { - (*stub)(dest, src, count); - empty(dest); - } - } - - err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); - assert(err == 0); - - /* Drop any leading path and pull the variant name out of the executable */ - char *variant = strrchr(argv[0], '/'); - - if (variant == NULL) - { - variant = argv[0]; - } - - variant = strstr(variant, "try-"); - assert(variant != NULL); - - double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) * 1e-9; - /* Estimate the bounce time. Measured on a Panda. */ - double bounced = 0.448730 * loops / 50000000; - - /* Dump both machine and human readable versions */ - printf("%s:%s:%u:%u:%d:%d:%s:%.6f: took %.6f s for %u calls to %s of %u bytes. ~%.3f MB/s corrected.\n", - variant + 4, ptest->name, - count, loops, src_alignment, dst_alignment, run_id, - elapsed, - elapsed, loops, ptest->name, count, - (double)loops*count/(elapsed - bounced)/(1024*1024)); - - return 0; -} Index: contrib/cortex-strings/configure.ac =================================================================== --- contrib/cortex-strings/configure.ac +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2011-2012, Linaro Limited -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the Linaro nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -AC_INIT(cortex-strings, 1.1-2012.06~dev) -AM_INIT_AUTOMAKE(foreign subdir-objects color-tests dist-bzip2) -AC_CONFIG_HEADERS([config.h]) -AC_CONFIG_FILES(Makefile) -AC_CANONICAL_HOST -AM_PROG_AS -AC_PROG_CC -AC_PROG_LIBTOOL - -default_submachine= - -case $host in -aarch64*-*-*) - arch=aarch64 - ;; -arm*-*-*) - arch=aarch32 - default_submachine=cortex-a9 - ;; -x86_64-*-*-*) - arch=generic - ;; -*) - AC_MSG_ERROR([unknown architecture $host]) - ;; -esac - -AM_CONDITIONAL([HOST_AARCH32], [test x$arch = xaarch32]) -AM_CONDITIONAL([HOST_AARCH64], [test x$arch = xaarch64]) -AM_CONDITIONAL([HOST_GENERIC], [test x$arch = xgeneric]) - -AC_ARG_WITH([cpu], - AS_HELP_STRING([--with-cpu=CPU], - [select code for CPU variant @<:@default=cortex-a9@:>@]]), - [dnl - case "$withval" in - yes|'') AC_MSG_ERROR([--with-cpu requires an argument]) ;; - no) ;; - *) submachine="$withval" ;; - esac -], -[submachine=$default_submachine]) - -AC_SUBST(submachine) -AM_CONDITIONAL([WITH_SUBMACHINE], [test x$submachine != x]) - -AC_ARG_WITH([neon], - AC_HELP_STRING([--with-neon], - [include NEON specific routines @<:@default=yes@:>@]), - [with_neon=$withval], - [with_neon=yes]) -AC_SUBST(with_neon) -AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes) - -AC_ARG_WITH([vfp], - AC_HELP_STRING([--with-vfp], - [include VFP specific routines @<:@default=yes@:>@]), - [with_vfp=$withval], - [with_vfp=yes]) -AC_SUBST(with_vfp) -AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes) - -AC_OUTPUT Index: contrib/cortex-strings/scripts/add-license.sh =================================================================== --- contrib/cortex-strings/scripts/add-license.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -# -# Add the modified BSD license to a file -# - -f=`mktemp -d` -trap "rm -rf $f" EXIT - -year=`date +%Y` -cat > $f/original < $f/c -sed -r 's/(.*)/ * \1/' $f/original | sed -r 's/ +$//' >> $f/c -echo " */" >> $f/c -echo >> $f/c - -# ...and shell style -sed -r 's/(.*)/# \1/' $f/original | sed -r 's/ +$//' >> $f/shell -echo '#' >> $f/shell -echo >> $f/shell - -for name in $@; do - if grep -q Copyright $name; then - echo $name already has some type of copyright - continue - fi - - case $name in - # These files don't have an explicit license - *autogen.sh*) - continue;; - *reference/newlib/*) - continue;; - *reference/newlib-xscale/*) - continue;; - */dhry/*) - continue;; - - *.c) - src=$f/c - ;; - *.sh|*.am|*.ac) - src=$f/shell - ;; - *) - echo Unrecognied extension on $name - continue - esac - - cat $src $name > $f/next - mv $f/next $name - echo Updated $name -done Index: contrib/cortex-strings/scripts/bench.py =================================================================== --- contrib/cortex-strings/scripts/bench.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python - -"""Simple harness that benchmarks different variants of the routines, -caches the results, and emits all of the records at the end. - -Results are generated for different values of: - * Source - * Routine - * Length - * Alignment -""" - -import argparse -import subprocess -import math -import sys - -# Prefix to the executables -build = '../build/try-' - -ALL = 'memchr memcmp memcpy memset strchr strcmp strcpy strlen' - -HAS = { - 'this': 'bounce memchr memcpy memset strchr strcmp strcpy strlen', - 'bionic-a9': 'memcmp memcpy memset strcmp strcpy strlen', - 'bionic-a15': 'memcmp memcpy memset strcmp strcpy strlen', - 'bionic-c': ALL, - 'csl': 'memcpy memset', - 'glibc': 'memcpy memset strchr strlen', - 'glibc-c': ALL, - 'newlib': 'memcpy strcmp strcpy strlen', - 'newlib-c': ALL, - 'newlib-xscale': 'memchr memcpy memset strchr strcmp strcpy strlen', - 'plain': 'memset memcpy strcmp strcpy', -} - -BOUNCE_ALIGNMENTS = ['1'] -SINGLE_BUFFER_ALIGNMENTS = ['1', '2', '4', '8', '16', '32'] -DUAL_BUFFER_ALIGNMENTS = ['1:32', '2:32', '4:32', '8:32', '16:32', '32:32'] - -ALIGNMENTS = { - 'bounce': BOUNCE_ALIGNMENTS, - 'memchr': SINGLE_BUFFER_ALIGNMENTS, - 'memset': SINGLE_BUFFER_ALIGNMENTS, - 'strchr': SINGLE_BUFFER_ALIGNMENTS, - 'strlen': SINGLE_BUFFER_ALIGNMENTS, - 'memcmp': DUAL_BUFFER_ALIGNMENTS, - 'memcpy': DUAL_BUFFER_ALIGNMENTS, - 'strcmp': DUAL_BUFFER_ALIGNMENTS, - 'strcpy': DUAL_BUFFER_ALIGNMENTS, -} - -VARIANTS = sorted(HAS.keys()) -FUNCTIONS = sorted(ALIGNMENTS.keys()) - -NUM_RUNS = 5 - -def run(cache, variant, function, bytes, loops, alignment, run_id, quiet=False): - """Perform a single run, exercising the cache as appropriate.""" - key = ':'.join('%s' % x for x in (variant, function, bytes, loops, alignment, run_id)) - - if key in cache: - got = cache[key] - else: - xbuild = build - cmd = '%(xbuild)s%(variant)s -t %(function)s -c %(bytes)s -l %(loops)s -a %(alignment)s -r %(run_id)s' % locals() - - try: - got = subprocess.check_output(cmd.split()).strip() - except OSError, ex: - assert False, 'Error %s while running %s' % (ex, cmd) - - parts = got.split(':') - took = float(parts[7]) - - cache[key] = got - - if not quiet: - print got - sys.stdout.flush() - - return took - -def run_many(cache, variants, bytes, all_functions): - # We want the data to come out in a useful order. So fix an - # alignment and function, and do all sizes for a variant first - bytes = sorted(bytes) - mid = bytes[int(len(bytes)/1.5)] - - if not all_functions: - # Use the ordering in 'this' as the default - all_functions = HAS['this'].split() - - # Find all other functions - for functions in HAS.values(): - for function in functions.split(): - if function not in all_functions: - all_functions.append(function) - - for function in all_functions: - for alignment in ALIGNMENTS[function]: - for variant in variants: - if function not in HAS[variant].split(): - continue - - # Run a tracer through and see how long it takes and - # adjust the number of loops based on that. Not great - # for memchr() and similar which are O(n), but it will - # do - f = 50000000 - want = 5.0 - - loops = int(f / math.sqrt(max(1, mid))) - took = run(cache, variant, function, mid, loops, alignment, 0, - quiet=True) - # Keep it reasonable for silly routines like bounce - factor = min(20, max(0.05, want/took)) - f = f * factor - - # Round f to a few significant figures - scale = 10**int(math.log10(f) - 1) - f = scale*int(f/scale) - - for b in sorted(bytes): - # Figure out the number of loops to give a roughly consistent run - loops = int(f / math.sqrt(max(1, b))) - for run_id in range(0, NUM_RUNS): - run(cache, variant, function, b, loops, alignment, - run_id) - -def run_top(cache): - parser = argparse.ArgumentParser() - parser.add_argument("-v", "--variants", nargs="+", help="library variant to run (run all if not specified)", default = VARIANTS, choices = VARIANTS) - parser.add_argument("-f", "--functions", nargs="+", help="function to run (run all if not specified)", default = FUNCTIONS, choices = FUNCTIONS) - parser.add_argument("-l", "--limit", type=int, help="upper limit to test to (in bytes)", default = 512*1024) - args = parser.parse_args() - - # Test all powers of 2 - step1 = 2.0 - # Test intermediate powers of 1.4 - step2 = 1.4 - - bytes = [] - - for step in [step1, step2]: - if step: - # Figure out how many steps get us up to the top - steps = int(round(math.log(args.limit) / math.log(step))) - bytes.extend([int(step**x) for x in range(0, steps+1)]) - - run_many(cache, args.variants, bytes, args.functions) - -def main(): - cachename = 'cache.txt' - - cache = {} - - try: - with open(cachename) as f: - for line in f: - line = line.strip() - parts = line.split(':') - cache[':'.join(parts[:7])] = line - except: - pass - - try: - run_top(cache) - finally: - with open(cachename, 'w') as f: - for line in sorted(cache.values()): - print >> f, line - -if __name__ == '__main__': - main() Index: contrib/cortex-strings/scripts/fixup.py =================================================================== --- contrib/cortex-strings/scripts/fixup.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Simple script that enables target specific blocks based on the first argument. - -Matches comment blocks like this: - -/* For Foo: abc -def -*/ - -and de-comments them giving: -abc -def -""" -import re -import sys - -def main(): - key = sys.argv[1] - expr = re.compile(r'/\* For %s:\s([^*]+)\*/' % key, re.M) - - for arg in sys.argv[2:]: - with open(arg) as f: - body = f.read() - with open(arg, 'w') as f: - f.write(expr.sub(r'\1', body)) - -if __name__ == '__main__': - main() Index: contrib/cortex-strings/scripts/libplot.py =================================================================== --- contrib/cortex-strings/scripts/libplot.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Shared routines for the plotters.""" - -import fileinput -import collections - -Record = collections.namedtuple('Record', 'variant function bytes loops src_alignment dst_alignment run_id elapsed rest') - - -def make_colours(): - return iter('m b g r c y k pink orange brown grey'.split()) - -def parse_value(v): - """Turn text into a primitive""" - try: - if '.' in v: - return float(v) - else: - return int(v) - except ValueError: - return v - -def create_column_tuple(record, names): - cols = [getattr(record, name) for name in names] - return tuple(cols) - -def unique(records, name, prefer=''): - """Return the unique values of a column in the records""" - if type(name) == tuple: - values = list(set(create_column_tuple(x, name) for x in records)) - else: - values = list(set(getattr(x, name) for x in records)) - - if not values: - return values - elif type(values[0]) == str: - return sorted(values, key=lambda x: '%-06d|%s' % (-prefer.find(x), x)) - else: - return sorted(values) - -def alignments_equal(alignments): - for alignment in alignments: - if alignment[0] != alignment[1]: - return False - return True - -def parse_row(line): - return Record(*[parse_value(y) for y in line.split(':')]) - -def parse(): - """Parse a record file into named tuples, correcting for loop - overhead along the way. - """ - records = [parse_row(x) for x in fileinput.input()] - - # Pull out any bounce values - costs = {} - - for record in [x for x in records if x.function=='bounce']: - costs[(record.bytes, record.loops)] = record.elapsed - - # Fix up all of the records for cost - out = [] - - for record in records: - if record.function == 'bounce': - continue - - cost = costs.get((record.bytes, record.loops), None) - - if not cost: - out.append(record) - else: - # Unfortunately you can't update a namedtuple... - values = list(record) - values[-2] -= cost - out.append(Record(*values)) - - return out Index: contrib/cortex-strings/scripts/plot-align.py =================================================================== --- contrib/cortex-strings/scripts/plot-align.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -"""Plot the performance of different variants of one routine versus alignment. -""" - -import libplot - -import pylab - - -def plot(records, bytes, function): - records = [x for x in records if x.bytes==bytes and x.function==function] - - variants = libplot.unique(records, 'variant', prefer='this') - alignments = libplot.unique(records, ('src_alignment', 'dst_alignment')) - - X = pylab.arange(len(alignments)) - width = 1.0/(len(variants)+1) - - colours = libplot.make_colours() - - pylab.figure(1).set_size_inches((16, 12)) - pylab.clf() - - for i, variant in enumerate(variants): - heights = [] - - for alignment in alignments: - matches = [x for x in records if x.variant==variant and x.src_alignment==alignment[0] and x.dst_alignment==alignment[1]] - - if matches: - vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for - match in matches] - mean = sum(vals)/len(vals) - heights.append(mean) - else: - heights.append(0) - - pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant) - - - axes = pylab.axes() - if libplot.alignments_equal(alignments): - alignment_labels = ["%s" % x[0] for x in alignments] - else: - alignment_labels = ["%s:%s" % (x[0], x[1]) for x in alignments] - axes.set_xticklabels(alignment_labels) - axes.set_xticks(X + 0.5) - - pylab.title('Performance of different variants of %(function)s for %(bytes)d byte blocks' % locals()) - pylab.xlabel('Alignment') - pylab.ylabel('Rate (MB/s)') - pylab.legend(loc='lower right', ncol=3) - pylab.grid() - pylab.savefig('alignment-%(function)s-%(bytes)d.png' % locals(), dpi=72) - -def main(): - records = libplot.parse() - - for function in libplot.unique(records, 'function'): - for bytes in libplot.unique(records, 'bytes'): - plot(records, bytes, function) - - pylab.show() - -if __name__ == '__main__': - main() Index: contrib/cortex-strings/scripts/plot-sizes.py =================================================================== --- contrib/cortex-strings/scripts/plot-sizes.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python - -"""Plot the performance for different block sizes of one function across -variants. -""" - -import libplot - -import pylab -import pdb -import math - -def pretty_kb(v): - if v < 1024: - return '%d' % v - else: - if v % 1024 == 0: - return '%d k' % (v//1024) - else: - return '%.1f k' % (v/1024) - -def plot(records, function, alignment=None, scale=1): - variants = libplot.unique(records, 'variant', prefer='this') - records = [x for x in records if x.function==function] - - if alignment != None: - records = [x for x in records if x.src_alignment==alignment[0] and - x.dst_alignment==alignment[1]] - - alignments = libplot.unique(records, ('src_alignment', 'dst_alignment')) - if len(alignments) != 1: - return False - if libplot.alignments_equal(alignments): - aalignment = alignments[0][0] - else: - aalignment = "%s:%s" % (alignments[0][0], alignments[0][1]) - - bytes = libplot.unique(records, 'bytes')[0] - - colours = libplot.make_colours() - all_x = [] - - pylab.figure(1).set_size_inches((6.4*scale, 4.8*scale)) - pylab.clf() - - if 'str' in function: - # The harness fills out to 16k. Anything past that is an - # early match - top = 16384 - else: - top = 2**31 - - for variant in variants: - matches = [x for x in records if x.variant==variant and x.bytes <= top] - matches.sort(key=lambda x: x.bytes) - - X = sorted(list(set([x.bytes for x in matches]))) - Y = [] - Yerr = [] - for xbytes in X: - vals = [x.bytes*x.loops/x.elapsed/(1024*1024) for x in matches if x.bytes == xbytes] - if len(vals) > 1: - mean = sum(vals)/len(vals) - Y.append(mean) - if len(Yerr) == 0: - Yerr = [[], []] - err1 = max(vals) - mean - assert err1 >= 0 - err2 = min(vals) - mean - assert err2 <= 0 - Yerr[0].append(abs(err2)) - Yerr[1].append(err1) - else: - Y.append(vals[0]) - - all_x.extend(X) - colour = colours.next() - - if X: - pylab.plot(X, Y, c=colour) - if len(Yerr) > 0: - pylab.errorbar(X, Y, yerr=Yerr, c=colour, label=variant, fmt='o') - else: - pylab.scatter(X, Y, c=colour, label=variant, edgecolors='none') - - pylab.legend(loc='upper left', ncol=3, prop={'size': 'small'}) - pylab.grid() - pylab.title('%(function)s of %(aalignment)s byte aligned blocks' % locals()) - pylab.xlabel('Size (B)') - pylab.ylabel('Rate (MB/s)') - - # Figure out how high the range goes - top = max(all_x) - - power = int(round(math.log(max(all_x)) / math.log(2))) - - pylab.semilogx() - - pylab.axes().set_xticks([2**x for x in range(0, power+1)]) - pylab.axes().set_xticklabels([pretty_kb(2**x) for x in range(0, power+1)]) - pylab.xlim(0, top) - pylab.ylim(0, pylab.ylim()[1]) - return True - -def main(): - records = libplot.parse() - - functions = libplot.unique(records, 'function') - alignments = libplot.unique(records, ('src_alignment', 'dst_alignment')) - - for function in functions: - for alignment in alignments: - for scale in [1, 2.5]: - if plot(records, function, alignment, scale): - pylab.savefig('sizes-%s-%02d-%02d-%.1f.png' % (function, alignment[0], alignment[1], scale), dpi=72) - - pylab.show() - -if __name__ == '__main__': - main() Index: contrib/cortex-strings/scripts/plot-top.py =================================================================== --- contrib/cortex-strings/scripts/plot-top.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python - -"""Plot the performance of different variants of the string routines -for one size. -""" - -import libplot - -import pylab - - -def plot(records, bytes): - records = [x for x in records if x.bytes==bytes] - - variants = libplot.unique(records, 'variant', prefer='this') - functions = libplot.unique(records, 'function') - - X = pylab.arange(len(functions)) - width = 1.0/(len(variants)+1) - - colours = libplot.make_colours() - - pylab.figure(1).set_size_inches((16, 12)) - pylab.clf() - - for i, variant in enumerate(variants): - heights = [] - - for function in functions: - matches = [x for x in records if x.variant==variant and x.function==function and x.src_alignment==8] - - if matches: - vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for - match in matches] - mean = sum(vals)/len(vals) - heights.append(mean) - else: - heights.append(0) - - pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant) - - axes = pylab.axes() - axes.set_xticklabels(functions) - axes.set_xticks(X + 0.5) - - pylab.title('Performance of different variants for %d byte blocks' % bytes) - pylab.ylabel('Rate (MB/s)') - pylab.legend(loc='upper left', ncol=3) - pylab.grid() - pylab.savefig('top-%06d.png' % bytes, dpi=72) - -def main(): - records = libplot.parse() - - for bytes in libplot.unique(records, 'bytes'): - plot(records, bytes) - - pylab.show() - -if __name__ == '__main__': - main() Index: contrib/cortex-strings/scripts/plot.py =================================================================== --- contrib/cortex-strings/scripts/plot.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Plot the results for each test. Spits out a set of images into the -current directory. -""" - -import libplot - -import fileinput -import collections -import pprint - -import pylab - -Record = collections.namedtuple('Record', 'variant test size loops src_alignment dst_alignment run_id rawtime comment time bytes rate') - -def unique(rows, name): - """Takes a list of values, pulls out the named field, and returns - a list of the unique values of this field. - """ - return sorted(set(getattr(x, name) for x in rows)) - -def to_float(v): - """Convert a string into a better type. - - >>> to_float('foo') - 'foo' - >>> to_float('1.23') - 1.23 - >>> to_float('45') - 45 - """ - try: - if '.' in v: - return float(v) - else: - return int(v) - except: - return v - -def parse(): - # Split the input up - rows = [x.strip().split(':') for x in fileinput.input()] - # Automatically turn numbers into the base type - rows = [[to_float(y) for y in x] for x in rows] - - # Scan once to calculate the overhead - r = [Record(*(x + [0, 0, 0])) for x in rows] - bounces = pylab.array([(x.loops, x.rawtime) for x in r if x.test == 'bounce']) - fit = pylab.polyfit(bounces[:,0], bounces[:,1], 1) - - records = [] - - for row in rows: - # Make a dummy record so we can use the names - r1 = Record(*(row + [0, 0, 0])) - - bytes = r1.size * r1.loops - # Calculate the bounce time - delta = pylab.polyval(fit, [r1.loops]) - time = r1.rawtime - delta - rate = bytes / time - - records.append(Record(*(row + [time, bytes, rate]))) - - return records - -def plot(records, field, scale, ylabel): - variants = unique(records, 'variant') - tests = unique(records, 'test') - - colours = libplot.make_colours() - - # A little hack. We want the 'all' record to be drawn last so - # that it's obvious on the graph. Assume that no tests come - # before it alphabetically - variants.reverse() - - for test in tests: - for variant in variants: - v = [x for x in records if x.test==test and x.variant==variant] - v.sort(key=lambda x: x.size) - V = pylab.array([(x.size, getattr(x, field)) for x in v]) - - # Ensure our results appear - order = 1 if variant == 'this' else 0 - - try: - # A little hack. We want the 'all' to be obvious on - # the graph - if variant == 'all': - pylab.scatter(V[:,0], V[:,1]/scale, label=variant) - pylab.plot(V[:,0], V[:,1]/scale) - else: - pylab.plot(V[:,0], V[:,1]/scale, label=variant, - zorder=order, c = colours.next()) - - except Exception, ex: - # michaelh1 likes to run this script while the test is - # still running which can lead to bad data - print ex, 'on %s of %s' % (variant, test) - - pylab.legend(loc='lower right', ncol=2, prop={'size': 'small'}) - pylab.xlabel('Block size (B)') - pylab.ylabel(ylabel) - pylab.title('%s %s' % (test, field)) - pylab.grid() - - pylab.savefig('%s-%s.png' % (test, field), dpi=100) - pylab.semilogx(basex=2) - pylab.savefig('%s-%s-semilog.png' % (test, field), dpi=100) - pylab.clf() - -def test(): - import doctest - doctest.testmod() - -def main(): - records = parse() - - plot(records, 'rate', 1024**2, 'Rate (MB/s)') - plot(records, 'time', 1, 'Total time (s)') - -if __name__ == '__main__': - main() Index: contrib/cortex-strings/scripts/trim.sh =================================================================== --- contrib/cortex-strings/scripts/trim.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# -# Trims the whitespace from around any given images -# - -for i in $@; do - convert $i -bordercolor white -border 1x1 -trim +repage -alpha off +dither -colors 32 PNG8:next-$i - mv next-$i $i -done Index: contrib/cortex-strings/src/aarch64/memchr.S =================================================================== --- contrib/cortex-strings/src/aarch64/memchr.S +++ /dev/null @@ -1,172 +0,0 @@ -/* - * memchr - find a character in a memory zone - * - * Copyright (c) 2014, ARM Limited - * All rights Reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the company nor the names of its contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - * Neon Available. - */ - -/* Arguments and results. */ -#define srcin x0 -#define chrin w1 -#define cntin x2 - -#define result x0 - -#define src x3 -#define tmp x4 -#define wtmp2 w5 -#define synd x6 -#define soff x9 -#define cntrem x10 - -#define vrepchr v0 -#define vdata1 v1 -#define vdata2 v2 -#define vhas_chr1 v3 -#define vhas_chr2 v4 -#define vrepmask v5 -#define vend v6 - -/* - * Core algorithm: - * - * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits - * per byte. For each tuple, bit 0 is set if the relevant byte matched the - * requested character and bit 1 is not used (faster than using a 32bit - * syndrome). Since the bits in the syndrome reflect exactly the order in which - * things occur in the original string, counting trailing zeros allows to - * identify exactly which byte has matched. - */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn memchr - /* Do not dereference srcin if no bytes to compare. */ - cbz cntin, .Lzero_length - /* - * Magic constant 0x40100401 allows us to identify which lane matches - * the requested byte. - */ - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 - dup vrepchr.16b, chrin - /* Work with aligned 32-byte chunks */ - bic src, srcin, #31 - dup vrepmask.4s, wtmp2 - ands soff, srcin, #31 - and cntrem, cntin, #31 - b.eq .Lloop - - /* - * Input string is not 32-byte aligned. We calculate the syndrome - * value for the aligned 32 bytes block containing the first bytes - * and mask the irrelevant part. - */ - - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - sub tmp, soff, #32 - adds cntin, cntin, tmp - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b - addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ - addp vend.16b, vend.16b, vend.16b /* 128->64 */ - mov synd, vend.d[0] - /* Clear the soff*2 lower bits */ - lsl tmp, soff, #1 - lsr synd, synd, tmp - lsl synd, synd, tmp - /* The first block can also be the last */ - b.ls .Lmasklast - /* Have we found something already? */ - cbnz synd, .Ltail - -.Lloop: - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - subs cntin, cntin, #32 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* If we're out of data we finish regardless of the result */ - b.ls .Lend - /* Use a fast check for the termination condition */ - orr vend.16b, vhas_chr1.16b, vhas_chr2.16b - addp vend.2d, vend.2d, vend.2d - mov synd, vend.d[0] - /* We're not out of data, loop if we haven't found the character */ - cbz synd, .Lloop - -.Lend: - /* Termination condition found, let's calculate the syndrome value */ - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b - addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ - addp vend.16b, vend.16b, vend.16b /* 128->64 */ - mov synd, vend.d[0] - /* Only do the clear for the last possible block */ - b.hi .Ltail - -.Lmasklast: - /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ - add tmp, cntrem, soff - and tmp, tmp, #31 - sub tmp, tmp, #32 - neg tmp, tmp, lsl #1 - lsl synd, synd, tmp - lsr synd, synd, tmp - -.Ltail: - /* Count the trailing zeros using bit reversing */ - rbit synd, synd - /* Compensate the last post-increment */ - sub src, src, #32 - /* Check that we have found a character */ - cmp synd, #0 - /* And count the leading zeros */ - clz synd, synd - /* Compute the potential result */ - add result, src, synd, lsr #1 - /* Select result or NULL */ - csel result, xzr, result, eq - ret - -.Lzero_length: - mov result, #0 - ret - - .size memchr, . - memchr Index: contrib/cortex-strings/src/aarch64/memcmp.S =================================================================== --- contrib/cortex-strings/src/aarch64/memcmp.S +++ /dev/null @@ -1,162 +0,0 @@ -/* memcmp - compare memory - - Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result x0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 - -def_fn memcmp p2align=6 - cbz limit, .Lret0 - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, .Lnot_limit - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq .Lnot_limit - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - orr diff, diff, mask -.Lnot_limit: - -#ifndef __AARCH64EB__ - rev diff, diff - rev data1, data1 - rev data2, data2 -#endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret - -.Lmutual_align: - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b .Lstart_realigned - -.Lret0: - mov result, #0 - ret - - .p2align 6 -.Lmisaligned8: - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - ret - .size memcmp, . - memcmp Index: contrib/cortex-strings/src/aarch64/memcpy.S =================================================================== --- contrib/cortex-strings/src/aarch64/memcpy.S +++ /dev/null @@ -1,225 +0,0 @@ -/* Copyright (c) 2012, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l dst -#define F_h srcend -#define tmp1 x9 - -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - Small and medium copies read all data before writing, allowing any - kind of overlap, and memmove tailcalls memcpy for these cases as - well as non-overlapping copies. -*/ - -def_fn memcpy p2align=6 - prfm PLDL1KEEP, [src] - add srcend, src, count - add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 96 - b.hi L(copy_long) - - /* Medium copies: 17..96 bytes. */ - sub tmp1, count, 1 - ldp A_l, A_h, [src] - tbnz tmp1, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(copy16): - cmp count, 8 - b.lo 1f - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] - str A_h, [dstend, -8] - ret - .p2align 4 -1: - tbz count, 2, 1f - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret - - .p2align 4 - /* Copy 64..96 bytes. Copy 64 bytes from the start and - 32 bytes from the end. */ -L(copy96): - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [src, 32] - ldp D_l, D_h, [src, 48] - ldp E_l, E_h, [srcend, -32] - ldp F_l, F_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin, 32] - stp D_l, D_h, [dstin, 48] - stp E_l, E_h, [dstend, -32] - stp F_l, F_h, [dstend, -16] - ret - - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - .p2align 4 -L(copy_long): - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls 2f -1: - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -2: - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] - ret - - .size memcpy, . - memcpy Index: contrib/cortex-strings/src/aarch64/memmove.S =================================================================== --- contrib/cortex-strings/src/aarch64/memmove.S +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses - */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -/* Parameters and result. */ -#define dstin x0 -#define src x1 -#define count x2 -#define srcend x3 -#define dstend x4 -#define tmp1 x5 -#define A_l x6 -#define A_h x7 -#define B_l x8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l count -#define E_h tmp1 - -/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. - Larger backwards copies are also handled by memcpy. The only remaining - case is forward large copies. The destination is aligned, and an - unrolled loop processes 64 bytes per iteration. -*/ - -def_fn memmove, 6 - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.hs memcpy - - cbz tmp1, 3f - add dstend, dstin, count - add srcend, src, count - - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - and tmp1, dstend, 15 - ldp D_l, D_h, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - nop -1: - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp E_l, E_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp E_l, E_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] -3: ret - - .size memmove, . - memmove Index: contrib/cortex-strings/src/aarch64/memset.S =================================================================== --- contrib/cortex-strings/src/aarch64/memset.S +++ /dev/null @@ -1,235 +0,0 @@ -/* Copyright (c) 2012, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses - * - */ - - -#define dstin x0 -#define val x1 -#define valw w1 -#define count x2 -#define dst x3 -#define dstend x4 -#define tmp1 x5 -#define tmp1w w5 -#define tmp2 x6 -#define tmp2w w6 -#define zva_len x7 -#define zva_lenw w7 - -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn memset p2align=6 - - dup v0.16B, valw - add dstend, dstin, count - - cmp count, 96 - b.hi L(set_long) - cmp count, 16 - b.hs L(set_medium) - mov val, v0.D[0] - - /* Set 0..15 bytes. */ - tbz count, 3, 1f - str val, [dstin] - str val, [dstend, -8] - ret - nop -1: tbz count, 2, 2f - str valw, [dstin] - str valw, [dstend, -4] - ret -2: cbz count, 3f - strb valw, [dstin] - tbz count, 1, 3f - strh valw, [dstend, -2] -3: ret - - /* Set 17..96 bytes. */ -L(set_medium): - str q0, [dstin] - tbnz count, 6, L(set96) - str q0, [dstend, -16] - tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] -1: ret - - .p2align 4 - /* Set 64..96 bytes. Write 64 bytes from the start and - 32 bytes from the end. */ -L(set96): - str q0, [dstin, 16] - stp q0, q0, [dstin, 32] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 - nop -L(set_long): - and valw, valw, 255 - bic dst, dstin, 15 - str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - add dst, dst, 16 - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 -L(try_zva): - mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) - and tmp1w, tmp1w, 15 - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) - - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ -L(zva_64): - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 - nop -1: dc zva, dst - add dst, dst, 64 - subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 -L(zva_128): - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) - - str q0, [dst, 16] - stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - -L(zva_other): - mov tmp2w, 4 - lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) - - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - b L(tail64) - - .size memset, . - memset Index: contrib/cortex-strings/src/aarch64/strchr.S =================================================================== --- contrib/cortex-strings/src/aarch64/strchr.S +++ /dev/null @@ -1,165 +0,0 @@ -/* - strchr - find a character in a string - - Copyright (c) 2014, ARM Limited - All rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - * Neon Available. - */ - -/* Arguments and results. */ -#define srcin x0 -#define chrin w1 - -#define result x0 - -#define src x2 -#define tmp1 x3 -#define wtmp2 w4 -#define tmp3 x5 - -#define vrepchr v0 -#define vdata1 v1 -#define vdata2 v2 -#define vhas_nul1 v3 -#define vhas_nul2 v4 -#define vhas_chr1 v5 -#define vhas_chr2 v6 -#define vrepmask_0 v7 -#define vrepmask_c v16 -#define vend1 v17 -#define vend2 v18 - -/* Core algorithm. - - For each 32-byte hunk we calculate a 64-bit syndrome value, with - two bits per byte (LSB is always in bits 0 and 1, for both big - and little-endian systems). For each tuple, bit 0 is set iff - the relevant byte matched the requested character; bit 1 is set - iff the relevant byte matched the NUL end of string (we trigger - off bit0 for the special case of looking for NUL). Since the bits - in the syndrome reflect exactly the order in which things occur - in the original string a count_trailing_zeros() operation will - identify exactly which byte is causing the termination, and why. */ - -/* Locals and temporaries. */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - - .macro def_alias f a - .weak \a - .set \a,\f - .endm - -def_fn strchr -def_alias strchr index - /* Magic constant 0x40100401 to allow us to identify which lane - matches the requested byte. Magic constant 0x80200802 used - similarly for NUL termination. */ - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 - dup vrepchr.16b, chrin - bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ - dup vrepmask_c.4s, wtmp2 - ands tmp1, srcin, #31 - add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq .Lloop - - /* Input string is not 32-byte aligned. Rather than forcing - the padding bytes to a safe value, we calculate the syndrome - for all the bytes, but then mask off those bits of the - syndrome that are related to the padding. */ - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - neg tmp1, tmp1 - cmeq vhas_nul1.16b, vdata1.16b, #0 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - lsl tmp1, tmp1, #1 - addp vend1.16b, vend1.16b, vend2.16b // 256->128 - mov tmp3, #~0 - addp vend1.16b, vend1.16b, vend2.16b // 128->64 - lsr tmp1, tmp3, tmp1 - - mov tmp3, vend1.d[0] - bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail - -.Lloop: - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vend1.16b, vend2.16b - addp vend1.2d, vend1.2d, vend1.2d - mov tmp1, vend1.d[0] - cbz tmp1, .Lloop - - /* Termination condition found. Now need to establish exactly why - we terminated. */ - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - addp vend1.16b, vend1.16b, vend2.16b // 256->128 - addp vend1.16b, vend1.16b, vend2.16b // 128->64 - - mov tmp1, vend1.d[0] -.Ltail: - /* Count the trailing zeros, by bit reversing... */ - rbit tmp1, tmp1 - /* Re-bias source. */ - sub src, src, #32 - clz tmp1, tmp1 /* And counting the leading zeros. */ - /* Tmp1 is even if the target charager was found first. Otherwise - we've found the end of string and we weren't looking for NUL. */ - tst tmp1, #1 - add result, src, tmp1, lsr #1 - csel result, result, xzr, eq - ret - - .size strchr, . - strchr Index: contrib/cortex-strings/src/aarch64/strchrnul.S =================================================================== --- contrib/cortex-strings/src/aarch64/strchrnul.S +++ /dev/null @@ -1,144 +0,0 @@ -/* - strchrnul - find a character or nul in a string - - Copyright (c) 2014, ARM Limited - All rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - * Neon Available. - */ - -/* Arguments and results. */ -#define srcin x0 -#define chrin w1 - -#define result x0 - -#define src x2 -#define tmp1 x3 -#define wtmp2 w4 -#define tmp3 x5 - -#define vrepchr v0 -#define vdata1 v1 -#define vdata2 v2 -#define vhas_nul1 v3 -#define vhas_nul2 v4 -#define vhas_chr1 v5 -#define vhas_chr2 v6 -#define vrepmask v7 -#define vend1 v16 - -/* Core algorithm. - - For each 32-byte hunk we calculate a 64-bit syndrome value, with - two bits per byte (LSB is always in bits 0 and 1, for both big - and little-endian systems). For each tuple, bit 0 is set iff - the relevant byte matched the requested character or nul. Since the - bits in the syndrome reflect exactly the order in which things occur - in the original string a count_trailing_zeros() operation will - identify exactly which byte is causing the termination. */ - -/* Locals and temporaries. */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn strchrnul - /* Magic constant 0x40100401 to allow us to identify which lane - matches the termination condition. */ - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 - dup vrepchr.16b, chrin - bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ - dup vrepmask.4s, wtmp2 - ands tmp1, srcin, #31 - b.eq .Lloop - - /* Input string is not 32-byte aligned. Rather than forcing - the padding bytes to a safe value, we calculate the syndrome - for all the bytes, but then mask off those bits of the - syndrome that are related to the padding. */ - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - neg tmp1, tmp1 - cmeq vhas_nul1.16b, vdata1.16b, #0 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b - orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b - lsl tmp1, tmp1, #1 - addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 - mov tmp3, #~0 - addp vend1.16b, vend1.16b, vend1.16b // 128->64 - lsr tmp1, tmp3, tmp1 - - mov tmp3, vend1.d[0] - bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail - -.Lloop: - ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 - cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 - cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b - addp vend1.2d, vend1.2d, vend1.2d - mov tmp1, vend1.d[0] - cbz tmp1, .Lloop - - /* Termination condition found. Now need to establish exactly why - we terminated. */ - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b - addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 - addp vend1.16b, vend1.16b, vend1.16b // 128->64 - - mov tmp1, vend1.d[0] -.Ltail: - /* Count the trailing zeros, by bit reversing... */ - rbit tmp1, tmp1 - /* Re-bias source. */ - sub src, src, #32 - clz tmp1, tmp1 /* ... and counting the leading zeros. */ - /* tmp1 is twice the offset into the fragment. */ - add result, src, tmp1, lsr #1 - ret - - .size strchrnul, . - strchrnul Index: contrib/cortex-strings/src/aarch64/strcmp.S =================================================================== --- contrib/cortex-strings/src/aarch64/strcmp.S +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright (c) 2012, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define result x0 - -/* Internal variables. */ -#define data1 x2 -#define data1w w2 -#define data2 x3 -#define data2w w3 -#define has_nul x4 -#define diff x5 -#define syndrome x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define zeroones x10 -#define pos x11 - - /* Start of performance-critical section -- one 64B cache line. */ -def_fn strcmp p2align=6 - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - orr syndrome, diff, has_nul - cbz syndrome, .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ - -#ifndef __AARCH64EB__ - rev syndrome, syndrome - rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome - rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#endif - -.Lmutual_align: - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that preceed the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b .Lstart_realigned - -.Lmisaligned8: - /* We can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Lmisaligned8 - sub result, data1, data2 - ret Index: contrib/cortex-strings/src/aarch64/strcpy.S =================================================================== --- contrib/cortex-strings/src/aarch64/strcpy.S +++ /dev/null @@ -1,336 +0,0 @@ -/* - strcpy/stpcpy - copy a string returning pointer to start/end. - - Copyright (c) 2013, 2014, 2015 ARM Ltd. - All Rights Reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. - */ - -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. - - To test the page crossing code path more thoroughly, compile with - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define dstin x0 -#define srcin x1 - -/* Locals and temporaries. */ -#define src x2 -#define dst x3 -#define data1 x4 -#define data1w w4 -#define data2 x5 -#define data2w w5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define data1a x13 -#define data2a x14 -#define pos x15 -#define len x16 -#define to_align x17 - -#ifdef BUILD_STPCPY -#define STRCPY stpcpy -#else -#define STRCPY strcpy -#endif - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - /* AArch64 systems have a minimum page size of 4k. We can do a quick - page size check for crossing this boundary on entry and if we - do not, then we can short-circuit much of the entry code. We - expect early page-crossing strings to be rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite - predictable, even with random strings. - - We don't bother checking for larger page sizes, the cost of setting - up the correct page size is just not worth the extra gain from - a small reduction in the cases taking the slow path. Note that - we only care about whether the first fetch, which may be - misaligned, crosses a page boundary - after that we move to aligned - fetches for the remainder of the string. */ - -#ifdef STRCPY_TEST_PAGE_CROSS - /* Make everything that isn't Qword aligned look like a page cross. */ -#define MIN_PAGE_P2 4 -#else -#define MIN_PAGE_P2 12 -#endif - -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) - -def_fn STRCPY p2align=6 - /* For moderately short strings, the fastest way to do the copy is to - calculate the length of the string in the same way as strlen, then - essentially do a memcpy of the result. This avoids the need for - multiple byte copies and further means that by the time we - reach the bulk copy loop we know we can always use DWord - accesses. We expect strcpy to rarely be called repeatedly - with the same source string, so branch prediction is likely to - always be difficult - we mitigate against this by preferring - conditional select operations over branches whenever this is - feasible. */ - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) - mov zeroones, #REP8_01 - and to_align, srcin, #15 - cmp tmp2, #(MIN_PAGE_SIZE - 16) - neg tmp1, to_align - /* The first fetch will straddle a (possible) page boundary iff - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte - aligned string will never fail the page align check, so will - always take the fast path. */ - b.gt .Lpage_cross - -.Lpage_cross_ok: - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* Because we expect the end to be found within 16 characters - (profiling shows this is the most common case), it's worth - swapping the bytes now to save having to recalculate the - termination syndrome later. We preserve data1 and data2 - so that we can re-use the values later on. */ - rev tmp2, data1 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 - rev tmp4, data2 - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bics has_nul2, tmp3, tmp4 - b.eq .Lbulk_entry - - /* The string is short (<=16 bytes). We don't know exactly how - short though, yet. Work out the exact length so that we can - quickly select the optimal copy strategy. */ -.Lfp_gt8: - rev has_nul2, has_nul2 - clz pos, has_nul2 - mov tmp2, #56 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - sub pos, tmp2, pos -#ifdef __AARCH64EB__ - lsr data2, data2, pos -#else - lsl data2, data2, pos -#endif - str data2, [dst, #1] - str data1, [dstin] -#ifdef BUILD_STPCPY - add dstin, dst, #8 -#endif - ret - -.Lfp_le8: - rev has_nul1, has_nul1 - clz pos, has_nul1 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - subs tmp2, pos, #24 /* Pos in bits. */ - b.lt .Lfp_lt4 -#ifdef __AARCH64EB__ - mov tmp2, #56 - sub pos, tmp2, pos - lsr data2, data1, pos - lsr data1, data1, #32 -#else - lsr data2, data1, tmp2 -#endif - /* 4->7 bytes to copy. */ - str data2w, [dst, #-3] - str data1w, [dstin] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret -.Lfp_lt4: - cbz pos, .Lfp_lt2 - /* 2->3 bytes to copy. */ -#ifdef __AARCH64EB__ - lsr data1, data1, #48 -#endif - strh data1w, [dstin] - /* Fall-through, one byte (max) to go. */ -.Lfp_lt2: - /* Null-terminated string. Last character must be zero! */ - strb wzr, [dst] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret - - .p2align 6 - /* Aligning here ensures that the entry code and main loop all lies - within one 64-byte cache line. */ -.Lbulk_entry: - sub to_align, to_align, #16 - stp data1, data2, [dstin] - sub src, srcin, to_align - sub dst, dstin, to_align - b .Lentry_no_page_cross - - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -.Lmain_loop: - stp data1, data2, [dst], #16 -.Lentry_no_page_cross: - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lmain_loop - - /* Since we know we are copying at least 16 bytes, the fastest way - to deal with the tail is to determine the location of the - trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, ne - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] -#ifdef BUILD_STPCPY - sub dstin, dst, #1 -#endif - ret - -.Lpage_cross: - bic src, srcin, #15 - /* Start by loading two words at [srcin & ~15], then forcing the - bytes that precede srcin to 0xff. This means they never look - like termination bytes. */ - ldp data1, data2, [src] - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - tst to_align, #7 - csetm tmp2, ne -#ifdef __AARCH64EB__ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - cmp to_align, #8 - csinv data1, data1, xzr, lt - csel data2, data2, data2a, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lpage_cross_ok - /* We now need to make data1 and data2 look like they've been - loaded directly from srcin. Do a rotate on the 128-bit value. */ - lsl tmp1, to_align, #3 /* Bytes->bits. */ - neg tmp2, to_align, lsl #3 -#ifdef __AARCH64EB__ - lsl data1a, data1, tmp1 - lsr tmp4, data2, tmp2 - lsl data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - rev tmp2, data1 - rev tmp4, data2 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - lsr data1a, data1, tmp1 - lsl tmp4, data2, tmp2 - lsr data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, .Lfp_le8 - bic has_nul2, tmp3, tmp4 - b .Lfp_gt8 - - .size STRCPY, . - STRCPY Index: contrib/cortex-strings/src/aarch64/strlen.S =================================================================== --- contrib/cortex-strings/src/aarch64/strlen.S +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2013-2015, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. - */ - -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 - -/* Locals and temporaries. */ -#define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - -#ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 15 -#else -# define MIN_PAGE_SIZE 4096 -#endif - - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - strlen will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ - -def_fn strlen p2align=6 - and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. - Since we expect strings to be small and early-exit, - byte-swap the data now so has_null1/2 will be correct. */ - rev data1, data1 - rev data2, data2 -#endif - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) - - /* Enter with C = has_nul1 == 0. */ - csel has_nul1, has_nul1, has_nul2, cc - mov len, 8 - rev has_nul1, has_nul1 - clz tmp1, has_nul1 - csel len, xzr, len, cc - add len, len, tmp1, lsr 3 - ret - - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -.Lpage_cross_entry: - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ - orr tmp2, data1, REP8_7f - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, cc -#endif - sub len, src, srcin - rev has_nul1, has_nul1 - add tmp2, len, 8 - clz tmp1, has_nul1 - csel len, len, tmp2, cc - add len, len, tmp1, lsr 3 - ret - -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) - - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ -L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) - - .size strlen, . - strlen Index: contrib/cortex-strings/src/aarch64/strncmp.S =================================================================== --- contrib/cortex-strings/src/aarch64/strncmp.S +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result x0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define syndrome x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define zeroones x11 -#define pos x12 -#define limit_wd x13 -#define mask x14 -#define endloop x15 - - .text - .p2align 6 - .rep 7 - nop /* Pad so that the loop below fits a cache line. */ - .endr -def_fn strncmp - cbz limit, .Lret0 - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp endloop, #0, #0, eq - b.eq .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, .Lnot_limit - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq .Lnot_limit - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -.Lnot_limit: - orr syndrome, diff, has_nul - -#ifndef __AARCH64EB__ - rev syndrome, syndrome - rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome - rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#endif - -.Lmutual_align: - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. - We also need to adjust the limit calculations, but without - overflowing if the limit is near ULONG_MAX. */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ - ldr data2, [src2], #8 - mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ -#endif - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, tmp1 - add tmp3, tmp3, tmp1 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - add limit_wd, limit_wd, tmp3, lsr #3 - b .Lstart_realigned - -.Lret0: - mov result, #0 - ret - - .p2align 6 -.Lmisaligned8: - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - ret - .size strncmp, . - strncmp Index: contrib/cortex-strings/src/aarch64/strnlen.S =================================================================== --- contrib/cortex-strings/src/aarch64/strnlen.S +++ /dev/null @@ -1,181 +0,0 @@ -/* strnlen - calculate the length of a string with limit. - - Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 -#define limit x1 - -/* Locals and temporaries. */ -#define src x2 -#define data1 x3 -#define data2 x4 -#define data2a x5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define pos x13 -#define limit_wd x14 - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - .text - .p2align 6 -.Lstart: - /* Pre-pad to ensure critical loop begins an icache line. */ - .rep 7 - nop - .endr - /* Put this code here to avoid wasting more space with pre-padding. */ -.Lhit_limit: - mov len, limit - ret - -def_fn strnlen - cbz limit, .Lhit_limit - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne .Lmisaligned - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ - - /* Start of critial section -- keep to one 64Byte cache line. */ -.Lloop: - ldp data1, data2, [src], #16 -.Lrealigned: - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - subs limit_wd, limit_wd, #1 - orr tmp1, has_nul1, has_nul2 - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq .Lloop - /* End of critical section -- keep to one 64Byte cache line. */ - - orr tmp1, has_nul1, has_nul2 - cbz tmp1, .Lhit_limit /* No null in final Qword. */ - - /* We know there's a null in the final Qword. The easiest thing - to do now is work out the length of the string and return - MIN (len, limit). */ - - sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 -#ifdef __AARCH64EB__ - mov data2, data1 -#endif - sub len, len, #8 - mov has_nul2, has_nul1 -.Lnul_in_data2: -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - rev data2, data2 - sub tmp1, data2, zeroones - orr tmp2, data2, #REP8_7f - bic has_nul2, tmp1, tmp2 -#endif - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ - cmp len, limit - csel len, len, limit, ls /* Return the lower value. */ - ret - -.Lmisaligned: - /* Deal with a partial first word. - We're doing two things in parallel here; - 1) Calculate the number of words (but avoiding overflow if - limit is near ULONG_MAX) - to do this we need to work out - limit + tmp1 - 1 as a 65-bit value before shifting it; - 2) Load and mask the initial data words - we force the bytes - before the ones we are interested in to 0xff - this ensures - early bytes will not hit any zero detection. */ - sub limit_wd, limit, #1 - neg tmp4, tmp1 - cmp tmp1, #8 - - and tmp3, limit_wd, #15 - lsr limit_wd, limit_wd, #4 - mov tmp2, #~0 - - ldp data1, data2, [src], #16 - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ - add tmp3, tmp3, tmp1 - -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit_wd, tmp3, lsr #4 - - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b .Lrealigned - .size strnlen, . - .Lstart /* Include pre-padding in size. */ Index: contrib/cortex-strings/src/arm/memchr.S =================================================================== --- contrib/cortex-strings/src/arm/memchr.S +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (c) 2010-2011, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Written by Dave Gilbert - - This memchr routine is optimised on a Cortex-A9 and should work on - all ARMv7 processors. It has a fast past for short sizes, and has - an optimised path for large data sets; the worst case is finding the - match early in a large data set. - - */ - -@ 2011-02-07 david.gilbert@linaro.org -@ Extracted from local git a5b438d861 -@ 2011-07-14 david.gilbert@linaro.org -@ Import endianness fix from local git ea786f1b -@ 2011-12-07 david.gilbert@linaro.org -@ Removed unneeded cbz from align loop - - .syntax unified - .arch armv7-a - -@ this lets us check a flag in a 00/ff byte easily in either endianness -#ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) -#else -#define CHARTSTMASK(c) 1<<(c*8) -#endif - .text - .thumb - -@ --------------------------------------------------------------------------- - .thumb_func - .align 2 - .p2align 4,,15 - .global memchr - .type memchr,%function -memchr: - @ r0 = start of memory to scan - @ r1 = character to look for - @ r2 = length - @ returns r0 = pointer to character or NULL if not found - and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char - - cmp r2,#16 @ If it's short don't bother with anything clever - blt 20f - - tst r0, #7 @ If it's already aligned skip the next bit - beq 10f - - @ Work up to an aligned point -5: - ldrb r3, [r0],#1 - subs r2, r2, #1 - cmp r3, r1 - beq 50f @ If it matches exit found - tst r0, #7 - bne 5b @ If not aligned yet then do next byte - -10: - @ At this point, we are aligned, we know we have at least 8 bytes to work with - push {r4,r5,r6,r7} - orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes - orr r1, r1, r1, lsl #16 - bic r4, r2, #7 @ Number of double words to work with - mvns r7, #0 @ all F's - movs r3, #0 - -15: - ldmia r0!,{r5,r6} - subs r4, r4, #8 - eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target - eor r6,r6, r1 - uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 - sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION - uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 - sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION - cbnz r6, 60f - bne 15b @ (Flags from the subs above) If not run out of bytes then go around again - - pop {r4,r5,r6,r7} - and r1,r1,#0xff @ Get r1 back to a single character from the expansion above - and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done - -20: - cbz r2, 40f @ 0 length or hit the end already then not found - -21: @ Post aligned section, or just a short call - ldrb r3,[r0],#1 - subs r2,r2,#1 - eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub - cbz r3, 50f - bne 21b @ on r2 flags - -40: - movs r0,#0 @ not found - bx lr - -50: - subs r0,r0,#1 @ found - bx lr - -60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was - @ r0 points to the start of the double word after the one that was tested - @ r5 has the 00/ff pattern for the first word, r6 has the chained value - cmp r5, #0 - itte eq - moveq r5, r6 @ the end is in the 2nd word - subeq r0,r0,#3 @ Points to 2nd byte of 2nd word - subne r0,r0,#7 @ or 2nd byte of 1st word - - @ r0 currently points to the 3rd byte of the word containing the hit - tst r5, # CHARTSTMASK(0) @ 1st character - bne 61f - adds r0,r0,#1 - tst r5, # CHARTSTMASK(1) @ 2nd character - ittt eq - addeq r0,r0,#1 - tsteq r5, # (3<<15) @ 2nd & 3rd character - @ If not the 3rd must be the last one - addeq r0,r0,#1 - -61: - pop {r4,r5,r6,r7} - subs r0,r0,#1 - bx lr Index: contrib/cortex-strings/src/arm/memcpy.S =================================================================== --- contrib/cortex-strings/src/arm/memcpy.S +++ /dev/null @@ -1,617 +0,0 @@ -/* Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - This memcpy routine is optimised for Cortex-A15 cores and takes advantage - of VFP or NEON when built with the appropriate flags. - - Assumptions: - - ARMv6 (ARMv7-a if using Neon) - ARM state - Unaligned accesses - - */ - - .syntax unified - /* This implementation requires ARM state. */ - .arm - -#ifdef __ARM_NEON__ - - .fpu neon - .arch armv7-a -# define FRAME_SIZE 4 -# define USE_VFP -# define USE_NEON - -#elif !defined (__SOFTFP__) - - .arch armv6 - .fpu vfpv2 -# define FRAME_SIZE 32 -# define USE_VFP - -#else - .arch armv6 -# define FRAME_SIZE 32 - -#endif - -/* Old versions of GAS incorrectly implement the NEON align semantics. */ -#ifdef BROKEN_ASM_NEON_ALIGN -#define ALIGN(addr, align) addr,:align -#else -#define ALIGN(addr, align) addr:align -#endif - -#define PC_OFFSET 8 /* PC pipeline compensation. */ -#define INSN_SIZE 4 - -/* Call parameters. */ -#define dstin r0 -#define src r1 -#define count r2 - -/* Locals. */ -#define tmp1 r3 -#define dst ip -#define tmp2 r10 - -#ifndef USE_NEON -/* For bulk copies using GP registers. */ -#define A_l r2 /* Call-clobbered. */ -#define A_h r3 /* Call-clobbered. */ -#define B_l r4 -#define B_h r5 -#define C_l r6 -#define C_h r7 -#define D_l r8 -#define D_h r9 -#endif - -/* Number of lines ahead to pre-fetch data. If you change this the code - below will need adjustment to compensate. */ - -#define prefetch_lines 5 - -#ifdef USE_VFP - .macro cpy_line_vfp vreg, base - vstr \vreg, [dst, #\base] - vldr \vreg, [src, #\base] - vstr d0, [dst, #\base + 8] - vldr d0, [src, #\base + 8] - vstr d1, [dst, #\base + 16] - vldr d1, [src, #\base + 16] - vstr d2, [dst, #\base + 24] - vldr d2, [src, #\base + 24] - vstr \vreg, [dst, #\base + 32] - vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] - vstr d0, [dst, #\base + 40] - vldr d0, [src, #\base + 40] - vstr d1, [dst, #\base + 48] - vldr d1, [src, #\base + 48] - vstr d2, [dst, #\base + 56] - vldr d2, [src, #\base + 56] - .endm - - .macro cpy_tail_vfp vreg, base - vstr \vreg, [dst, #\base] - vldr \vreg, [src, #\base] - vstr d0, [dst, #\base + 8] - vldr d0, [src, #\base + 8] - vstr d1, [dst, #\base + 16] - vldr d1, [src, #\base + 16] - vstr d2, [dst, #\base + 24] - vldr d2, [src, #\base + 24] - vstr \vreg, [dst, #\base + 32] - vstr d0, [dst, #\base + 40] - vldr d0, [src, #\base + 40] - vstr d1, [dst, #\base + 48] - vldr d1, [src, #\base + 48] - vstr d2, [dst, #\base + 56] - vldr d2, [src, #\base + 56] - .endm -#endif - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn memcpy p2align=6 - - mov dst, dstin /* Preserve dstin, we need to return it. */ - cmp count, #64 - bge .Lcpy_not_short - /* Deal with small copies quickly by dropping straight into the - exit block. */ - -.Ltail63unaligned: -#ifdef USE_NEON - and tmp1, count, #0x38 - rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) - add pc, pc, tmp1 - vld1.8 {d0}, [src]! /* 14 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 12 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 10 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 8 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 6 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 4 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 2 words to go. */ - vst1.8 {d0}, [dst]! - - tst count, #4 - ldrne tmp1, [src], #4 - strne tmp1, [dst], #4 -#else - /* Copy up to 15 full words of data. May not be aligned. */ - /* Cannot use VFP for unaligned data. */ - and tmp1, count, #0x3c - add dst, dst, tmp1 - add src, src, tmp1 - rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) - /* Jump directly into the sequence below at the correct offset. */ - add pc, pc, tmp1, lsl #1 - - ldr tmp1, [src, #-60] /* 15 words to go. */ - str tmp1, [dst, #-60] - - ldr tmp1, [src, #-56] /* 14 words to go. */ - str tmp1, [dst, #-56] - ldr tmp1, [src, #-52] - str tmp1, [dst, #-52] - - ldr tmp1, [src, #-48] /* 12 words to go. */ - str tmp1, [dst, #-48] - ldr tmp1, [src, #-44] - str tmp1, [dst, #-44] - - ldr tmp1, [src, #-40] /* 10 words to go. */ - str tmp1, [dst, #-40] - ldr tmp1, [src, #-36] - str tmp1, [dst, #-36] - - ldr tmp1, [src, #-32] /* 8 words to go. */ - str tmp1, [dst, #-32] - ldr tmp1, [src, #-28] - str tmp1, [dst, #-28] - - ldr tmp1, [src, #-24] /* 6 words to go. */ - str tmp1, [dst, #-24] - ldr tmp1, [src, #-20] - str tmp1, [dst, #-20] - - ldr tmp1, [src, #-16] /* 4 words to go. */ - str tmp1, [dst, #-16] - ldr tmp1, [src, #-12] - str tmp1, [dst, #-12] - - ldr tmp1, [src, #-8] /* 2 words to go. */ - str tmp1, [dst, #-8] - ldr tmp1, [src, #-4] - str tmp1, [dst, #-4] -#endif - - lsls count, count, #31 - ldrhcs tmp1, [src], #2 - ldrbne src, [src] /* Src is dead, use as a scratch. */ - strhcs tmp1, [dst], #2 - strbne src, [dst] - bx lr - -.Lcpy_not_short: - /* At least 64 bytes to copy, but don't know the alignment yet. */ - str tmp2, [sp, #-FRAME_SIZE]! - and tmp2, src, #7 - and tmp1, dst, #7 - cmp tmp1, tmp2 - bne .Lcpy_notaligned - -#ifdef USE_VFP - /* Magic dust alert! Force VFP on Cortex-A9. Experiments show - that the FP pipeline is much better at streaming loads and - stores. This is outside the critical loop. */ - vmov.f32 s0, s0 -#endif - - /* SRC and DST have the same mutual 64-bit alignment, but we may - still need to pre-copy some bytes to get to natural alignment. - We bring SRC and DST into full 64-bit alignment. */ - lsls tmp2, dst, #29 - beq 1f - rsbs tmp2, tmp2, #0 - sub count, count, tmp2, lsr #29 - ldrmi tmp1, [src], #4 - strmi tmp1, [dst], #4 - lsls tmp2, tmp2, #2 - ldrhcs tmp1, [src], #2 - ldrbne tmp2, [src], #1 - strhcs tmp1, [dst], #2 - strbne tmp2, [dst], #1 - -1: - subs tmp2, count, #64 /* Use tmp2 for count. */ - blt .Ltail63aligned - - cmp tmp2, #512 - bge .Lcpy_body_long - -.Lcpy_body_medium: /* Count in tmp2. */ -#ifdef USE_VFP -1: - vldr d0, [src, #0] - subs tmp2, tmp2, #64 - vldr d1, [src, #8] - vstr d0, [dst, #0] - vldr d0, [src, #16] - vstr d1, [dst, #8] - vldr d1, [src, #24] - vstr d0, [dst, #16] - vldr d0, [src, #32] - vstr d1, [dst, #24] - vldr d1, [src, #40] - vstr d0, [dst, #32] - vldr d0, [src, #48] - vstr d1, [dst, #40] - vldr d1, [src, #56] - vstr d0, [dst, #48] - add src, src, #64 - vstr d1, [dst, #56] - add dst, dst, #64 - bge 1b - tst tmp2, #0x3f - beq .Ldone - -.Ltail63aligned: /* Count in tmp2. */ - and tmp1, tmp2, #0x38 - add dst, dst, tmp1 - add src, src, tmp1 - rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) - add pc, pc, tmp1 - - vldr d0, [src, #-56] /* 14 words to go. */ - vstr d0, [dst, #-56] - vldr d0, [src, #-48] /* 12 words to go. */ - vstr d0, [dst, #-48] - vldr d0, [src, #-40] /* 10 words to go. */ - vstr d0, [dst, #-40] - vldr d0, [src, #-32] /* 8 words to go. */ - vstr d0, [dst, #-32] - vldr d0, [src, #-24] /* 6 words to go. */ - vstr d0, [dst, #-24] - vldr d0, [src, #-16] /* 4 words to go. */ - vstr d0, [dst, #-16] - vldr d0, [src, #-8] /* 2 words to go. */ - vstr d0, [dst, #-8] -#else - sub src, src, #8 - sub dst, dst, #8 -1: - ldrd A_l, A_h, [src, #8] - strd A_l, A_h, [dst, #8] - ldrd A_l, A_h, [src, #16] - strd A_l, A_h, [dst, #16] - ldrd A_l, A_h, [src, #24] - strd A_l, A_h, [dst, #24] - ldrd A_l, A_h, [src, #32] - strd A_l, A_h, [dst, #32] - ldrd A_l, A_h, [src, #40] - strd A_l, A_h, [dst, #40] - ldrd A_l, A_h, [src, #48] - strd A_l, A_h, [dst, #48] - ldrd A_l, A_h, [src, #56] - strd A_l, A_h, [dst, #56] - ldrd A_l, A_h, [src, #64]! - strd A_l, A_h, [dst, #64]! - subs tmp2, tmp2, #64 - bge 1b - tst tmp2, #0x3f - bne 1f - ldr tmp2,[sp], #FRAME_SIZE - bx lr -1: - add src, src, #8 - add dst, dst, #8 - -.Ltail63aligned: /* Count in tmp2. */ - /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but - we know that the src and dest are 64-bit aligned so we can use - LDRD/STRD to improve efficiency. */ - /* TMP2 is now negative, but we don't care about that. The bottom - six bits still tell us how many bytes are left to copy. */ - - and tmp1, tmp2, #0x38 - add dst, dst, tmp1 - add src, src, tmp1 - rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) - add pc, pc, tmp1 - ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ - strd A_l, A_h, [dst, #-56] - ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ - strd A_l, A_h, [dst, #-48] - ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ - strd A_l, A_h, [dst, #-40] - ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ - strd A_l, A_h, [dst, #-32] - ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ - strd A_l, A_h, [dst, #-24] - ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ - strd A_l, A_h, [dst, #-16] - ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ - strd A_l, A_h, [dst, #-8] - -#endif - tst tmp2, #4 - ldrne tmp1, [src], #4 - strne tmp1, [dst], #4 - lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ - ldrhcs tmp1, [src], #2 - ldrbne tmp2, [src] - strhcs tmp1, [dst], #2 - strbne tmp2, [dst] - -.Ldone: - ldr tmp2, [sp], #FRAME_SIZE - bx lr - -.Lcpy_body_long: /* Count in tmp2. */ - - /* Long copy. We know that there's at least (prefetch_lines * 64) - bytes to go. */ -#ifdef USE_VFP - /* Don't use PLD. Instead, read some data in advance of the current - copy position into a register. This should act like a PLD - operation but we won't have to repeat the transfer. */ - - vldr d3, [src, #0] - vldr d4, [src, #64] - vldr d5, [src, #128] - vldr d6, [src, #192] - vldr d7, [src, #256] - - vldr d0, [src, #8] - vldr d1, [src, #16] - vldr d2, [src, #24] - add src, src, #32 - - subs tmp2, tmp2, #prefetch_lines * 64 * 2 - blt 2f -1: - cpy_line_vfp d3, 0 - cpy_line_vfp d4, 64 - cpy_line_vfp d5, 128 - add dst, dst, #3 * 64 - add src, src, #3 * 64 - cpy_line_vfp d6, 0 - cpy_line_vfp d7, 64 - add dst, dst, #2 * 64 - add src, src, #2 * 64 - subs tmp2, tmp2, #prefetch_lines * 64 - bge 1b - -2: - cpy_tail_vfp d3, 0 - cpy_tail_vfp d4, 64 - cpy_tail_vfp d5, 128 - add src, src, #3 * 64 - add dst, dst, #3 * 64 - cpy_tail_vfp d6, 0 - vstr d7, [dst, #64] - vldr d7, [src, #64] - vstr d0, [dst, #64 + 8] - vldr d0, [src, #64 + 8] - vstr d1, [dst, #64 + 16] - vldr d1, [src, #64 + 16] - vstr d2, [dst, #64 + 24] - vldr d2, [src, #64 + 24] - vstr d7, [dst, #64 + 32] - add src, src, #96 - vstr d0, [dst, #64 + 40] - vstr d1, [dst, #64 + 48] - vstr d2, [dst, #64 + 56] - add dst, dst, #128 - add tmp2, tmp2, #prefetch_lines * 64 - b .Lcpy_body_medium -#else - /* Long copy. Use an SMS style loop to maximize the I/O - bandwidth of the core. We don't have enough spare registers - to synthesise prefetching, so use PLD operations. */ - /* Pre-bias src and dst. */ - sub src, src, #8 - sub dst, dst, #8 - pld [src, #8] - pld [src, #72] - subs tmp2, tmp2, #64 - pld [src, #136] - ldrd A_l, A_h, [src, #8] - strd B_l, B_h, [sp, #8] - ldrd B_l, B_h, [src, #16] - strd C_l, C_h, [sp, #16] - ldrd C_l, C_h, [src, #24] - strd D_l, D_h, [sp, #24] - pld [src, #200] - ldrd D_l, D_h, [src, #32]! - b 1f - .p2align 6 -2: - pld [src, #232] - strd A_l, A_h, [dst, #40] - ldrd A_l, A_h, [src, #40] - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [src, #48] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [src, #56] - strd D_l, D_h, [dst, #64]! - ldrd D_l, D_h, [src, #64]! - subs tmp2, tmp2, #64 -1: - strd A_l, A_h, [dst, #8] - ldrd A_l, A_h, [src, #8] - strd B_l, B_h, [dst, #16] - ldrd B_l, B_h, [src, #16] - strd C_l, C_h, [dst, #24] - ldrd C_l, C_h, [src, #24] - strd D_l, D_h, [dst, #32] - ldrd D_l, D_h, [src, #32] - bcs 2b - /* Save the remaining bytes and restore the callee-saved regs. */ - strd A_l, A_h, [dst, #40] - add src, src, #40 - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [sp, #8] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [sp, #16] - strd D_l, D_h, [dst, #64] - ldrd D_l, D_h, [sp, #24] - add dst, dst, #72 - tst tmp2, #0x3f - bne .Ltail63aligned - ldr tmp2, [sp], #FRAME_SIZE - bx lr -#endif - -.Lcpy_notaligned: - pld [src] - pld [src, #64] - /* There's at least 64 bytes to copy, but there is no mutual - alignment. */ - /* Bring DST to 64-bit alignment. */ - lsls tmp2, dst, #29 - pld [src, #(2 * 64)] - beq 1f - rsbs tmp2, tmp2, #0 - sub count, count, tmp2, lsr #29 - ldrmi tmp1, [src], #4 - strmi tmp1, [dst], #4 - lsls tmp2, tmp2, #2 - ldrbne tmp1, [src], #1 - ldrhcs tmp2, [src], #2 - strbne tmp1, [dst], #1 - strhcs tmp2, [dst], #2 -1: - pld [src, #(3 * 64)] - subs count, count, #64 - ldrmi tmp2, [sp], #FRAME_SIZE - bmi .Ltail63unaligned - pld [src, #(4 * 64)] - -#ifdef USE_NEON - vld1.8 {d0-d3}, [src]! - vld1.8 {d4-d7}, [src]! - subs count, count, #64 - bmi 2f -1: - pld [src, #(4 * 64)] - vst1.8 {d0-d3}, [ALIGN (dst, 64)]! - vld1.8 {d0-d3}, [src]! - vst1.8 {d4-d7}, [ALIGN (dst, 64)]! - vld1.8 {d4-d7}, [src]! - subs count, count, #64 - bpl 1b -2: - vst1.8 {d0-d3}, [ALIGN (dst, 64)]! - vst1.8 {d4-d7}, [ALIGN (dst, 64)]! - ands count, count, #0x3f -#else - /* Use an SMS style loop to maximize the I/O bandwidth. */ - sub src, src, #4 - sub dst, dst, #8 - subs tmp2, count, #64 /* Use tmp2 for count. */ - ldr A_l, [src, #4] - ldr A_h, [src, #8] - strd B_l, B_h, [sp, #8] - ldr B_l, [src, #12] - ldr B_h, [src, #16] - strd C_l, C_h, [sp, #16] - ldr C_l, [src, #20] - ldr C_h, [src, #24] - strd D_l, D_h, [sp, #24] - ldr D_l, [src, #28] - ldr D_h, [src, #32]! - b 1f - .p2align 6 -2: - pld [src, #(5 * 64) - (32 - 4)] - strd A_l, A_h, [dst, #40] - ldr A_l, [src, #36] - ldr A_h, [src, #40] - strd B_l, B_h, [dst, #48] - ldr B_l, [src, #44] - ldr B_h, [src, #48] - strd C_l, C_h, [dst, #56] - ldr C_l, [src, #52] - ldr C_h, [src, #56] - strd D_l, D_h, [dst, #64]! - ldr D_l, [src, #60] - ldr D_h, [src, #64]! - subs tmp2, tmp2, #64 -1: - strd A_l, A_h, [dst, #8] - ldr A_l, [src, #4] - ldr A_h, [src, #8] - strd B_l, B_h, [dst, #16] - ldr B_l, [src, #12] - ldr B_h, [src, #16] - strd C_l, C_h, [dst, #24] - ldr C_l, [src, #20] - ldr C_h, [src, #24] - strd D_l, D_h, [dst, #32] - ldr D_l, [src, #28] - ldr D_h, [src, #32] - bcs 2b - - /* Save the remaining bytes and restore the callee-saved regs. */ - strd A_l, A_h, [dst, #40] - add src, src, #36 - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [sp, #8] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [sp, #16] - strd D_l, D_h, [dst, #64] - ldrd D_l, D_h, [sp, #24] - add dst, dst, #72 - ands count, tmp2, #0x3f -#endif - ldr tmp2, [sp], #FRAME_SIZE - bne .Ltail63unaligned - bx lr - - .size memcpy, . - memcpy Index: contrib/cortex-strings/src/arm/memset.S =================================================================== --- contrib/cortex-strings/src/arm/memset.S +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2010-2011, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Written by Dave Gilbert - - This memset routine is optimised on a Cortex-A9 and should work on - all ARMv7 processors. - - */ - - .syntax unified - .arch armv7-a - -@ 2011-08-30 david.gilbert@linaro.org -@ Extracted from local git 2f11b436 - -@ this lets us check a flag in a 00/ff byte easily in either endianness -#ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) -#else -#define CHARTSTMASK(c) 1<<(c*8) -#endif - .text - .thumb - -@ --------------------------------------------------------------------------- - .thumb_func - .align 2 - .p2align 4,,15 - .global memset - .type memset,%function -memset: - @ r0 = address - @ r1 = character - @ r2 = count - @ returns original address in r0 - - mov r3, r0 @ Leave r0 alone - cbz r2, 10f @ Exit if 0 length - - tst r0, #7 - beq 2f @ Already aligned - - @ Ok, so we're misaligned here -1: - strb r1, [r3], #1 - subs r2,r2,#1 - tst r3, #7 - cbz r2, 10f @ Exit if we hit the end - bne 1b @ go round again if still misaligned - -2: - @ OK, so we're aligned - push {r4,r5,r6,r7} - bics r4, r2, #15 @ if less than 16 bytes then need to finish it off - beq 5f - -3: - @ POSIX says that ch is cast to an unsigned char. A uxtb is one - @ byte and takes two cycles, where an AND is four bytes but one - @ cycle. - and r1, #0xFF - orr r1, r1, r1, lsl#8 @ Same character into all bytes - orr r1, r1, r1, lsl#16 - mov r5,r1 - mov r6,r1 - mov r7,r1 - -4: - subs r4,r4,#16 - stmia r3!,{r1,r5,r6,r7} - bne 4b - and r2,r2,#15 - - @ At this point we're still aligned and we have upto align-1 bytes left to right - @ we can avoid some of the byte-at-a time now by testing for some big chunks - tst r2,#8 - itt ne - subne r2,r2,#8 - stmiane r3!,{r1,r5} - -5: - pop {r4,r5,r6,r7} - cbz r2, 10f - - @ Got to do any last < alignment bytes -6: - subs r2,r2,#1 - strb r1,[r3],#1 - bne 6b - -10: - bx lr @ goodbye Index: contrib/cortex-strings/src/arm/strchr.S =================================================================== --- contrib/cortex-strings/src/arm/strchr.S +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2010-2011, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Written by Dave Gilbert - - A very simple strchr routine, from benchmarks on A9 it's a bit faster than - the current version in eglibc (2.12.1-0ubuntu14 package) - I don't think doing a word at a time version is worth it since a lot - of strchr cases are very short anyway. - - */ - -@ 2011-02-07 david.gilbert@linaro.org -@ Extracted from local git a5b438d861 - - .syntax unified - .arch armv7-a - - .text - .thumb - -@ --------------------------------------------------------------------------- - - .thumb_func - .align 2 - .p2align 4,,15 - .global strchr - .type strchr,%function -strchr: - @ r0 = start of string - @ r1 = character to match - @ returns NULL for no match, or a pointer to the match - and r1,r1, #255 - -1: - ldrb r2,[r0],#1 - cmp r2,r1 - cbz r2,10f - bne 1b - - @ We're here if it matched -5: - subs r0,r0,#1 - bx lr - -10: - @ We're here if we ran off the end - cmp r1, #0 @ Corner case - you're allowed to search for the nil and get a pointer to it - beq 5b @ A bit messy, if it's common we should branch at the start to a special loop - mov r0,#0 - bx lr Index: contrib/cortex-strings/src/arm/strcmp.S =================================================================== --- contrib/cortex-strings/src/arm/strcmp.S +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Copyright (c) 2012-2014 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Implementation of strcmp for ARMv7 when DSP instructions are - available. Use ldrd to support wider loads, provided the data - is sufficiently aligned. Use saturating arithmetic to optimize - the compares. */ - -/* Build Options: - STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first - byte in the string. If comparing completely random strings - the pre-check will save time, since there is a very high - probability of a mismatch in the first character: we save - significant overhead if this is the common case. However, - if strings are likely to be identical (eg because we're - verifying a hit in a hash table), then this check is largely - redundant. */ - -#define STRCMP_NO_PRECHECK 0 - - /* This version uses Thumb-2 code. */ - .thumb - .syntax unified - -#ifdef __ARM_BIG_ENDIAN -#define S2LO lsl -#define S2LOEQ lsleq -#define S2HI lsr -#define MSB 0x000000ff -#define LSB 0xff000000 -#define BYTE0_OFFSET 24 -#define BYTE1_OFFSET 16 -#define BYTE2_OFFSET 8 -#define BYTE3_OFFSET 0 -#else /* not __ARM_BIG_ENDIAN */ -#define S2LO lsr -#define S2LOEQ lsreq -#define S2HI lsl -#define BYTE0_OFFSET 0 -#define BYTE1_OFFSET 8 -#define BYTE2_OFFSET 16 -#define BYTE3_OFFSET 24 -#define MSB 0xff000000 -#define LSB 0x000000ff -#endif /* not __ARM_BIG_ENDIAN */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -/* Parameters and result. */ -#define src1 r0 -#define src2 r1 -#define result r0 /* Overlaps src1. */ - -/* Internal variables. */ -#define tmp1 r4 -#define tmp2 r5 -#define const_m1 r12 - -/* Additional internal variables for 64-bit aligned data. */ -#define data1a r2 -#define data1b r3 -#define data2a r6 -#define data2b r7 -#define syndrome_a tmp1 -#define syndrome_b tmp2 - -/* Additional internal variables for 32-bit aligned data. */ -#define data1 r2 -#define data2 r3 -#define syndrome tmp2 - - - /* Macro to compute and return the result value for word-aligned - cases. */ - .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 -#ifdef __ARM_BIG_ENDIAN - /* If data1 contains a zero byte, then syndrome will contain a 1 in - bit 7 of that byte. Otherwise, the highest set bit in the - syndrome will highlight the first different bit. It is therefore - sufficient to extract the eight bits starting with the syndrome - bit. */ - clz tmp1, \synd - lsl r1, \d2, tmp1 - .if \restore_r6 - ldrd r6, r7, [sp, #8] - .endif - .cfi_restore 6 - .cfi_restore 7 - lsl \d1, \d1, tmp1 - .cfi_remember_state - lsr result, \d1, #24 - ldrd r4, r5, [sp], #16 - .cfi_restore 4 - .cfi_restore 5 - sub result, result, r1, lsr #24 - bx lr -#else - /* To use the big-endian trick we'd have to reverse all three words. - that's slower than this approach. */ - rev \synd, \synd - clz tmp1, \synd - bic tmp1, tmp1, #7 - lsr r1, \d2, tmp1 - .cfi_remember_state - .if \restore_r6 - ldrd r6, r7, [sp, #8] - .endif - .cfi_restore 6 - .cfi_restore 7 - lsr \d1, \d1, tmp1 - and result, \d1, #255 - and r1, r1, #255 - ldrd r4, r5, [sp], #16 - .cfi_restore 4 - .cfi_restore 5 - sub result, result, r1 - - bx lr -#endif - .endm - - .text - .p2align 5 -.Lstrcmp_start_addr: -#if STRCMP_NO_PRECHECK == 0 -.Lfastpath_exit: - sub r0, r2, r3 - bx lr - nop -#endif -def_fn strcmp -#if STRCMP_NO_PRECHECK == 0 - ldrb r2, [src1] - ldrb r3, [src2] - cmp r2, #1 - it cs - cmpcs r2, r3 - bne .Lfastpath_exit -#endif - .cfi_startproc - strd r4, r5, [sp, #-16]! - .cfi_def_cfa_offset 16 - .cfi_offset 4, -16 - .cfi_offset 5, -12 - orr tmp1, src1, src2 - strd r6, r7, [sp, #8] - .cfi_offset 6, -8 - .cfi_offset 7, -4 - mvn const_m1, #0 - lsl r2, tmp1, #29 - cbz r2, .Lloop_aligned8 - -.Lnot_aligned: - eor tmp1, src1, src2 - tst tmp1, #7 - bne .Lmisaligned8 - - /* Deal with mutual misalignment by aligning downwards and then - masking off the unwanted loaded data to prevent a difference. */ - and tmp1, src1, #7 - bic src1, src1, #7 - and tmp2, tmp1, #3 - bic src2, src2, #7 - lsl tmp2, tmp2, #3 /* Bytes -> bits. */ - ldrd data1a, data1b, [src1], #16 - tst tmp1, #4 - ldrd data2a, data2b, [src2], #16 - /* In thumb code we can't use MVN with a register shift, but - we do have ORN. */ - S2HI tmp1, const_m1, tmp2 - orn data1a, data1a, tmp1 - orn data2a, data2a, tmp1 - beq .Lstart_realigned8 - orn data1b, data1b, tmp1 - mov data1a, const_m1 - orn data2b, data2b, tmp1 - mov data2a, const_m1 - b .Lstart_realigned8 - - /* Unwind the inner loop by a factor of 2, giving 16 bytes per - pass. */ - .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ - .p2align 2 /* Always word aligned. */ -.Lloop_aligned8: - ldrd data1a, data1b, [src1], #16 - ldrd data2a, data2b, [src2], #16 -.Lstart_realigned8: - uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ - eor syndrome_a, data1a, data2a - sel syndrome_a, syndrome_a, const_m1 - cbnz syndrome_a, .Ldiff_in_a - uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ - eor syndrome_b, data1b, data2b - sel syndrome_b, syndrome_b, const_m1 - cbnz syndrome_b, .Ldiff_in_b - - ldrd data1a, data1b, [src1, #-8] - ldrd data2a, data2b, [src2, #-8] - uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ - eor syndrome_a, data1a, data2a - sel syndrome_a, syndrome_a, const_m1 - uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ - eor syndrome_b, data1b, data2b - sel syndrome_b, syndrome_b, const_m1 - /* Can't use CBZ for backwards branch. */ - orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ - beq .Lloop_aligned8 - -.Ldiff_found: - cbnz syndrome_a, .Ldiff_in_a - -.Ldiff_in_b: - strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 - -.Ldiff_in_a: - .cfi_restore_state - strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 - - .cfi_restore_state -.Lmisaligned8: - tst tmp1, #3 - bne .Lmisaligned4 - ands tmp1, src1, #3 - bne .Lmutual_align4 - - /* Unrolled by a factor of 2, to reduce the number of post-increment - operations. */ -.Lloop_aligned4: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned4: - uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ - eor syndrome, data1, data2 - sel syndrome, syndrome, const_m1 - cbnz syndrome, .Laligned4_done - ldr data1, [src1, #-4] - ldr data2, [src2, #-4] - uadd8 syndrome, data1, const_m1 - eor syndrome, data1, data2 - sel syndrome, syndrome, const_m1 - cmp syndrome, #0 - beq .Lloop_aligned4 - -.Laligned4_done: - strcmp_epilogue_aligned syndrome, data1, data2, 0 - -.Lmutual_align4: - .cfi_restore_state - /* Deal with mutual misalignment by aligning downwards and then - masking off the unwanted loaded data to prevent a difference. */ - lsl tmp1, tmp1, #3 /* Bytes -> bits. */ - bic src1, src1, #3 - ldr data1, [src1], #8 - bic src2, src2, #3 - ldr data2, [src2], #8 - - /* In thumb code we can't use MVN with a register shift, but - we do have ORN. */ - S2HI tmp1, const_m1, tmp1 - orn data1, data1, tmp1 - orn data2, data2, tmp1 - b .Lstart_realigned4 - -.Lmisaligned4: - ands tmp1, src1, #3 - beq .Lsrc1_aligned - sub src2, src2, tmp1 - bic src1, src1, #3 - lsls tmp1, tmp1, #31 - ldr data1, [src1], #4 - beq .Laligned_m2 - bcs .Laligned_m1 - -#if STRCMP_NO_PRECHECK == 1 - ldrb data2, [src2, #1] - uxtb tmp1, data1, ror #BYTE1_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m2: - ldrb data2, [src2, #2] - uxtb tmp1, data1, ror #BYTE2_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m1: - ldrb data2, [src2, #3] - uxtb tmp1, data1, ror #BYTE3_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - add src2, src2, #4 - cbnz data2, .Lsrc1_aligned -#else /* STRCMP_NO_PRECHECK */ - /* If we've done the pre-check, then we don't need to check the - first byte again here. */ - ldrb data2, [src2, #2] - uxtb tmp1, data1, ror #BYTE2_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m2: - ldrb data2, [src2, #3] - uxtb tmp1, data1, ror #BYTE3_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbnz data2, .Laligned_m1 -#endif - -.Lmisaligned_exit: - .cfi_remember_state - mov result, tmp1 - ldr r4, [sp], #16 - .cfi_restore 4 - bx lr - -#if STRCMP_NO_PRECHECK == 0 -.Laligned_m1: - add src2, src2, #4 -#endif -.Lsrc1_aligned: - .cfi_restore_state - /* src1 is word aligned, but src2 has no common alignment - with it. */ - ldr data1, [src1], #4 - lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ - - bic src2, src2, #3 - ldr data2, [src2], #4 - bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ - bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ - - /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ -.Loverlap3: - bic tmp1, data1, #MSB - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #8 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #24 - bne 6f - ldr data1, [src1], #4 - b .Loverlap3 -4: - S2LO data2, data2, #8 - b .Lstrcmp_tail - -5: - bics syndrome, syndrome, #MSB - bne .Lstrcmp_done_equal - - /* We can only get here if the MSB of data1 contains 0, so - fast-path the exit. */ - ldrb result, [src2] - .cfi_remember_state - ldrd r4, r5, [sp], #16 - .cfi_restore 4 - .cfi_restore 5 - /* R6/7 Not used in this sequence. */ - .cfi_restore 6 - .cfi_restore 7 - neg result, result - bx lr - -6: - .cfi_restore_state - S2LO data1, data1, #24 - and data2, data2, #LSB - b .Lstrcmp_tail - - .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap2: - and tmp1, data1, const_m1, S2LO #16 - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #16 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #16 - bne 6f - ldr data1, [src1], #4 - b .Loverlap2 -4: - S2LO data2, data2, #16 - b .Lstrcmp_tail -5: - ands syndrome, syndrome, const_m1, S2LO #16 - bne .Lstrcmp_done_equal - - ldrh data2, [src2] - S2LO data1, data1, #16 -#ifdef __ARM_BIG_ENDIAN - lsl data2, data2, #16 -#endif - b .Lstrcmp_tail - -6: - S2LO data1, data1, #16 - and data2, data2, const_m1, S2LO #16 - b .Lstrcmp_tail - - .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap1: - and tmp1, data1, #LSB - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #24 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #8 - bne 6f - ldr data1, [src1], #4 - b .Loverlap1 -4: - S2LO data2, data2, #24 - b .Lstrcmp_tail -5: - tst syndrome, #LSB - bne .Lstrcmp_done_equal - ldr data2, [src2] -6: - S2LO data1, data1, #8 - bic data2, data2, #MSB - b .Lstrcmp_tail - -.Lstrcmp_done_equal: - mov result, #0 - .cfi_remember_state - ldrd r4, r5, [sp], #16 - .cfi_restore 4 - .cfi_restore 5 - /* R6/7 not used in this sequence. */ - .cfi_restore 6 - .cfi_restore 7 - bx lr - -.Lstrcmp_tail: - .cfi_restore_state -#ifndef __ARM_BIG_ENDIAN - rev data1, data1 - rev data2, data2 - /* Now everything looks big-endian... */ -#endif - uadd8 tmp1, data1, const_m1 - eor tmp1, data1, data2 - sel syndrome, tmp1, const_m1 - clz tmp1, syndrome - lsl data1, data1, tmp1 - lsl data2, data2, tmp1 - lsr result, data1, #24 - ldrd r4, r5, [sp], #16 - .cfi_restore 4 - .cfi_restore 5 - /* R6/7 not used in this sequence. */ - .cfi_restore 6 - .cfi_restore 7 - sub result, result, data2, lsr #24 - bx lr - .cfi_endproc - .size strcmp, . - .Lstrcmp_start_addr Index: contrib/cortex-strings/src/thumb-2/strcpy.c =================================================================== --- contrib/cortex-strings/src/thumb-2/strcpy.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2008 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* For GLIBC: -#include -#include - -#undef strcmp -*/ - -#ifdef __thumb2__ -#define magic1(REG) "#0x01010101" -#define magic2(REG) "#0x80808080" -#else -#define magic1(REG) #REG -#define magic2(REG) #REG ", lsl #7" -#endif - -char* __attribute__((naked)) -strcpy (char* dst, const char* src) -{ - asm ( -#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \ - (defined (__thumb__) && !defined (__thumb2__))) - "pld [r1, #0]\n\t" - "eor r2, r0, r1\n\t" - "mov ip, r0\n\t" - "tst r2, #3\n\t" - "bne 4f\n\t" - "tst r1, #3\n\t" - "bne 3f\n" - "5:\n\t" -#ifndef __thumb2__ - "str r5, [sp, #-4]!\n\t" - "mov r5, #0x01\n\t" - "orr r5, r5, r5, lsl #8\n\t" - "orr r5, r5, r5, lsl #16\n\t" -#endif - - "str r4, [sp, #-4]!\n\t" - "tst r1, #4\n\t" - "ldr r3, [r1], #4\n\t" - "beq 2f\n\t" - "sub r2, r3, "magic1(r5)"\n\t" - "bics r2, r2, r3\n\t" - "tst r2, "magic2(r5)"\n\t" - "itt eq\n\t" - "streq r3, [ip], #4\n\t" - "ldreq r3, [r1], #4\n" - "bne 1f\n\t" - /* Inner loop. We now know that r1 is 64-bit aligned, so we - can safely fetch up to two words. This allows us to avoid - load stalls. */ - ".p2align 2\n" - "2:\n\t" - "pld [r1, #8]\n\t" - "ldr r4, [r1], #4\n\t" - "sub r2, r3, "magic1(r5)"\n\t" - "bics r2, r2, r3\n\t" - "tst r2, "magic2(r5)"\n\t" - "sub r2, r4, "magic1(r5)"\n\t" - "bne 1f\n\t" - "str r3, [ip], #4\n\t" - "bics r2, r2, r4\n\t" - "tst r2, "magic2(r5)"\n\t" - "itt eq\n\t" - "ldreq r3, [r1], #4\n\t" - "streq r4, [ip], #4\n\t" - "beq 2b\n\t" - "mov r3, r4\n" - "1:\n\t" -#ifdef __ARMEB__ - "rors r3, r3, #24\n\t" -#endif - "strb r3, [ip], #1\n\t" - "tst r3, #0xff\n\t" -#ifdef __ARMEL__ - "ror r3, r3, #8\n\t" -#endif - "bne 1b\n\t" - "ldr r4, [sp], #4\n\t" -#ifndef __thumb2__ - "ldr r5, [sp], #4\n\t" -#endif - "BX LR\n" - - /* Strings have the same offset from word alignment, but it's - not zero. */ - "3:\n\t" - "tst r1, #1\n\t" - "beq 1f\n\t" - "ldrb r2, [r1], #1\n\t" - "strb r2, [ip], #1\n\t" - "cmp r2, #0\n\t" - "it eq\n" - "BXEQ LR\n" - "1:\n\t" - "tst r1, #2\n\t" - "beq 5b\n\t" - "ldrh r2, [r1], #2\n\t" -#ifdef __ARMEB__ - "tst r2, #0xff00\n\t" - "iteet ne\n\t" - "strneh r2, [ip], #2\n\t" - "lsreq r2, r2, #8\n\t" - "streqb r2, [ip]\n\t" - "tstne r2, #0xff\n\t" -#else - "tst r2, #0xff\n\t" - "itet ne\n\t" - "strneh r2, [ip], #2\n\t" - "streqb r2, [ip]\n\t" - "tstne r2, #0xff00\n\t" -#endif - "bne 5b\n\t" - "BX LR\n" - - /* src and dst do not have a common word-alignement. Fall back to - byte copying. */ - "4:\n\t" - "ldrb r2, [r1], #1\n\t" - "strb r2, [ip], #1\n\t" - "cmp r2, #0\n\t" - "bne 4b\n\t" - "BX LR" - -#elif !defined (__thumb__) || defined (__thumb2__) - "mov r3, r0\n\t" - "1:\n\t" - "ldrb r2, [r1], #1\n\t" - "strb r2, [r3], #1\n\t" - "cmp r2, #0\n\t" - "bne 1b\n\t" - "BX LR" -#else - "mov r3, r0\n\t" - "1:\n\t" - "ldrb r2, [r1]\n\t" - "add r1, r1, #1\n\t" - "strb r2, [r3]\n\t" - "add r3, r3, #1\n\t" - "cmp r2, #0\n\t" - "bne 1b\n\t" - "BX LR" -#endif - ); -} -/* For GLIBC: libc_hidden_builtin_def (strcpy) */ Index: contrib/cortex-strings/src/thumb-2/strlen.S =================================================================== --- contrib/cortex-strings/src/thumb-2/strlen.S +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright (c) 2010-2011,2013 Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Assumes: - ARMv6T2, AArch32 - - */ - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -#ifdef __ARMEB__ -#define S2LO lsl -#define S2HI lsr -#else -#define S2LO lsr -#define S2HI lsl -#endif - - /* This code requires Thumb. */ - .thumb - .syntax unified - -/* Parameters and result. */ -#define srcin r0 -#define result r0 - -/* Internal variables. */ -#define src r1 -#define data1a r2 -#define data1b r3 -#define const_m1 r12 -#define const_0 r4 -#define tmp1 r4 /* Overlaps const_0 */ -#define tmp2 r5 - -def_fn strlen p2align=6 - pld [srcin, #0] - strd r4, r5, [sp, #-8]! - bic src, srcin, #7 - mvn const_m1, #0 - ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ - pld [src, #32] - bne.w .Lmisaligned8 - mov const_0, #0 - mov result, #-8 -.Lloop_aligned: - /* Bytes 0-7. */ - ldrd data1a, data1b, [src] - pld [src, #64] - add result, result, #8 -.Lstart_realigned: - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found - - /* Bytes 8-15. */ - ldrd data1a, data1b, [src, #8] - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - add result, result, #8 - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found - - /* Bytes 16-23. */ - ldrd data1a, data1b, [src, #16] - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - add result, result, #8 - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found - - /* Bytes 24-31. */ - ldrd data1a, data1b, [src, #24] - add src, src, #32 - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - add result, result, #8 - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cmp data1b, #0 - beq .Lloop_aligned - -.Lnull_found: - cmp data1a, #0 - itt eq - addeq result, result, #4 - moveq data1a, data1b -#ifndef __ARMEB__ - rev data1a, data1a -#endif - clz data1a, data1a - ldrd r4, r5, [sp], #8 - add result, result, data1a, lsr #3 /* Bits -> Bytes. */ - bx lr - -.Lmisaligned8: - ldrd data1a, data1b, [src] - and tmp2, tmp1, #3 - rsb result, tmp1, #0 - lsl tmp2, tmp2, #3 /* Bytes -> bits. */ - tst tmp1, #4 - pld [src, #64] - S2HI tmp2, const_m1, tmp2 - orn data1a, data1a, tmp2 - itt ne - ornne data1b, data1b, tmp2 - movne data1a, const_m1 - mov const_0, #0 - b .Lstart_realigned - .size strlen, . - strlen - Index: contrib/cortex-strings/src/thumb/aeabi_idiv.S =================================================================== --- contrib/cortex-strings/src/thumb/aeabi_idiv.S +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Copyright (c) 2014 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* An executable stack is *not* required for these functions. */ - -.section .note.GNU-stack,"",%progbits -.previous -.eabi_attribute 25, 1 - -/* ANSI concatenation macros. */ - -#define CONCAT1(a, b) CONCAT2(a, b) -#define CONCAT2(a, b) a ## b - -/* Use the right prefix for global labels. */ - -#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x) - -#define TYPE(x) .type SYM(x),function -#define SIZE(x) .size SYM(x), . - SYM(x) -#define LSYM(x) .x - -.macro cfi_start start_label, end_label - .pushsection .debug_frame -LSYM(Lstart_frame): - .4byte LSYM(Lend_cie) - LSYM(Lstart_cie) -LSYM(Lstart_cie): - .4byte 0xffffffff - .byte 0x1 - .ascii "\0" - .uleb128 0x1 - .sleb128 -4 - .byte 0xe - .byte 0xc - .uleb128 0xd - .uleb128 0x0 - - .align 2 -LSYM(Lend_cie): - .4byte LSYM(Lend_fde)-LSYM(Lstart_fde) -LSYM(Lstart_fde): - .4byte LSYM(Lstart_frame) - .4byte \start_label - .4byte \end_label-\start_label - .popsection -.endm - -.macro cfi_end end_label - .pushsection .debug_frame - .align 2 -LSYM(Lend_fde): - .popsection -\end_label: -.endm - -.macro THUMB_LDIV0 name signed - push {r0, lr} - movs r0, #0 - bl SYM(__aeabi_idiv0) - pop {r1, pc} -.endm - -.macro FUNC_END name - SIZE (__\name) -.endm - -.macro DIV_FUNC_END name signed - cfi_start __\name, LSYM(Lend_div0) -LSYM(Ldiv0): - THUMB_LDIV0 \name \signed - cfi_end LSYM(Lend_div0) - FUNC_END \name -.endm - -.macro THUMB_FUNC_START name - .globl SYM (\name) - TYPE (\name) - .thumb_func -SYM (\name): -.endm - -.macro FUNC_START name - .text - .globl SYM (__\name) - TYPE (__\name) - .align 0 - .force_thumb - .thumb_func - .syntax unified -SYM (__\name): -.endm - -.macro FUNC_ALIAS new old - .globl SYM (__\new) - .thumb_set SYM (__\new), SYM (__\old) -.endm - -/* Register aliases. */ -work .req r4 -dividend .req r0 -divisor .req r1 -overdone .req r2 -result .req r2 -curbit .req r3 - -/* ------------------------------------------------------------------------ */ -/* Bodies of the division and modulo routines. */ -/* ------------------------------------------------------------------------ */ -.macro BranchToDiv n, label - lsrs curbit, dividend, \n - cmp curbit, divisor - bcc \label -.endm - -.macro DoDiv n - lsrs curbit, dividend, \n - cmp curbit, divisor - bcc 1f - lsls curbit, divisor, \n - subs dividend, dividend, curbit - -1: adcs result, result -.endm - -.macro THUMB1_Div_Positive - movs result, #0 - BranchToDiv #1, LSYM(Lthumb1_div1) - BranchToDiv #4, LSYM(Lthumb1_div4) - BranchToDiv #8, LSYM(Lthumb1_div8) - BranchToDiv #12, LSYM(Lthumb1_div12) - BranchToDiv #16, LSYM(Lthumb1_div16) -LSYM(Lthumb1_div_large_positive): - movs result, #0xff - lsls divisor, divisor, #8 - rev result, result - lsrs curbit, dividend, #16 - cmp curbit, divisor - bcc 1f - asrs result, #8 - lsls divisor, divisor, #8 - beq LSYM(Ldivbyzero_waypoint) - -1: lsrs curbit, dividend, #12 - cmp curbit, divisor - bcc LSYM(Lthumb1_div12) - b LSYM(Lthumb1_div16) -LSYM(Lthumb1_div_loop): - lsrs divisor, divisor, #8 -LSYM(Lthumb1_div16): - Dodiv #15 - Dodiv #14 - Dodiv #13 - Dodiv #12 -LSYM(Lthumb1_div12): - Dodiv #11 - Dodiv #10 - Dodiv #9 - Dodiv #8 - bcs LSYM(Lthumb1_div_loop) -LSYM(Lthumb1_div8): - Dodiv #7 - Dodiv #6 - Dodiv #5 -LSYM(Lthumb1_div5): - Dodiv #4 -LSYM(Lthumb1_div4): - Dodiv #3 -LSYM(Lthumb1_div3): - Dodiv #2 -LSYM(Lthumb1_div2): - Dodiv #1 -LSYM(Lthumb1_div1): - subs divisor, dividend, divisor - bcs 1f - mov divisor, dividend - -1: adcs result, result - mov dividend, result - bx lr - -LSYM(Ldivbyzero_waypoint): - b LSYM(Ldiv0) -.endm - -.macro THUMB1_Div_Negative - lsrs result, divisor, #31 - beq 1f - rsbs divisor, divisor, #0 - -1: asrs curbit, dividend, #32 - bcc 2f - rsbs dividend, dividend, #0 - -2: eors curbit, result - movs result, #0 - mov ip, curbit - BranchToDiv #4, LSYM(Lthumb1_div_negative4) - BranchToDiv #8, LSYM(Lthumb1_div_negative8) -LSYM(Lthumb1_div_large): - movs result, #0xfc - lsls divisor, divisor, #6 - rev result, result - lsrs curbit, dividend, #8 - cmp curbit, divisor - bcc LSYM(Lthumb1_div_negative8) - - lsls divisor, divisor, #6 - asrs result, result, #6 - cmp curbit, divisor - bcc LSYM(Lthumb1_div_negative8) - - lsls divisor, divisor, #6 - asrs result, result, #6 - cmp curbit, divisor - bcc LSYM(Lthumb1_div_negative8) - - lsls divisor, divisor, #6 - beq LSYM(Ldivbyzero_negative) - asrs result, result, #6 - b LSYM(Lthumb1_div_negative8) -LSYM(Lthumb1_div_negative_loop): - lsrs divisor, divisor, #6 -LSYM(Lthumb1_div_negative8): - DoDiv #7 - DoDiv #6 - DoDiv #5 - DoDiv #4 -LSYM(Lthumb1_div_negative4): - DoDiv #3 - DoDiv #2 - bcs LSYM(Lthumb1_div_negative_loop) - DoDiv #1 - subs divisor, dividend, divisor - bcs 1f - mov divisor, dividend - -1: mov curbit, ip - adcs result, result - asrs curbit, curbit, #1 - mov dividend, result - bcc 2f - rsbs dividend, dividend, #0 - cmp curbit, #0 - -2: bpl 3f - rsbs divisor, divisor, #0 - -3: bx lr - -LSYM(Ldivbyzero_negative): - mov curbit, ip - asrs curbit, curbit, #1 - bcc LSYM(Ldiv0) - rsbs dividend, dividend, #0 -.endm - -/* ------------------------------------------------------------------------ */ -/* Start of the Real Functions */ -/* ------------------------------------------------------------------------ */ - - FUNC_START aeabi_idiv0 - bx lr - FUNC_END aeabi_idiv0 - - FUNC_START divsi3 - FUNC_ALIAS aeabi_idiv divsi3 - -LSYM(divsi3_skip_div0_test): - mov curbit, dividend - orrs curbit, divisor - bmi LSYM(Lthumb1_div_negative) - -LSYM(Lthumb1_div_positive): - THUMB1_Div_Positive - -LSYM(Lthumb1_div_negative): - THUMB1_Div_Negative - - DIV_FUNC_END divsi3 signed - - FUNC_START aeabi_idivmod - - cmp r1, #0 - beq LSYM(Ldiv0) - push {r0, r1, lr} - bl LSYM(divsi3_skip_div0_test) - POP {r1, r2, r3} - mul r2, r0 - sub r1, r1, r2 - bx r3 - - FUNC_END aeabi_idivmod -/* ------------------------------------------------------------------------ */ Index: contrib/cortex-strings/src/thumb/strcmp-armv6m.S =================================================================== --- contrib/cortex-strings/src/thumb/strcmp-armv6m.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2014 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Implementation of strcmp for ARMv6m. This version is only used in - ARMv6-M when we want an efficient implementation. Otherwize if the - code size is preferred, strcmp-armv4t.S will be used. */ - - .thumb_func - .syntax unified - .arch armv6-m - - .macro DoSub n, label - subs r0, r0, r1 -#ifdef __ARM_BIG_ENDIAN - lsrs r1, r4, \n -#else - lsls r1, r4, \n -#endif - orrs r1, r0 - bne \label - .endm - - .macro Byte_Test n, label - lsrs r0, r2, \n - lsrs r1, r3, \n - DoSub \n, \label - .endm - - .text - .p2align 0 - .global strcmp - .type strcmp, %function -strcmp: - .cfi_startproc - mov r2, r0 - push {r4, r5, r6, lr} - orrs r2, r1 - lsls r2, r2, #30 - bne 6f - ldr r5, =0x01010101 - lsls r6, r5, #7 -1: - ldmia r0!, {r2} - ldmia r1!, {r3} - subs r4, r2, r5 - bics r4, r2 - ands r4, r6 - beq 3f - -#ifdef __ARM_BIG_ENDIAN - Byte_Test #24, 4f - Byte_Test #16, 4f - Byte_Test #8, 4f - - b 7f -3: - cmp r2, r3 - beq 1b - cmp r2, r3 -#else - uxtb r0, r2 - uxtb r1, r3 - DoSub #24, 2f - - uxth r0, r2 - uxth r1, r3 - DoSub #16, 2f - - lsls r0, r2, #8 - lsls r1, r3, #8 - lsrs r0, r0, #8 - lsrs r1, r1, #8 - DoSub #8, 2f - - lsrs r0, r2, #24 - lsrs r1, r3, #24 - subs r0, r0, r1 -2: - pop {r4, r5, r6, pc} - -3: - cmp r2, r3 - beq 1b - rev r0, r2 - rev r1, r3 - cmp r0, r1 -#endif - - bls 5f - movs r0, #1 -4: - pop {r4, r5, r6, pc} -5: - movs r0, #0 - mvns r0, r0 - pop {r4, r5, r6, pc} -6: - ldrb r2, [r0, #0] - ldrb r3, [r1, #0] - adds r0, #1 - adds r1, #1 - cmp r2, #0 - beq 7f - cmp r2, r3 - bne 7f - ldrb r2, [r0, #0] - ldrb r3, [r1, #0] - adds r0, #1 - adds r1, #1 - cmp r2, #0 - beq 7f - cmp r2, r3 - beq 6b -7: - subs r0, r2, r3 - pop {r4, r5, r6, pc} - .cfi_endproc - .size strcmp, . - strcmp