Index: vendor/llvm/dist-release_70/CMakeLists.txt =================================================================== --- vendor/llvm/dist-release_70/CMakeLists.txt (revision 337298) +++ vendor/llvm/dist-release_70/CMakeLists.txt (revision 337299) @@ -1,1044 +1,1042 @@ # See docs/CMake.html for instructions about how to build LLVM with CMake. cmake_minimum_required(VERSION 3.4.3) cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0048 NEW) # CMake 3.1 and higher include generator expressions of the form # $ in the SOURCES property. These need to be # stripped everywhere that access the SOURCES property, so we just # defer to the OLD behavior of not including generator expressions # in the output for now. cmake_policy(SET CMP0051 OLD) cmake_policy(SET CMP0056 NEW) cmake_policy(SET CMP0057 NEW) if(POLICY CMP0068) cmake_policy(SET CMP0068 NEW) set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON) endif() if(NOT DEFINED LLVM_VERSION_MAJOR) set(LLVM_VERSION_MAJOR 7) endif() if(NOT DEFINED LLVM_VERSION_MINOR) set(LLVM_VERSION_MINOR 0) endif() if(NOT DEFINED LLVM_VERSION_PATCH) set(LLVM_VERSION_PATCH 0) endif() if(NOT DEFINED LLVM_VERSION_SUFFIX) - set(LLVM_VERSION_SUFFIX svn) + set(LLVM_VERSION_SUFFIX "") endif() if (NOT PACKAGE_VERSION) set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}") endif() if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQUAL "")) message(WARNING "Visual Studio generators use the x86 host compiler by " "default, even for 64-bit targets. This can result in linker " "instability and out of memory errors. To use the 64-bit " "host compiler, pass -Thost=x64 on the CMake command line.") endif() project(LLVM VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH} LANGUAGES C CXX ASM) if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "No build type selected, default to Debug") set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type (default Debug)" FORCE) endif() # This should only apply if you are both on an Apple host, and targeting Apple. if(CMAKE_HOST_APPLE AND APPLE) # if CMAKE_LIBTOOL is not set, try and find it with xcrun or find_program if(NOT CMAKE_LIBTOOL) if(NOT CMAKE_XCRUN) find_program(CMAKE_XCRUN NAMES xcrun) endif() if(CMAKE_XCRUN) execute_process(COMMAND ${CMAKE_XCRUN} -find libtool OUTPUT_VARIABLE CMAKE_LIBTOOL OUTPUT_STRIP_TRAILING_WHITESPACE) endif() if(NOT CMAKE_LIBTOOL OR NOT EXISTS CMAKE_LIBTOOL) find_program(CMAKE_LIBTOOL NAMES libtool) endif() endif() get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES) if(CMAKE_LIBTOOL) set(CMAKE_LIBTOOL ${CMAKE_LIBTOOL} CACHE PATH "libtool executable") message(STATUS "Found libtool - ${CMAKE_LIBTOOL}") execute_process(COMMAND ${CMAKE_LIBTOOL} -V OUTPUT_VARIABLE LIBTOOL_V_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE) if("${LIBTOOL_V_OUTPUT}" MATCHES ".*cctools-([0-9.]+).*") string(REGEX REPLACE ".*cctools-([0-9.]+).*" "\\1" LIBTOOL_VERSION ${LIBTOOL_V_OUTPUT}) if(NOT LIBTOOL_VERSION VERSION_LESS "862") set(LIBTOOL_NO_WARNING_FLAG "-no_warning_for_no_symbols") endif() endif() foreach(lang ${languages}) set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "\"${CMAKE_LIBTOOL}\" -static ${LIBTOOL_NO_WARNING_FLAG} -o \ ") endforeach() endif() # If DYLD_LIBRARY_PATH is set we need to set it on archiver commands if(DYLD_LIBRARY_PATH) set(dyld_envar "DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}") foreach(lang ${languages}) foreach(cmd ${CMAKE_${lang}_CREATE_STATIC_LIBRARY}) list(APPEND CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW "${dyld_envar} ${cmd}") endforeach() set(CMAKE_${lang}_CREATE_STATIC_LIBRARY ${CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW}) endforeach() endif() endif() # Side-by-side subprojects layout: automatically set the # LLVM_EXTERNAL_${project}_SOURCE_DIR using LLVM_ALL_PROJECTS # This allows an easy way of setting up a build directory for llvm and another # one for llvm+clang+... using the same sources. set(LLVM_ALL_PROJECTS "clang;libcxx;libcxxabi;lldb;compiler-rt;lld;polly;debuginfo-tests") set(LLVM_ENABLE_PROJECTS "" CACHE STRING "Semicolon-separated list of projects to build (${LLVM_ALL_PROJECTS}), or \"all\".") if( LLVM_ENABLE_PROJECTS STREQUAL "all" ) set( LLVM_ENABLE_PROJECTS ${LLVM_ALL_PROJECTS}) endif() foreach(proj ${LLVM_ENABLE_PROJECTS}) set(PROJ_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${proj}") if(NOT EXISTS "${PROJ_DIR}" OR NOT IS_DIRECTORY "${PROJ_DIR}") message(FATAL_ERROR "LLVM_ENABLE_PROJECTS requests ${proj} but directory not found: ${PROJ_DIR}") endif() string(TOUPPER "${proj}" upper_proj) STRING(REGEX REPLACE "-" "_" upper_proj ${upper_proj}) set(LLVM_EXTERNAL_${upper_proj}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${proj}") # There is a widely spread opinion that clang-tools-extra should be merged # into clang. The following simulates it by always enabling clang-tools-extra # when enabling clang. if (proj STREQUAL "clang") set(LLVM_EXTERNAL_CLANG_TOOLS_EXTRA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../clang-tools-extra") endif() endforeach() # Build llvm with ccache if the package is present set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") if(LLVM_CCACHE_BUILD) find_program(CCACHE_PROGRAM ccache) if(CCACHE_PROGRAM) set(LLVM_CCACHE_MAXSIZE "" CACHE STRING "Size of ccache") set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data") set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes CCACHE_HASHDIR=yes" CACHE STRING "Parameters to pass through to ccache") set(CCACHE_PROGRAM "${LLVM_CCACHE_PARAMS} ${CCACHE_PROGRAM}") if (LLVM_CCACHE_MAXSIZE) set(CCACHE_PROGRAM "CCACHE_MAXSIZE=${LLVM_CCACHE_MAXSIZE} ${CCACHE_PROGRAM}") endif() if (LLVM_CCACHE_DIR) set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}") endif() set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM}) else() message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF") endif() endif() option(LLVM_DEPENDENCY_DEBUGGING "Dependency debugging mode to verify correctly expressed library dependencies (Darwin only)" OFF) # Some features of the LLVM build may be disallowed when dependency debugging is # enabled. In particular you cannot use ccache because we want to force compile # operations to always happen. if(LLVM_DEPENDENCY_DEBUGGING) if(NOT CMAKE_HOST_APPLE) message(FATAL_ERROR "Dependency debugging is only currently supported on Darwin hosts.") endif() if(LLVM_CCACHE_BUILD) message(FATAL_ERROR "Cannot enable dependency debugging while using ccache.") endif() endif() option(LLVM_ENABLE_DAGISEL_COV "Debug: Prints tablegen patterns that were used for selecting" OFF) option(LLVM_ENABLE_GISEL_COV "Enable collection of GlobalISel rule coverage" OFF) if(LLVM_ENABLE_GISEL_COV) set(LLVM_GISEL_COV_PREFIX "${CMAKE_BINARY_DIR}/gisel-coverage-" CACHE STRING "Provide a filename prefix to collect the GlobalISel rule coverage") endif() # Add path for custom modules set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" ) # Generate a CompilationDatabase (compile_commands.json file) for our build, # for use by clang_complete, YouCompleteMe, etc. set(CMAKE_EXPORT_COMPILE_COMMANDS 1) option(LLVM_INSTALL_BINUTILS_SYMLINKS "Install symlinks from the binutils tool names to the corresponding LLVM tools." OFF) option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OFF) option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF) option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON) if ( LLVM_USE_FOLDERS ) set_property(GLOBAL PROPERTY USE_FOLDERS ON) endif() include(VersionFromVCS) option(LLVM_APPEND_VC_REV "Embed the version control system revision id in LLVM" ON) set(PACKAGE_NAME LLVM) set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}") set(PACKAGE_BUGREPORT "https://bugs.llvm.org/") set(BUG_REPORT_URL "${PACKAGE_BUGREPORT}" CACHE STRING "Default URL where bug reports are to be submitted.") # Configure CPack. set(CPACK_PACKAGE_INSTALL_DIRECTORY "LLVM") set(CPACK_PACKAGE_VENDOR "LLVM") set(CPACK_PACKAGE_VERSION_MAJOR ${LLVM_VERSION_MAJOR}) set(CPACK_PACKAGE_VERSION_MINOR ${LLVM_VERSION_MINOR}) set(CPACK_PACKAGE_VERSION_PATCH ${LLVM_VERSION_PATCH}) set(CPACK_PACKAGE_VERSION ${PACKAGE_VERSION}) set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.TXT") set(CPACK_NSIS_COMPRESSOR "/SOLID lzma \r\n SetCompressorDictSize 32") if(WIN32 AND NOT UNIX) set(CPACK_PACKAGE_INSTALL_REGISTRY_KEY "LLVM") set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_logo.bmp") set(CPACK_NSIS_MUI_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_icon.ico") set(CPACK_NSIS_MUI_UNIICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_icon.ico") set(CPACK_NSIS_MODIFY_PATH "ON") set(CPACK_NSIS_ENABLE_UNINSTALL_BEFORE_INSTALL "ON") if( CMAKE_CL_64 ) set(CPACK_NSIS_INSTALL_ROOT "$PROGRAMFILES64") endif() endif() include(CPack) # Sanity check our source directory to make sure that we are not trying to # generate an in-source build (unless on MSVC_IDE, where it is ok), and to make # sure that we don't have any stray generated files lying around in the tree # (which would end up getting picked up by header search, instead of the correct # versions). if( CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR AND NOT MSVC_IDE ) message(FATAL_ERROR "In-source builds are not allowed. CMake would overwrite the makefiles distributed with LLVM. Please create a directory and run cmake from there, passing the path to this source directory as the last argument. This process created the file `CMakeCache.txt' and the directory `CMakeFiles'. Please delete them.") endif() if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR ) file(GLOB_RECURSE tablegenned_files_on_include_dir "${CMAKE_CURRENT_SOURCE_DIR}/include/llvm/*.gen") file(GLOB_RECURSE tablegenned_files_on_lib_dir "${CMAKE_CURRENT_SOURCE_DIR}/lib/Target/*.inc") if( tablegenned_files_on_include_dir OR tablegenned_files_on_lib_dir) message(FATAL_ERROR "Apparently there is a previous in-source build, probably as the result of running `configure' and `make' on ${CMAKE_CURRENT_SOURCE_DIR}. This may cause problems. The suspicious files are: ${tablegenned_files_on_lib_dir} ${tablegenned_files_on_include_dir} Please clean the source directory.") endif() endif() string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) if (CMAKE_BUILD_TYPE AND NOT uppercase_CMAKE_BUILD_TYPE MATCHES "^(DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL)$") message(FATAL_ERROR "Invalid value for CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") endif() set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" ) set(LLVM_TOOLS_INSTALL_DIR "bin" CACHE STRING "Path for binary subdirectory (defaults to 'bin')") mark_as_advanced(LLVM_TOOLS_INSTALL_DIR) set(LLVM_UTILS_INSTALL_DIR "${LLVM_TOOLS_INSTALL_DIR}" CACHE STRING "Path to install LLVM utilities (enabled by LLVM_INSTALL_UTILS=ON) (defaults to LLVM_TOOLS_INSTALL_DIR)") mark_as_advanced(LLVM_UTILS_INSTALL_DIR) # They are used as destination of target generators. set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin) set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX}) if(WIN32 OR CYGWIN) # DLL platform -- put DLLs into bin. set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) else() set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) endif() # Each of them corresponds to llvm-config's. set(LLVM_TOOLS_BINARY_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) # --bindir set(LLVM_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) # --libdir set(LLVM_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) # --src-root set(LLVM_MAIN_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/include ) # --includedir set(LLVM_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} ) # --prefix # Note: LLVM_CMAKE_PATH does not include generated files set(LLVM_CMAKE_PATH ${LLVM_MAIN_SRC_DIR}/cmake/modules) set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples) set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) # List of all targets to be built by default: set(LLVM_ALL_TARGETS AArch64 AMDGPU ARM BPF Hexagon Lanai Mips MSP430 NVPTX PowerPC Sparc SystemZ X86 XCore ) # List of targets with JIT support: set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ) set(LLVM_TARGETS_TO_BUILD "all" CACHE STRING "Semicolon-separated list of targets to build, or \"all\".") set(LLVM_EXPERIMENTAL_TARGETS_TO_BUILD "" CACHE STRING "Semicolon-separated list of experimental targets to build.") option(BUILD_SHARED_LIBS "Build all libraries as shared libraries instead of static" OFF) option(LLVM_ENABLE_BACKTRACES "Enable embedding backtraces on crash." ON) if(LLVM_ENABLE_BACKTRACES) set(ENABLE_BACKTRACES 1) endif() option(LLVM_ENABLE_CRASH_OVERRIDES "Enable crash overrides." ON) if(LLVM_ENABLE_CRASH_OVERRIDES) set(ENABLE_CRASH_OVERRIDES 1) endif() option(LLVM_ENABLE_FFI "Use libffi to call external functions from the interpreter" OFF) set(FFI_LIBRARY_DIR "" CACHE PATH "Additional directory, where CMake should search for libffi.so") set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should search for ffi.h or ffi/ffi.h") set(LLVM_TARGET_ARCH "host" CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.") option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON) set(LLVM_ENABLE_LIBXML2 "ON" CACHE STRING "Use libxml2 if available. Can be ON, OFF, or FORCE_ON") option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON) option(LLVM_ENABLE_LIBPFM "Use libpfm for performance counters if available." ON) option(LLVM_ENABLE_THREADS "Use threads if available." ON) option(LLVM_ENABLE_ZLIB "Use zlib for compression/decompression if available." ON) if( LLVM_TARGETS_TO_BUILD STREQUAL "all" ) set( LLVM_TARGETS_TO_BUILD ${LLVM_ALL_TARGETS} ) endif() set(LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD} ${LLVM_EXPERIMENTAL_TARGETS_TO_BUILD}) list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD) option(LLVM_ENABLE_PIC "Build Position-Independent Code" ON) option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON) option(LLVM_ENABLE_MODULES "Compile with C++ modules enabled." OFF) if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") option(LLVM_ENABLE_MODULE_DEBUGGING "Compile with -gmodules." ON) option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." OFF) else() option(LLVM_ENABLE_MODULE_DEBUGGING "Compile with -gmodules." OFF) option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." ON) endif() option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF) option(LLVM_ENABLE_CXX1Z "Compile with C++1z enabled." OFF) option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF) option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF) option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON) option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF) option(LLVM_ENABLE_DUMP "Enable dump functions even when assertions are disabled" OFF) if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) option(LLVM_ENABLE_ASSERTIONS "Enable assertions" OFF) else() option(LLVM_ENABLE_ASSERTIONS "Enable assertions" ON) endif() option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF) set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING "Enable abi-breaking checks. Can be WITH_ASSERTS, FORCE_ON or FORCE_OFF.") option(LLVM_FORCE_USE_OLD_HOST_TOOLCHAIN "Set to ON to force using an old, unsupported host toolchain." OFF) option(LLVM_USE_INTEL_JITEVENTS "Use Intel JIT API to inform Intel(R) VTune(TM) Amplifier XE 2011 about JIT code" OFF) if( LLVM_USE_INTEL_JITEVENTS ) # Verify we are on a supported platform if( NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) message(FATAL_ERROR "Intel JIT API support is available on Linux and Windows only.") endif() endif( LLVM_USE_INTEL_JITEVENTS ) option(LLVM_USE_OPROFILE "Use opagent JIT interface to inform OProfile about JIT code" OFF) option(LLVM_EXTERNALIZE_DEBUGINFO "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF) option(LLVM_CODESIGNING_IDENTITY "Sign executables and dylibs with the given identity (Darwin Only)" OFF) # If enabled, verify we are on a platform that supports oprofile. if( LLVM_USE_OPROFILE ) if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) message(FATAL_ERROR "OProfile support is available on Linux only.") endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) endif( LLVM_USE_OPROFILE ) option(LLVM_USE_PERF "Use perf JIT interface to inform perf about JIT code" OFF) # If enabled, verify we are on a platform that supports perf. if( LLVM_USE_PERF ) if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) message(FATAL_ERROR "perf support is available on Linux only.") endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) endif( LLVM_USE_PERF ) set(LLVM_USE_SANITIZER "" CACHE STRING "Define the sanitizer used to build binaries and tests.") option(LLVM_OPTIMIZE_SANITIZED_BUILDS "Pass -O1 on debug sanitizer builds" ON) set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH "Path to fuzzing library for linking with fuzz targets") option(LLVM_USE_SPLIT_DWARF "Use -gsplit-dwarf when compiling llvm." OFF) option(LLVM_POLLY_LINK_INTO_TOOLS "Statically link Polly into tools (if available)" ON) option(LLVM_POLLY_BUILD "Build LLVM with Polly" ON) if (EXISTS ${LLVM_MAIN_SRC_DIR}/tools/polly/CMakeLists.txt) set(POLLY_IN_TREE TRUE) elseif(LLVM_EXTERNAL_POLLY_SOURCE_DIR) set(POLLY_IN_TREE TRUE) else() set(POLLY_IN_TREE FALSE) endif() if (LLVM_POLLY_BUILD AND POLLY_IN_TREE) set(WITH_POLLY ON) else() set(WITH_POLLY OFF) endif() if (LLVM_POLLY_LINK_INTO_TOOLS AND WITH_POLLY) set(LINK_POLLY_INTO_TOOLS ON) else() set(LINK_POLLY_INTO_TOOLS OFF) endif() # Define an option controlling whether we should build for 32-bit on 64-bit # platforms, where supported. if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) # TODO: support other platforms and toolchains. option(LLVM_BUILD_32_BITS "Build 32 bits executables and libraries." OFF) endif() # Define the default arguments to use with 'lit', and an option for the user to # override. set(LIT_ARGS_DEFAULT "-sv") if (MSVC_IDE OR XCODE) set(LIT_ARGS_DEFAULT "${LIT_ARGS_DEFAULT} --no-progress-bar") endif() set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit") # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools. if( WIN32 AND NOT CYGWIN ) set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools") endif() # Define options to control the inclusion and default build behavior for # components which may not strictly be necessary (tools, examples, and tests). # # This is primarily to support building smaller or faster project files. option(LLVM_INCLUDE_TOOLS "Generate build targets for the LLVM tools." ON) option(LLVM_BUILD_TOOLS "Build the LLVM tools. If OFF, just generate build targets." ON) option(LLVM_INCLUDE_UTILS "Generate build targets for the LLVM utils." ON) option(LLVM_BUILD_UTILS "Build LLVM utility binaries. If OFF, just generate build targets." ON) option(LLVM_INCLUDE_RUNTIMES "Generate build targets for the LLVM runtimes." ON) option(LLVM_BUILD_RUNTIMES "Build the LLVM runtimes. If OFF, just generate build targets." ON) option(LLVM_BUILD_RUNTIME "Build the LLVM runtime libraries." ON) option(LLVM_BUILD_EXAMPLES "Build the LLVM example programs. If OFF, just generate build targets." OFF) option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON) option(LLVM_BUILD_TESTS "Build LLVM unit tests. If OFF, just generate build targets." OFF) option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON) option(LLVM_INCLUDE_GO_TESTS "Include the Go bindings tests in test build targets." ON) option (LLVM_BUILD_DOCS "Build the llvm documentation." OFF) option (LLVM_INCLUDE_DOCS "Generate build targets for llvm documentation." ON) option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OFF) option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF) option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON) option (LLVM_ENABLE_BINDINGS "Build bindings." ON) set(LLVM_INSTALL_DOXYGEN_HTML_DIR "share/doc/llvm/doxygen-html" CACHE STRING "Doxygen-generated HTML documentation install directory") set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "share/doc/llvm/ocaml-html" CACHE STRING "OCamldoc-generated HTML documentation install directory") option (LLVM_BUILD_EXTERNAL_COMPILER_RT "Build compiler-rt as an external project." OFF) option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO "Show target and host info when tools are invoked with --version." ON) # You can configure which libraries from LLVM you want to include in the # shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited # list of LLVM components. All component names handled by llvm-config are valid. if(NOT DEFINED LLVM_DYLIB_COMPONENTS) set(LLVM_DYLIB_COMPONENTS "all" CACHE STRING "Semicolon-separated list of components to include in libLLVM, or \"all\".") endif() option(LLVM_LINK_LLVM_DYLIB "Link tools against the libllvm dynamic library" OFF) option(LLVM_BUILD_LLVM_C_DYLIB "Build libllvm-c re-export library (Darwin Only)" OFF) set(LLVM_BUILD_LLVM_DYLIB_default OFF) if(LLVM_LINK_LLVM_DYLIB OR LLVM_BUILD_LLVM_C_DYLIB) set(LLVM_BUILD_LLVM_DYLIB_default ON) endif() option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" ${LLVM_BUILD_LLVM_DYLIB_default}) - -option(LLVM_DYLIB_SYMBOL_VERSIONING OFF) option(LLVM_OPTIMIZED_TABLEGEN "Force TableGen to be built with optimization" OFF) if(CMAKE_CROSSCOMPILING OR (LLVM_OPTIMIZED_TABLEGEN AND (LLVM_ENABLE_ASSERTIONS OR CMAKE_CONFIGURATION_TYPES))) set(LLVM_USE_HOST_TOOLS ON) endif() if (MSVC_IDE AND NOT (MSVC_VERSION LESS 1900)) option(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION "Configure project to use Visual Studio native visualizers" TRUE) else() set(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION FALSE CACHE INTERNAL "For Visual Studio 2013, manually copy natvis files to Documents\\Visual Studio 2013\\Visualizers" FORCE) endif() if (LLVM_BUILD_INSTRUMENTED OR LLVM_BUILD_INSTRUMENTED_COVERAGE OR LLVM_ENABLE_IR_PGO) if(NOT LLVM_PROFILE_MERGE_POOL_SIZE) # A pool size of 1-2 is probably sufficient on a SSD. 3-4 should be fine # for spining disks. Anything higher may only help on slower mediums. set(LLVM_PROFILE_MERGE_POOL_SIZE "4") endif() if(NOT LLVM_PROFILE_FILE_PATTERN) if(NOT LLVM_PROFILE_DATA_DIR) file(TO_NATIVE_PATH "${LLVM_BINARY_DIR}/profiles" LLVM_PROFILE_DATA_DIR) endif() file(TO_NATIVE_PATH "${LLVM_PROFILE_DATA_DIR}/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN) endif() endif() if (LLVM_BUILD_STATIC) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static") endif() # Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV. set(LLVM_TARGET_TRIPLE_ENV CACHE STRING "The name of environment variable to override default target. Disabled by blank.") mark_as_advanced(LLVM_TARGET_TRIPLE_ENV) set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR OFF CACHE BOOL "Enable per-target runtimes directory") # All options referred to from HandleLLVMOptions have to be specified # BEFORE this include, otherwise options will not be correctly set on # first cmake run include(config-ix) string(REPLACE "Native" ${LLVM_NATIVE_ARCH} LLVM_TARGETS_TO_BUILD "${LLVM_TARGETS_TO_BUILD}") list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD) # By default, we target the host, but this can be overridden at CMake # invocation time. set(LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_HOST_TRIPLE}" CACHE STRING "Default target for which LLVM will generate code." ) set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}") message(STATUS "LLVM host triple: ${LLVM_HOST_TRIPLE}") message(STATUS "LLVM default target triple: ${LLVM_DEFAULT_TARGET_TRIPLE}") include(HandleLLVMOptions) # Verify that we can find a Python 2 interpreter. Python 3 is unsupported. # FIXME: We should support systems with only Python 3, but that requires work # on LLDB. set(Python_ADDITIONAL_VERSIONS 2.7) include(FindPythonInterp) if( NOT PYTHONINTERP_FOUND ) message(FATAL_ERROR "Unable to find Python interpreter, required for builds and testing. Please install Python or specify the PYTHON_EXECUTABLE CMake variable.") endif() if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 ) message(FATAL_ERROR "Python 2.7 or newer is required") endif() ###### # LLVMBuild Integration # # We use llvm-build to generate all the data required by the CMake based # build system in one swoop: # # - We generate a file (a CMake fragment) in the object root which contains # all the definitions that are required by CMake. # # - We generate the library table used by llvm-config. # # - We generate the dependencies for the CMake fragment, so that we will # automatically reconfigure outselves. set(LLVMBUILDTOOL "${LLVM_MAIN_SRC_DIR}/utils/llvm-build/llvm-build") set(LLVMCONFIGLIBRARYDEPENDENCIESINC "${LLVM_BINARY_DIR}/tools/llvm-config/LibraryDependencies.inc") set(LLVMBUILDCMAKEFRAG "${LLVM_BINARY_DIR}/LLVMBuild.cmake") # Create the list of optional components that are enabled if (LLVM_USE_INTEL_JITEVENTS) set(LLVMOPTIONALCOMPONENTS IntelJITEvents) endif (LLVM_USE_INTEL_JITEVENTS) if (LLVM_USE_OPROFILE) set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} OProfileJIT) endif (LLVM_USE_OPROFILE) if (LLVM_USE_PERF) set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} PerfJITEvents) endif (LLVM_USE_PERF) message(STATUS "Constructing LLVMBuild project information") execute_process( COMMAND ${PYTHON_EXECUTABLE} -B ${LLVMBUILDTOOL} --native-target "${LLVM_NATIVE_ARCH}" --enable-targets "${LLVM_TARGETS_TO_BUILD}" --enable-optional-components "${LLVMOPTIONALCOMPONENTS}" --write-library-table ${LLVMCONFIGLIBRARYDEPENDENCIESINC} --write-cmake-fragment ${LLVMBUILDCMAKEFRAG} OUTPUT_VARIABLE LLVMBUILDOUTPUT ERROR_VARIABLE LLVMBUILDERRORS OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE LLVMBUILDRESULT) # On Win32, CMake doesn't properly handle piping the default output/error # streams into the GUI console. So, we explicitly catch and report them. if( NOT "${LLVMBUILDOUTPUT}" STREQUAL "") message(STATUS "llvm-build output: ${LLVMBUILDOUTPUT}") endif() if( NOT "${LLVMBUILDRESULT}" STREQUAL "0" ) message(FATAL_ERROR "Unexpected failure executing llvm-build: ${LLVMBUILDERRORS}") endif() # Include the generated CMake fragment. This will define properties from the # LLVMBuild files in a format which is easy to consume from CMake, and will add # the dependencies so that CMake will reconfigure properly when the LLVMBuild # files change. include(${LLVMBUILDCMAKEFRAG}) ###### # Configure all of the various header file fragments LLVM uses which depend on # configuration variables. set(LLVM_ENUM_TARGETS "") set(LLVM_ENUM_ASM_PRINTERS "") set(LLVM_ENUM_ASM_PARSERS "") set(LLVM_ENUM_DISASSEMBLERS "") foreach(t ${LLVM_TARGETS_TO_BUILD}) set( td ${LLVM_MAIN_SRC_DIR}/lib/Target/${t} ) list(FIND LLVM_ALL_TARGETS ${t} idx) list(FIND LLVM_EXPERIMENTAL_TARGETS_TO_BUILD ${t} idy) # At this point, LLVMBUILDTOOL already checked all the targets passed in # LLVM_TARGETS_TO_BUILD and LLVM_EXPERIMENTAL_TARGETS_TO_BUILD, so # this test just makes sure that any experimental targets were passed via # LLVM_EXPERIMENTAL_TARGETS_TO_BUILD, not LLVM_TARGETS_TO_BUILD. if( idx LESS 0 AND idy LESS 0 ) message(FATAL_ERROR "The target `${t}' is experimental and must be passed " "via LLVM_EXPERIMENTAL_TARGETS_TO_BUILD.") else() set(LLVM_ENUM_TARGETS "${LLVM_ENUM_TARGETS}LLVM_TARGET(${t})\n") endif() file(GLOB asmp_file "${td}/*AsmPrinter.cpp") if( asmp_file ) set(LLVM_ENUM_ASM_PRINTERS "${LLVM_ENUM_ASM_PRINTERS}LLVM_ASM_PRINTER(${t})\n") endif() if( EXISTS ${td}/AsmParser/CMakeLists.txt ) set(LLVM_ENUM_ASM_PARSERS "${LLVM_ENUM_ASM_PARSERS}LLVM_ASM_PARSER(${t})\n") endif() if( EXISTS ${td}/Disassembler/CMakeLists.txt ) set(LLVM_ENUM_DISASSEMBLERS "${LLVM_ENUM_DISASSEMBLERS}LLVM_DISASSEMBLER(${t})\n") endif() endforeach(t) # Produce the target definition files, which provide a way for clients to easily # include various classes of targets. configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/AsmPrinters.def.in ${LLVM_INCLUDE_DIR}/llvm/Config/AsmPrinters.def ) configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/AsmParsers.def.in ${LLVM_INCLUDE_DIR}/llvm/Config/AsmParsers.def ) configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/Disassemblers.def.in ${LLVM_INCLUDE_DIR}/llvm/Config/Disassemblers.def ) configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/Targets.def.in ${LLVM_INCLUDE_DIR}/llvm/Config/Targets.def ) # Configure the three LLVM configuration header files. configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/config.h.cmake ${LLVM_INCLUDE_DIR}/llvm/Config/config.h) configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/llvm-config.h.cmake ${LLVM_INCLUDE_DIR}/llvm/Config/llvm-config.h) configure_file( ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/abi-breaking.h.cmake ${LLVM_INCLUDE_DIR}/llvm/Config/abi-breaking.h) # Add target for generating source rpm package. set(LLVM_SRPM_USER_BINARY_SPECFILE ${CMAKE_CURRENT_SOURCE_DIR}/llvm.spec.in CACHE FILEPATH ".spec file to use for srpm generation") set(LLVM_SRPM_BINARY_SPECFILE ${CMAKE_CURRENT_BINARY_DIR}/llvm.spec) set(LLVM_SRPM_DIR "${CMAKE_CURRENT_BINARY_DIR}/srpm") # SVN_REVISION and GIT_COMMIT get set by the call to add_version_info_from_vcs. # DUMMY_VAR contains a version string which we don't care about. add_version_info_from_vcs(DUMMY_VAR) if ( SVN_REVISION ) set(LLVM_RPM_SPEC_REVISION "r${SVN_REVISION}") elseif ( GIT_COMMIT ) set (LLVM_RPM_SPEC_REVISION "g${GIT_COMMIT}") endif() configure_file( ${LLVM_SRPM_USER_BINARY_SPECFILE} ${LLVM_SRPM_BINARY_SPECFILE} @ONLY) add_custom_target(srpm COMMAND cpack -G TGZ --config CPackSourceConfig.cmake -B ${LLVM_SRPM_DIR}/SOURCES COMMAND rpmbuild -bs --define '_topdir ${LLVM_SRPM_DIR}' ${LLVM_SRPM_BINARY_SPECFILE}) set_target_properties(srpm PROPERTIES FOLDER "Misc") # They are not referenced. See set_output_directory(). set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin ) set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} ) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} ) if(APPLE AND DARWIN_LTO_LIBRARY) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}") set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}") endif() # Work around a broken bfd ld behavior. When linking a binary with a # foo.so library, it will try to find any library that foo.so uses and # check its symbols. This is wasteful (the check was done when foo.so # was created) and can fail since it is not the dynamic linker and # doesn't know how to handle search paths correctly. if (UNIX AND NOT APPLE AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "SunOS|AIX") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-allow-shlib-undefined") endif() set(CMAKE_INCLUDE_CURRENT_DIR ON) include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}) # when crosscompiling import the executable targets from a file if(LLVM_USE_HOST_TOOLS) include(CrossCompile) endif(LLVM_USE_HOST_TOOLS) if(LLVM_TARGET_IS_CROSSCOMPILE_HOST) # Dummy use to avoid CMake Warning: Manually-specified variables were not used # (this is a variable that CrossCompile sets on recursive invocations) endif() if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)") # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM # with libxml2, iconv.h, etc., we must add /usr/local paths. include_directories(SYSTEM "/usr/local/include") link_directories("/usr/local/lib") endif(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)") if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS ) # special hack for Solaris to handle crazy system sys/regset.h include_directories("${LLVM_MAIN_INCLUDE_DIR}/llvm/Support/Solaris") endif( ${CMAKE_SYSTEM_NAME} MATCHES SunOS ) # Make sure we don't get -rdynamic in every binary. For those that need it, # use export_executable_symbols(target). set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "") set(LLVM_PROFDATA_FILE "" CACHE FILEPATH "Profiling data file to use when compiling in order to improve runtime performance.") if(LLVM_PROFDATA_FILE AND EXISTS ${LLVM_PROFDATA_FILE}) if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" ) add_definitions("-fprofile-instr-use=${LLVM_PROFDATA_FILE}") else() message(FATAL_ERROR "LLVM_PROFDATA_FILE can only be specified when compiling with clang") endif() endif() include(AddLLVM) include(TableGen) if( MINGW AND NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" ) # People report that -O3 is unreliable on MinGW. The traditional # build also uses -O2 for that reason: llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELEASE "-O3" "-O2") endif() # Put this before tblgen. Else we have a circular dependence. add_subdirectory(lib/Demangle) add_subdirectory(lib/Support) add_subdirectory(lib/TableGen) add_subdirectory(utils/TableGen) add_subdirectory(include/llvm) add_subdirectory(lib) if( LLVM_INCLUDE_UTILS ) add_subdirectory(utils/FileCheck) add_subdirectory(utils/PerfectShuffle) add_subdirectory(utils/count) add_subdirectory(utils/not) add_subdirectory(utils/yaml-bench) else() if ( LLVM_INCLUDE_TESTS ) message(FATAL_ERROR "Including tests when not building utils will not work. Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLDE_TESTS to Off.") endif() endif() # Use LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION instead of LLVM_INCLUDE_UTILS because it is not really a util if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION) add_subdirectory(utils/LLVMVisualizers) endif() foreach( binding ${LLVM_BINDINGS_LIST} ) if( EXISTS "${LLVM_MAIN_SRC_DIR}/bindings/${binding}/CMakeLists.txt" ) add_subdirectory(bindings/${binding}) endif() endforeach() add_subdirectory(projects) if( LLVM_INCLUDE_TOOLS ) add_subdirectory(tools) endif() if( LLVM_INCLUDE_RUNTIMES ) add_subdirectory(runtimes) endif() if( LLVM_INCLUDE_EXAMPLES ) add_subdirectory(examples) endif() if( LLVM_INCLUDE_TESTS ) if(EXISTS ${LLVM_MAIN_SRC_DIR}/projects/test-suite AND TARGET clang) include(LLVMExternalProjectUtils) llvm_ExternalProject_Add(test-suite ${LLVM_MAIN_SRC_DIR}/projects/test-suite USE_TOOLCHAIN EXCLUDE_FROM_ALL NO_INSTALL ALWAYS_CLEAN) endif() add_subdirectory(utils/lit) add_subdirectory(test) add_subdirectory(unittests) if( LLVM_INCLUDE_UTILS ) add_subdirectory(utils/unittest) endif() if (WIN32) # This utility is used to prevent crashing tests from calling Dr. Watson on # Windows. add_subdirectory(utils/KillTheDoctor) endif() # Add a global check rule now that all subdirectories have been traversed # and we know the total set of lit testsuites. get_property(LLVM_LIT_TESTSUITES GLOBAL PROPERTY LLVM_LIT_TESTSUITES) get_property(LLVM_LIT_PARAMS GLOBAL PROPERTY LLVM_LIT_PARAMS) get_property(LLVM_LIT_DEPENDS GLOBAL PROPERTY LLVM_LIT_DEPENDS) get_property(LLVM_LIT_EXTRA_ARGS GLOBAL PROPERTY LLVM_LIT_EXTRA_ARGS) get_property(LLVM_ADDITIONAL_TEST_TARGETS GLOBAL PROPERTY LLVM_ADDITIONAL_TEST_TARGETS) get_property(LLVM_ADDITIONAL_TEST_DEPENDS GLOBAL PROPERTY LLVM_ADDITIONAL_TEST_DEPENDS) add_lit_target(check-all "Running all regression tests" ${LLVM_LIT_TESTSUITES} PARAMS ${LLVM_LIT_PARAMS} DEPENDS ${LLVM_LIT_DEPENDS} ${LLVM_ADDITIONAL_TEST_TARGETS} ARGS ${LLVM_LIT_EXTRA_ARGS} ) if(TARGET check-runtimes) add_dependencies(check-all check-runtimes) endif() add_custom_target(test-depends DEPENDS ${LLVM_LIT_DEPENDS} ${LLVM_ADDITIONAL_TEST_DEPENDS}) set_target_properties(test-depends PROPERTIES FOLDER "Tests") endif() if (LLVM_INCLUDE_DOCS) add_subdirectory(docs) endif() add_subdirectory(cmake/modules) # Do this last so that all lit targets have already been created. if (LLVM_INCLUDE_UTILS) add_subdirectory(utils/llvm-lit) endif() if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) install(DIRECTORY include/llvm include/llvm-c DESTINATION include COMPONENT llvm-headers FILES_MATCHING PATTERN "*.def" PATTERN "*.h" PATTERN "*.td" PATTERN "*.inc" PATTERN "LICENSE.TXT" PATTERN ".svn" EXCLUDE ) install(DIRECTORY ${LLVM_INCLUDE_DIR}/llvm ${LLVM_INCLUDE_DIR}/llvm-c DESTINATION include COMPONENT llvm-headers FILES_MATCHING PATTERN "*.def" PATTERN "*.h" PATTERN "*.gen" PATTERN "*.inc" # Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def" PATTERN "CMakeFiles" EXCLUDE PATTERN "config.h" EXCLUDE PATTERN ".svn" EXCLUDE ) # Installing the headers needs to depend on generating any public # tablegen'd headers. add_custom_target(llvm-headers DEPENDS intrinsics_gen) set_target_properties(llvm-headers PROPERTIES FOLDER "Misc") if (NOT CMAKE_CONFIGURATION_TYPES) add_llvm_install_targets(install-llvm-headers DEPENDS llvm-headers COMPONENT llvm-headers) endif() endif() # This must be at the end of the LLVM root CMakeLists file because it must run # after all targets are created. if(LLVM_DISTRIBUTION_COMPONENTS) if(CMAKE_CONFIGURATION_TYPES) message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)") endif() add_custom_target(distribution) add_custom_target(install-distribution) add_custom_target(install-distribution-stripped) foreach(target ${LLVM_DISTRIBUTION_COMPONENTS} ${LLVM_RUNTIME_DISTRIBUTION_COMPONENTS}) if(TARGET ${target}) add_dependencies(distribution ${target}) else() message(SEND_ERROR "Specified distribution component '${target}' doesn't have a target") endif() if(TARGET install-${target}) add_dependencies(install-distribution install-${target}) else() message(SEND_ERROR "Specified distribution component '${target}' doesn't have an install target") endif() if(TARGET install-${target}-stripped) add_dependencies(install-distribution-stripped install-${target}-stripped) else() message(SEND_ERROR "Specified distribution component '${target}' doesn't have an install-stripped target." " Its installation target creation should be changed to use add_llvm_install_targets," " or you should manually create the 'install-${target}-stripped' target.") endif() endforeach() endif() # This allows us to deploy the Universal CRT DLLs by passing -DCMAKE_INSTALL_UCRT_LIBRARIES=ON to CMake if (MSVC AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") include(InstallRequiredSystemLibraries) endif() Index: vendor/llvm/dist-release_70/docs/ReleaseNotes.rst =================================================================== --- vendor/llvm/dist-release_70/docs/ReleaseNotes.rst (revision 337298) +++ vendor/llvm/dist-release_70/docs/ReleaseNotes.rst (revision 337299) @@ -1,206 +1,212 @@ ======================== LLVM 7.0.0 Release Notes ======================== .. contents:: :local: .. warning:: These are in-progress notes for the upcoming LLVM 7 release. Release notes for previous releases can be found on `the Download Page `_. Introduction ============ This document contains the release notes for the LLVM Compiler Infrastructure, release 7.0.0. Here we describe the status of LLVM, including major improvements from the previous release, improvements in various subprojects of LLVM, and some of the current users of the code. All LLVM releases may be downloaded from the `LLVM releases web site `_. For more information about LLVM, including information about the latest release, please check out the `main LLVM web site `_. If you have questions or comments, the `LLVM Developer's Mailing List `_ is a good place to send them. Note that if you are reading this file from a Subversion checkout or the main LLVM web page, this document applies to the *next* release, not the current one. To see the release notes for a specific release, please see the `releases page `_. Non-comprehensive list of changes in this release ================================================= .. NOTE For small 1-3 sentence descriptions, just add an entry at the end of this list. If your description won't fit comfortably in one bullet point (e.g. maybe you would like to give an example of the functionality, or simply have a lot to talk about), see the `NOTE` below for adding a new subsection. * Libraries have been renamed from 7.0 to 7. This change also impacts downstream libraries like lldb. * The LoopInstSimplify pass (-loop-instsimplify) has been removed. * Symbols starting with ``?`` are no longer mangled by LLVM when using the Windows ``x`` or ``w`` IR mangling schemes. * A new tool named :doc:`llvm-exegesis ` has been added. :program:`llvm-exegesis` automatically measures instruction scheduling properties (latency/uops) and provides a principled way to edit scheduling models. * A new tool named :doc:`llvm-mca ` has been added. :program:`llvm-mca` is a static performance analysis tool that uses information available in LLVM to statically predict the performance of machine code for a specific CPU. * The optimization flag to merge constants (-fmerge-all-constants) is no longer applied by default. * Optimization of floating-point casts is improved. This may cause surprising results for code that is relying on the undefined behavior of overflowing casts. The optimization can be disabled by specifying a function attribute: "strict-float-cast-overflow"="false". This attribute may be created by the - clang option :option:`-fno-strict-float-cast-overflow`. + clang option ``-fno-strict-float-cast-overflow``. Code sanitizers can be used to detect affected patterns. The option for detecting this problem alone is "-fsanitize=float-cast-overflow": .. code-block:: c int main() { float x = 4294967296.0f; x = (float)((int)x); printf("junk in the ftrunc: %f\n", x); return 0; } .. code-block:: bash clang -O1 ftrunc.c -fsanitize=float-cast-overflow ; ./a.out ftrunc.c:5:15: runtime error: 4.29497e+09 is outside the range of representable values of type 'int' junk in the ftrunc: 0.000000 * ``LLVM_ON_WIN32`` is no longer set by ``llvm/Config/config.h`` and ``llvm/Config/llvm-config.h``. If you used this macro, use the compiler-set ``_WIN32`` instead which is set exactly when ``LLVM_ON_WIN32`` used to be set. * The ``DEBUG`` macro has been renamed to ``LLVM_DEBUG``, the interface remains the same. If you used this macro you need to migrate to the new one. You should also clang-format your code to make it easier to integrate future changes locally. This can be done with the following bash commands: .. code-block:: bash git grep -l 'DEBUG' | xargs perl -pi -e 's/\bDEBUG\s?\(/LLVM_DEBUG(/g' git diff -U0 master | ../clang/tools/clang-format/clang-format-diff.py -i -p1 -style LLVM * Early support for UBsan, X-Ray instrumentation and libFuzzer (x86 and x86_64) for OpenBSD. Support for MSan (x86_64), X-Ray instrumentation and libFuzzer (x86 and x86_64) for FreeBSD. * ``SmallVector`` shrank from ``sizeof(void*) * 4 + sizeof(T)`` to ``sizeof(void*) + sizeof(unsigned) * 2``, smaller than ``std::vector`` on 64-bit platforms. The maximum capacity is now restricted to ``UINT32_MAX``. Since SmallVector doesn't have the exception-safety pessimizations some implementations saddle std::vector with and is better at using ``realloc``, it's now a better choice even on the heap (although when TinyPtrVector works, it's even smaller). + +* Preliminary/experimental support for DWARF v5 debugging information, + including the new .debug_names accelerator table. DWARF emitted at ``-O0`` + should be fully DWARF v5 compliant. Type units and split DWARF are known + not to be compliant, and higher optimization levels will still emit some + information in v4 format. * Note.. .. NOTE If you would like to document a larger change, then you can add a subsection about it right here. You can copy the following boilerplate and un-indent it (the indentation causes it to be inside this comment). Special New Feature ------------------- Makes programs 10x faster by doing Special New Thing. Changes to the LLVM IR ---------------------- * The signatures for the builtins @llvm.memcpy, @llvm.memmove, and @llvm.memset have changed. Alignment is no longer an argument, and are instead conveyed as parameter attributes. * invariant.group.barrier has been renamed to launder.invariant.group. * invariant.group metadata can now refer only empty metadata nodes. Changes to the ARM Backend -------------------------- During this release ... Changes to the MIPS Target -------------------------- During this release ... Changes to the PowerPC Target ----------------------------- During this release ... Changes to the X86 Target ------------------------- During this release ... Changes to the AMDGPU Target ----------------------------- During this release ... Changes to the AVR Target ----------------------------- During this release ... Changes to the OCaml bindings ----------------------------- * Remove ``add_bb_vectorize``. Changes to the C API -------------------- * Remove ``LLVMAddBBVectorizePass``. The implementation was removed and the C interface was made a deprecated no-op in LLVM 5. Use ``LLVMAddSLPVectorizePass`` instead to get the supported SLP vectorizer. Changes to the DAG infrastructure --------------------------------- * ADDC/ADDE/SUBC/SUBE are now deprecated and will default to expand. Backends that wish to continue to use these opcodes should explicitely request so using ``setOperationAction`` in their ``TargetLowering``. New backends should use UADDO/ADDCARRY/USUBO/SUBCARRY instead of the deprecated opcodes. * The SETCCE opcode has now been removed in favor of SETCCCARRY. External Open Source Projects Using LLVM 7 ========================================== * A project... Additional Information ====================== A wide variety of additional information is available on the `LLVM web page `_, in particular in the `documentation `_ section. The web page also contains versions of the API documentation which is up-to-date with the Subversion version of the source code. You can access versions of these documents specific to this release by going into the ``llvm/docs/`` directory in the LLVM tree. If you have any questions or comments about LLVM, please feel free to contact us via the `mailing lists `_. Index: vendor/llvm/dist-release_70/include/llvm/Support/DebugCounter.h =================================================================== --- vendor/llvm/dist-release_70/include/llvm/Support/DebugCounter.h (revision 337298) +++ vendor/llvm/dist-release_70/include/llvm/Support/DebugCounter.h (revision 337299) @@ -1,169 +1,187 @@ //===- llvm/Support/DebugCounter.h - Debug counter support ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// \file /// This file provides an implementation of debug counters. Debug /// counters are a tool that let you narrow down a miscompilation to a specific /// thing happening. /// /// To give a use case: Imagine you have a file, very large, and you /// are trying to understand the minimal transformation that breaks it. Bugpoint /// and bisection is often helpful here in narrowing it down to a specific pass, /// but it's still a very large file, and a very complicated pass to try to /// debug. That is where debug counting steps in. You can instrument the pass /// with a debug counter before it does a certain thing, and depending on the /// counts, it will either execute that thing or not. The debug counter itself /// consists of a skip and a count. Skip is the number of times shouldExecute /// needs to be called before it returns true. Count is the number of times to /// return true once Skip is 0. So a skip=47, count=2 ,would skip the first 47 /// executions by returning false from shouldExecute, then execute twice, and /// then return false again. /// Note that a counter set to a negative number will always execute. /// For a concrete example, during predicateinfo creation, the renaming pass /// replaces each use with a renamed use. //// /// If I use DEBUG_COUNTER to create a counter called "predicateinfo", and /// variable name RenameCounter, and then instrument this renaming with a debug /// counter, like so: /// /// if (!DebugCounter::shouldExecute(RenameCounter) /// /// /// Now I can, from the command line, make it rename or not rename certain uses /// by setting the skip and count. /// So for example /// bin/opt -debug-counter=predicateinfo-skip=47,predicateinfo-count=1 /// will skip renaming the first 47 uses, then rename one, then skip the rest. //===----------------------------------------------------------------------===// #ifndef LLVM_SUPPORT_DEBUGCOUNTER_H #define LLVM_SUPPORT_DEBUGCOUNTER_H #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include namespace llvm { class DebugCounter { public: /// Returns a reference to the singleton instance. static DebugCounter &instance(); // Used by the command line option parser to push a new value it parsed. void push_back(const std::string &); // Register a counter with the specified name. // // FIXME: Currently, counter registration is required to happen before command // line option parsing. The main reason to register counters is to produce a // nice list of them on the command line, but i'm not sure this is worth it. static unsigned registerCounter(StringRef Name, StringRef Desc) { return instance().addCounter(Name, Desc); } inline static bool shouldExecute(unsigned CounterName) { -// Compile to nothing when debugging is off -#ifdef NDEBUG - return true; -#else + if (!isCountingEnabled()) + return true; + auto &Us = instance(); auto Result = Us.Counters.find(CounterName); if (Result != Us.Counters.end()) { auto &CounterInfo = Result->second; ++CounterInfo.Count; // We only execute while the Skip is not smaller than Count, // and the StopAfter + Skip is larger than Count. // Negative counters always execute. if (CounterInfo.Skip < 0) return true; if (CounterInfo.Skip >= CounterInfo.Count) return false; if (CounterInfo.StopAfter < 0) return true; return CounterInfo.StopAfter + CounterInfo.Skip >= CounterInfo.Count; } // Didn't find the counter, should we warn? return true; -#endif // NDEBUG } // Return true if a given counter had values set (either programatically or on // the command line). This will return true even if those values are // currently in a state where the counter will always execute. static bool isCounterSet(unsigned ID) { return instance().Counters[ID].IsSet; } // Return the Count for a counter. This only works for set counters. static int64_t getCounterValue(unsigned ID) { auto &Us = instance(); auto Result = Us.Counters.find(ID); assert(Result != Us.Counters.end() && "Asking about a non-set counter"); return Result->second.Count; } // Set a registered counter to a given Count value. static void setCounterValue(unsigned ID, int64_t Count) { auto &Us = instance(); Us.Counters[ID].Count = Count; } // Dump or print the current counter set into llvm::dbgs(). LLVM_DUMP_METHOD void dump() const; void print(raw_ostream &OS) const; // Get the counter ID for a given named counter, or return 0 if none is found. unsigned getCounterId(const std::string &Name) const { return RegisteredCounters.idFor(Name); } // Return the number of registered counters. unsigned int getNumCounters() const { return RegisteredCounters.size(); } // Return the name and description of the counter with the given ID. std::pair getCounterInfo(unsigned ID) const { return std::make_pair(RegisteredCounters[ID], Counters.lookup(ID).Desc); } // Iterate through the registered counters typedef UniqueVector CounterVector; CounterVector::const_iterator begin() const { return RegisteredCounters.begin(); } CounterVector::const_iterator end() const { return RegisteredCounters.end(); } + // Force-enables counting all DebugCounters. + // + // Since DebugCounters are incompatible with threading (not only do they not + // make sense, but we'll also see data races), this should only be used in + // contexts where we're certain we won't spawn threads. + static void enableAllCounters() { instance().Enabled = true; } + private: + static bool isCountingEnabled() { +// Compile to nothing when debugging is off +#ifdef NDEBUG + return false; +#else + return instance().Enabled; +#endif + } + unsigned addCounter(const std::string &Name, const std::string &Desc) { unsigned Result = RegisteredCounters.insert(Name); Counters[Result] = {}; Counters[Result].Desc = Desc; return Result; } // Struct to store counter info. struct CounterInfo { int64_t Count = 0; int64_t Skip = 0; int64_t StopAfter = -1; bool IsSet = false; std::string Desc; }; DenseMap Counters; CounterVector RegisteredCounters; + + // Whether we should do DebugCounting at all. DebugCounters aren't + // thread-safe, so this should always be false in multithreaded scenarios. + bool Enabled = false; }; #define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC) \ static const unsigned VARNAME = \ DebugCounter::registerCounter(COUNTERNAME, DESC) } // namespace llvm #endif Index: vendor/llvm/dist-release_70/lib/Support/DebugCounter.cpp =================================================================== --- vendor/llvm/dist-release_70/lib/Support/DebugCounter.cpp (revision 337298) +++ vendor/llvm/dist-release_70/lib/Support/DebugCounter.cpp (revision 337299) @@ -1,113 +1,115 @@ #include "llvm/Support/DebugCounter.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Format.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Options.h" using namespace llvm; namespace { // This class overrides the default list implementation of printing so we // can pretty print the list of debug counter options. This type of // dynamic option is pretty rare (basically this and pass lists). class DebugCounterList : public cl::list { private: using Base = cl::list; public: template explicit DebugCounterList(Mods &&... Ms) : Base(std::forward(Ms)...) {} private: void printOptionInfo(size_t GlobalWidth) const override { // This is a variant of from generic_parser_base::printOptionInfo. Sadly, // it's not easy to make it more usable. We could get it to print these as // options if we were a cl::opt and registered them, but lists don't have // options, nor does the parser for std::string. The other mechanisms for // options are global and would pollute the global namespace with our // counters. Rather than go that route, we have just overridden the // printing, which only a few things call anyway. outs() << " -" << ArgStr; // All of the other options in CommandLine.cpp use ArgStr.size() + 6 for // width, so we do the same. Option::printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6); const auto &CounterInstance = DebugCounter::instance(); for (auto Name : CounterInstance) { const auto Info = CounterInstance.getCounterInfo(CounterInstance.getCounterId(Name)); size_t NumSpaces = GlobalWidth - Info.first.size() - 8; outs() << " =" << Info.first; outs().indent(NumSpaces) << " - " << Info.second << '\n'; } } }; } // namespace // Create our command line option. static DebugCounterList DebugCounterOption( "debug-counter", cl::Hidden, cl::desc("Comma separated list of debug counter skip and count"), cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance())); static ManagedStatic DC; DebugCounter &DebugCounter::instance() { return *DC; } // This is called by the command line parser when it sees a value for the // debug-counter option defined above. void DebugCounter::push_back(const std::string &Val) { if (Val.empty()) return; // The strings should come in as counter=value auto CounterPair = StringRef(Val).split('='); if (CounterPair.second.empty()) { errs() << "DebugCounter Error: " << Val << " does not have an = in it\n"; return; } // Now we have counter=value. // First, process value. int64_t CounterVal; if (CounterPair.second.getAsInteger(0, CounterVal)) { errs() << "DebugCounter Error: " << CounterPair.second << " is not a number\n"; return; } // Now we need to see if this is the skip or the count, remove the suffix, and // add it to the counter values. if (CounterPair.first.endswith("-skip")) { auto CounterName = CounterPair.first.drop_back(5); unsigned CounterID = getCounterId(CounterName); if (!CounterID) { errs() << "DebugCounter Error: " << CounterName << " is not a registered counter\n"; return; } + enableAllCounters(); Counters[CounterID].Skip = CounterVal; Counters[CounterID].IsSet = true; } else if (CounterPair.first.endswith("-count")) { auto CounterName = CounterPair.first.drop_back(6); unsigned CounterID = getCounterId(CounterName); if (!CounterID) { errs() << "DebugCounter Error: " << CounterName << " is not a registered counter\n"; return; } + enableAllCounters(); Counters[CounterID].StopAfter = CounterVal; Counters[CounterID].IsSet = true; } else { errs() << "DebugCounter Error: " << CounterPair.first << " does not end with -skip or -count\n"; } } void DebugCounter::print(raw_ostream &OS) const { OS << "Counters and values:\n"; for (const auto &KV : Counters) OS << left_justify(RegisteredCounters[KV.first], 32) << ": {" << KV.second.Count << "," << KV.second.Skip << "," << KV.second.StopAfter << "}\n"; } LLVM_DUMP_METHOD void DebugCounter::dump() const { print(dbgs()); } Index: vendor/llvm/dist-release_70/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- vendor/llvm/dist-release_70/lib/Target/AArch64/AArch64InstrFormats.td (revision 337298) +++ vendor/llvm/dist-release_70/lib/Target/AArch64/AArch64InstrFormats.td (revision 337299) @@ -1,10400 +1,10402 @@ //===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Describe AArch64 instructions format here // // Format specifies the encoding used by the instruction. This is part of the // ad-hoc solution used to emit machine instruction encodings by our machine // code emitter. class Format val> { bits<2> Value = val; } def PseudoFrm : Format<0>; def NormalFrm : Format<1>; // Do we need any others? // AArch64 Instruction Format class AArch64Inst : Instruction { field bits<32> Inst; // Instruction encoding. // Mask of bits that cause an encoding to be UNPREDICTABLE. // If a bit is set, then if the corresponding bit in the // target encoding differs from its value in the "Inst" field, // the instruction is UNPREDICTABLE (SoftFail in abstract parlance). field bits<32> Unpredictable = 0; // SoftFail is the generic name for this field, but we alias it so // as to make it more obvious what it means in ARM-land. field bits<32> SoftFail = Unpredictable; let Namespace = "AArch64"; Format F = f; bits<2> Form = F.Value; let Pattern = []; let Constraints = cstr; } class InstSubst : InstAlias, Requires<[UseNegativeImmediates]>; // Pseudo instructions (don't have encoding information) class Pseudo pattern, string cstr = ""> : AArch64Inst { dag OutOperandList = oops; dag InOperandList = iops; let Pattern = pattern; let isCodeGenOnly = 1; } // Real instructions (have encoding information) class EncodedI pattern> : AArch64Inst { let Pattern = pattern; let Size = 4; } // Enum describing whether an instruction is // destructive in its first source operand. class DestructiveInstTypeEnum val> { bits<1> Value = val; } def NotDestructive : DestructiveInstTypeEnum<0>; def Destructive : DestructiveInstTypeEnum<1>; // Normal instructions class I pattern> : EncodedI { dag OutOperandList = oops; dag InOperandList = iops; let AsmString = !strconcat(asm, operands); // Destructive operations (SVE) DestructiveInstTypeEnum DestructiveInstType = NotDestructive; ElementSizeEnum ElementSize = ElementSizeB; let TSFlags{3} = DestructiveInstType.Value; let TSFlags{2-0} = ElementSize.Value; } class TriOpFrag : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag : PatFrag<(ops node:$LHS), res>; // Helper fragment for an extract of the high portion of a 128-bit vector. def extract_high_v16i8 : UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>; def extract_high_v8i16 : UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>; def extract_high_v4i32 : UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>; def extract_high_v2i64 : UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>; //===----------------------------------------------------------------------===// // Asm Operand Classes. // // Shifter operand for arithmetic shifted encodings. def ShifterOperand : AsmOperandClass { let Name = "Shifter"; } // Shifter operand for mov immediate encodings. def MovImm32ShifterOperand : AsmOperandClass { let SuperClasses = [ShifterOperand]; let Name = "MovImm32Shifter"; let RenderMethod = "addShifterOperands"; let DiagnosticType = "InvalidMovImm32Shift"; } def MovImm64ShifterOperand : AsmOperandClass { let SuperClasses = [ShifterOperand]; let Name = "MovImm64Shifter"; let RenderMethod = "addShifterOperands"; let DiagnosticType = "InvalidMovImm64Shift"; } // Shifter operand for arithmetic register shifted encodings. class ArithmeticShifterOperand : AsmOperandClass { let SuperClasses = [ShifterOperand]; let Name = "ArithmeticShifter" # width; let PredicateMethod = "isArithmeticShifter<" # width # ">"; let RenderMethod = "addShifterOperands"; let DiagnosticType = "AddSubRegShift" # width; } def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>; def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>; // Shifter operand for logical register shifted encodings. class LogicalShifterOperand : AsmOperandClass { let SuperClasses = [ShifterOperand]; let Name = "LogicalShifter" # width; let PredicateMethod = "isLogicalShifter<" # width # ">"; let RenderMethod = "addShifterOperands"; let DiagnosticType = "AddSubRegShift" # width; } def LogicalShifterOperand32 : LogicalShifterOperand<32>; def LogicalShifterOperand64 : LogicalShifterOperand<64>; // Shifter operand for logical vector 128/64-bit shifted encodings. def LogicalVecShifterOperand : AsmOperandClass { let SuperClasses = [ShifterOperand]; let Name = "LogicalVecShifter"; let RenderMethod = "addShifterOperands"; } def LogicalVecHalfWordShifterOperand : AsmOperandClass { let SuperClasses = [LogicalVecShifterOperand]; let Name = "LogicalVecHalfWordShifter"; let RenderMethod = "addShifterOperands"; } // The "MSL" shifter on the vector MOVI instruction. def MoveVecShifterOperand : AsmOperandClass { let SuperClasses = [ShifterOperand]; let Name = "MoveVecShifter"; let RenderMethod = "addShifterOperands"; } // Extend operand for arithmetic encodings. def ExtendOperand : AsmOperandClass { let Name = "Extend"; let DiagnosticType = "AddSubRegExtendLarge"; } def ExtendOperand64 : AsmOperandClass { let SuperClasses = [ExtendOperand]; let Name = "Extend64"; let DiagnosticType = "AddSubRegExtendSmall"; } // 'extend' that's a lsl of a 64-bit register. def ExtendOperandLSL64 : AsmOperandClass { let SuperClasses = [ExtendOperand]; let Name = "ExtendLSL64"; let RenderMethod = "addExtend64Operands"; let DiagnosticType = "AddSubRegExtendLarge"; } // 8-bit floating-point immediate encodings. def FPImmOperand : AsmOperandClass { let Name = "FPImm"; let ParserMethod = "tryParseFPImm"; let DiagnosticType = "InvalidFPImm"; } def CondCode : AsmOperandClass { let Name = "CondCode"; let DiagnosticType = "InvalidCondCode"; } // A 32-bit register pasrsed as 64-bit def GPR32as64Operand : AsmOperandClass { let Name = "GPR32as64"; let ParserMethod = "tryParseGPROperand"; } def GPR32as64 : RegisterOperand { let ParserMatchClass = GPR32as64Operand; } // A 64-bit register pasrsed as 32-bit def GPR64as32Operand : AsmOperandClass { let Name = "GPR64as32"; let ParserMethod = "tryParseGPROperand"; } def GPR64as32 : RegisterOperand { let ParserMatchClass = GPR64as32Operand; } // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh // are encoded as the eight bit value 'abcdefgh'. def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; } class UImmScaledMemoryIndexed : AsmOperandClass { let Name = "UImm" # Width # "s" # Scale; let DiagnosticType = "InvalidMemoryIndexed" # Scale # "UImm" # Width; let RenderMethod = "addImmScaledOperands<" # Scale # ">"; let PredicateMethod = "isUImmScaled<" # Width # ", " # Scale # ">"; } class SImmScaledMemoryIndexed : AsmOperandClass { let Name = "SImm" # Width # "s" # Scale; let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm" # Width; let RenderMethod = "addImmScaledOperands<" # Scale # ">"; let PredicateMethod = "isSImmScaled<" # Width # ", " # Scale # ">"; } //===----------------------------------------------------------------------===// // Operand Definitions. // // ADR[P] instruction labels. def AdrpOperand : AsmOperandClass { let Name = "AdrpLabel"; let ParserMethod = "tryParseAdrpLabel"; let DiagnosticType = "InvalidLabel"; } def adrplabel : Operand { let EncoderMethod = "getAdrLabelOpValue"; let PrintMethod = "printAdrpLabel"; let ParserMatchClass = AdrpOperand; } def AdrOperand : AsmOperandClass { let Name = "AdrLabel"; let ParserMethod = "tryParseAdrLabel"; let DiagnosticType = "InvalidLabel"; } def adrlabel : Operand { let EncoderMethod = "getAdrLabelOpValue"; let ParserMatchClass = AdrOperand; } class SImmOperand : AsmOperandClass { let Name = "SImm" # width; let DiagnosticType = "InvalidMemoryIndexedSImm" # width; let RenderMethod = "addImmOperands"; let PredicateMethod = "isSImm<" # width # ">"; } // Authenticated loads for v8.3 can have scaled 10-bit immediate offsets. def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>; def simm10Scaled : Operand { let ParserMatchClass = SImm10s8Operand; let DecoderMethod = "DecodeSImm<10>"; let PrintMethod = "printImmScale<8>"; } // uimm6 predicate - True if the immediate is in the range [0, 63]. def UImm6Operand : AsmOperandClass { let Name = "UImm6"; let DiagnosticType = "InvalidImm0_63"; } def uimm6 : Operand, ImmLeaf= 0 && Imm < 64; }]> { let ParserMatchClass = UImm6Operand; } def SImm9Operand : SImmOperand<9>; def simm9 : Operand, ImmLeaf= -256 && Imm < 256; }]> { let ParserMatchClass = SImm9Operand; let DecoderMethod = "DecodeSImm<9>"; } def SImm8Operand : SImmOperand<8>; def simm8 : Operand, ImmLeaf= -128 && Imm < 127; }]> { let ParserMatchClass = SImm8Operand; let DecoderMethod = "DecodeSImm<8>"; } def SImm6Operand : SImmOperand<6>; def simm6_32b : Operand, ImmLeaf= -32 && Imm < 32; }]> { let ParserMatchClass = SImm6Operand; let DecoderMethod = "DecodeSImm<6>"; } def SImm5Operand : SImmOperand<5>; def simm5_64b : Operand, ImmLeaf= -16 && Imm < 16; }]> { let ParserMatchClass = SImm5Operand; let DecoderMethod = "DecodeSImm<5>"; } def simm5_32b : Operand, ImmLeaf= -16 && Imm < 16; }]> { let ParserMatchClass = SImm5Operand; let DecoderMethod = "DecodeSImm<5>"; } // simm7sN predicate - True if the immediate is a multiple of N in the range // [-64 * N, 63 * N]. def SImm7s4Operand : SImmScaledMemoryIndexed<7, 4>; def SImm7s8Operand : SImmScaledMemoryIndexed<7, 8>; def SImm7s16Operand : SImmScaledMemoryIndexed<7, 16>; def simm7s4 : Operand { let ParserMatchClass = SImm7s4Operand; let PrintMethod = "printImmScale<4>"; } def simm7s8 : Operand { let ParserMatchClass = SImm7s8Operand; let PrintMethod = "printImmScale<8>"; } def simm7s16 : Operand { let ParserMatchClass = SImm7s16Operand; let PrintMethod = "printImmScale<16>"; } def am_indexed7s8 : ComplexPattern; def am_indexed7s16 : ComplexPattern; def am_indexed7s32 : ComplexPattern; def am_indexed7s64 : ComplexPattern; def am_indexed7s128 : ComplexPattern; // uimm5sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 32 * N]. def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>; def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>; def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>; def uimm5s2 : Operand, ImmLeaf= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> { let ParserMatchClass = UImm5s2Operand; let PrintMethod = "printImmScale<2>"; } def uimm5s4 : Operand, ImmLeaf= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> { let ParserMatchClass = UImm5s4Operand; let PrintMethod = "printImmScale<4>"; } def uimm5s8 : Operand, ImmLeaf= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> { let ParserMatchClass = UImm5s8Operand; let PrintMethod = "printImmScale<8>"; } // uimm6sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 64 * N]. def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>; def UImm6s2Operand : UImmScaledMemoryIndexed<6, 2>; def UImm6s4Operand : UImmScaledMemoryIndexed<6, 4>; def UImm6s8Operand : UImmScaledMemoryIndexed<6, 8>; def uimm6s1 : Operand, ImmLeaf= 0 && Imm < 64; }]> { let ParserMatchClass = UImm6s1Operand; } def uimm6s2 : Operand, ImmLeaf= 0 && Imm < (64*2) && ((Imm % 2) == 0); }]> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = UImm6s2Operand; } def uimm6s4 : Operand, ImmLeaf= 0 && Imm < (64*4) && ((Imm % 4) == 0); }]> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = UImm6s4Operand; } def uimm6s8 : Operand, ImmLeaf= 0 && Imm < (64*8) && ((Imm % 8) == 0); }]> { let PrintMethod = "printImmScale<8>"; let ParserMatchClass = UImm6s8Operand; } // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>; def simm6s1 : Operand, ImmLeaf= -32 && Imm < 32; }]> { let ParserMatchClass = SImm6s1Operand; let DecoderMethod = "DecodeSImm<6>"; } // simm4sN predicate - True if the immediate is a multiple of N in the range // [ -8* N, 7 * N]. def SImm4s1Operand : SImmScaledMemoryIndexed<4, 1>; def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>; def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>; def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>; def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>; def simm4s1 : Operand, ImmLeaf=-8 && Imm <= 7; }]> { let ParserMatchClass = SImm4s1Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s2 : Operand, ImmLeaf=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = SImm4s2Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s3 : Operand, ImmLeaf=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> { let PrintMethod = "printImmScale<3>"; let ParserMatchClass = SImm4s3Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s4 : Operand, ImmLeaf=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = SImm4s4Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s16 : Operand, ImmLeaf=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> { let PrintMethod = "printImmScale<16>"; let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; } class AsmImmRange : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; let RenderMethod = "addImmOperands"; let PredicateMethod = "isImmInRange<" # Low # "," # High # ">"; } def Imm1_8Operand : AsmImmRange<1, 8>; def Imm1_16Operand : AsmImmRange<1, 16>; def Imm1_32Operand : AsmImmRange<1, 32>; def Imm1_64Operand : AsmImmRange<1, 64>; class BranchTarget : AsmOperandClass { let Name = "BranchTarget" # N; let DiagnosticType = "InvalidLabel"; let PredicateMethod = "isBranchTarget<" # N # ">"; } class PCRelLabel : BranchTarget { let Name = "PCRelLabel" # N; } def BranchTarget14Operand : BranchTarget<14>; def BranchTarget26Operand : BranchTarget<26>; def PCRelLabel19Operand : PCRelLabel<19>; def MovZSymbolG3AsmOperand : AsmOperandClass { let Name = "MovZSymbolG3"; let RenderMethod = "addImmOperands"; } def movz_symbol_g3 : Operand { let ParserMatchClass = MovZSymbolG3AsmOperand; } def MovZSymbolG2AsmOperand : AsmOperandClass { let Name = "MovZSymbolG2"; let RenderMethod = "addImmOperands"; } def movz_symbol_g2 : Operand { let ParserMatchClass = MovZSymbolG2AsmOperand; } def MovZSymbolG1AsmOperand : AsmOperandClass { let Name = "MovZSymbolG1"; let RenderMethod = "addImmOperands"; } def movz_symbol_g1 : Operand { let ParserMatchClass = MovZSymbolG1AsmOperand; } def MovZSymbolG0AsmOperand : AsmOperandClass { let Name = "MovZSymbolG0"; let RenderMethod = "addImmOperands"; } def movz_symbol_g0 : Operand { let ParserMatchClass = MovZSymbolG0AsmOperand; } def MovKSymbolG3AsmOperand : AsmOperandClass { let Name = "MovKSymbolG3"; let RenderMethod = "addImmOperands"; } def movk_symbol_g3 : Operand { let ParserMatchClass = MovKSymbolG3AsmOperand; } def MovKSymbolG2AsmOperand : AsmOperandClass { let Name = "MovKSymbolG2"; let RenderMethod = "addImmOperands"; } def movk_symbol_g2 : Operand { let ParserMatchClass = MovKSymbolG2AsmOperand; } def MovKSymbolG1AsmOperand : AsmOperandClass { let Name = "MovKSymbolG1"; let RenderMethod = "addImmOperands"; } def movk_symbol_g1 : Operand { let ParserMatchClass = MovKSymbolG1AsmOperand; } def MovKSymbolG0AsmOperand : AsmOperandClass { let Name = "MovKSymbolG0"; let RenderMethod = "addImmOperands"; } def movk_symbol_g0 : Operand { let ParserMatchClass = MovKSymbolG0AsmOperand; } class fixedpoint_i32 : Operand, ComplexPattern", [fpimm, ld]> { let EncoderMethod = "getFixedPointScaleOpValue"; let DecoderMethod = "DecodeFixedPointScaleImm32"; let ParserMatchClass = Imm1_32Operand; } class fixedpoint_i64 : Operand, ComplexPattern", [fpimm, ld]> { let EncoderMethod = "getFixedPointScaleOpValue"; let DecoderMethod = "DecodeFixedPointScaleImm64"; let ParserMatchClass = Imm1_64Operand; } def fixedpoint_f16_i32 : fixedpoint_i32; def fixedpoint_f32_i32 : fixedpoint_i32; def fixedpoint_f64_i32 : fixedpoint_i32; def fixedpoint_f16_i64 : fixedpoint_i64; def fixedpoint_f32_i64 : fixedpoint_i64; def fixedpoint_f64_i64 : fixedpoint_i64; def vecshiftR8 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 9); }]> { let EncoderMethod = "getVecShiftR8OpValue"; let DecoderMethod = "DecodeVecShiftR8Imm"; let ParserMatchClass = Imm1_8Operand; } def vecshiftR16 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 17); }]> { let EncoderMethod = "getVecShiftR16OpValue"; let DecoderMethod = "DecodeVecShiftR16Imm"; let ParserMatchClass = Imm1_16Operand; } def vecshiftR16Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 9); }]> { let EncoderMethod = "getVecShiftR16OpValue"; let DecoderMethod = "DecodeVecShiftR16ImmNarrow"; let ParserMatchClass = Imm1_8Operand; } def vecshiftR32 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 33); }]> { let EncoderMethod = "getVecShiftR32OpValue"; let DecoderMethod = "DecodeVecShiftR32Imm"; let ParserMatchClass = Imm1_32Operand; } def vecshiftR32Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 17); }]> { let EncoderMethod = "getVecShiftR32OpValue"; let DecoderMethod = "DecodeVecShiftR32ImmNarrow"; let ParserMatchClass = Imm1_16Operand; } def vecshiftR64 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 65); }]> { let EncoderMethod = "getVecShiftR64OpValue"; let DecoderMethod = "DecodeVecShiftR64Imm"; let ParserMatchClass = Imm1_64Operand; } def vecshiftR64Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 33); }]> { let EncoderMethod = "getVecShiftR64OpValue"; let DecoderMethod = "DecodeVecShiftR64ImmNarrow"; let ParserMatchClass = Imm1_32Operand; } def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; def Imm0_31Operand : AsmImmRange<0, 31>; def Imm0_63Operand : AsmImmRange<0, 63>; def vecshiftL8 : Operand, ImmLeaf { let EncoderMethod = "getVecShiftL8OpValue"; let DecoderMethod = "DecodeVecShiftL8Imm"; let ParserMatchClass = Imm0_7Operand; } def vecshiftL16 : Operand, ImmLeaf { let EncoderMethod = "getVecShiftL16OpValue"; let DecoderMethod = "DecodeVecShiftL16Imm"; let ParserMatchClass = Imm0_15Operand; } def vecshiftL32 : Operand, ImmLeaf { let EncoderMethod = "getVecShiftL32OpValue"; let DecoderMethod = "DecodeVecShiftL32Imm"; let ParserMatchClass = Imm0_31Operand; } def vecshiftL64 : Operand, ImmLeaf { let EncoderMethod = "getVecShiftL64OpValue"; let DecoderMethod = "DecodeVecShiftL64Imm"; let ParserMatchClass = Imm0_63Operand; } // Crazy immediate formats used by 32-bit and 64-bit logical immediate // instructions for splatting repeating bit patterns across the immediate. def logical_imm32_XFORM : SDNodeXFormgetZExtValue(), 32); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>; def logical_imm64_XFORM : SDNodeXFormgetZExtValue(), 64); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>; let DiagnosticType = "LogicalSecondSource" in { def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; let PredicateMethod = "isLogicalImm"; let RenderMethod = "addLogicalImmOperands"; } def LogicalImm64Operand : AsmOperandClass { let Name = "LogicalImm64"; let PredicateMethod = "isLogicalImm"; let RenderMethod = "addLogicalImmOperands"; } def LogicalImm32NotOperand : AsmOperandClass { let Name = "LogicalImm32Not"; let PredicateMethod = "isLogicalImm"; let RenderMethod = "addLogicalImmNotOperands"; } def LogicalImm64NotOperand : AsmOperandClass { let Name = "LogicalImm64Not"; let PredicateMethod = "isLogicalImm"; let RenderMethod = "addLogicalImmNotOperands"; } } def logical_imm32 : Operand, IntImmLeaf { let PrintMethod = "printLogicalImm"; let ParserMatchClass = LogicalImm32Operand; } def logical_imm64 : Operand, IntImmLeaf { let PrintMethod = "printLogicalImm"; let ParserMatchClass = LogicalImm64Operand; } def logical_imm32_not : Operand { let ParserMatchClass = LogicalImm32NotOperand; } def logical_imm64_not : Operand { let ParserMatchClass = LogicalImm64NotOperand; } // imm0_65535 predicate - True if the immediate is in the range [0,65535]. def Imm0_65535Operand : AsmImmRange<0, 65535>; def imm0_65535 : Operand, ImmLeaf { let ParserMatchClass = Imm0_65535Operand; let PrintMethod = "printImmHex"; } // imm0_255 predicate - True if the immediate is in the range [0,255]. def Imm0_255Operand : AsmImmRange<0,255>; def imm0_255 : Operand, ImmLeaf { let ParserMatchClass = Imm0_255Operand; let PrintMethod = "printImm"; } // imm0_127 predicate - True if the immediate is in the range [0,127] def Imm0_127Operand : AsmImmRange<0, 127>; def imm0_127 : Operand, ImmLeaf { let ParserMatchClass = Imm0_127Operand; let PrintMethod = "printImm"; } // NOTE: These imm0_N operands have to be of type i64 because i64 is the size // for all shift-amounts. // imm0_63 predicate - True if the immediate is in the range [0,63] def imm0_63 : Operand, ImmLeaf { let ParserMatchClass = Imm0_63Operand; } // imm0_31 predicate - True if the immediate is in the range [0,31] def imm0_31 : Operand, ImmLeaf { let ParserMatchClass = Imm0_31Operand; } // True if the 32-bit immediate is in the range [0,31] def imm32_0_31 : Operand, ImmLeaf { let ParserMatchClass = Imm0_31Operand; } // imm0_1 predicate - True if the immediate is in the range [0,1] def imm0_1 : Operand, ImmLeaf { let ParserMatchClass = Imm0_1Operand; } // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand, ImmLeaf { let ParserMatchClass = Imm0_15Operand; } // imm0_7 predicate - True if the immediate is in the range [0,7] def imm0_7 : Operand, ImmLeaf { let ParserMatchClass = Imm0_7Operand; } // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand, ImmLeaf { let ParserMatchClass = Imm0_15Operand; } // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr // {5-0} - imm6 class arith_shift : Operand { let PrintMethod = "printShifter"; let ParserMatchClass = !cast( "ArithmeticShifterOperand" # width); } def arith_shift32 : arith_shift; def arith_shift64 : arith_shift; class arith_shifted_reg : Operand, ComplexPattern { let PrintMethod = "printShiftedRegister"; let MIOperandInfo = (ops regclass, !cast("arith_shift" # width)); } def arith_shifted_reg32 : arith_shifted_reg; def arith_shifted_reg64 : arith_shifted_reg; // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror // {5-0} - imm6 class logical_shift : Operand { let PrintMethod = "printShifter"; let ParserMatchClass = !cast( "LogicalShifterOperand" # width); } def logical_shift32 : logical_shift<32>; def logical_shift64 : logical_shift<64>; class logical_shifted_reg : Operand, ComplexPattern { let PrintMethod = "printShiftedRegister"; let MIOperandInfo = (ops regclass, shiftop); } def logical_shifted_reg32 : logical_shifted_reg; def logical_shifted_reg64 : logical_shifted_reg; // A logical vector shifter operand: // {7-6} - shift type: 00 = lsl // {5-0} - imm6: #0, #8, #16, or #24 def logical_vec_shift : Operand { let PrintMethod = "printShifter"; let EncoderMethod = "getVecShifterOpValue"; let ParserMatchClass = LogicalVecShifterOperand; } // A logical vector half-word shifter operand: // {7-6} - shift type: 00 = lsl // {5-0} - imm6: #0 or #8 def logical_vec_hw_shift : Operand { let PrintMethod = "printShifter"; let EncoderMethod = "getVecShifterOpValue"; let ParserMatchClass = LogicalVecHalfWordShifterOperand; } // A vector move shifter operand: // {0} - imm1: #8 or #16 def move_vec_shift : Operand { let PrintMethod = "printShifter"; let EncoderMethod = "getMoveVecShifterOpValue"; let ParserMatchClass = MoveVecShifterOperand; } let DiagnosticType = "AddSubSecondSource" in { def AddSubImmOperand : AsmOperandClass { let Name = "AddSubImm"; let ParserMethod = "tryParseImmWithOptionalShift"; let RenderMethod = "addImmWithOptionalShiftOperands<12>"; } def AddSubImmNegOperand : AsmOperandClass { let Name = "AddSubImmNeg"; let ParserMethod = "tryParseImmWithOptionalShift"; let RenderMethod = "addImmNegWithOptionalShiftOperands<12>"; } } // An ADD/SUB immediate shifter operand: // second operand: // {7-6} - shift type: 00 = lsl // {5-0} - imm6: #0 or #12 class addsub_shifted_imm : Operand, ComplexPattern { let PrintMethod = "printAddSubImm"; let EncoderMethod = "getAddSubImmOpValue"; let ParserMatchClass = AddSubImmOperand; let MIOperandInfo = (ops i32imm, i32imm); } class addsub_shifted_imm_neg : Operand { let EncoderMethod = "getAddSubImmOpValue"; let ParserMatchClass = AddSubImmNegOperand; let MIOperandInfo = (ops i32imm, i32imm); } def addsub_shifted_imm32 : addsub_shifted_imm; def addsub_shifted_imm64 : addsub_shifted_imm; def addsub_shifted_imm32_neg : addsub_shifted_imm_neg; def addsub_shifted_imm64_neg : addsub_shifted_imm_neg; def gi_addsub_shifted_imm32 : GIComplexOperandMatcher, GIComplexPatternEquiv; def gi_addsub_shifted_imm64 : GIComplexOperandMatcher, GIComplexPatternEquiv; class neg_addsub_shifted_imm : Operand, ComplexPattern { let PrintMethod = "printAddSubImm"; let EncoderMethod = "getAddSubImmOpValue"; let ParserMatchClass = AddSubImmOperand; let MIOperandInfo = (ops i32imm, i32imm); } def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm; def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm; // An extend operand: // {5-3} - extend type // {2-0} - imm3 def arith_extend : Operand { let PrintMethod = "printArithExtend"; let ParserMatchClass = ExtendOperand; } def arith_extend64 : Operand { let PrintMethod = "printArithExtend"; let ParserMatchClass = ExtendOperand64; } // 'extend' that's a lsl of a 64-bit register. def arith_extendlsl64 : Operand { let PrintMethod = "printArithExtend"; let ParserMatchClass = ExtendOperandLSL64; } class arith_extended_reg32 : Operand, ComplexPattern { let PrintMethod = "printExtendedRegister"; let MIOperandInfo = (ops GPR32, arith_extend); } class arith_extended_reg32to64 : Operand, ComplexPattern { let PrintMethod = "printExtendedRegister"; let MIOperandInfo = (ops GPR32, arith_extend64); } // Floating-point immediate. def fpimm16 : Operand, FPImmLeafgetValueAPF(); uint32_t enc = AArch64_AM::getFP16Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } def fpimm32 : Operand, FPImmLeafgetValueAPF(); uint32_t enc = AArch64_AM::getFP32Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } def fpimm64 : Operand, FPImmLeafgetValueAPF(); uint32_t enc = AArch64_AM::getFP64Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } def fpimm8 : Operand { let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } def fpimm0 : FPImmLeaf; // Vector lane operands class AsmVectorIndex : AsmOperandClass { let Name = NamePrefix # "IndexRange" # Min # "_" # Max; let DiagnosticType = "Invalid" # Name; let PredicateMethod = "isVectorIndex<" # Min # ", " # Max # ">"; let RenderMethod = "addVectorIndexOperands"; } class AsmVectorIndexOpnd : Operand, ImmLeaf { let ParserMatchClass = mc; let PrintMethod = "printVectorIndex"; } def VectorIndex1Operand : AsmVectorIndex<1, 1>; def VectorIndexBOperand : AsmVectorIndex<0, 15>; def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; def VectorIndex1 : AsmVectorIndexOpnd; def VectorIndexB : AsmVectorIndexOpnd; def VectorIndexH : AsmVectorIndexOpnd; def VectorIndexS : AsmVectorIndexOpnd; def VectorIndexD : AsmVectorIndexOpnd; def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">; def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">; def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">; def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">; def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">; def sve_elm_idx_extdup_b : AsmVectorIndexOpnd; def sve_elm_idx_extdup_h : AsmVectorIndexOpnd; def sve_elm_idx_extdup_s : AsmVectorIndexOpnd; def sve_elm_idx_extdup_d : AsmVectorIndexOpnd; def sve_elm_idx_extdup_q : AsmVectorIndexOpnd; // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh // are encoded as the eight bit value 'abcdefgh'. def simdimmtype10 : Operand, FPImmLeafgetValueAPF(); uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF() .bitcastToAPInt() .getZExtValue()); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>> { let ParserMatchClass = SIMDImmType10Operand; let PrintMethod = "printSIMDType10Operand"; } //--- // System management //--- // Base encoding for system instruction operands. let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class BaseSystemI pattern = []> : I { let Inst{31-22} = 0b1101010100; let Inst{21} = L; } // System instructions which do not have an Rt register. class SimpleSystemI pattern = []> : BaseSystemI { let Inst{4-0} = 0b11111; } // System instructions which have an Rt register. class RtSystemI : BaseSystemI, Sched<[WriteSys]> { bits<5> Rt; let Inst{4-0} = Rt; } // Hint instructions that take both a CRm and a 3-bit immediate. // NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot // model patterns with sufficiently fine granularity let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in class HintI : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "", [(int_aarch64_hint imm0_127:$imm)]>, Sched<[WriteHint]> { bits <7> imm; let Inst{20-12} = 0b000110010; let Inst{11-5} = imm; } // System instructions taking a single literal operand which encodes into // CRm. op2 differentiates the opcodes. def BarrierAsmOperand : AsmOperandClass { let Name = "Barrier"; let ParserMethod = "tryParseBarrierOperand"; } def barrier_op : Operand { let PrintMethod = "printBarrierOption"; let ParserMatchClass = BarrierAsmOperand; } class CRmSystemI opc, string asm, list pattern = []> : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>, Sched<[WriteBarrier]> { bits<4> CRm; let Inst{20-12} = 0b000110011; let Inst{11-8} = CRm; let Inst{7-5} = opc; } class SystemNoOperands op2, string asm, list pattern = []> : SimpleSystemI<0, (ins), asm, "", pattern>, Sched<[]> { bits<4> CRm; let CRm = 0b0011; let Inst{31-12} = 0b11010101000000110010; let Inst{11-8} = CRm; let Inst{7-5} = op2; let Inst{4-0} = 0b11111; } // MRS/MSR system instructions. These have different operand classes because // a different subset of registers can be accessed through each instruction. def MRSSystemRegisterOperand : AsmOperandClass { let Name = "MRSSystemRegister"; let ParserMethod = "tryParseSysReg"; let DiagnosticType = "MRS"; } // concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate. def mrs_sysreg_op : Operand { let ParserMatchClass = MRSSystemRegisterOperand; let DecoderMethod = "DecodeMRSSystemRegister"; let PrintMethod = "printMRSSystemRegister"; } def MSRSystemRegisterOperand : AsmOperandClass { let Name = "MSRSystemRegister"; let ParserMethod = "tryParseSysReg"; let DiagnosticType = "MSR"; } def msr_sysreg_op : Operand { let ParserMatchClass = MSRSystemRegisterOperand; let DecoderMethod = "DecodeMSRSystemRegister"; let PrintMethod = "printMSRSystemRegister"; } def PSBHintOperand : AsmOperandClass { let Name = "PSBHint"; let ParserMethod = "tryParsePSBHint"; } def psbhint_op : Operand { let ParserMatchClass = PSBHintOperand; let PrintMethod = "printPSBHintOp"; let MCOperandPredicate = [{ // Check, if operand is valid, to fix exhaustive aliasing in disassembly. // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. if (!MCOp.isImm()) return false; return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr; }]; } class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; let Inst{20-5} = systemreg; } // FIXME: Some of these def NZCV, others don't. Best way to model that? // Explicitly modeling each of the system register as a register class // would do it, but feels like overkill at this point. class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), "msr", "\t$systemreg, $Rt"> { bits<16> systemreg; let Inst{20-5} = systemreg; } def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { let Name = "SystemPStateFieldWithImm0_15"; let ParserMethod = "tryParseSysReg"; } def pstatefield4_op : Operand { let ParserMatchClass = SystemPStateFieldWithImm0_15Operand; let PrintMethod = "printSystemPStateField"; } let Defs = [NZCV] in class MSRpstateImm0_15 : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm), "msr", "\t$pstatefield, $imm">, Sched<[WriteSys]> { bits<6> pstatefield; bits<4> imm; let Inst{20-19} = 0b00; let Inst{18-16} = pstatefield{5-3}; let Inst{15-12} = 0b0100; let Inst{11-8} = imm; let Inst{7-5} = pstatefield{2-0}; let DecoderMethod = "DecodeSystemPStateInstruction"; // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns // Fail the decoder should attempt to decode the instruction as MSRI. let hasCompleteDecoder = 0; } def SystemPStateFieldWithImm0_1Operand : AsmOperandClass { let Name = "SystemPStateFieldWithImm0_1"; let ParserMethod = "tryParseSysReg"; } def pstatefield1_op : Operand { let ParserMatchClass = SystemPStateFieldWithImm0_1Operand; let PrintMethod = "printSystemPStateField"; } let Defs = [NZCV] in class MSRpstateImm0_1 : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm), "msr", "\t$pstatefield, $imm">, Sched<[WriteSys]> { bits<6> pstatefield; bit imm; let Inst{20-19} = 0b00; let Inst{18-16} = pstatefield{5-3}; let Inst{15-9} = 0b0100000; let Inst{8} = imm; let Inst{7-5} = pstatefield{2-0}; let DecoderMethod = "DecodeSystemPStateInstruction"; // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns // Fail the decoder should attempt to decode the instruction as MSRI. let hasCompleteDecoder = 0; } // SYS and SYSL generic system instructions. def SysCRAsmOperand : AsmOperandClass { let Name = "SysCR"; let ParserMethod = "tryParseSysCROperand"; } def sys_cr_op : Operand { let PrintMethod = "printSysCROperand"; let ParserMatchClass = SysCRAsmOperand; } class SystemXtI : RtSystemI { bits<3> op1; bits<4> Cn; bits<4> Cm; bits<3> op2; let Inst{20-19} = 0b01; let Inst{18-16} = op1; let Inst{15-12} = Cn; let Inst{11-8} = Cm; let Inst{7-5} = op2; } class SystemLXtI : RtSystemI { bits<3> op1; bits<4> Cn; bits<4> Cm; bits<3> op2; let Inst{20-19} = 0b01; let Inst{18-16} = op1; let Inst{15-12} = Cn; let Inst{11-8} = Cm; let Inst{7-5} = op2; } // Branch (register) instructions: // // case opc of // 0001 blr // 0000 br // 0101 dret // 0100 eret // 0010 ret // otherwise UNDEFINED class BaseBranchReg opc, dag oops, dag iops, string asm, string operands, list pattern> : I, Sched<[WriteBrReg]> { let Inst{31-25} = 0b1101011; let Inst{24-21} = opc; let Inst{20-16} = 0b11111; let Inst{15-10} = 0b000000; let Inst{4-0} = 0b00000; } class BranchReg opc, string asm, list pattern> : BaseBranchReg { bits<5> Rn; let Inst{9-5} = Rn; } let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in class SpecialReturn opc, string asm> : BaseBranchReg { let Inst{9-5} = 0b11111; } let mayLoad = 1 in class RCPCLoad sz, string asm, RegisterClass RC> : I<(outs RC:$Rt), (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]", "", []>, Sched<[]> { bits<5> Rn; bits<5> Rt; let Inst{31-30} = sz; let Inst{29-10} = 0b11100010111111110000; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } class AuthBase M, dag oops, dag iops, string asm, string operands, list pattern> : I, Sched<[]> { let Inst{31-25} = 0b1101011; let Inst{20-11} = 0b1111100001; let Inst{10} = M; let Inst{4-0} = 0b11111; } class AuthBranchTwoOperands op, bits<1> M, string asm> : AuthBase { bits<5> Rn; bits<5> Rm; let Inst{24-22} = 0b100; let Inst{21} = op; let Inst{9-5} = Rn; let Inst{4-0} = Rm; } class AuthOneOperand opc, bits<1> M, string asm> : AuthBase { bits<5> Rn; let Inst{24} = 0; let Inst{23-21} = opc; let Inst{9-5} = Rn; } class AuthReturn op, bits<1> M, string asm> : AuthBase { let Inst{24} = 0; let Inst{23-21} = op; let Inst{9-0} = 0b1111111111; } let mayLoad = 1 in class BaseAuthLoad : I, Sched<[]> { bits<10> offset; bits<5> Rn; bits<5> Rt; let Inst{31-24} = 0b11111000; let Inst{23} = M; let Inst{22} = offset{9}; let Inst{21} = 1; let Inst{20-12} = offset{8-0}; let Inst{11} = W; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } multiclass AuthLoad { def indexed : BaseAuthLoad; def writeback : BaseAuthLoad; def : InstAlias(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>; } //--- // Conditional branch instruction. //--- // Condition code. // 4-bit immediate. Pretty-printed as def ccode : Operand { let PrintMethod = "printCondCode"; let ParserMatchClass = CondCode; } def inv_ccode : Operand { // AL and NV are invalid in the aliases which use inv_ccode let PrintMethod = "printInverseCondCode"; let ParserMatchClass = CondCode; let MCOperandPredicate = [{ return MCOp.isImm() && MCOp.getImm() != AArch64CC::AL && MCOp.getImm() != AArch64CC::NV; }]; } // Conditional branch target. 19-bit immediate. The low two bits of the target // offset are implied zero and so are not part of the immediate. def am_brcond : Operand { let EncoderMethod = "getCondBranchTargetOpValue"; let DecoderMethod = "DecodePCRelLabel19"; let PrintMethod = "printAlignedLabel"; let ParserMatchClass = PCRelLabel19Operand; let OperandType = "OPERAND_PCREL"; } class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), "b", ".$cond\t$target", "", [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, Sched<[WriteBr]> { let isBranch = 1; let isTerminator = 1; let Uses = [NZCV]; bits<4> cond; bits<19> target; let Inst{31-24} = 0b01010100; let Inst{23-5} = target; let Inst{4} = 0; let Inst{3-0} = cond; } //--- // Compare-and-branch instructions. //--- class BaseCmpBranch : I<(outs), (ins regtype:$Rt, am_brcond:$target), asm, "\t$Rt, $target", "", [(node regtype:$Rt, bb:$target)]>, Sched<[WriteBr]> { let isBranch = 1; let isTerminator = 1; bits<5> Rt; bits<19> target; let Inst{30-25} = 0b011010; let Inst{24} = op; let Inst{23-5} = target; let Inst{4-0} = Rt; } multiclass CmpBranch { def W : BaseCmpBranch { let Inst{31} = 0; } def X : BaseCmpBranch { let Inst{31} = 1; } } //--- // Test-bit-and-branch instructions. //--- // Test-and-branch target. 14-bit sign-extended immediate. The low two bits of // the target offset are implied zero and so are not part of the immediate. def am_tbrcond : Operand { let EncoderMethod = "getTestBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; let ParserMatchClass = BranchTarget14Operand; let OperandType = "OPERAND_PCREL"; } // AsmOperand classes to emit (or not) special diagnostics def TBZImm0_31Operand : AsmOperandClass { let Name = "TBZImm0_31"; let PredicateMethod = "isImmInRange<0,31>"; let RenderMethod = "addImmOperands"; } def TBZImm32_63Operand : AsmOperandClass { let Name = "Imm32_63"; let PredicateMethod = "isImmInRange<32,63>"; let DiagnosticType = "InvalidImm0_63"; let RenderMethod = "addImmOperands"; } class tbz_imm0_31 : Operand, ImmLeaf { let ParserMatchClass = matcher; } def tbz_imm0_31_diag : tbz_imm0_31; def tbz_imm0_31_nodiag : tbz_imm0_31; def tbz_imm32_63 : Operand, ImmLeaf 31) && (((uint32_t)Imm) < 64); }]> { let ParserMatchClass = TBZImm32_63Operand; } class BaseTestBranch : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target), asm, "\t$Rt, $bit_off, $target", "", [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>, Sched<[WriteBr]> { let isBranch = 1; let isTerminator = 1; bits<5> Rt; bits<6> bit_off; bits<14> target; let Inst{30-25} = 0b011011; let Inst{24} = op; let Inst{23-19} = bit_off{4-0}; let Inst{18-5} = target; let Inst{4-0} = Rt; let DecoderMethod = "DecodeTestAndBranch"; } multiclass TestBranch { def W : BaseTestBranch { let Inst{31} = 0; } def X : BaseTestBranch { let Inst{31} = 1; } // Alias X-reg with 0-31 imm to W-Reg. def : InstAlias(NAME#"W") GPR32as64:$Rd, tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>; def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target), (!cast(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32), tbz_imm0_31_diag:$imm, bb:$target)>; } //--- // Unconditional branch (immediate) instructions. //--- def am_b_target : Operand { let EncoderMethod = "getBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; let ParserMatchClass = BranchTarget26Operand; let OperandType = "OPERAND_PCREL"; } def am_bl_target : Operand { let EncoderMethod = "getBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; let ParserMatchClass = BranchTarget26Operand; let OperandType = "OPERAND_PCREL"; } class BImm pattern> : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> { bits<26> addr; let Inst{31} = op; let Inst{30-26} = 0b00101; let Inst{25-0} = addr; let DecoderMethod = "DecodeUnconditionalBranch"; } class BranchImm pattern> : BImm; class CallImm pattern> : BImm; //--- // Basic one-operand data processing instructions. //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseOneOperandData opc, RegisterClass regtype, string asm, SDPatternOperator node> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", [(set regtype:$Rd, (node regtype:$Rn))]>, Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; let Inst{30-13} = 0b101101011000000000; let Inst{12-10} = opc; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass OneOperandData opc, string asm, SDPatternOperator node = null_frag> { def Wr : BaseOneOperandData { let Inst{31} = 0; } def Xr : BaseOneOperandData { let Inst{31} = 1; } } class OneWRegData opc, string asm, SDPatternOperator node> : BaseOneOperandData { let Inst{31} = 0; } class OneXRegData opc, string asm, SDPatternOperator node> : BaseOneOperandData { let Inst{31} = 1; } class SignAuthOneData opcode_prefix, bits<2> opcode, string asm> : I<(outs GPR64:$Rd), (ins GPR64sp:$Rn), asm, "\t$Rd, $Rn", "", []>, Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; let Inst{31-15} = 0b11011010110000010; let Inst{14-12} = opcode_prefix; let Inst{11-10} = opcode; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class SignAuthZero opcode_prefix, bits<2> opcode, string asm> : I<(outs GPR64:$Rd), (ins), asm, "\t$Rd", "", []>, Sched<[]> { bits<5> Rd; let Inst{31-15} = 0b11011010110000010; let Inst{14-12} = opcode_prefix; let Inst{11-10} = opcode; let Inst{9-5} = 0b11111; let Inst{4-0} = Rd; } class SignAuthTwoOperand opc, string asm, SDPatternOperator OpNode> : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64sp:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64sp:$Rm))]>, Sched<[WriteI, ReadI, ReadI]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-21} = 0b10011010110; let Inst{20-16} = Rm; let Inst{15-14} = 0b00; let Inst{13-10} = opc; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions class BaseFlagManipulation : I<(outs), iops, asm, ops, "", []>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rn; let Inst{31} = sf; let Inst{30-15} = 0b0111010000000000; let Inst{14} = sz; let Inst{13-10} = 0b0010; let Inst{9-5} = Rn; let Inst{4-0} = 0b01101; } class FlagRotate : BaseFlagManipulation<0b1, 0b0, iops, asm, ops> { bits<6> imm; bits<4> mask; let Inst{20-15} = imm; let Inst{13-10} = 0b0001; let Inst{4} = 0b0; let Inst{3-0} = mask; } //--- // Basic two-operand data processing instructions. //--- class BaseBaseAddSubCarry pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pattern>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{30} = isSub; let Inst{28-21} = 0b11010000; let Inst{20-16} = Rm; let Inst{15-10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class BaseAddSubCarry : BaseBaseAddSubCarry; class BaseAddSubCarrySetFlags : BaseBaseAddSubCarry { let Defs = [NZCV]; } multiclass AddSubCarry { def Wr : BaseAddSubCarry { let Inst{31} = 0; let Inst{29} = 0; } def Xr : BaseAddSubCarry { let Inst{31} = 1; let Inst{29} = 0; } // Sets flags. def SWr : BaseAddSubCarrySetFlags { let Inst{31} = 0; let Inst{29} = 1; } def SXr : BaseAddSubCarrySetFlags { let Inst{31} = 1; let Inst{29} = 1; } } class BaseTwoOperand opc, RegisterClass regtype, string asm, SDPatternOperator OpNode> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{30-21} = 0b0011010110; let Inst{20-16} = Rm; let Inst{15-14} = 0b00; let Inst{13-10} = opc; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class BaseDiv : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> { let Inst{10} = isSigned; } multiclass Div { def Wr : BaseDiv, Sched<[WriteID32, ReadID, ReadID]> { let Inst{31} = 0; } def Xr : BaseDiv, Sched<[WriteID64, ReadID, ReadID]> { let Inst{31} = 1; } } class BaseShift shift_type, RegisterClass regtype, string asm, SDPatternOperator OpNode = null_frag> : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>, Sched<[WriteIS, ReadI]> { let Inst{11-10} = shift_type; } multiclass Shift shift_type, string asm, SDNode OpNode> { def Wr : BaseShift { let Inst{31} = 0; } def Xr : BaseShift { let Inst{31} = 1; } def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)), (!cast(NAME # "Wr") GPR32:$Rn, (EXTRACT_SUBREG i64:$Rm, sub_32))>; def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))), (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))), (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))), (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; } class ShiftAlias : InstAlias; class BaseMulAccum opc, RegisterClass multype, RegisterClass addtype, string asm, list pattern> : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra), asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<5> Ra; let Inst{30-24} = 0b0011011; let Inst{23-21} = opc; let Inst{20-16} = Rm; let Inst{15} = isSub; let Inst{14-10} = Ra; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass MulAccum { // MADD/MSUB generation is decided by MachineCombiner.cpp def Wrrr : BaseMulAccum, Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 0; } def Xrrr : BaseMulAccum, Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } } class WideMulAccum opc, string asm, SDNode AccNode, SDNode ExtNode> : BaseMulAccum, Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } class MulHi opc, string asm, SDNode OpNode> : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>, Sched<[WriteIM64, ReadIM, ReadIM]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-24} = 0b10011011; let Inst{23-21} = opc; let Inst{20-16} = Rm; let Inst{15} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 // (i.e. all bits 1) but is ignored by the processor. let PostEncoderMethod = "fixMulHigh"; } class MulAccumWAlias : InstAlias; class MulAccumXAlias : InstAlias; class WideMulAccumAlias : InstAlias; class BaseCRC32 sz, bit C, RegisterClass StreamReg, SDPatternOperator OpNode, string asm> : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>, Sched<[WriteISReg, ReadI, ReadISReg]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = sf; let Inst{30-21} = 0b0011010110; let Inst{20-16} = Rm; let Inst{15-13} = 0b010; let Inst{12} = C; let Inst{11-10} = sz; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let Predicates = [HasCRC]; } //--- // Address generation. //--- class ADRI pattern> : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "", pattern>, Sched<[WriteI]> { bits<5> Xd; bits<21> label; let Inst{31} = page; let Inst{30-29} = label{1-0}; let Inst{28-24} = 0b10000; let Inst{23-5} = label{20-2}; let Inst{4-0} = Xd; let DecoderMethod = "DecodeAdrInstruction"; } //--- // Move immediate. //--- def movimm32_imm : Operand { let ParserMatchClass = Imm0_65535Operand; let EncoderMethod = "getMoveWideImmOpValue"; let PrintMethod = "printImm"; } def movimm32_shift : Operand { let PrintMethod = "printShifter"; let ParserMatchClass = MovImm32ShifterOperand; } def movimm64_shift : Operand { let PrintMethod = "printShifter"; let ParserMatchClass = MovImm64ShifterOperand; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseMoveImmediate opc, RegisterClass regtype, Operand shifter, string asm> : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift), asm, "\t$Rd, $imm$shift", "", []>, Sched<[WriteImm]> { bits<5> Rd; bits<16> imm; bits<6> shift; let Inst{30-29} = opc; let Inst{28-23} = 0b100101; let Inst{22-21} = shift{5-4}; let Inst{20-5} = imm; let Inst{4-0} = Rd; let DecoderMethod = "DecodeMoveImmInstruction"; } multiclass MoveImmediate opc, string asm> { def Wi : BaseMoveImmediate { let Inst{31} = 0; } def Xi : BaseMoveImmediate { let Inst{31} = 1; } } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseInsertImmediate opc, RegisterClass regtype, Operand shifter, string asm> : I<(outs regtype:$Rd), (ins regtype:$src, movimm32_imm:$imm, shifter:$shift), asm, "\t$Rd, $imm$shift", "$src = $Rd", []>, Sched<[WriteI, ReadI]> { bits<5> Rd; bits<16> imm; bits<6> shift; let Inst{30-29} = opc; let Inst{28-23} = 0b100101; let Inst{22-21} = shift{5-4}; let Inst{20-5} = imm; let Inst{4-0} = Rd; let DecoderMethod = "DecodeMoveImmInstruction"; } multiclass InsertImmediate opc, string asm> { def Wi : BaseInsertImmediate { let Inst{31} = 0; } def Xi : BaseInsertImmediate { let Inst{31} = 1; } } //--- // Add/Subtract //--- class BaseAddSubImm : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm), asm, "\t$Rd, $Rn, $imm", "", [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>, Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; bits<14> imm; let Inst{30} = isSub; let Inst{29} = setFlags; let Inst{28-24} = 0b10001; let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12 let Inst{21-10} = imm{11-0}; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let DecoderMethod = "DecodeBaseAddSubImm"; } class BaseAddSubRegPseudo : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, Sched<[WriteI, ReadI, ReadI]>; class BaseAddSubSReg : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>, Sched<[WriteISReg, ReadI, ReadISReg]> { // The operands are in order to match the 'addr' MI operands, so we // don't need an encoder method and by-name matching. Just use the default // in-order handling. Since we're using by-order, make sure the names // do not match. bits<5> dst; bits<5> src1; bits<5> src2; bits<8> shift; let Inst{30} = isSub; let Inst{29} = setFlags; let Inst{28-24} = 0b01011; let Inst{23-22} = shift{7-6}; let Inst{21} = 0; let Inst{20-16} = src2; let Inst{15-10} = shift{5-0}; let Inst{9-5} = src1; let Inst{4-0} = dst; let DecoderMethod = "DecodeThreeAddrSRegInstruction"; } class BaseAddSubEReg : I<(outs dstRegtype:$R1), (ins src1Regtype:$R2, src2Regtype:$R3), asm, "\t$R1, $R2, $R3", "", [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>, Sched<[WriteIEReg, ReadI, ReadIEReg]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<6> ext; let Inst{30} = isSub; let Inst{29} = setFlags; let Inst{28-24} = 0b01011; let Inst{23-21} = 0b001; let Inst{20-16} = Rm; let Inst{15-13} = ext{5-3}; let Inst{12-10} = ext{2-0}; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let DecoderMethod = "DecodeAddSubERegInstruction"; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseAddSubEReg64 : I<(outs dstRegtype:$Rd), (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext), asm, "\t$Rd, $Rn, $Rm$ext", "", []>, Sched<[WriteIEReg, ReadI, ReadIEReg]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<6> ext; let Inst{30} = isSub; let Inst{29} = setFlags; let Inst{28-24} = 0b01011; let Inst{23-21} = 0b001; let Inst{20-16} = Rm; let Inst{15} = ext{5}; let Inst{12-10} = ext{2-0}; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let DecoderMethod = "DecodeAddSubERegInstruction"; } // Aliases for register+register add/subtract. class AddSubRegAlias : InstAlias; multiclass AddSub { let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in { // Add/Subtract immediate // Increase the weight of the immediate variant to try to match it before // the extended register variant. // We used to match the register variant before the immediate when the // register argument could be implicitly zero-extended. let AddedComplexity = 6 in def Wri : BaseAddSubImm { let Inst{31} = 0; } let AddedComplexity = 6 in def Xri : BaseAddSubImm { let Inst{31} = 1; } // Add/Subtract register - Only used for CodeGen def Wrr : BaseAddSubRegPseudo; def Xrr : BaseAddSubRegPseudo; // Add/Subtract shifted register def Wrs : BaseAddSubSReg { let Inst{31} = 0; } def Xrs : BaseAddSubSReg { let Inst{31} = 1; } } // Add/Subtract extended register let AddedComplexity = 1, hasSideEffects = 0 in { def Wrx : BaseAddSubEReg, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg, mnemonic, OpNode> { let Inst{31} = 1; } } def Xrx64 : BaseAddSubEReg64 { // UXTX and SXTX only. let Inst{14-13} = 0b11; let Inst{31} = 1; } // add Rd, Rb, -imm -> sub Rd, Rn, imm def : InstSubst(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; def : InstSubst(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; // Register/register aliases with no shift when SP is not used. def : AddSubRegAlias(NAME#"Wrs"), GPR32, GPR32, GPR32, 0>; def : AddSubRegAlias(NAME#"Xrs"), GPR64, GPR64, GPR64, 0>; // Register/register aliases with no shift when either the destination or // first source register is SP. def : AddSubRegAlias(NAME#"Wrx"), GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0 def : AddSubRegAlias(NAME#"Wrx"), GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0 def : AddSubRegAlias(NAME#"Xrx64"), GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0 def : AddSubRegAlias(NAME#"Xrx64"), GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0 } multiclass AddSubS { let isCompare = 1, Defs = [NZCV] in { // Add/Subtract immediate def Wri : BaseAddSubImm { let Inst{31} = 0; } def Xri : BaseAddSubImm { let Inst{31} = 1; } // Add/Subtract register def Wrr : BaseAddSubRegPseudo; def Xrr : BaseAddSubRegPseudo; // Add/Subtract shifted register def Wrs : BaseAddSubSReg { let Inst{31} = 0; } def Xrs : BaseAddSubSReg { let Inst{31} = 1; } // Add/Subtract extended register let AddedComplexity = 1 in { def Wrx : BaseAddSubEReg, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg, mnemonic, OpNode> { let Inst{31} = 1; } } def Xrx64 : BaseAddSubEReg64 { // UXTX and SXTX only. let Inst{14-13} = 0b11; let Inst{31} = 1; } } // Defs = [NZCV] // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm def : InstSubst(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; def : InstSubst(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; // Compare aliases def : InstAlias(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; def : InstAlias(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; def : InstAlias(NAME#"Wrx") WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; def : InstAlias(NAME#"Xrx") XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; def : InstAlias(NAME#"Xrx64") XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; def : InstAlias(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; def : InstAlias(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm def : InstSubst(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; def : InstSubst(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; // Compare shorthands def : InstAlias(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, 0), 5>; def : InstAlias(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, 0), 5>; def : InstAlias(NAME#"Wrx") WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; def : InstAlias(NAME#"Xrx64") XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; // Register/register aliases with no shift when SP is not used. def : AddSubRegAlias(NAME#"Wrs"), GPR32, GPR32, GPR32, 0>; def : AddSubRegAlias(NAME#"Xrs"), GPR64, GPR64, GPR64, 0>; // Register/register aliases with no shift when the first source register // is SP. def : AddSubRegAlias(NAME#"Wrx"), GPR32, GPR32sponly, GPR32, 16>; // UXTW #0 def : AddSubRegAlias(NAME#"Xrx64"), GPR64, GPR64sponly, GPR64, 24>; // UXTX #0 } //--- // Extract //--- def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisPtrTy<3>]>; def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>; class BaseExtractImm patterns> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm), asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>, Sched<[WriteExtr, ReadExtrHi]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<6> imm; let Inst{30-23} = 0b00100111; let Inst{21} = 0; let Inst{20-16} = Rm; let Inst{15-10} = imm; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass ExtractImm { def Wrri : BaseExtractImm { let Inst{31} = 0; let Inst{22} = 0; // imm<5> must be zero. let imm{5} = 0; } def Xrri : BaseExtractImm { let Inst{31} = 1; let Inst{22} = 1; } } //--- // Bitfield //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseBitfieldImm opc, RegisterClass regtype, Operand imm_type, string asm> : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms), asm, "\t$Rd, $Rn, $immr, $imms", "", []>, Sched<[WriteIS, ReadI]> { bits<5> Rd; bits<5> Rn; bits<6> immr; bits<6> imms; let Inst{30-29} = opc; let Inst{28-23} = 0b100110; let Inst{21-16} = immr; let Inst{15-10} = imms; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass BitfieldImm opc, string asm> { def Wri : BaseBitfieldImm { let Inst{31} = 0; let Inst{22} = 0; // imms<5> and immr<5> must be zero, else ReservedValue(). let Inst{21} = 0; let Inst{15} = 0; } def Xri : BaseBitfieldImm { let Inst{31} = 1; let Inst{22} = 1; } } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseBitfieldImmWith2RegArgs opc, RegisterClass regtype, Operand imm_type, string asm> : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr, imm_type:$imms), asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>, Sched<[WriteIS, ReadI]> { bits<5> Rd; bits<5> Rn; bits<6> immr; bits<6> imms; let Inst{30-29} = opc; let Inst{28-23} = 0b100110; let Inst{21-16} = immr; let Inst{15-10} = imms; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass BitfieldImmWith2RegArgs opc, string asm> { def Wri : BaseBitfieldImmWith2RegArgs { let Inst{31} = 0; let Inst{22} = 0; // imms<5> and immr<5> must be zero, else ReservedValue(). let Inst{21} = 0; let Inst{15} = 0; } def Xri : BaseBitfieldImmWith2RegArgs { let Inst{31} = 1; let Inst{22} = 1; } } //--- // Logical //--- // Logical (immediate) class BaseLogicalImm opc, RegisterClass dregtype, RegisterClass sregtype, Operand imm_type, string asm, list pattern> : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm), asm, "\t$Rd, $Rn, $imm", "", pattern>, Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; bits<13> imm; let Inst{30-29} = opc; let Inst{28-23} = 0b100100; let Inst{22} = imm{12}; let Inst{21-16} = imm{11-6}; let Inst{15-10} = imm{5-0}; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let DecoderMethod = "DecodeLogicalImmInstruction"; } // Logical (shifted register) class BaseLogicalSReg opc, bit N, RegisterClass regtype, logical_shifted_reg shifted_regtype, string asm, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pattern>, Sched<[WriteISReg, ReadI, ReadISReg]> { // The operands are in order to match the 'addr' MI operands, so we // don't need an encoder method and by-name matching. Just use the default // in-order handling. Since we're using by-order, make sure the names // do not match. bits<5> dst; bits<5> src1; bits<5> src2; bits<8> shift; let Inst{30-29} = opc; let Inst{28-24} = 0b01010; let Inst{23-22} = shift{7-6}; let Inst{21} = N; let Inst{20-16} = src2; let Inst{15-10} = shift{5-0}; let Inst{9-5} = src1; let Inst{4-0} = dst; let DecoderMethod = "DecodeThreeAddrSRegInstruction"; } // Aliases for register+register logical instructions. class LogicalRegAlias : InstAlias; multiclass LogicalImm opc, string mnemonic, SDNode OpNode, string Alias> { let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in def Wri : BaseLogicalImm { let Inst{31} = 0; let Inst{22} = 0; // 64-bit version has an additional bit of immediate. } let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in def Xri : BaseLogicalImm { let Inst{31} = 1; } def : InstSubst(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; def : InstSubst(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } multiclass LogicalImmS opc, string mnemonic, SDNode OpNode, string Alias> { let isCompare = 1, Defs = [NZCV] in { def Wri : BaseLogicalImm { let Inst{31} = 0; let Inst{22} = 0; // 64-bit version has an additional bit of immediate. } def Xri : BaseLogicalImm { let Inst{31} = 1; } } // end Defs = [NZCV] def : InstSubst(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; def : InstSubst(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } class BaseLogicalRegPseudo : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, Sched<[WriteI, ReadI, ReadI]>; // Split from LogicalImm as not all instructions have both. multiclass LogicalReg opc, bit N, string mnemonic, SDPatternOperator OpNode> { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def Wrr : BaseLogicalRegPseudo; def Xrr : BaseLogicalRegPseudo; } def Wrs : BaseLogicalSReg { let Inst{31} = 0; } def Xrs : BaseLogicalSReg { let Inst{31} = 1; } def : LogicalRegAlias(NAME#"Wrs"), GPR32>; def : LogicalRegAlias(NAME#"Xrs"), GPR64>; } // Split from LogicalReg to allow setting NZCV Defs multiclass LogicalRegS opc, bit N, string mnemonic, SDPatternOperator OpNode = null_frag> { let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def Wrr : BaseLogicalRegPseudo; def Xrr : BaseLogicalRegPseudo; def Wrs : BaseLogicalSReg { let Inst{31} = 0; } def Xrs : BaseLogicalSReg { let Inst{31} = 1; } } // Defs = [NZCV] def : LogicalRegAlias(NAME#"Wrs"), GPR32>; def : LogicalRegAlias(NAME#"Xrs"), GPR64>; } //--- // Conditionally set flags //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseCondComparisonImm : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond), mnemonic, "\t$Rn, $imm, $nzcv, $cond", "", [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv), (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; bits<5> Rn; bits<5> imm; bits<4> nzcv; bits<4> cond; let Inst{30} = op; let Inst{29-21} = 0b111010010; let Inst{20-16} = imm; let Inst{15-12} = cond; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4} = 0b0; let Inst{3-0} = nzcv; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseCondComparisonReg : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv), (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; bits<5> Rn; bits<5> Rm; bits<4> nzcv; bits<4> cond; let Inst{30} = op; let Inst{29-21} = 0b111010010; let Inst{20-16} = Rm; let Inst{15-12} = cond; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4} = 0b0; let Inst{3-0} = nzcv; } multiclass CondComparison { // immediate operand variants def Wi : BaseCondComparisonImm { let Inst{31} = 0; } def Xi : BaseCondComparisonImm { let Inst{31} = 1; } // register operand variants def Wr : BaseCondComparisonReg { let Inst{31} = 0; } def Xr : BaseCondComparisonReg { let Inst{31} = 1; } } //--- // Conditional select //--- class BaseCondSelect op2, RegisterClass regtype, string asm> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), asm, "\t$Rd, $Rn, $Rm, $cond", "", [(set regtype:$Rd, (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<4> cond; let Inst{30} = op; let Inst{29-21} = 0b011010100; let Inst{20-16} = Rm; let Inst{15-12} = cond; let Inst{11-10} = op2; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass CondSelect op2, string asm> { def Wr : BaseCondSelect { let Inst{31} = 0; } def Xr : BaseCondSelect { let Inst{31} = 1; } } class BaseCondSelectOp op2, RegisterClass regtype, string asm, PatFrag frag> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), asm, "\t$Rd, $Rn, $Rm, $cond", "", [(set regtype:$Rd, (AArch64csel regtype:$Rn, (frag regtype:$Rm), (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<4> cond; let Inst{30} = op; let Inst{29-21} = 0b011010100; let Inst{20-16} = Rm; let Inst{15-12} = cond; let Inst{11-10} = op2; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } def inv_cond_XFORM : SDNodeXForm(N->getZExtValue()); return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N), MVT::i32); }]>; multiclass CondSelectOp op2, string asm, PatFrag frag> { def Wr : BaseCondSelectOp { let Inst{31} = 0; } def Xr : BaseCondSelectOp { let Inst{31} = 1; } def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV), (!cast(NAME # Wr) GPR32:$Rn, GPR32:$Rm, (inv_cond_XFORM imm:$cond))>; def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV), (!cast(NAME # Xr) GPR64:$Rn, GPR64:$Rm, (inv_cond_XFORM imm:$cond))>; } //--- // Special Mask Value //--- def maski8_or_more : Operand, ImmLeaf { } def maski16_or_more : Operand, ImmLeaf { } //--- // Load/store //--- // (unsigned immediate) // Indexed for 8-bit registers. offset is in range [0,4095]. def am_indexed8 : ComplexPattern; def am_indexed16 : ComplexPattern; def am_indexed32 : ComplexPattern; def am_indexed64 : ComplexPattern; def am_indexed128 : ComplexPattern; def gi_am_indexed8 : GIComplexOperandMatcher">, GIComplexPatternEquiv; def gi_am_indexed16 : GIComplexOperandMatcher">, GIComplexPatternEquiv; def gi_am_indexed32 : GIComplexOperandMatcher">, GIComplexPatternEquiv; def gi_am_indexed64 : GIComplexOperandMatcher">, GIComplexPatternEquiv; def gi_am_indexed128 : GIComplexOperandMatcher">, GIComplexPatternEquiv; class UImm12OffsetOperand : AsmOperandClass { let Name = "UImm12Offset" # Scale; let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">"; let PredicateMethod = "isUImm12Offset<" # Scale # ">"; let DiagnosticType = "InvalidMemoryIndexed" # Scale; } def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>; def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>; def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>; def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>; def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>; class uimm12_scaled : Operand { let ParserMatchClass = !cast("UImm12OffsetScale" # Scale # "Operand"); let EncoderMethod = "getLdStUImm12OpValue"; let PrintMethod = "printUImm12Offset<" # Scale # ">"; } def uimm12s1 : uimm12_scaled<1>; def uimm12s2 : uimm12_scaled<2>; def uimm12s4 : uimm12_scaled<4>; def uimm12s8 : uimm12_scaled<8>; def uimm12s16 : uimm12_scaled<16>; class BaseLoadStoreUI sz, bit V, bits<2> opc, dag oops, dag iops, string asm, list pattern> : I { bits<5> Rt; bits<5> Rn; bits<12> offset; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b01; let Inst{23-22} = opc; let Inst{21-10} = offset; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodeUnsignedLdStInstruction"; } multiclass LoadUI sz, bit V, bits<2> opc, RegisterOperand regtype, Operand indextype, string asm, list pattern> { let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def ui : BaseLoadStoreUI, Sched<[WriteLD]>; def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } multiclass StoreUI sz, bit V, bits<2> opc, RegisterOperand regtype, Operand indextype, string asm, list pattern> { let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def ui : BaseLoadStoreUI, Sched<[WriteST]>; def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } // Same as StoreUI, but take a RegisterOperand. This is used by GlobalISel to // substitute zero-registers automatically. // // TODO: Roll out zero-register subtitution to GPR32/GPR64 and fold this back // into StoreUI. multiclass StoreUIz sz, bit V, bits<2> opc, RegisterOperand regtype, Operand indextype, string asm, list pattern> { let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def ui : BaseLoadStoreUI, Sched<[WriteST]>; def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } def PrefetchOperand : AsmOperandClass { let Name = "Prefetch"; let ParserMethod = "tryParsePrefetch"; } def prfop : Operand { let PrintMethod = "printPrefetchOp"; let ParserMatchClass = PrefetchOperand; } let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class PrefetchUI sz, bit V, bits<2> opc, string asm, list pat> : BaseLoadStoreUI, Sched<[WriteLD]>; //--- // Load literal //--- // Load literal address: 19-bit immediate. The low two bits of the target // offset are implied zero and so are not part of the immediate. def am_ldrlit : Operand { let EncoderMethod = "getLoadLiteralOpValue"; let DecoderMethod = "DecodePCRelLabel19"; let PrintMethod = "printAlignedLabel"; let ParserMatchClass = PCRelLabel19Operand; let OperandType = "OPERAND_PCREL"; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in class LoadLiteral opc, bit V, RegisterOperand regtype, string asm> : I<(outs regtype:$Rt), (ins am_ldrlit:$label), asm, "\t$Rt, $label", "", []>, Sched<[WriteLD]> { bits<5> Rt; bits<19> label; let Inst{31-30} = opc; let Inst{29-27} = 0b011; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-5} = label; let Inst{4-0} = Rt; } let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class PrefetchLiteral opc, bit V, string asm, list pat> : I<(outs), (ins prfop:$Rt, am_ldrlit:$label), asm, "\t$Rt, $label", "", pat>, Sched<[WriteLD]> { bits<5> Rt; bits<19> label; let Inst{31-30} = opc; let Inst{29-27} = 0b011; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-5} = label; let Inst{4-0} = Rt; } //--- // Load/store register offset //--- def ro_Xindexed8 : ComplexPattern", []>; def ro_Xindexed16 : ComplexPattern", []>; def ro_Xindexed32 : ComplexPattern", []>; def ro_Xindexed64 : ComplexPattern", []>; def ro_Xindexed128 : ComplexPattern", []>; def ro_Windexed8 : ComplexPattern", []>; def ro_Windexed16 : ComplexPattern", []>; def ro_Windexed32 : ComplexPattern", []>; def ro_Windexed64 : ComplexPattern", []>; def ro_Windexed128 : ComplexPattern", []>; class MemExtendOperand : AsmOperandClass { let Name = "Mem" # Reg # "Extend" # Width; let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">"; let RenderMethod = "addMemExtendOperands"; let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width; } def MemWExtend8Operand : MemExtendOperand<"W", 8> { // The address "[x0, x1, lsl #0]" actually maps to the variant which performs // the trivial shift. let RenderMethod = "addMemExtend8Operands"; } def MemWExtend16Operand : MemExtendOperand<"W", 16>; def MemWExtend32Operand : MemExtendOperand<"W", 32>; def MemWExtend64Operand : MemExtendOperand<"W", 64>; def MemWExtend128Operand : MemExtendOperand<"W", 128>; def MemXExtend8Operand : MemExtendOperand<"X", 8> { // The address "[x0, x1, lsl #0]" actually maps to the variant which performs // the trivial shift. let RenderMethod = "addMemExtend8Operands"; } def MemXExtend16Operand : MemExtendOperand<"X", 16>; def MemXExtend32Operand : MemExtendOperand<"X", 32>; def MemXExtend64Operand : MemExtendOperand<"X", 64>; def MemXExtend128Operand : MemExtendOperand<"X", 128>; class ro_extend : Operand { let ParserMatchClass = ParserClass; let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">"; let DecoderMethod = "DecodeMemExtend"; let EncoderMethod = "getMemExtendOpValue"; let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift); } def ro_Wextend8 : ro_extend; def ro_Wextend16 : ro_extend; def ro_Wextend32 : ro_extend; def ro_Wextend64 : ro_extend; def ro_Wextend128 : ro_extend; def ro_Xextend8 : ro_extend; def ro_Xextend16 : ro_extend; def ro_Xextend32 : ro_extend; def ro_Xextend64 : ro_extend; def ro_Xextend128 : ro_extend; class ROAddrMode { // CodeGen-level pattern covering the entire addressing mode. ComplexPattern Wpat = windex; ComplexPattern Xpat = xindex; // Asm-level Operand covering the valid "uxtw #3" style syntax. Operand Wext = wextend; Operand Xext = xextend; } def ro8 : ROAddrMode; def ro16 : ROAddrMode; def ro32 : ROAddrMode; def ro64 : ROAddrMode; def ro128 : ROAddrMode; class LoadStore8RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, dag ins, dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; bits<5> Rm; bits<2> extend; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15} = extend{1}; // sign extend Rm? let Inst{14} = 1; let Inst{12} = extend{0}; // do shift? let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } class ROInstAlias : InstAlias; multiclass Load8RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10 in def roW : LoadStore8RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in def roX : LoadStore8RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } multiclass Store8RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10 in def roW : LoadStore8RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in def roX : LoadStore8RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } class LoadStore16RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, dag ins, dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; bits<5> Rm; bits<2> extend; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15} = extend{1}; // sign extend Rm? let Inst{14} = 1; let Inst{12} = extend{0}; // do shift? let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } multiclass Load16RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10 in def roW : LoadStore16RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in def roX : LoadStore16RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } multiclass Store16RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10 in def roW : LoadStore16RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in def roX : LoadStore16RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } class LoadStore32RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, dag ins, dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; bits<5> Rm; bits<2> extend; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15} = extend{1}; // sign extend Rm? let Inst{14} = 1; let Inst{12} = extend{0}; // do shift? let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } multiclass Load32RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10 in def roW : LoadStore32RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in def roX : LoadStore32RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } multiclass Store32RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10 in def roW : LoadStore32RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in def roX : LoadStore32RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } class LoadStore64RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, dag ins, dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; bits<5> Rm; bits<2> extend; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15} = extend{1}; // sign extend Rm? let Inst{14} = 1; let Inst{12} = extend{0}; // do shift? let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } multiclass Load64RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def roW : LoadStore64RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def roX : LoadStore64RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } multiclass Store64RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roW : LoadStore64RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roX : LoadStore64RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } class LoadStore128RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, dag ins, dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; bits<5> Rm; bits<2> extend; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15} = extend{1}; // sign extend Rm? let Inst{14} = 1; let Inst{12} = extend{0}; // do shift? let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } multiclass Load128RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def roW : LoadStore128RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def roX : LoadStore128RO, Sched<[WriteLDIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } multiclass Store128RO sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roW : LoadStore128RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roX : LoadStore128RO, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class BasePrefetchRO sz, bit V, bits<2> opc, dag outs, dag ins, string asm, list pat> : I, Sched<[WriteLD]> { bits<5> Rt; bits<5> Rn; bits<5> Rm; bits<2> extend; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15} = extend{1}; // sign extend Rm? let Inst{14} = 1; let Inst{12} = extend{0}; // do shift? let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; } multiclass PrefetchRO sz, bit V, bits<2> opc, string asm> { def roW : BasePrefetchRO { let Inst{13} = 0b0; } def roX : BasePrefetchRO { let Inst{13} = 0b1; } def : InstAlias<"prfm $Rt, [$Rn, $Rm]", (!cast(NAME # "roX") prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; } //--- // Load/store unscaled immediate //--- def am_unscaled8 : ComplexPattern; def am_unscaled16 : ComplexPattern; def am_unscaled32 : ComplexPattern; def am_unscaled64 : ComplexPattern; def am_unscaled128 :ComplexPattern; def gi_am_unscaled8 : GIComplexOperandMatcher, GIComplexPatternEquiv; def gi_am_unscaled16 : GIComplexOperandMatcher, GIComplexPatternEquiv; def gi_am_unscaled32 : GIComplexOperandMatcher, GIComplexPatternEquiv; def gi_am_unscaled64 : GIComplexOperandMatcher, GIComplexPatternEquiv; def gi_am_unscaled128 : GIComplexOperandMatcher, GIComplexPatternEquiv; class BaseLoadStoreUnscale sz, bit V, bits<2> opc, dag oops, dag iops, string asm, list pattern> : I { bits<5> Rt; bits<5> Rn; bits<9> offset; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 0; let Inst{20-12} = offset; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodeSignedLdStInstruction"; } // Armv8.4 LDAPR & STLR with Immediate Offset instruction multiclass BaseLoadUnscaleV84 sz, bits<2> opc, RegisterOperand regtype > { def i : BaseLoadStoreUnscale, Sched<[WriteST]> { let Inst{29} = 0; let Inst{24} = 1; } def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } multiclass BaseStoreUnscaleV84 sz, bits<2> opc, RegisterOperand regtype > { def i : BaseLoadStoreUnscale, Sched<[WriteST]> { let Inst{29} = 0; let Inst{24} = 1; } def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } multiclass LoadUnscaled sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, list pattern> { let AddedComplexity = 1 in // try this before LoadUI def i : BaseLoadStoreUnscale, Sched<[WriteLD]>; def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } multiclass StoreUnscaled sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, list pattern> { let AddedComplexity = 1 in // try this before StoreUI def i : BaseLoadStoreUnscale, Sched<[WriteST]>; def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } multiclass PrefetchUnscaled sz, bit V, bits<2> opc, string asm, list pat> { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in def i : BaseLoadStoreUnscale, Sched<[WriteLD]>; def : InstAlias(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; } //--- // Load/store unscaled immediate, unprivileged //--- class BaseLoadStoreUnprivileged sz, bit V, bits<2> opc, dag oops, dag iops, string asm> : I { bits<5> Rt; bits<5> Rn; bits<9> offset; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 0; let Inst{20-12} = offset; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodeSignedLdStInstruction"; } multiclass LoadUnprivileged sz, bit V, bits<2> opc, RegisterClass regtype, string asm> { let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in def i : BaseLoadStoreUnprivileged, Sched<[WriteLD]>; def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } multiclass StoreUnprivileged sz, bit V, bits<2> opc, RegisterClass regtype, string asm> { let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def i : BaseLoadStoreUnprivileged, Sched<[WriteST]>; def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } //--- // Load/store pre-indexed //--- class BaseLoadStorePreIdx sz, bit V, bits<2> opc, dag oops, dag iops, string asm, string cstr, list pat> : I { bits<5> Rt; bits<5> Rn; bits<9> offset; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0; let Inst{23-22} = opc; let Inst{21} = 0; let Inst{20-12} = offset; let Inst{11-10} = 0b11; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodeSignedLdStInstruction"; } let hasSideEffects = 0 in { let mayStore = 0, mayLoad = 1 in class LoadPreIdx sz, bit V, bits<2> opc, RegisterOperand regtype, string asm> : BaseLoadStorePreIdx, Sched<[WriteLD, WriteAdr]>; let mayStore = 1, mayLoad = 0 in class StorePreIdx sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, SDPatternOperator storeop, ValueType Ty> : BaseLoadStorePreIdx, Sched<[WriteAdr, WriteST]>; } // hasSideEffects = 0 //--- // Load/store post-indexed //--- class BaseLoadStorePostIdx sz, bit V, bits<2> opc, dag oops, dag iops, string asm, string cstr, list pat> : I { bits<5> Rt; bits<5> Rn; bits<9> offset; let Inst{31-30} = sz; let Inst{29-27} = 0b111; let Inst{26} = V; let Inst{25-24} = 0b00; let Inst{23-22} = opc; let Inst{21} = 0b0; let Inst{20-12} = offset; let Inst{11-10} = 0b01; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodeSignedLdStInstruction"; } let hasSideEffects = 0 in { let mayStore = 0, mayLoad = 1 in class LoadPostIdx sz, bit V, bits<2> opc, RegisterOperand regtype, string asm> : BaseLoadStorePostIdx, Sched<[WriteLD, WriteAdr]>; let mayStore = 1, mayLoad = 0 in class StorePostIdx sz, bit V, bits<2> opc, RegisterOperand regtype, string asm, SDPatternOperator storeop, ValueType Ty> : BaseLoadStorePostIdx, Sched<[WriteAdr, WriteST]>; } // hasSideEffects = 0 //--- // Load/store pair //--- // (indexed, offset) class BaseLoadStorePairOffset opc, bit V, bit L, dag oops, dag iops, string asm> : I { bits<5> Rt; bits<5> Rt2; bits<5> Rn; bits<7> offset; let Inst{31-30} = opc; let Inst{29-27} = 0b101; let Inst{26} = V; let Inst{25-23} = 0b010; let Inst{22} = L; let Inst{21-15} = offset; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodePairLdStInstruction"; } multiclass LoadPairOffset opc, bit V, RegisterOperand regtype, Operand indextype, string asm> { let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in def i : BaseLoadStorePairOffset, Sched<[WriteLD, WriteLDHi]>; def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } multiclass StorePairOffset opc, bit V, RegisterOperand regtype, Operand indextype, string asm> { let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in def i : BaseLoadStorePairOffset, Sched<[WriteSTP]>; def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } // (pre-indexed) class BaseLoadStorePairPreIdx opc, bit V, bit L, dag oops, dag iops, string asm> : I { bits<5> Rt; bits<5> Rt2; bits<5> Rn; bits<7> offset; let Inst{31-30} = opc; let Inst{29-27} = 0b101; let Inst{26} = V; let Inst{25-23} = 0b011; let Inst{22} = L; let Inst{21-15} = offset; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodePairLdStInstruction"; } let hasSideEffects = 0 in { let mayStore = 0, mayLoad = 1 in class LoadPairPreIdx opc, bit V, RegisterOperand regtype, Operand indextype, string asm> : BaseLoadStorePairPreIdx, Sched<[WriteLD, WriteLDHi, WriteAdr]>; let mayStore = 1, mayLoad = 0 in class StorePairPreIdx opc, bit V, RegisterOperand regtype, Operand indextype, string asm> : BaseLoadStorePairPreIdx, Sched<[WriteAdr, WriteSTP]>; } // hasSideEffects = 0 // (post-indexed) class BaseLoadStorePairPostIdx opc, bit V, bit L, dag oops, dag iops, string asm> : I { bits<5> Rt; bits<5> Rt2; bits<5> Rn; bits<7> offset; let Inst{31-30} = opc; let Inst{29-27} = 0b101; let Inst{26} = V; let Inst{25-23} = 0b001; let Inst{22} = L; let Inst{21-15} = offset; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodePairLdStInstruction"; } let hasSideEffects = 0 in { let mayStore = 0, mayLoad = 1 in class LoadPairPostIdx opc, bit V, RegisterOperand regtype, Operand idxtype, string asm> : BaseLoadStorePairPostIdx, Sched<[WriteLD, WriteLDHi, WriteAdr]>; let mayStore = 1, mayLoad = 0 in class StorePairPostIdx opc, bit V, RegisterOperand regtype, Operand idxtype, string asm> : BaseLoadStorePairPostIdx, Sched<[WriteAdr, WriteSTP]>; } // hasSideEffects = 0 // (no-allocate) class BaseLoadStorePairNoAlloc opc, bit V, bit L, dag oops, dag iops, string asm> : I { bits<5> Rt; bits<5> Rt2; bits<5> Rn; bits<7> offset; let Inst{31-30} = opc; let Inst{29-27} = 0b101; let Inst{26} = V; let Inst{25-23} = 0b000; let Inst{22} = L; let Inst{21-15} = offset; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let DecoderMethod = "DecodePairLdStInstruction"; } multiclass LoadPairNoAlloc opc, bit V, RegisterClass regtype, Operand indextype, string asm> { let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in def i : BaseLoadStorePairNoAlloc, Sched<[WriteLD, WriteLDHi]>; def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } multiclass StorePairNoAlloc opc, bit V, RegisterClass regtype, Operand indextype, string asm> { let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in def i : BaseLoadStorePairNoAlloc, Sched<[WriteSTP]>; def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } //--- // Load/store exclusive //--- // True exclusive operations write to and/or read from the system's exclusive // monitors, which as far as a compiler is concerned can be modelled as a // random shared memory address. Hence LoadExclusive mayStore. // // Since these instructions have the undefined register bits set to 1 in // their canonical form, we need a post encoder method to set those bits // to 1 when encoding these instructions. We do this using the // fixLoadStoreExclusive function. This function has template parameters: // // fixLoadStoreExclusive // // hasRs indicates that the instruction uses the Rs field, so we won't set // it to 1 (and the same for Rt2). We don't need template parameters for // the other register fields since Rt and Rn are always used. // let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in class BaseLoadStoreExclusive sz, bit o2, bit L, bit o1, bit o0, dag oops, dag iops, string asm, string operands> : I { let Inst{31-30} = sz; let Inst{29-24} = 0b001000; let Inst{23} = o2; let Inst{22} = L; let Inst{21} = o1; let Inst{15} = o0; let DecoderMethod = "DecodeExclusiveLdStInstruction"; } // Neither Rs nor Rt2 operands. class LoadStoreExclusiveSimple sz, bit o2, bit L, bit o1, bit o0, dag oops, dag iops, string asm, string operands> : BaseLoadStoreExclusive { bits<5> Rt; bits<5> Rn; let Inst{20-16} = 0b11111; let Unpredictable{20-16} = 0b11111; let Inst{14-10} = 0b11111; let Unpredictable{14-10} = 0b11111; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let PostEncoderMethod = "fixLoadStoreExclusive<0,0>"; } // Simple load acquires don't set the exclusive monitor let mayLoad = 1, mayStore = 0 in class LoadAcquire sz, bit o2, bit L, bit o1, bit o0, RegisterClass regtype, string asm> : LoadStoreExclusiveSimple, Sched<[WriteLD]>; class LoadExclusive sz, bit o2, bit L, bit o1, bit o0, RegisterClass regtype, string asm> : LoadStoreExclusiveSimple, Sched<[WriteLD]>; class LoadExclusivePair sz, bit o2, bit L, bit o1, bit o0, RegisterClass regtype, string asm> : BaseLoadStoreExclusive, Sched<[WriteLD, WriteLDHi]> { bits<5> Rt; bits<5> Rt2; bits<5> Rn; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let PostEncoderMethod = "fixLoadStoreExclusive<0,1>"; } // Simple store release operations do not check the exclusive monitor. let mayLoad = 0, mayStore = 1 in class StoreRelease sz, bit o2, bit L, bit o1, bit o0, RegisterClass regtype, string asm> : LoadStoreExclusiveSimple, Sched<[WriteST]>; let mayLoad = 1, mayStore = 1 in class StoreExclusive sz, bit o2, bit L, bit o1, bit o0, RegisterClass regtype, string asm> : BaseLoadStoreExclusive, Sched<[WriteSTX]> { bits<5> Ws; bits<5> Rt; bits<5> Rn; let Inst{20-16} = Ws; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let Constraints = "@earlyclobber $Ws"; let PostEncoderMethod = "fixLoadStoreExclusive<1,0>"; } class StoreExclusivePair sz, bit o2, bit L, bit o1, bit o0, RegisterClass regtype, string asm> : BaseLoadStoreExclusive, Sched<[WriteSTX]> { bits<5> Ws; bits<5> Rt; bits<5> Rt2; bits<5> Rn; let Inst{20-16} = Ws; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let Constraints = "@earlyclobber $Ws"; } //--- // Exception generation //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class ExceptionGeneration op1, bits<2> ll, string asm> : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, Sched<[WriteSys]> { bits<16> imm; let Inst{31-24} = 0b11010100; let Inst{23-21} = op1; let Inst{20-5} = imm; let Inst{4-2} = 0b000; let Inst{1-0} = ll; } let Predicates = [HasFPARMv8] in { //--- // Floating point to integer conversion //--- class BaseFPToIntegerUnscaled type, bits<2> rmode, bits<3> opcode, RegisterClass srcType, RegisterClass dstType, string asm, list pattern> : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; let Inst{30-29} = 0b00; let Inst{28-24} = 0b11110; let Inst{23-22} = type; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; let Inst{15-10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseFPToInteger type, bits<2> rmode, bits<3> opcode, RegisterClass srcType, RegisterClass dstType, Operand immType, string asm, list pattern> : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), asm, "\t$Rd, $Rn, $scale", "", pattern>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; bits<6> scale; let Inst{30-29} = 0b00; let Inst{28-24} = 0b11110; let Inst{23-22} = type; let Inst{21} = 0; let Inst{20-19} = rmode; let Inst{18-16} = opcode; let Inst{15-10} = scale; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass FPToIntegerUnscaled rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { // Unscaled half-precision to 32-bit def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { let Inst{31} = 0; // 32-bit GPR flag let Predicates = [HasFullFP16]; } // Unscaled half-precision to 64-bit def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; } // Unscaled single-precision to 32-bit def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { let Inst{31} = 0; // 32-bit GPR flag } // Unscaled single-precision to 64-bit def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm, [(set GPR64:$Rd, (OpN FPR32:$Rn))]> { let Inst{31} = 1; // 64-bit GPR flag } // Unscaled double-precision to 32-bit def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm, [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> { let Inst{31} = 0; // 32-bit GPR flag } // Unscaled double-precision to 64-bit def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm, [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> { let Inst{31} = 1; // 64-bit GPR flag } } multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { // Scaled half-precision to 32-bit def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, fixedpoint_f16_i32, asm, [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, fixedpoint_f16_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; let Predicates = [HasFullFP16]; } // Scaled half-precision to 64-bit def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, fixedpoint_f16_i64, asm, [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, fixedpoint_f16_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; } // Scaled single-precision to 32-bit def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, fixedpoint_f32_i32, asm, [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn, fixedpoint_f32_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; } // Scaled single-precision to 64-bit def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64, fixedpoint_f32_i64, asm, [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn, fixedpoint_f32_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag } // Scaled double-precision to 32-bit def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32, fixedpoint_f64_i32, asm, [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn, fixedpoint_f64_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; } // Scaled double-precision to 64-bit def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64, fixedpoint_f64_i64, asm, [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn, fixedpoint_f64_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag } } //--- // Integer to floating point conversion //--- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseIntegerToFP pattern> : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), asm, "\t$Rd, $Rn, $scale", "", pattern>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; bits<6> scale; let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b00001; let Inst{16} = isUnsigned; let Inst{15-10} = scale; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class BaseIntegerToFPUnscaled : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; bits<6> scale; let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b10001; let Inst{16} = isUnsigned; let Inst{15-10} = 0b000000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass IntegerToFP { // Unscaled def UWHri: BaseIntegerToFPUnscaled { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def UWSri: BaseIntegerToFPUnscaled { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b00; // 32-bit FPR flag } def UWDri: BaseIntegerToFPUnscaled { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b01; // 64-bit FPR flag } def UXHri: BaseIntegerToFPUnscaled { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def UXSri: BaseIntegerToFPUnscaled { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b00; // 32-bit FPR flag } def UXDri: BaseIntegerToFPUnscaled { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b01; // 64-bit FPR flag } // Scaled def SWHri: BaseIntegerToFP { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let scale{5} = 1; let Predicates = [HasFullFP16]; } def SWSri: BaseIntegerToFP { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b00; // 32-bit FPR flag let scale{5} = 1; } def SWDri: BaseIntegerToFP { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b01; // 64-bit FPR flag let scale{5} = 1; } def SXHri: BaseIntegerToFP { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def SXSri: BaseIntegerToFP { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b00; // 32-bit FPR flag } def SXDri: BaseIntegerToFP { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b01; // 64-bit FPR flag } } //--- // Unscaled integer <-> floating point conversion (i.e. FMOV) //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseUnscaledConversion rmode, bits<3> opcode, RegisterClass srcType, RegisterClass dstType, string asm> : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", // We use COPY_TO_REGCLASS for these bitconvert operations. // copyPhysReg() expands the resultant COPY instructions after // regalloc is done. This gives greater freedom for the allocator // and related passes (coalescing, copy propagation, et. al.) to // be more effective. [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>, Sched<[WriteFCopy]> { bits<5> Rd; bits<5> Rn; let Inst{30-24} = 0b0011110; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; let Inst{15-10} = 0b000000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseUnscaledConversionToHigh rmode, bits<3> opcode, RegisterClass srcType, RegisterOperand dstType, string asm, string kind> : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>, Sched<[WriteFCopy]> { bits<5> Rd; bits<5> Rn; let Inst{30-23} = 0b00111101; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; let Inst{15-10} = 0b000000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let DecoderMethod = "DecodeFMOVLaneInstruction"; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseUnscaledConversionFromHigh rmode, bits<3> opcode, RegisterOperand srcType, RegisterClass dstType, string asm, string kind> : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>, Sched<[WriteFCopy]> { bits<5> Rd; bits<5> Rn; let Inst{30-23} = 0b00111101; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; let Inst{15-10} = 0b000000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; let DecoderMethod = "DecodeFMOVLaneInstruction"; } multiclass UnscaledConversion { def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b00; // 32-bit FPR flag } def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b01; // 64-bit FPR flag } def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b11; // 16-bit FPR flag let Predicates = [HasFullFP16]; } def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag let Inst{23-22} = 0b00; // 32-bit FPR flag } def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag let Inst{23-22} = 0b01; // 64-bit FPR flag } def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, asm, ".d"> { let Inst{31} = 1; let Inst{22} = 0; } def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64, asm, ".d"> { let Inst{31} = 1; let Inst{22} = 0; } } //--- // Floating point conversion //--- class BaseFPConversion type, bits<2> opcode, RegisterClass dstType, RegisterClass srcType, string asm, list pattern> : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; let Inst{31-24} = 0b00011110; let Inst{23-22} = type; let Inst{21-17} = 0b10001; let Inst{16-15} = opcode; let Inst{14-10} = 0b10000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass FPConversion { // Double-precision to Half-precision def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, [(set FPR16:$Rd, (fpround FPR64:$Rn))]>; // Double-precision to Single-precision def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, [(set FPR32:$Rd, (fpround FPR64:$Rn))]>; // Half-precision to Double-precision def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>; // Half-precision to Single-precision def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>; // Single-precision to Double-precision def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>; // Single-precision to Half-precision def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, [(set FPR16:$Rd, (fpround FPR32:$Rn))]>; } //--- // Single operand floating point data processing //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSingleOperandFPData opcode, RegisterClass regtype, ValueType vt, string asm, SDPatternOperator node> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>, Sched<[WriteF]> { bits<5> Rd; bits<5> Rn; let Inst{31-24} = 0b00011110; let Inst{21-19} = 0b100; let Inst{18-15} = opcode; let Inst{14-10} = 0b10000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SingleOperandFPData opcode, string asm, SDPatternOperator node = null_frag> { def Hr : BaseSingleOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } def Sr : BaseSingleOperandFPData { let Inst{23-22} = 0b00; // 32-bit size flag } def Dr : BaseSingleOperandFPData { let Inst{23-22} = 0b01; // 64-bit size flag } } //--- // Two operand floating point data processing //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseTwoOperandFPData opcode, RegisterClass regtype, string asm, list pat> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pat>, Sched<[WriteF]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass TwoOperandFPData opcode, string asm, SDPatternOperator node = null_frag> { def Hrr : BaseTwoOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } def Srr : BaseTwoOperandFPData { let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData { let Inst{23-22} = 0b01; // 64-bit size flag } } multiclass TwoOperandFPDataNeg opcode, string asm, SDNode node> { def Hrr : BaseTwoOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } def Srr : BaseTwoOperandFPData { let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData { let Inst{23-22} = 0b01; // 64-bit size flag } } //--- // Three operand floating point data processing //--- class BaseThreeOperandFPData pat> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra), asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>, Sched<[WriteFMul]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<5> Ra; let Inst{31-24} = 0b00011111; let Inst{21} = isNegated; let Inst{20-16} = Rm; let Inst{15} = isSub; let Inst{14-10} = Ra; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass ThreeOperandFPData { def Hrrr : BaseThreeOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } def Srrr : BaseThreeOperandFPData { let Inst{23-22} = 0b00; // 32-bit size flag } def Drrr : BaseThreeOperandFPData { let Inst{23-22} = 0b01; // 64-bit size flag } } //--- // Floating point data comparisons //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseOneOperandFPComparison pat> : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, Sched<[WriteFCmp]> { bits<5> Rn; let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{15-10} = 0b001000; let Inst{9-5} = Rn; let Inst{4} = signalAllNans; let Inst{3-0} = 0b1000; // Rm should be 0b00000 canonically, but we need to accept any value. let PostEncoderMethod = "fixOneOperandFPComparison"; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseTwoOperandFPComparison pat> : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>, Sched<[WriteFCmp]> { bits<5> Rm; bits<5> Rn; let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-10} = 0b001000; let Inst{9-5} = Rn; let Inst{4} = signalAllNans; let Inst{3-0} = 0b0000; } multiclass FPComparison { let Defs = [NZCV] in { def Hrr : BaseTwoOperandFPComparison { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Hri : BaseOneOperandFPComparison { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Srr : BaseTwoOperandFPComparison { let Inst{23-22} = 0b00; } def Sri : BaseOneOperandFPComparison { let Inst{23-22} = 0b00; } def Drr : BaseTwoOperandFPComparison { let Inst{23-22} = 0b01; } def Dri : BaseOneOperandFPComparison { let Inst{23-22} = 0b01; } } // Defs = [NZCV] } //--- // Floating point conditional comparisons //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseFPCondComparison pat> : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>, Sched<[WriteFCmp]> { let Uses = [NZCV]; let Defs = [NZCV]; bits<5> Rn; bits<5> Rm; bits<4> nzcv; bits<4> cond; let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; let Inst{11-10} = 0b01; let Inst{9-5} = Rn; let Inst{4} = signalAllNans; let Inst{3-0} = nzcv; } multiclass FPCondComparison { - def Hrr : BaseFPCondComparison { + def Hrr : BaseFPCondComparison { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Srr : BaseFPCondComparison { let Inst{23-22} = 0b00; } def Drr : BaseFPCondComparison { let Inst{23-22} = 0b01; } } //--- // Floating point conditional select //--- class BaseFPCondSelect : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), asm, "\t$Rd, $Rn, $Rm, $cond", "", [(set regtype:$Rd, (AArch64csel (vt regtype:$Rn), regtype:$Rm, (i32 imm:$cond), NZCV))]>, Sched<[WriteF]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<4> cond; let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; let Inst{11-10} = 0b11; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass FPCondSelect { let Uses = [NZCV] in { def Hrrr : BaseFPCondSelect { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Srrr : BaseFPCondSelect { let Inst{23-22} = 0b00; } def Drrr : BaseFPCondSelect { let Inst{23-22} = 0b01; } } // Uses = [NZCV] } //--- // Floating move immediate //--- class BaseFPMoveImmediate : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "", [(set regtype:$Rd, fpimmtype:$imm)]>, Sched<[WriteFImm]> { bits<5> Rd; bits<8> imm; let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-13} = imm; let Inst{12-5} = 0b10000000; let Inst{4-0} = Rd; } multiclass FPMoveImmediate { def Hi : BaseFPMoveImmediate { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Si : BaseFPMoveImmediate { let Inst{23-22} = 0b00; } def Di : BaseFPMoveImmediate { let Inst{23-22} = 0b01; } } } // end of 'let Predicates = [HasFPARMv8]' //---------------------------------------------------------------------------- // AdvSIMD //---------------------------------------------------------------------------- let Predicates = [HasNEON] in { //---------------------------------------------------------------------------- // AdvSIMD three register vector instructions //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDThreeSameVector size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDThreeSameVectorTied size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class BaseSIMDThreeSameVectorDot : BaseSIMDThreeSameVectorTied { let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); } multiclass SIMDThreeSameVectorDot { def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64, v2i32, v8i8, OpNode>; def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; } // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDThreeSameVectorBHSTied opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; } // As above, but only B sized elements supported. multiclass SIMDThreeSameVectorB opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; } // As above, but only floating point elements supported. multiclass SIMDThreeSameVectorFP opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, asm, ".4h", [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, asm, ".8h", [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; } // Predicates = [HasNEON, HasFullFP16] def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } multiclass SIMDThreeSameVectorFPCmp opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; } // Predicates = [HasNEON, HasFullFP16] def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } multiclass SIMDThreeSameVectorFPTied opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; } // Predicates = [HasNEON, HasFullFP16] def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } // As above, but D and B sized elements unsupported. multiclass SIMDThreeSameVectorHS opc, string asm, SDPatternOperator OpNode> { def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; } // Logical three vector ops share opcode bits, and only use B sized elements. multiclass SIMDLogicalThreeVector size, string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)), (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; } multiclass SIMDLogicalThreeVectorTied size, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), (v4i16 V64:$RHS))), (!cast(NAME#"v8i8") V64:$LHS, V64:$MHS, V64:$RHS)>; def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), (v2i32 V64:$RHS))), (!cast(NAME#"v8i8") V64:$LHS, V64:$MHS, V64:$RHS)>; def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), (v1i64 V64:$RHS))), (!cast(NAME#"v8i8") V64:$LHS, V64:$MHS, V64:$RHS)>; def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), (v8i16 V128:$RHS))), (!cast(NAME#"v16i8") V128:$LHS, V128:$MHS, V128:$RHS)>; def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), (v4i32 V128:$RHS))), (!cast(NAME#"v16i8") V128:$LHS, V128:$MHS, V128:$RHS)>; def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), (v2i64 V128:$RHS))), (!cast(NAME#"v16i8") V128:$LHS, V128:$MHS, V128:$RHS)>; } //---------------------------------------------------------------------------- // AdvSIMD two register vector instructions. //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVector size, bits<5> opcode, bits<2> size2, RegisterOperand regtype, string asm, string dstkind, string srckind, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0b1; let Inst{20-19} = size2; let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVectorTied size, bits<5> opcode, bits<2> size2, RegisterOperand regtype, string asm, string dstkind, string srckind, list pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0b1; let Inst{20-19} = size2; let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } // Supports B, H, and S element sizes. multiclass SIMDTwoVectorBHS opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } class BaseSIMDVectorLShiftLongBySize size, RegisterOperand regtype, string asm, string dstkind, string srckind, string amount> : I<(outs V128:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount # "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29-24} = 0b101110; let Inst{23-22} = size; let Inst{21-10} = 0b100001001110; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDVectorLShiftLongBySizeBHS { let hasSideEffects = 0 in { def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64, "shll", ".8h", ".8b", "8">; def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128, "shll2", ".8h", ".16b", "8">; def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64, "shll", ".4s", ".4h", "16">; def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128, "shll2", ".4s", ".8h", "16">; def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64, "shll", ".2d", ".2s", "32">; def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128, "shll2", ".2d", ".4s", "32">; } } // Supports all element sizes. multiclass SIMDLongTwoVector opc, string asm, SDPatternOperator OpNode> { def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } multiclass SIMDLongTwoVectorTied opc, string asm, SDPatternOperator OpNode> { def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v8i8 V64:$Rn)))]>; def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v16i8 V128:$Rn)))]>; def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v4i16 V64:$Rn)))]>; def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v8i16 V128:$Rn)))]>; def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), (v2i32 V64:$Rn)))]>; def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v4i32 V128:$Rn)))]>; } // Supports all element sizes, except 1xD. multiclass SIMDTwoVectorBHSDTied opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; } multiclass SIMDTwoVectorBHSD opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } // Supports only B element sizes. multiclass SIMDTwoVectorB size, bits<5> opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; } // Supports only B and H element sizes. multiclass SIMDTwoVectorBH opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; } // Supports only S and D element sizes, uses high bit of the size field // as an extra opcode bit. multiclass SIMDTwoVectorFP opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, asm, ".4h", ".4h", [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, asm, ".8h", ".8h", [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } // Supports only S element size. multiclass SIMDTwoVectorS opc, string asm, SDPatternOperator OpNode> { def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } multiclass SIMDTwoVectorFPToInt opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } multiclass SIMDTwoVectorIntToFP opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, asm, ".4h", ".4h", [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, asm, ".8h", ".8h", [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } class BaseSIMDMixedTwoVector size, bits<5> opcode, RegisterOperand inreg, RegisterOperand outreg, string asm, string outkind, string inkind, list pattern> : I<(outs outreg:$Rd), (ins inreg:$Rn), asm, "{\t$Rd" # outkind # ", $Rn" # inkind # "|" # outkind # "\t$Rd, $Rn}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21-17} = 0b10000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class BaseSIMDMixedTwoVectorTied size, bits<5> opcode, RegisterOperand inreg, RegisterOperand outreg, string asm, string outkind, string inkind, list pattern> : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm, "{\t$Rd" # outkind # ", $Rn" # inkind # "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21-17} = 0b10000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDMixedTwoVector opc, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64, asm, ".8b", ".8h", [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>; def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128, asm#"2", ".16b", ".8h", []>; def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64, asm, ".4h", ".4s", [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>; def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128, asm#"2", ".8h", ".4s", []>; def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64, asm, ".2s", ".2d", [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>; def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128, asm#"2", ".4s", ".2d", []>; def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))), (!cast(NAME # "v16i8") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))), (!cast(NAME # "v8i16") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))), (!cast(NAME # "v4i32") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; } class BaseSIMDCmpTwoVector size, bits<2> size2, bits<5> opcode, RegisterOperand regtype, string asm, string kind, string zero, ValueType dty, ValueType sty, SDNode OpNode> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0b1; let Inst{20-19} = size2; let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } // Comparisons support all element sizes, except 1xD. multiclass SIMDCmpTwoVector opc, string asm, SDNode OpNode> { def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64, asm, ".8b", "0", v8i8, v8i8, OpNode>; def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128, asm, ".16b", "0", v16i8, v16i8, OpNode>; def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64, asm, ".4h", "0", v4i16, v4i16, OpNode>; def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128, asm, ".8h", "0", v8i16, v8i16, OpNode>; def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64, asm, ".2s", "0", v2i32, v2i32, OpNode>; def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128, asm, ".4s", "0", v4i32, v4i32, OpNode>; def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128, asm, ".2d", "0", v2i64, v2i64, OpNode>; } // FP Comparisons support only S and D element sizes (and H for v8.2a). multiclass SIMDFPCmpTwoVector opc, string asm, SDNode OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64, asm, ".4h", "0.0", v4i16, v4f16, OpNode>; def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128, asm, ".8h", "0.0", v8i16, v8f16, OpNode>; } // Predicates = [HasNEON, HasFullFP16] def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64, asm, ".2s", "0.0", v2i32, v2f32, OpNode>; def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128, asm, ".4s", "0.0", v4i32, v4f32, OpNode>; def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128, asm, ".2d", "0.0", v2i64, v2f64, OpNode>; let Predicates = [HasNEON, HasFullFP16] in { def : InstAlias(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; def : InstAlias(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; } def : InstAlias(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; def : InstAlias(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; def : InstAlias(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; let Predicates = [HasNEON, HasFullFP16] in { def : InstAlias(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; def : InstAlias(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; } def : InstAlias(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; def : InstAlias(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; def : InstAlias(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDFPCvtTwoVector size, bits<5> opcode, RegisterOperand outtype, RegisterOperand intype, string asm, string VdTy, string VnTy, list pattern> : I<(outs outtype:$Rd), (ins intype:$Rn), asm, !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21-17} = 0b10000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class BaseSIMDFPCvtTwoVectorTied size, bits<5> opcode, RegisterOperand outtype, RegisterOperand intype, string asm, string VdTy, string VnTy, list pattern> : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm, !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21-17} = 0b10000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDFPWidenTwoVector opc, string asm> { def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64, asm, ".4s", ".4h", []>; def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128, asm#"2", ".4s", ".8h", []>; def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64, asm, ".2d", ".2s", []>; def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128, asm#"2", ".2d", ".4s", []>; } multiclass SIMDFPNarrowTwoVector opc, string asm> { def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128, asm, ".4h", ".4s", []>; def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128, asm#"2", ".8h", ".4s", []>; def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, asm, ".2s", ".2d", []>; def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, asm#"2", ".4s", ".2d", []>; } multiclass SIMDFPInexactCvtTwoVector opc, string asm, Intrinsic OpNode> { def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, asm, ".2s", ".2d", [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>; def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, asm#"2", ".4s", ".2d", []>; def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))), (!cast(NAME # "v4f32") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; } //---------------------------------------------------------------------------- // AdvSIMD three register different-size vector instructions. //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDDifferentThreeVector size, bits<4> opcode, RegisterOperand outtype, RegisterOperand intype1, RegisterOperand intype2, string asm, string outkind, string inkind1, string inkind2, list pattern> : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm, "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = size{0}; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size{2-1}; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = opcode; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDDifferentThreeVectorTied size, bits<4> opcode, RegisterOperand outtype, RegisterOperand intype1, RegisterOperand intype2, string asm, string outkind, string inkind1, string inkind2, list pattern> : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm, "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = size{0}; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size{2-1}; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = opcode; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } // FIXME: TableGen doesn't know how to deal with expanded types that also // change the element count (in this case, placing the results in // the high elements of the result register rather than the low // elements). Until that's fixed, we can't code-gen those. multiclass SIMDNarrowThreeVectorBHS opc, string asm, Intrinsic IntOp> { def v8i16_v8i8 : BaseSIMDDifferentThreeVector; def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied; def v4i32_v4i16 : BaseSIMDDifferentThreeVector; def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied; def v2i64_v2i32 : BaseSIMDDifferentThreeVector; def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied; // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in // a version attached to an instruction. def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm))), (!cast(NAME # "v8i16_v16i8") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm))), (!cast(NAME # "v4i32_v8i16") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm))), (!cast(NAME # "v2i64_v4i32") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; } multiclass SIMDDifferentThreeVectorBD opc, string asm, Intrinsic IntOp> { def v8i8 : BaseSIMDDifferentThreeVector; def v16i8 : BaseSIMDDifferentThreeVector; let Predicates = [HasAES] in { def v1i64 : BaseSIMDDifferentThreeVector; def v2i64 : BaseSIMDDifferentThreeVector; } def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)), (v8i8 (extract_high_v16i8 V128:$Rm)))), (!cast(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } multiclass SIMDLongThreeVectorHS opc, string asm, SDPatternOperator OpNode> { def v4i16_v4i32 : BaseSIMDDifferentThreeVector; def v8i16_v4i32 : BaseSIMDDifferentThreeVector; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; def v4i32_v2i64 : BaseSIMDDifferentThreeVector; } multiclass SIMDLongThreeVectorBHSabdl opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector; def v16i8_v8i16 : BaseSIMDDifferentThreeVector; def v4i16_v4i32 : BaseSIMDDifferentThreeVector; def v8i16_v4i32 : BaseSIMDDifferentThreeVector; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; def v4i32_v2i64 : BaseSIMDDifferentThreeVector; } multiclass SIMDLongThreeVectorTiedBHSabal opc, string asm, SDPatternOperator OpNode> { def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; } multiclass SIMDLongThreeVectorBHS opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector; def v16i8_v8i16 : BaseSIMDDifferentThreeVector; def v4i16_v4i32 : BaseSIMDDifferentThreeVector; def v8i16_v4i32 : BaseSIMDDifferentThreeVector; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; def v4i32_v2i64 : BaseSIMDDifferentThreeVector; } multiclass SIMDLongThreeVectorTiedBHS opc, string asm, SDPatternOperator OpNode> { def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; } multiclass SIMDLongThreeVectorSQDMLXTiedHS opc, string asm, SDPatternOperator Accum> { def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; } multiclass SIMDWideThreeVectorBHS opc, string asm, SDPatternOperator OpNode> { def v8i8_v8i16 : BaseSIMDDifferentThreeVector; def v16i8_v8i16 : BaseSIMDDifferentThreeVector; def v4i16_v4i32 : BaseSIMDDifferentThreeVector; def v8i16_v4i32 : BaseSIMDDifferentThreeVector; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; def v4i32_v2i64 : BaseSIMDDifferentThreeVector; } //---------------------------------------------------------------------------- // AdvSIMD bitwise extract from vector //---------------------------------------------------------------------------- class BaseSIMDBitwiseExtract : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" # "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "", [(set (vty regtype:$Rd), (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<4> imm; let Inst{31} = 0; let Inst{30} = size; let Inst{29-21} = 0b101110000; let Inst{20-16} = Rm; let Inst{15} = 0; let Inst{14-11} = imm; let Inst{10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDBitwiseExtract { def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> { let imm{3} = 0; } def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">; } //---------------------------------------------------------------------------- // AdvSIMD zip vector //---------------------------------------------------------------------------- class BaseSIMDZipVector size, bits<3> opc, RegisterOperand regtype, string asm, string kind, SDNode OpNode, ValueType valty> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # "|" # kind # "\t$Rd, $Rn, $Rm}", "", [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = size{0}; let Inst{29-24} = 0b001110; let Inst{23-22} = size{2-1}; let Inst{21} = 0; let Inst{20-16} = Rm; let Inst{15} = 0; let Inst{14-12} = opc; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDZipVectoropc, string asm, SDNode OpNode> { def v8i8 : BaseSIMDZipVector<0b000, opc, V64, asm, ".8b", OpNode, v8i8>; def v16i8 : BaseSIMDZipVector<0b001, opc, V128, asm, ".16b", OpNode, v16i8>; def v4i16 : BaseSIMDZipVector<0b010, opc, V64, asm, ".4h", OpNode, v4i16>; def v8i16 : BaseSIMDZipVector<0b011, opc, V128, asm, ".8h", OpNode, v8i16>; def v2i32 : BaseSIMDZipVector<0b100, opc, V64, asm, ".2s", OpNode, v2i32>; def v4i32 : BaseSIMDZipVector<0b101, opc, V128, asm, ".4s", OpNode, v4i32>; def v2i64 : BaseSIMDZipVector<0b111, opc, V128, asm, ".2d", OpNode, v2i64>; def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)), (!cast(NAME#"v4i16") V64:$Rn, V64:$Rm)>; def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)), (!cast(NAME#"v8i16") V128:$Rn, V128:$Rm)>; def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)), (!cast(NAME#"v2i32") V64:$Rn, V64:$Rm)>; def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)), (!cast(NAME#"v4i32") V128:$Rn, V128:$Rm)>; def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)), (!cast(NAME#"v2i64") V128:$Rn, V128:$Rm)>; } //---------------------------------------------------------------------------- // AdvSIMD three register scalar instructions //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDThreeScalar size, bits<5> opcode, RegisterClass regtype, string asm, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDThreeScalarTied size, bit R, bits<5> opcode, dag oops, dag iops, string asm, list pattern> : I, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21} = R; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDThreeScalarD opc, string asm, SDPatternOperator OpNode> { def v1i64 : BaseSIMDThreeScalar; } multiclass SIMDThreeScalarBHSD opc, string asm, SDPatternOperator OpNode> { def v1i64 : BaseSIMDThreeScalar; def v1i32 : BaseSIMDThreeScalar; def v1i16 : BaseSIMDThreeScalar; def v1i8 : BaseSIMDThreeScalar; def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (!cast(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))), (!cast(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>; } multiclass SIMDThreeScalarHS opc, string asm, SDPatternOperator OpNode> { def v1i32 : BaseSIMDThreeScalar; def v1i16 : BaseSIMDThreeScalar; } multiclass SIMDThreeScalarHSTied opc, string asm, SDPatternOperator OpNode = null_frag> { def v1i32: BaseSIMDThreeScalarTied; def v1i16: BaseSIMDThreeScalarTied; } multiclass SIMDFPThreeScalar opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def #NAME#64 : BaseSIMDThreeScalar; def #NAME#32 : BaseSIMDThreeScalar; let Predicates = [HasNEON, HasFullFP16] in { def #NAME#16 : BaseSIMDThreeScalar; } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (!cast(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; } multiclass SIMDThreeScalarFPCmp opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def #NAME#64 : BaseSIMDThreeScalar; def #NAME#32 : BaseSIMDThreeScalar; let Predicates = [HasNEON, HasFullFP16] in { def #NAME#16 : BaseSIMDThreeScalar; } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (!cast(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; } class BaseSIMDThreeScalarMixed size, bits<5> opcode, dag oops, dag iops, string asm, string cstr, list pat> : I, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDThreeScalarMixedHS opc, string asm, SDPatternOperator OpNode = null_frag> { def i16 : BaseSIMDThreeScalarMixed; def i32 : BaseSIMDThreeScalarMixed; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDThreeScalarMixedTiedHS opc, string asm, SDPatternOperator OpNode = null_frag> { def i16 : BaseSIMDThreeScalarMixed; def i32 : BaseSIMDThreeScalarMixed; } //---------------------------------------------------------------------------- // AdvSIMD two register scalar instructions //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoScalar size, bits<2> size2, bits<5> opcode, RegisterClass regtype, RegisterClass regtype2, string asm, list pat> : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, "\t$Rd, $Rn", "", pat>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21} = 0b1; let Inst{20-19} = size2; let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoScalarTied size, bits<5> opcode, RegisterClass regtype, RegisterClass regtype2, string asm, list pat> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm, "\t$Rd, $Rn", "$Rd = $dst", pat>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21-17} = 0b10000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDCmpTwoScalar size, bits<2> size2, bits<5> opcode, RegisterClass regtype, string asm, string zero> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn, #" # zero, "", []>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21} = 0b1; let Inst{20-19} = size2; let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class SIMDInexactCvtTwoScalar opcode, string asm> : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31-17} = 0b011111100110000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDCmpTwoScalarD opc, string asm, SDPatternOperator OpNode> { def v1i64rz : BaseSIMDCmpTwoScalar; def : Pat<(v1i64 (OpNode FPR64:$Rn)), (!cast(NAME # v1i64rz) FPR64:$Rn)>; } multiclass SIMDFPCmpTwoScalar opc, string asm, SDPatternOperator OpNode> { def v1i64rz : BaseSIMDCmpTwoScalar; def v1i32rz : BaseSIMDCmpTwoScalar; let Predicates = [HasNEON, HasFullFP16] in { def v1i16rz : BaseSIMDCmpTwoScalar; } def : InstAlias(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; def : InstAlias(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; let Predicates = [HasNEON, HasFullFP16] in { def : InstAlias(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>; } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), (!cast(NAME # v1i64rz) FPR64:$Rn)>; } multiclass SIMDTwoScalarD opc, string asm, SDPatternOperator OpNode = null_frag> { def v1i64 : BaseSIMDTwoScalar; def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), (!cast(NAME # "v1i64") FPR64:$Rn)>; } multiclass SIMDFPTwoScalar opc, string asm> { def v1i64 : BaseSIMDTwoScalar; def v1i32 : BaseSIMDTwoScalar; let Predicates = [HasNEON, HasFullFP16] in { def v1f16 : BaseSIMDTwoScalar; } } multiclass SIMDFPTwoScalarCVT opc, string asm, SDPatternOperator OpNode> { def v1i64 : BaseSIMDTwoScalar; def v1i32 : BaseSIMDTwoScalar; let Predicates = [HasNEON, HasFullFP16] in { def v1i16 : BaseSIMDTwoScalar; } } multiclass SIMDTwoScalarBHSD opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def v1i64 : BaseSIMDTwoScalar; def v1i32 : BaseSIMDTwoScalar; def v1i16 : BaseSIMDTwoScalar; def v1i8 : BaseSIMDTwoScalar; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), (!cast(NAME # v1i64) FPR64:$Rn)>; } multiclass SIMDTwoScalarBHSDTied opc, string asm, Intrinsic OpNode> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def v1i64 : BaseSIMDTwoScalarTied; def v1i32 : BaseSIMDTwoScalarTied; def v1i16 : BaseSIMDTwoScalarTied; def v1i8 : BaseSIMDTwoScalarTied; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))), (!cast(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDTwoScalarMixedBHS opc, string asm, SDPatternOperator OpNode = null_frag> { def v1i32 : BaseSIMDTwoScalar; def v1i16 : BaseSIMDTwoScalar; def v1i8 : BaseSIMDTwoScalar; } //---------------------------------------------------------------------------- // AdvSIMD scalar pairwise instructions //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDPairwiseScalar size, bits<5> opcode, RegisterOperand regtype, RegisterOperand vectype, string asm, string kind> : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21-17} = 0b11000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDPairwiseScalarD opc, string asm> { def v2i64p : BaseSIMDPairwiseScalar; } multiclass SIMDFPPairwiseScalar opc, string asm> { let Predicates = [HasNEON, HasFullFP16] in { def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64, asm, ".2h">; } def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64, asm, ".2s">; def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128, asm, ".2d">; } //---------------------------------------------------------------------------- // AdvSIMD across lanes instructions //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDAcrossLanes size, bits<5> opcode, RegisterClass regtype, RegisterOperand vectype, string asm, string kind, list pattern> : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21-17} = 0b11000; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDAcrossLanesBHS opcode, string asm> { def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64, asm, ".8b", []>; def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128, asm, ".16b", []>; def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64, asm, ".4h", []>; def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128, asm, ".8h", []>; def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128, asm, ".4s", []>; } multiclass SIMDAcrossLanesHSD opcode, string asm> { def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64, asm, ".8b", []>; def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128, asm, ".16b", []>; def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64, asm, ".4h", []>; def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128, asm, ".8h", []>; def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128, asm, ".4s", []>; } multiclass SIMDFPAcrossLanes opcode, bit sz1, string asm, Intrinsic intOp> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, asm, ".4h", [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, asm, ".8h", [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; } //---------------------------------------------------------------------------- // AdvSIMD INS/DUP instructions //---------------------------------------------------------------------------- // FIXME: There has got to be a better way to factor these. ugh. class BaseSIMDInsDup pattern> : I, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = op; let Inst{28-21} = 0b01110000; let Inst{15} = 0; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class SIMDDupFromMain imm5, string size, ValueType vectype, RegisterOperand vecreg, RegisterClass regtype> : BaseSIMDInsDup { let Inst{20-16} = imm5; let Inst{14-11} = 0b0001; } class SIMDDupFromElement : BaseSIMDInsDup { let Inst{14-11} = 0b0000; } class SIMDDup64FromElement : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128, VectorIndexD, i64, AArch64duplane64> { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; } class SIMDDup32FromElement : SIMDDupFromElement { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } class SIMDDup16FromElement : SIMDDupFromElement { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } class SIMDDup8FromElement : SIMDDupFromElement { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } class BaseSIMDMov imm4, RegisterClass regtype, Operand idxtype, string asm, list pattern> : BaseSIMDInsDup { let Inst{14-11} = imm4; } class SIMDSMov : BaseSIMDMov; class SIMDUMov : BaseSIMDMov; class SIMDMovAlias : InstAlias; multiclass SMov { def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } } multiclass UMov { def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; } def : SIMDMovAlias<"mov", ".s", !cast(NAME#"vi32"), GPR32, VectorIndexS>; def : SIMDMovAlias<"mov", ".d", !cast(NAME#"vi64"), GPR64, VectorIndexD>; } class SIMDInsFromMain : BaseSIMDInsDup<1, 0, (outs V128:$dst), (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins", "{\t$Rd" # size # "$idx, $Rn" # "|" # size # "\t$Rd$idx, $Rn}", "$Rd = $dst", [(set V128:$dst, (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> { let Inst{14-11} = 0b0011; } class SIMDInsFromElement : BaseSIMDInsDup<1, 1, (outs V128:$dst), (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins", "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" # "|" # size # "\t$Rd$idx, $Rn$idx2}", "$Rd = $dst", [(set V128:$dst, (vector_insert (vectype V128:$Rd), (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)), idxtype:$idx))]>; class SIMDInsMainMovAlias : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # "|" # size #"\t$dst$idx, $src}", (inst V128:$dst, idxtype:$idx, regtype:$src)>; class SIMDInsElementMovAlias : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; multiclass SIMDIns { def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; } def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> { bits<4> idx; bits<4> idx2; let Inst{20-17} = idx; let Inst{16} = 1; let Inst{14-11} = idx2; } def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> { bits<3> idx; bits<3> idx2; let Inst{20-18} = idx; let Inst{17-16} = 0b10; let Inst{14-12} = idx2; let Inst{11} = {?}; } def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> { bits<2> idx; bits<2> idx2; let Inst{20-19} = idx; let Inst{18-16} = 0b100; let Inst{14-13} = idx2; let Inst{12-11} = {?,?}; } def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> { bits<1> idx; bits<1> idx2; let Inst{20} = idx; let Inst{19-16} = 0b1000; let Inst{14} = idx2; let Inst{13-11} = {?,?,?}; } // For all forms of the INS instruction, the "mov" mnemonic is the // preferred alias. Why they didn't just call the instruction "mov" in // the first place is a very good question indeed... def : SIMDInsMainMovAlias<".b", !cast(NAME#"vi8gpr"), GPR32, VectorIndexB>; def : SIMDInsMainMovAlias<".h", !cast(NAME#"vi16gpr"), GPR32, VectorIndexH>; def : SIMDInsMainMovAlias<".s", !cast(NAME#"vi32gpr"), GPR32, VectorIndexS>; def : SIMDInsMainMovAlias<".d", !cast(NAME#"vi64gpr"), GPR64, VectorIndexD>; def : SIMDInsElementMovAlias<".b", !cast(NAME#"vi8lane"), VectorIndexB>; def : SIMDInsElementMovAlias<".h", !cast(NAME#"vi16lane"), VectorIndexH>; def : SIMDInsElementMovAlias<".s", !cast(NAME#"vi32lane"), VectorIndexS>; def : SIMDInsElementMovAlias<".d", !cast(NAME#"vi64lane"), VectorIndexD>; } //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDTableLookup len, bit op, RegisterOperand vectype, RegisterOperand listtype, string asm, string kind> : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm, "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>, Sched<[WriteV]> { bits<5> Vd; bits<5> Vn; bits<5> Vm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29-21} = 0b001110000; let Inst{20-16} = Vm; let Inst{15} = 0; let Inst{14-13} = len; let Inst{12} = op; let Inst{11-10} = 0b00; let Inst{9-5} = Vn; let Inst{4-0} = Vd; } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDTableLookupTied len, bit op, RegisterOperand vectype, RegisterOperand listtype, string asm, string kind> : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm, "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>, Sched<[WriteV]> { bits<5> Vd; bits<5> Vn; bits<5> Vm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29-21} = 0b001110000; let Inst{20-16} = Vm; let Inst{15} = 0; let Inst{14-13} = len; let Inst{12} = op; let Inst{11-10} = 0b00; let Inst{9-5} = Vn; let Inst{4-0} = Vd; } class SIMDTableLookupAlias : InstAlias; multiclass SIMDTableLookup { def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b, asm, ".8b">; def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b, asm, ".8b">; def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b, asm, ".8b">; def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b, asm, ".8b">; def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b, asm, ".16b">; def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b, asm, ".16b">; def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b, asm, ".16b">; def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b, asm, ".16b">; def : SIMDTableLookupAlias(NAME#"v8i8One"), V64, VecListOne128>; def : SIMDTableLookupAlias(NAME#"v8i8Two"), V64, VecListTwo128>; def : SIMDTableLookupAlias(NAME#"v8i8Three"), V64, VecListThree128>; def : SIMDTableLookupAlias(NAME#"v8i8Four"), V64, VecListFour128>; def : SIMDTableLookupAlias(NAME#"v16i8One"), V128, VecListOne128>; def : SIMDTableLookupAlias(NAME#"v16i8Two"), V128, VecListTwo128>; def : SIMDTableLookupAlias(NAME#"v16i8Three"), V128, VecListThree128>; def : SIMDTableLookupAlias(NAME#"v16i8Four"), V128, VecListFour128>; } multiclass SIMDTableLookupTied { def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b, asm, ".8b">; def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b, asm, ".8b">; def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b, asm, ".8b">; def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b, asm, ".8b">; def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b, asm, ".16b">; def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b, asm, ".16b">; def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b, asm, ".16b">; def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b, asm, ".16b">; def : SIMDTableLookupAlias(NAME#"v8i8One"), V64, VecListOne128>; def : SIMDTableLookupAlias(NAME#"v8i8Two"), V64, VecListTwo128>; def : SIMDTableLookupAlias(NAME#"v8i8Three"), V64, VecListThree128>; def : SIMDTableLookupAlias(NAME#"v8i8Four"), V64, VecListFour128>; def : SIMDTableLookupAlias(NAME#"v16i8One"), V128, VecListOne128>; def : SIMDTableLookupAlias(NAME#"v16i8Two"), V128, VecListTwo128>; def : SIMDTableLookupAlias(NAME#"v16i8Three"), V128, VecListThree128>; def : SIMDTableLookupAlias(NAME#"v16i8Four"), V128, VecListFour128>; } //---------------------------------------------------------------------------- // AdvSIMD scalar CPY //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDScalarCPY : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov", "{\t$dst, $src" # kind # "$idx" # "|\t$dst, $src$idx}", "", []>, Sched<[WriteV]> { bits<5> dst; bits<5> src; let Inst{31-21} = 0b01011110000; let Inst{15-10} = 0b000001; let Inst{9-5} = src; let Inst{4-0} = dst; } class SIMDScalarCPYAlias : InstAlias; multiclass SIMDScalarCPY { def i8 : BaseSIMDScalarCPY { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } def i16 : BaseSIMDScalarCPY { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } def i32 : BaseSIMDScalarCPY { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } def i64 : BaseSIMDScalarCPY { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; } def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src), VectorIndexD:$idx)))), (!cast(NAME # i64) V128:$src, VectorIndexD:$idx)>; // 'DUP' mnemonic aliases. def : SIMDScalarCPYAlias<"dup", ".b", !cast(NAME#"i8"), FPR8, V128, VectorIndexB>; def : SIMDScalarCPYAlias<"dup", ".h", !cast(NAME#"i16"), FPR16, V128, VectorIndexH>; def : SIMDScalarCPYAlias<"dup", ".s", !cast(NAME#"i32"), FPR32, V128, VectorIndexS>; def : SIMDScalarCPYAlias<"dup", ".d", !cast(NAME#"i64"), FPR64, V128, VectorIndexD>; } //---------------------------------------------------------------------------- // AdvSIMD modified immediate instructions //---------------------------------------------------------------------------- class BaseSIMDModifiedImm pattern> : I, Sched<[WriteV]> { bits<5> Rd; bits<8> imm8; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = op; let Inst{28-19} = 0b0111100000; let Inst{18-16} = imm8{7-5}; let Inst{11} = op2; let Inst{10} = 1; let Inst{9-5} = imm8{4-0}; let Inst{4-0} = Rd; } class BaseSIMDModifiedImmVector pattern> : BaseSIMDModifiedImm { let DecoderMethod = "DecodeModImmInstruction"; } class BaseSIMDModifiedImmVectorTied pattern> : BaseSIMDModifiedImm { let DecoderMethod = "DecodeModImmTiedInstruction"; } class BaseSIMDModifiedImmVectorShift b15_b12, RegisterOperand vectype, string asm, string kind, list pattern> : BaseSIMDModifiedImmVector { bits<2> shift; let Inst{15} = b15_b12{1}; let Inst{14-13} = shift; let Inst{12} = b15_b12{0}; } class BaseSIMDModifiedImmVectorShiftTied b15_b12, RegisterOperand vectype, string asm, string kind, list pattern> : BaseSIMDModifiedImmVectorTied { bits<2> shift; let Inst{15} = b15_b12{1}; let Inst{14-13} = shift; let Inst{12} = b15_b12{0}; } class BaseSIMDModifiedImmVectorShiftHalf b15_b12, RegisterOperand vectype, string asm, string kind, list pattern> : BaseSIMDModifiedImmVector { bits<2> shift; let Inst{15} = b15_b12{1}; let Inst{14} = 0; let Inst{13} = shift{0}; let Inst{12} = b15_b12{0}; } class BaseSIMDModifiedImmVectorShiftHalfTied b15_b12, RegisterOperand vectype, string asm, string kind, list pattern> : BaseSIMDModifiedImmVectorTied { bits<2> shift; let Inst{15} = b15_b12{1}; let Inst{14} = 0; let Inst{13} = shift{0}; let Inst{12} = b15_b12{0}; } multiclass SIMDModifiedImmVectorShift hw_cmode, bits<2> w_cmode, string asm> { def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64, asm, ".4h", []>; def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128, asm, ".8h", []>; def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64, asm, ".2s", []>; def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128, asm, ".4s", []>; } multiclass SIMDModifiedImmVectorShiftTied hw_cmode, bits<2> w_cmode, string asm, SDNode OpNode> { def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64, asm, ".4h", [(set (v4i16 V64:$dst), (OpNode V64:$Rd, imm0_255:$imm8, (i32 imm:$shift)))]>; def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128, asm, ".8h", [(set (v8i16 V128:$dst), (OpNode V128:$Rd, imm0_255:$imm8, (i32 imm:$shift)))]>; def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64, asm, ".2s", [(set (v2i32 V64:$dst), (OpNode V64:$Rd, imm0_255:$imm8, (i32 imm:$shift)))]>; def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128, asm, ".4s", [(set (v4i32 V128:$dst), (OpNode V128:$Rd, imm0_255:$imm8, (i32 imm:$shift)))]>; } class SIMDModifiedImmMoveMSL cmode, RegisterOperand vectype, string asm, string kind, list pattern> : BaseSIMDModifiedImmVector { bits<1> shift; let Inst{15-13} = cmode{3-1}; let Inst{12} = shift; } class SIMDModifiedImmVectorNoShift cmode, RegisterOperand vectype, Operand imm_type, string asm, string kind, list pattern> : BaseSIMDModifiedImmVector { let Inst{15-12} = cmode; } class SIMDModifiedImmScalarNoShift cmode, string asm, list pattern> : BaseSIMDModifiedImm { let Inst{15-12} = cmode; let DecoderMethod = "DecodeModImmInstruction"; } //---------------------------------------------------------------------------- // AdvSIMD indexed element //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDIndexed size, bits<4> opc, RegisterOperand dst_reg, RegisterOperand lhs_reg, RegisterOperand rhs_reg, Operand vec_idx, string asm, string apple_kind, string dst_kind, string lhs_kind, string rhs_kind, list pattern> : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm, "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28} = Scalar; let Inst{27-24} = 0b1111; let Inst{23-22} = size; // Bit 21 must be set by the derived class. let Inst{20-16} = Rm; let Inst{15-12} = opc; // Bit 11 must be set by the derived class. let Inst{10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDIndexedTied size, bits<4> opc, RegisterOperand dst_reg, RegisterOperand lhs_reg, RegisterOperand rhs_reg, Operand vec_idx, string asm, string apple_kind, string dst_kind, string lhs_kind, string rhs_kind, list pattern> : I<(outs dst_reg:$dst), (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm, "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28} = Scalar; let Inst{27-24} = 0b1111; let Inst{23-22} = size; // Bit 21 must be set by the derived class. let Inst{20-16} = Rm; let Inst{15-12} = opc; // Bit 11 must be set by the derived class. let Inst{10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } // ARMv8.2 Index Dot product instructions class BaseSIMDThreeSameVectorDotIndex : BaseSIMDIndexedTied { bits<2> idx; let Inst{21} = idx{0}; // L let Inst{11} = idx{1}; // H } multiclass SIMDThreeSameVectorDotIndex { def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", V64, v2i32, v8i8, OpNode>; def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", V128, v4i32, v16i8, OpNode>; } multiclass SIMDFPIndexed opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } } // Predicates = [HasNEON, HasFullFP16] def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc, V128, V128, V128, VectorIndexD, asm, ".2d", ".2d", ".2d", ".d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> { bits<1> idx; let Inst{11} = idx{0}; let Inst{21} = 0; } let Predicates = [HasNEON, HasFullFP16] in { def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", [(set (f16 FPR16Op:$Rd), (OpNode (f16 FPR16Op:$Rn), (f16 (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } } // Predicates = [HasNEON, HasFullFP16] def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (f32 FPR32Op:$Rd), (OpNode (f32 FPR32Op:$Rn), (f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc, FPR64Op, FPR64Op, V128, VectorIndexD, asm, ".d", "", "", ".d", [(set (f64 FPR64Op:$Rd), (OpNode (f64 FPR64Op:$Rn), (f64 (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))))]> { bits<1> idx; let Inst{11} = idx{0}; let Inst{21} = 0; } } multiclass SIMDFPIndexedTiedPatterns { // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast(INST # v2i32_indexed) V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64dup (f32 FPR32Op:$Rm)))), (!cast(INST # "v2i32_indexed") V64:$Rd, V64:$Rn, (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast(INST # "v4i32_indexed") V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (AArch64dup (f32 FPR32Op:$Rm)))), (!cast(INST # "v4i32_indexed") V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))), (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (AArch64dup (f64 FPR64Op:$Rm)))), (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; // 2 variants for 32-bit scalar version: extract from .2s or from .4s def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; // 1 variant for 64-bit scalar version: extract from .1d or from .2d def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))), (!cast(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn, V128:$Rm, VectorIndexD:$idx)>; } multiclass SIMDFPIndexedTied opc, string asm> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } } // Predicates = [HasNEON, HasFullFP16] def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", []> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", []> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc, V128, V128, V128, VectorIndexD, asm, ".2d", ".2d", ".2d", ".d", []> { bits<1> idx; let Inst{11} = idx{0}; let Inst{21} = 0; } let Predicates = [HasNEON, HasFullFP16] in { def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } } // Predicates = [HasNEON, HasFullFP16] def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", []> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc, FPR64Op, FPR64Op, V128, VectorIndexD, asm, ".d", "", "", ".d", []> { bits<1> idx; let Inst{11} = idx{0}; let Inst{21} = 0; } } multiclass SIMDIndexedHS opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i32 FPR32Op:$Rd), (OpNode FPR32Op:$Rn, (i32 (vector_extract (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } multiclass SIMDVectorIndexedHS opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } multiclass SIMDVectorIndexedHSTied opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } multiclass SIMDIndexedLongSD opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V128, V64, V128_lo, VectorIndexH, asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V128, V64, V128, VectorIndexS, asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, FPR32Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, FPR64Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", []> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, SDPatternOperator Accum> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V128, V64, V128_lo, VectorIndexH, asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an // intermediate EXTRACT_SUBREG would be untyped. def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), (i32 (vector_extract (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))), (i64 0))))), (EXTRACT_SUBREG (!cast(NAME # v4i16_indexed) (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx), ssub)>; def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn), (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V128, V64, V128, VectorIndexS, asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$dst), (Accum (v2i64 V128:$Rd), (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$dst), (Accum (v2i64 V128:$Rd), (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn), (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, FPR32Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR64Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i64 FPR64Op:$dst), (Accum (i64 FPR64Op:$Rd), (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32Op:$Rn), (i32 (vector_extract (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } multiclass SIMDVectorIndexedLongSD opc, string asm, SDPatternOperator OpNode> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V128, V64, V128_lo, VectorIndexH, asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V128, V64, V128, VectorIndexS, asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } } multiclass SIMDVectorIndexedLongSDTied opc, string asm, SDPatternOperator OpNode> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V128, V64, V128_lo, VectorIndexH, asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (extract_high_v8i16 V128:$Rn), (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V128, V64, V128, VectorIndexS, asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (extract_high_v4i32 V128:$Rn), (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } } //---------------------------------------------------------------------------- // AdvSIMD scalar shift by immediate //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDScalarShift opc, bits<7> fixed_imm, RegisterClass regtype1, RegisterClass regtype2, Operand immtype, string asm, list pattern> : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm), asm, "\t$Rd, $Rn, $imm", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<7> imm; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-23} = 0b111110; let Inst{22-16} = fixed_imm; let Inst{15-11} = opc; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDScalarShiftTied opc, bits<7> fixed_imm, RegisterClass regtype1, RegisterClass regtype2, Operand immtype, string asm, list pattern> : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm), asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<7> imm; let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-23} = 0b111110; let Inst{22-16} = fixed_imm; let Inst{15-11} = opc; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDFPScalarRShift opc, string asm> { let Predicates = [HasNEON, HasFullFP16] in { def h : BaseSIMDScalarShift { let Inst{19-16} = imm{3-0}; } } // Predicates = [HasNEON, HasFullFP16] def s : BaseSIMDScalarShift { let Inst{20-16} = imm{4-0}; } def d : BaseSIMDScalarShift { let Inst{21-16} = imm{5-0}; } } multiclass SIMDScalarRShiftD opc, string asm, SDPatternOperator OpNode> { def d : BaseSIMDScalarShift { let Inst{21-16} = imm{5-0}; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), (!cast(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>; } multiclass SIMDScalarRShiftDTied opc, string asm, SDPatternOperator OpNode = null_frag> { def d : BaseSIMDScalarShiftTied { let Inst{21-16} = imm{5-0}; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), (!cast(NAME # "d") FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; } multiclass SIMDScalarLShiftD opc, string asm, SDPatternOperator OpNode> { def d : BaseSIMDScalarShift { let Inst{21-16} = imm{5-0}; } } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in multiclass SIMDScalarLShiftDTied opc, string asm> { def d : BaseSIMDScalarShiftTied { let Inst{21-16} = imm{5-0}; } } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in multiclass SIMDScalarRShiftBHS opc, string asm, SDPatternOperator OpNode = null_frag> { def b : BaseSIMDScalarShift { let Inst{18-16} = imm{2-0}; } def h : BaseSIMDScalarShift { let Inst{19-16} = imm{3-0}; } def s : BaseSIMDScalarShift { let Inst{20-16} = imm{4-0}; } } multiclass SIMDScalarLShiftBHSD opc, string asm, SDPatternOperator OpNode> { def b : BaseSIMDScalarShift { let Inst{18-16} = imm{2-0}; } def h : BaseSIMDScalarShift { let Inst{19-16} = imm{3-0}; } def s : BaseSIMDScalarShift { let Inst{20-16} = imm{4-0}; } def d : BaseSIMDScalarShift { let Inst{21-16} = imm{5-0}; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (!cast(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; } multiclass SIMDScalarRShiftBHSD opc, string asm> { def b : BaseSIMDScalarShift { let Inst{18-16} = imm{2-0}; } def h : BaseSIMDScalarShift { let Inst{19-16} = imm{3-0}; } def s : BaseSIMDScalarShift { let Inst{20-16} = imm{4-0}; } def d : BaseSIMDScalarShift { let Inst{21-16} = imm{5-0}; } } //---------------------------------------------------------------------------- // AdvSIMD vector x indexed element //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDVectorShift opc, bits<7> fixed_imm, RegisterOperand dst_reg, RegisterOperand src_reg, Operand immtype, string asm, string dst_kind, string src_kind, list pattern> : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm), asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-23} = 0b011110; let Inst{22-16} = fixed_imm; let Inst{15-11} = opc; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in class BaseSIMDVectorShiftTied opc, bits<7> fixed_imm, RegisterOperand vectype1, RegisterOperand vectype2, Operand immtype, string asm, string dst_kind, string src_kind, list pattern> : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm), asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-23} = 0b011110; let Inst{22-16} = fixed_imm; let Inst{15-11} = opc; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDVectorRShiftSD opc, string asm, Intrinsic OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, V64, V64, vecshiftR16, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftR16, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } } // Predicates = [HasNEON, HasFullFP16] def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftR32, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, V128, V128, vecshiftR64, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> { bits<6> imm; let Inst{21-16} = imm; } } multiclass SIMDVectorRShiftToFP opc, string asm, Intrinsic OpNode> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, V64, V64, vecshiftR16, asm, ".4h", ".4h", [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftR16, asm, ".8h", ".8h", [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } } // Predicates = [HasNEON, HasFullFP16] def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftR32, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, V128, V128, vecshiftR64, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> { bits<6> imm; let Inst{21-16} = imm; } } multiclass SIMDVectorRShiftNarrowBHS opc, string asm, SDPatternOperator OpNode> { def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, V64, V128, vecshiftR16Narrow, asm, ".8b", ".8h", [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> { bits<3> imm; let Inst{18-16} = imm; } def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, V128, V128, vecshiftR16Narrow, asm#"2", ".16b", ".8h", []> { bits<3> imm; let Inst{18-16} = imm; let hasSideEffects = 0; } def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, V64, V128, vecshiftR32Narrow, asm, ".4h", ".4s", [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftR32Narrow, asm#"2", ".8h", ".4s", []> { bits<4> imm; let Inst{19-16} = imm; let hasSideEffects = 0; } def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V128, vecshiftR64Narrow, asm, ".2s", ".2d", [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftR64Narrow, asm#"2", ".4s", ".2d", []> { bits<5> imm; let Inst{20-16} = imm; let hasSideEffects = 0; } // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions // themselves, so put them here instead. // Patterns involving what's effectively an insert high and a normal // intrinsic, represented by CONCAT_VECTORS. def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm)), (!cast(NAME # "v16i8_shift") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, vecshiftR16Narrow:$imm)>; def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm)), (!cast(NAME # "v8i16_shift") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, vecshiftR32Narrow:$imm)>; def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm)), (!cast(NAME # "v4i32_shift") (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, vecshiftR64Narrow:$imm)>; } multiclass SIMDVectorLShiftBHSD opc, string asm, SDPatternOperator OpNode> { def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, V64, V64, vecshiftL8, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (i32 vecshiftL8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, V128, V128, vecshiftL8, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (i32 vecshiftL8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, V64, V64, vecshiftL16, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 vecshiftL16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftL16, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 vecshiftL16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftL32, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 vecshiftL32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftL32, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 vecshiftL32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, V128, V128, vecshiftL64, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 vecshiftL64:$imm)))]> { bits<6> imm; let Inst{21-16} = imm; } } multiclass SIMDVectorRShiftBHSD opc, string asm, SDPatternOperator OpNode> { def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, V64, V64, vecshiftR8, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (i32 vecshiftR8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, V128, V128, vecshiftR8, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (i32 vecshiftR8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, V64, V64, vecshiftR16, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 vecshiftR16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftR16, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 vecshiftR16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 vecshiftR32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftR32, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 vecshiftR32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, V128, V128, vecshiftR64, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 vecshiftR64:$imm)))]> { bits<6> imm; let Inst{21-16} = imm; } } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDVectorRShiftBHSDTied opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, V64, V64, vecshiftR8, asm, ".8b", ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (i32 vecshiftR8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, V128, V128, vecshiftR8, asm, ".16b", ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (i32 vecshiftR8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, V64, V64, vecshiftR16, asm, ".4h", ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (i32 vecshiftR16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftR16, asm, ".8h", ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (i32 vecshiftR16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (i32 vecshiftR32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftR32, asm, ".4s", ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (i32 vecshiftR32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, V128, V128, vecshiftR64, asm, ".2d", ".2d", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), (i32 vecshiftR64:$imm)))]> { bits<6> imm; let Inst{21-16} = imm; } } multiclass SIMDVectorLShiftBHSDTied opc, string asm, SDPatternOperator OpNode = null_frag> { def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, V64, V64, vecshiftL8, asm, ".8b", ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (i32 vecshiftL8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, V128, V128, vecshiftL8, asm, ".16b", ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (i32 vecshiftL8:$imm)))]> { bits<3> imm; let Inst{18-16} = imm; } def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, V64, V64, vecshiftL16, asm, ".4h", ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (i32 vecshiftL16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftL16, asm, ".8h", ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (i32 vecshiftL16:$imm)))]> { bits<4> imm; let Inst{19-16} = imm; } def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftL32, asm, ".2s", ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (i32 vecshiftL32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftL32, asm, ".4s", ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (i32 vecshiftL32:$imm)))]> { bits<5> imm; let Inst{20-16} = imm; } def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, V128, V128, vecshiftL64, asm, ".2d", ".2d", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), (i32 vecshiftL64:$imm)))]> { bits<6> imm; let Inst{21-16} = imm; } } multiclass SIMDVectorLShiftLongBHSD opc, string asm, SDPatternOperator OpNode> { def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, V128, V64, vecshiftL8, asm, ".8h", ".8b", [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> { bits<3> imm; let Inst{18-16} = imm; } def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, V128, V128, vecshiftL8, asm#"2", ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> { bits<3> imm; let Inst{18-16} = imm; } def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, V128, V64, vecshiftL16, asm, ".4s", ".4h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> { bits<4> imm; let Inst{19-16} = imm; } def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, V128, V128, vecshiftL16, asm#"2", ".4s", ".8h", [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> { bits<4> imm; let Inst{19-16} = imm; } def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V128, V64, vecshiftL32, asm, ".2d", ".2s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> { bits<5> imm; let Inst{20-16} = imm; } def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, V128, V128, vecshiftL32, asm#"2", ".2d", ".4s", [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> { bits<5> imm; let Inst{20-16} = imm; } } //--- // Vector load/store //--- // SIMD ldX/stX no-index memory references don't allow the optional // ", #0" constant and handle post-indexing explicitly, so we use // a more specialized parse method for them. Otherwise, it's the same as // the general GPR64sp handling. class BaseSIMDLdSt opcode, bits<2> size, string asm, dag oops, dag iops, list pattern> : I { bits<5> Vt; bits<5> Rn; let Inst{31} = 0; let Inst{30} = Q; let Inst{29-23} = 0b0011000; let Inst{22} = L; let Inst{21-16} = 0b000000; let Inst{15-12} = opcode; let Inst{11-10} = size; let Inst{9-5} = Rn; let Inst{4-0} = Vt; } class BaseSIMDLdStPost opcode, bits<2> size, string asm, dag oops, dag iops> : I { bits<5> Vt; bits<5> Rn; bits<5> Xm; let Inst{31} = 0; let Inst{30} = Q; let Inst{29-23} = 0b0011001; let Inst{22} = L; let Inst{21} = 0; let Inst{20-16} = Xm; let Inst{15-12} = opcode; let Inst{11-10} = size; let Inst{9-5} = Rn; let Inst{4-0} = Vt; } // The immediate form of AdvSIMD post-indexed addressing is encoded with // register post-index addressing from the zero register. multiclass SIMDLdStAliases { // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16" // "ld1\t$Vt, [$Rn], #16" // may get mapped to // (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR) def : InstAlias(BaseName # Count # "v" # layout # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # layout):$Vt, XZR), 1>; // E.g. "ld1.8b { v0, v1 }, [x1], #16" // "ld1.8b\t$Vt, [$Rn], #16" // may get mapped to // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR) def : InstAlias(BaseName # Count # "v" # layout # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # Size):$Vt, XZR), 0>; // E.g. "ld1.8b { v0, v1 }, [x1]" // "ld1\t$Vt, [$Rn]" // may get mapped to // (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn) def : InstAlias(BaseName # Count # "v" # layout) !cast("VecList" # Count # Size):$Vt, GPR64sp:$Rn), 0>; // E.g. "ld1.8b { v0, v1 }, [x1], x2" // "ld1\t$Vt, [$Rn], $Xm" // may get mapped to // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm) def : InstAlias(BaseName # Count # "v" # layout # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # Size):$Vt, !cast("GPR64pi" # Offset):$Xm), 0>; } multiclass BaseSIMDLdN opcode> { let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm, (outs !cast(veclist # "16b"):$Vt), (ins GPR64sp:$Rn), []>; def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm, (outs !cast(veclist # "8h"):$Vt), (ins GPR64sp:$Rn), []>; def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm, (outs !cast(veclist # "4s"):$Vt), (ins GPR64sp:$Rn), []>; def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm, (outs !cast(veclist # "2d"):$Vt), (ins GPR64sp:$Rn), []>; def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm, (outs !cast(veclist # "8b"):$Vt), (ins GPR64sp:$Rn), []>; def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm, (outs !cast(veclist # "4h"):$Vt), (ins GPR64sp:$Rn), []>; def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm, (outs !cast(veclist # "2s"):$Vt), (ins GPR64sp:$Rn), []>; def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm, (outs GPR64sp:$wback, !cast(veclist # "16b"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm, (outs GPR64sp:$wback, !cast(veclist # "8h"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm, (outs GPR64sp:$wback, !cast(veclist # "4s"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm, (outs GPR64sp:$wback, !cast(veclist # "2d"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm, (outs GPR64sp:$wback, !cast(veclist # "8b"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm, (outs GPR64sp:$wback, !cast(veclist # "4h"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm, (outs GPR64sp:$wback, !cast(veclist # "2s"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; } defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; } // Only ld1/st1 has a v1d version. multiclass BaseSIMDStN opcode> { let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in { def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs), (ins !cast(veclist # "16b"):$Vt, GPR64sp:$Rn), []>; def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs), (ins !cast(veclist # "8h"):$Vt, GPR64sp:$Rn), []>; def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs), (ins !cast(veclist # "4s"):$Vt, GPR64sp:$Rn), []>; def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs), (ins !cast(veclist # "2d"):$Vt, GPR64sp:$Rn), []>; def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs), (ins !cast(veclist # "8b"):$Vt, GPR64sp:$Rn), []>; def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs), (ins !cast(veclist # "4h"):$Vt, GPR64sp:$Rn), []>; def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs), (ins !cast(veclist # "2s"):$Vt, GPR64sp:$Rn), []>; def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "16b"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "8h"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "4s"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "2d"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset128):$Xm)>; def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "8b"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "4h"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "2s"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; } defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; defm : SIMDLdStAliases; } multiclass BaseSIMDLd1 opcode> : BaseSIMDLdN { // LD1 instructions have extra "1d" variants. let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm, (outs !cast(veclist # "1d"):$Vt), (ins GPR64sp:$Rn), []>; def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm, (outs GPR64sp:$wback, !cast(veclist # "1d"):$Vt), (ins GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; } defm : SIMDLdStAliases; } multiclass BaseSIMDSt1 opcode> : BaseSIMDStN { // ST1 instructions have extra "1d" variants. let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs), (ins !cast(veclist # "1d"):$Vt, GPR64sp:$Rn), []>; def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, (outs GPR64sp:$wback), (ins !cast(veclist # "1d"):$Vt, GPR64sp:$Rn, !cast("GPR64pi" # Offset64):$Xm)>; } defm : SIMDLdStAliases; } multiclass SIMDLd1Multiple { defm One : BaseSIMDLd1; defm Two : BaseSIMDLd1; defm Three : BaseSIMDLd1; defm Four : BaseSIMDLd1; } multiclass SIMDSt1Multiple { defm One : BaseSIMDSt1; defm Two : BaseSIMDSt1; defm Three : BaseSIMDSt1; defm Four : BaseSIMDSt1; } multiclass SIMDLd2Multiple { defm Two : BaseSIMDLdN; } multiclass SIMDSt2Multiple { defm Two : BaseSIMDStN; } multiclass SIMDLd3Multiple { defm Three : BaseSIMDLdN; } multiclass SIMDSt3Multiple { defm Three : BaseSIMDStN; } multiclass SIMDLd4Multiple { defm Four : BaseSIMDLdN; } multiclass SIMDSt4Multiple { defm Four : BaseSIMDStN; } //--- // AdvSIMD Load/store single-element //--- class BaseSIMDLdStSingle opcode, string asm, string operands, string cst, dag oops, dag iops, list pattern> : I { bits<5> Vt; bits<5> Rn; let Inst{31} = 0; let Inst{29-24} = 0b001101; let Inst{22} = L; let Inst{21} = R; let Inst{15-13} = opcode; let Inst{9-5} = Rn; let Inst{4-0} = Vt; } class BaseSIMDLdStSingleTied opcode, string asm, string operands, string cst, dag oops, dag iops, list pattern> : I { bits<5> Vt; bits<5> Rn; let Inst{31} = 0; let Inst{29-24} = 0b001101; let Inst{22} = L; let Inst{21} = R; let Inst{15-13} = opcode; let Inst{9-5} = Rn; let Inst{4-0} = Vt; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in class BaseSIMDLdR opcode, bit S, bits<2> size, string asm, DAGOperand listtype> : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "", (outs listtype:$Vt), (ins GPR64sp:$Rn), []> { let Inst{30} = Q; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = S; let Inst{11-10} = size; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in class BaseSIMDLdRPost opcode, bit S, bits<2> size, string asm, DAGOperand listtype, DAGOperand GPR64pi> : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", (outs GPR64sp:$wback, listtype:$Vt), (ins GPR64sp:$Rn, GPR64pi:$Xm), []> { bits<5> Xm; let Inst{30} = Q; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = S; let Inst{11-10} = size; } multiclass SIMDLdrAliases { // E.g. "ld1r { v0.8b }, [x1], #1" // "ld1r.8b\t$Vt, [$Rn], #1" // may get mapped to // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) def : InstAlias(BaseName # "v" # layout # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # layout):$Vt, XZR), 1>; // E.g. "ld1r.8b { v0 }, [x1], #1" // "ld1r.8b\t$Vt, [$Rn], #1" // may get mapped to // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) def : InstAlias(BaseName # "v" # layout # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # Size):$Vt, XZR), 0>; // E.g. "ld1r.8b { v0 }, [x1]" // "ld1r.8b\t$Vt, [$Rn]" // may get mapped to // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) def : InstAlias(BaseName # "v" # layout) !cast("VecList" # Count # Size):$Vt, GPR64sp:$Rn), 0>; // E.g. "ld1r.8b { v0 }, [x1], x2" // "ld1r.8b\t$Vt, [$Rn], $Xm" // may get mapped to // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) def : InstAlias(BaseName # "v" # layout # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # Size):$Vt, !cast("GPR64pi" # Offset):$Xm), 0>; } multiclass SIMDLdR opcode, bit S, string asm, string Count, int Offset1, int Offset2, int Offset4, int Offset8> { def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm, !cast("VecList" # Count # "8b")>; def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm, !cast("VecList" # Count #"16b")>; def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm, !cast("VecList" # Count #"4h")>; def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm, !cast("VecList" # Count #"8h")>; def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm, !cast("VecList" # Count #"2s")>; def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm, !cast("VecList" # Count #"4s")>; def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm, !cast("VecList" # Count #"1d")>; def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm, !cast("VecList" # Count #"2d")>; def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm, !cast("VecList" # Count # "8b"), !cast("GPR64pi" # Offset1)>; def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm, !cast("VecList" # Count # "16b"), !cast("GPR64pi" # Offset1)>; def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm, !cast("VecList" # Count # "4h"), !cast("GPR64pi" # Offset2)>; def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm, !cast("VecList" # Count # "8h"), !cast("GPR64pi" # Offset2)>; def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm, !cast("VecList" # Count # "2s"), !cast("GPR64pi" # Offset4)>; def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm, !cast("VecList" # Count # "4s"), !cast("GPR64pi" # Offset4)>; def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm, !cast("VecList" # Count # "1d"), !cast("GPR64pi" # Offset8)>; def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm, !cast("VecList" # Count # "2d"), !cast("GPR64pi" # Offset8)>; defm : SIMDLdrAliases; defm : SIMDLdrAliases; defm : SIMDLdrAliases; defm : SIMDLdrAliases; defm : SIMDLdrAliases; defm : SIMDLdrAliases; defm : SIMDLdrAliases; defm : SIMDLdrAliases; } class SIMDLdStSingleB opcode, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingle { // idx encoded in Q:S:size fields. bits<4> idx; let Inst{30} = idx{3}; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = idx{2}; let Inst{11-10} = idx{1-0}; } class SIMDLdStSingleBTied opcode, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingleTied { // idx encoded in Q:S:size fields. bits<4> idx; let Inst{30} = idx{3}; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = idx{2}; let Inst{11-10} = idx{1-0}; } class SIMDLdStSingleBPost opcode, string asm, dag oops, dag iops> : BaseSIMDLdStSingle { // idx encoded in Q:S:size fields. bits<4> idx; bits<5> Xm; let Inst{30} = idx{3}; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = idx{2}; let Inst{11-10} = idx{1-0}; } class SIMDLdStSingleBTiedPost opcode, string asm, dag oops, dag iops> : BaseSIMDLdStSingleTied { // idx encoded in Q:S:size fields. bits<4> idx; bits<5> Xm; let Inst{30} = idx{3}; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = idx{2}; let Inst{11-10} = idx{1-0}; } class SIMDLdStSingleH opcode, bit size, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingle { // idx encoded in Q:S:size<1> fields. bits<3> idx; let Inst{30} = idx{2}; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = idx{1}; let Inst{11} = idx{0}; let Inst{10} = size; } class SIMDLdStSingleHTied opcode, bit size, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingleTied { // idx encoded in Q:S:size<1> fields. bits<3> idx; let Inst{30} = idx{2}; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = idx{1}; let Inst{11} = idx{0}; let Inst{10} = size; } class SIMDLdStSingleHPost opcode, bit size, string asm, dag oops, dag iops> : BaseSIMDLdStSingle { // idx encoded in Q:S:size<1> fields. bits<3> idx; bits<5> Xm; let Inst{30} = idx{2}; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = idx{1}; let Inst{11} = idx{0}; let Inst{10} = size; } class SIMDLdStSingleHTiedPost opcode, bit size, string asm, dag oops, dag iops> : BaseSIMDLdStSingleTied { // idx encoded in Q:S:size<1> fields. bits<3> idx; bits<5> Xm; let Inst{30} = idx{2}; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = idx{1}; let Inst{11} = idx{0}; let Inst{10} = size; } class SIMDLdStSingleS opcode, bits<2> size, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingle { // idx encoded in Q:S fields. bits<2> idx; let Inst{30} = idx{1}; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = idx{0}; let Inst{11-10} = size; } class SIMDLdStSingleSTied opcode, bits<2> size, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingleTied { // idx encoded in Q:S fields. bits<2> idx; let Inst{30} = idx{1}; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = idx{0}; let Inst{11-10} = size; } class SIMDLdStSingleSPost opcode, bits<2> size, string asm, dag oops, dag iops> : BaseSIMDLdStSingle { // idx encoded in Q:S fields. bits<2> idx; bits<5> Xm; let Inst{30} = idx{1}; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = idx{0}; let Inst{11-10} = size; } class SIMDLdStSingleSTiedPost opcode, bits<2> size, string asm, dag oops, dag iops> : BaseSIMDLdStSingleTied { // idx encoded in Q:S fields. bits<2> idx; bits<5> Xm; let Inst{30} = idx{1}; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = idx{0}; let Inst{11-10} = size; } class SIMDLdStSingleD opcode, bits<2> size, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingle { // idx encoded in Q field. bits<1> idx; let Inst{30} = idx; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = 0; let Inst{11-10} = size; } class SIMDLdStSingleDTied opcode, bits<2> size, string asm, dag oops, dag iops, list pattern> : BaseSIMDLdStSingleTied { // idx encoded in Q field. bits<1> idx; let Inst{30} = idx; let Inst{23} = 0; let Inst{20-16} = 0b00000; let Inst{12} = 0; let Inst{11-10} = size; } class SIMDLdStSingleDPost opcode, bits<2> size, string asm, dag oops, dag iops> : BaseSIMDLdStSingle { // idx encoded in Q field. bits<1> idx; bits<5> Xm; let Inst{30} = idx; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = 0; let Inst{11-10} = size; } class SIMDLdStSingleDTiedPost opcode, bits<2> size, string asm, dag oops, dag iops> : BaseSIMDLdStSingleTied { // idx encoded in Q field. bits<1> idx; bits<5> Xm; let Inst{30} = idx; let Inst{23} = 1; let Inst{20-16} = Xm; let Inst{12} = 0; let Inst{11-10} = size; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in multiclass SIMDLdSingleBTied opcode, string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i8 : SIMDLdStSingleBTied<1, R, opcode, asm, (outs listtype:$dst), (ins listtype:$Vt, VectorIndexB:$idx, GPR64sp:$Rn), []>; def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm, (outs GPR64sp:$wback, listtype:$dst), (ins listtype:$Vt, VectorIndexB:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in multiclass SIMDLdSingleHTied opcode, bit size, string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm, (outs listtype:$dst), (ins listtype:$Vt, VectorIndexH:$idx, GPR64sp:$Rn), []>; def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm, (outs GPR64sp:$wback, listtype:$dst), (ins listtype:$Vt, VectorIndexH:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in multiclass SIMDLdSingleSTied opcode, bits<2> size,string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm, (outs listtype:$dst), (ins listtype:$Vt, VectorIndexS:$idx, GPR64sp:$Rn), []>; def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm, (outs GPR64sp:$wback, listtype:$dst), (ins listtype:$Vt, VectorIndexS:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in multiclass SIMDLdSingleDTied opcode, bits<2> size, string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm, (outs listtype:$dst), (ins listtype:$Vt, VectorIndexD:$idx, GPR64sp:$Rn), []>; def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm, (outs GPR64sp:$wback, listtype:$dst), (ins listtype:$Vt, VectorIndexD:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in multiclass SIMDStSingleB opcode, string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i8 : SIMDLdStSingleB<0, R, opcode, asm, (outs), (ins listtype:$Vt, VectorIndexB:$idx, GPR64sp:$Rn), []>; def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm, (outs GPR64sp:$wback), (ins listtype:$Vt, VectorIndexB:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in multiclass SIMDStSingleH opcode, bit size, string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i16 : SIMDLdStSingleH<0, R, opcode, size, asm, (outs), (ins listtype:$Vt, VectorIndexH:$idx, GPR64sp:$Rn), []>; def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm, (outs GPR64sp:$wback), (ins listtype:$Vt, VectorIndexH:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in multiclass SIMDStSingleS opcode, bits<2> size,string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i32 : SIMDLdStSingleS<0, R, opcode, size, asm, (outs), (ins listtype:$Vt, VectorIndexS:$idx, GPR64sp:$Rn), []>; def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm, (outs GPR64sp:$wback), (ins listtype:$Vt, VectorIndexS:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in multiclass SIMDStSingleD opcode, bits<2> size, string asm, RegisterOperand listtype, RegisterOperand GPR64pi> { def i64 : SIMDLdStSingleD<0, R, opcode, size, asm, (outs), (ins listtype:$Vt, VectorIndexD:$idx, GPR64sp:$Rn), []>; def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm, (outs GPR64sp:$wback), (ins listtype:$Vt, VectorIndexD:$idx, GPR64sp:$Rn, GPR64pi:$Xm)>; } multiclass SIMDLdStSingleAliases { // E.g. "ld1 { v0.8b }[0], [x1], #1" // "ld1\t$Vt, [$Rn], #1" // may get mapped to // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) def : InstAlias(NAME # Type # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # layout):$Vt, idxtype:$idx, XZR), 1>; // E.g. "ld1.8b { v0 }[0], [x1], #1" // "ld1.8b\t$Vt, [$Rn], #1" // may get mapped to // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) def : InstAlias(NAME # Type # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # "128"):$Vt, idxtype:$idx, XZR), 0>; // E.g. "ld1.8b { v0 }[0], [x1]" // "ld1.8b\t$Vt, [$Rn]" // may get mapped to // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) def : InstAlias(NAME # Type) !cast("VecList" # Count # "128"):$Vt, idxtype:$idx, GPR64sp:$Rn), 0>; // E.g. "ld1.8b { v0 }[0], [x1], x2" // "ld1.8b\t$Vt, [$Rn], $Xm" // may get mapped to // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) def : InstAlias(NAME # Type # "_POST") GPR64sp:$Rn, !cast("VecList" # Count # "128"):$Vt, idxtype:$idx, !cast("GPR64pi" # Offset):$Xm), 0>; } multiclass SIMDLdSt1SingleAliases { defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; } multiclass SIMDLdSt2SingleAliases { defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; } multiclass SIMDLdSt3SingleAliases { defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; } multiclass SIMDLdSt4SingleAliases { defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; defm "" : SIMDLdStSingleAliases; } } // end of 'let Predicates = [HasNEON]' //---------------------------------------------------------------------------- // AdvSIMD v8.1 Rounding Double Multiply Add/Subtract //---------------------------------------------------------------------------- let Predicates = [HasNEON, HasRDM] in { class BaseSIMDThreeSameVectorTiedR0 size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list pattern> : BaseSIMDThreeSameVectorTied { } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS opc, string asm, SDPatternOperator Accum> { def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), (Accum (v4i16 V64:$Rd), (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>; def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), (Accum (v8i16 V128:$Rd), (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm)))))]>; def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), (Accum (v2i32 V64:$Rd), (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>; def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm)))))]>; } multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, SDPatternOperator Accum> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), (Accum (v4i16 V64:$Rd), (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$dst), (Accum (v8i16 V128:$Rd), (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), (Accum (v2i32 V64:$Rd), (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } // FIXME: it would be nice to use the scalar (v1i32) instruction here, but // an intermediate EXTRACT_SUBREG would be untyped. // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..))) def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), (i32 (vector_extract (v4i32 (insert_subvector (undef), (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx)))), (i32 0))), (i64 0))))), (EXTRACT_SUBREG (v2i32 (!cast(NAME # v2i32_indexed) (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), FPR32Op:$Rd, ssub)), V64:$Rn, V128:$Rm, VectorIndexS:$idx)), ssub)>; def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } // FIXME: it would be nice to use the scalar (v1i32) instruction here, but // an intermediate EXTRACT_SUBREG would be untyped. def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), (i32 (vector_extract (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx)))), (i64 0))))), (EXTRACT_SUBREG (v4i32 (!cast(NAME # v4i32_indexed) (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32Op:$Rd, ssub)), V128:$Rn, V128:$Rm, VectorIndexS:$idx)), ssub)>; def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", []> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; } def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i32 FPR32Op:$dst), (Accum (i32 FPR32Op:$Rd), (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32Op:$Rn), (i32 (vector_extract (v4i32 V128:$Rm), VectorIndexS:$idx))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } } // let Predicates = [HasNeon, HasRDM] //---------------------------------------------------------------------------- // ARMv8.3 Complex ADD/MLA instructions //---------------------------------------------------------------------------- class ComplexRotationOperand : AsmOperandClass { let PredicateMethod = "isComplexRotation<" # Angle # ", " # Remainder # ">"; let DiagnosticType = "InvalidComplexRotation" # Type; let Name = "ComplexRotation" # Type; } def complexrotateop : Operand { let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">; let PrintMethod = "printComplexRotationOp<90, 0>"; } def complexrotateopodd : Operand { let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; let PrintMethod = "printComplexRotationOp<180, 90>"; } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDThreeSameVectorComplex size, bits<3> opcode, RegisterOperand regtype, Operand rottype, string asm, string kind, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, rottype:$rot), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot" "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<1> rot; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0; let Inst{20-16} = Rm; let Inst{15-13} = opcode; // Non-tied version (FCADD) only has one rotation bit let Inst{12} = rot; let Inst{11} = 0; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDThreeSameVectorComplexHSD opcode, Operand rottype, string asm, SDPatternOperator OpNode>{ let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm), (rottype i32:$rot)))]>; def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm), (rottype i32:$rot)))]>; } let Predicates = [HasV8_3a, HasNEON] in { def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm), (rottype i32:$rot)))]>; def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm), (rottype i32:$rot)))]>; def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm), (rottype i32:$rot)))]>; } } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDThreeSameVectorTiedComplex size, bits<3> opcode, RegisterOperand regtype, Operand rottype, string asm, string kind, list pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm, rottype:$rot), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot" "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<2> rot; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0; let Inst{20-16} = Rm; let Inst{15-13} = opcode; let Inst{12-11} = rot; let Inst{10} = 1; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } multiclass SIMDThreeSameVectorTiedComplexHSD opcode, Operand rottype, string asm, SDPatternOperator OpNode> { let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64, rottype, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm), (rottype i32:$rot)))]>; def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128, rottype, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm), (rottype i32:$rot)))]>; } let Predicates = [HasV8_3a, HasNEON] in { def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64, rottype, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm), (rottype i32:$rot)))]>; def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128, rottype, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm), (rottype i32:$rot)))]>; def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128, rottype, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm), (rottype i32:$rot)))]>; } } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDIndexedTiedComplex size, bit opc1, bit opc2, RegisterOperand dst_reg, RegisterOperand lhs_reg, RegisterOperand rhs_reg, Operand vec_idx, Operand rottype, string asm, string apple_kind, string dst_kind, string lhs_kind, string rhs_kind, list pattern> : I<(outs dst_reg:$dst), (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx, rottype:$rot), asm, "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx, $rot" # "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx, $rot}", "$Rd = $dst", pattern>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; bits<2> rot; let Inst{31} = 0; let Inst{30} = Q; let Inst{29} = U; let Inst{28} = Scalar; let Inst{27-24} = 0b1111; let Inst{23-22} = size; // Bit 21 must be set by the derived class. let Inst{20-16} = Rm; let Inst{15} = opc1; let Inst{14-13} = rot; let Inst{12} = opc2; // Bit 11 must be set by the derived class. let Inst{10} = 0; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } // The complex instructions index by pairs of elements, so the VectorIndexes // don't match the lane types, and the index bits are different to the other // classes. multiclass SIMDIndexedTiedComplexHSD { let Predicates = [HasV8_3a,HasNEON,HasFullFP16] in { def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64, V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h", ".4h", ".h", []> { bits<1> idx; let Inst{11} = 0; let Inst{21} = idx{0}; } def v8f16_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b01, opc1, opc2, V128, V128, V128, VectorIndexS, rottype, asm, ".8h", ".8h", ".8h", ".h", []> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } } // Predicates = [HasV8_3a,HasNEON,HasFullFP16] let Predicates = [HasV8_3a,HasNEON] in { def v4f32_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b10, opc1, opc2, V128, V128, V128, VectorIndexD, rottype, asm, ".4s", ".4s", ".4s", ".s", []> { bits<1> idx; let Inst{11} = idx{0}; let Inst{21} = 0; } } // Predicates = [HasV8_3a,HasNEON] } //---------------------------------------------------------------------------- // Crypto extensions //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class AESBase opc, string asm, dag outs, dag ins, string cstr, list pat> : I, Sched<[WriteV]>{ bits<5> Rd; bits<5> Rn; let Inst{31-16} = 0b0100111000101000; let Inst{15-12} = opc; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class AESInst opc, string asm, Intrinsic OpNode> : AESBase; class AESTiedInst opc, string asm, Intrinsic OpNode> : AESBase; let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class SHA3OpTiedInst opc, string asm, string dst_lhs_kind, dag oops, dag iops, list pat> : I, Sched<[WriteV]>{ bits<5> Rd; bits<5> Rn; bits<5> Rm; let Inst{31-21} = 0b01011110000; let Inst{20-16} = Rm; let Inst{15} = 0; let Inst{14-12} = opc; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class SHATiedInstQSV opc, string asm, Intrinsic OpNode> : SHA3OpTiedInst; class SHATiedInstVVV opc, string asm, Intrinsic OpNode> : SHA3OpTiedInst; class SHATiedInstQQV opc, string asm, Intrinsic OpNode> : SHA3OpTiedInst; let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class SHA2OpInst opc, string asm, string kind, string cstr, dag oops, dag iops, list pat> : I, Sched<[WriteV]>{ bits<5> Rd; bits<5> Rn; let Inst{31-16} = 0b0101111000101000; let Inst{15-12} = opc; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; let Inst{4-0} = Rd; } class SHATiedInstVV opc, string asm, Intrinsic OpNode> : SHA2OpInst; class SHAInstSS opc, string asm, Intrinsic OpNode> : SHA2OpInst; // Armv8.2-A Crypto extensions class BaseCryptoV82 pattern> : I , Sched<[WriteV]> { bits<5> Vd; bits<5> Vn; let Inst{31-25} = 0b1100111; let Inst{9-5} = Vn; let Inst{4-0} = Vd; } class CryptoRRTiedop0, bits<2>op1, string asm, string asmops> : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops, "$Vm = $Vd", []> { let Inst{31-25} = 0b1100111; let Inst{24-21} = 0b0110; let Inst{20-15} = 0b000001; let Inst{14} = op0; let Inst{13-12} = 0b00; let Inst{11-10} = op1; } class CryptoRRTied_2Dop0, bits<2>op1, string asm> : CryptoRRTied; class CryptoRRTied_4Sop0, bits<2>op1, string asm> : CryptoRRTied; class CryptoRRR op0, bits<2>op1, dag oops, dag iops, string asm, string asmops, string cst> : BaseCryptoV82 { bits<5> Vm; let Inst{24-21} = 0b0011; let Inst{20-16} = Vm; let Inst{15} = 0b1; let Inst{14} = op0; let Inst{13-12} = 0b00; let Inst{11-10} = op1; } class CryptoRRR_2D op0, bits<2>op1, string asm> : CryptoRRR; class CryptoRRRTied_2D op0, bits<2>op1, string asm> : CryptoRRR; class CryptoRRR_4S op0, bits<2>op1, string asm> : CryptoRRR; class CryptoRRRTied_4S op0, bits<2>op1, string asm> : CryptoRRR; class CryptoRRRTied op0, bits<2>op1, string asm> : CryptoRRR; class CryptoRRRRop0, string asm, string asmops> : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm, asmops, "", []> { bits<5> Vm; bits<5> Va; let Inst{24-23} = 0b00; let Inst{22-21} = op0; let Inst{20-16} = Vm; let Inst{15} = 0b0; let Inst{14-10} = Va; } class CryptoRRRR_16Bop0, string asm> : CryptoRRRR { } class CryptoRRRR_4Sop0, string asm> : CryptoRRRR { } class CryptoRRRi6 : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm, "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> { bits<6> imm; bits<5> Vm; let Inst{24-21} = 0b0100; let Inst{20-16} = Vm; let Inst{15-10} = imm; let Inst{9-5} = Vn; let Inst{4-0} = Vd; } class CryptoRRRi2Tiedop0, bits<2>op1, string asm> : BaseCryptoV82<(outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm), asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> { bits<2> imm; bits<5> Vm; let Inst{24-21} = 0b0010; let Inst{20-16} = Vm; let Inst{15} = 0b1; let Inst{14} = op0; let Inst{13-12} = imm; let Inst{11-10} = op1; } //---------------------------------------------------------------------------- // v8.1 atomic instructions extension: // * CAS // * CASP // * SWP // * LDOPregister, and aliases STOPregister // Instruction encodings: // // 31 30|29 24|23|22|21|20 16|15|14 10|9 5|4 0 // CAS SZ |001000|1 |A |1 |Rs |R |11111 |Rn |Rt // CASP 0|SZ|001000|0 |A |1 |Rs |R |11111 |Rn |Rt // SWP SZ |111000|A |R |1 |Rs |1 |OPC|00|Rn |Rt // LD SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |Rt // ST SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |11111 // Instruction syntax: // // CAS{}[] , , [] // CAS{} , , [] // CASP{} , , , , [] // CASP{} , , , , [] // SWP{}[] , , [] // SWP{} , , [] // LD{}[] , , [] // LD{} , , [] // ST{}[] , [] // ST{} , [] let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in class BaseCASEncoding pattern> : I { bits<2> Sz; bit NP; bit Acq; bit Rel; bits<5> Rs; bits<5> Rn; bits<5> Rt; let Inst{31-30} = Sz; let Inst{29-24} = 0b001000; let Inst{23} = NP; let Inst{22} = Acq; let Inst{21} = 0b1; let Inst{20-16} = Rs; let Inst{15} = Rel; let Inst{14-10} = 0b11111; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let Predicates = [HasLSE]; } class BaseCAS : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), "cas" # order # size, "\t$Rs, $Rt, [$Rn]", "$out = $Rs",[]>, Sched<[WriteAtomic]> { let NP = 1; } multiclass CompareAndSwap Acq, bits<1> Rel, string order> { let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseCAS; let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseCAS; let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseCAS; let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseCAS; } class BaseCASP : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), "casp" # order # size, "\t$Rs, $Rt, [$Rn]", "$out = $Rs",[]>, Sched<[WriteAtomic]> { let NP = 0; } multiclass CompareAndSwapPair Acq, bits<1> Rel, string order> { let Sz = 0b00, Acq = Acq, Rel = Rel in def W : BaseCASP; let Sz = 0b01, Acq = Acq, Rel = Rel in def X : BaseCASP; } let Predicates = [HasLSE] in class BaseSWP : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size, "\t$Rs, $Rt, [$Rn]","",[]>, Sched<[WriteAtomic]> { bits<2> Sz; bit Acq; bit Rel; bits<5> Rs; bits<3> opc = 0b000; bits<5> Rn; bits<5> Rt; let Inst{31-30} = Sz; let Inst{29-24} = 0b111000; let Inst{23} = Acq; let Inst{22} = Rel; let Inst{21} = 0b1; let Inst{20-16} = Rs; let Inst{15} = 0b1; let Inst{14-12} = opc; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let Predicates = [HasLSE]; } multiclass Swap Acq, bits<1> Rel, string order> { let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseSWP; let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseSWP; let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseSWP; let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseSWP; } let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in class BaseLDOPregister : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size, "\t$Rs, $Rt, [$Rn]","",[]>, Sched<[WriteAtomic]> { bits<2> Sz; bit Acq; bit Rel; bits<5> Rs; bits<3> opc; bits<5> Rn; bits<5> Rt; let Inst{31-30} = Sz; let Inst{29-24} = 0b111000; let Inst{23} = Acq; let Inst{22} = Rel; let Inst{21} = 0b1; let Inst{20-16} = Rs; let Inst{15} = 0b0; let Inst{14-12} = opc; let Inst{11-10} = 0b00; let Inst{9-5} = Rn; let Inst{4-0} = Rt; let Predicates = [HasLSE]; } multiclass LDOPregister opc, string op, bits<1> Acq, bits<1> Rel, string order> { let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in def B : BaseLDOPregister; let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in def H : BaseLDOPregister; let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in def W : BaseLDOPregister; let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in def X : BaseLDOPregister; } // Differing SrcRHS and DstRHS allow you to cover CLR & SUB by giving a more // complex DAG for DstRHS. let Predicates = [HasLSE] in multiclass LDOPregister_patterns_ord_dag { def : Pat<(!cast(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS), (!cast(inst # suffix) DstRHS, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS), (!cast(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS), (!cast(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS), (!cast(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS), (!cast(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>; } multiclass LDOPregister_patterns_ord { defm : LDOPregister_patterns_ord_dag; } multiclass LDOPregister_patterns_ord_mod { defm : LDOPregister_patterns_ord_dag; } multiclass LDOPregister_patterns { defm : LDOPregister_patterns_ord; defm : LDOPregister_patterns_ord; defm : LDOPregister_patterns_ord; defm : LDOPregister_patterns_ord; } multiclass LDOPregister_patterns_mod { defm : LDOPregister_patterns_ord_mod(mod#Xrr) XZR, GPR64:$Rm))>; defm : LDOPregister_patterns_ord_mod(mod#Wrr) WZR, GPR32:$Rm))>; defm : LDOPregister_patterns_ord_mod(mod#Wrr) WZR, GPR32:$Rm))>; defm : LDOPregister_patterns_ord_mod(mod#Wrr) WZR, GPR32:$Rm))>; } let Predicates = [HasLSE] in multiclass CASregister_patterns_ord_dag { def : Pat<(!cast(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW), (!cast(inst # suffix) OLD, NEW, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW), (!cast(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW), (!cast(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW), (!cast(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>; def : Pat<(!cast(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW), (!cast(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>; } multiclass CASregister_patterns_ord { defm : CASregister_patterns_ord_dag; } multiclass CASregister_patterns { defm : CASregister_patterns_ord; defm : CASregister_patterns_ord; defm : CASregister_patterns_ord; defm : CASregister_patterns_ord; } let Predicates = [HasLSE] in class BaseSTOPregister : InstAlias; multiclass STOPregister { def : BaseSTOPregister(instr # "LB")>; def : BaseSTOPregister(instr # "LH")>; def : BaseSTOPregister(instr # "LW")>; def : BaseSTOPregister(instr # "LX")>; def : BaseSTOPregister(instr # "B")>; def : BaseSTOPregister(instr # "H")>; def : BaseSTOPregister(instr # "W")>; def : BaseSTOPregister(instr # "X")>; } //---------------------------------------------------------------------------- // Allow the size specifier tokens to be upper case, not just lower. def : TokenAlias<".4B", ".4b">; // Add dot product def : TokenAlias<".8B", ".8b">; def : TokenAlias<".4H", ".4h">; def : TokenAlias<".2S", ".2s">; def : TokenAlias<".1D", ".1d">; def : TokenAlias<".16B", ".16b">; def : TokenAlias<".8H", ".8h">; def : TokenAlias<".4S", ".4s">; def : TokenAlias<".2D", ".2d">; def : TokenAlias<".1Q", ".1q">; def : TokenAlias<".2H", ".2h">; def : TokenAlias<".B", ".b">; def : TokenAlias<".H", ".h">; def : TokenAlias<".S", ".s">; def : TokenAlias<".D", ".d">; def : TokenAlias<".Q", ".q">; Index: vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCISelLowering.cpp (revision 337298) +++ vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCISelLowering.cpp (revision 337299) @@ -1,14174 +1,14190 @@ //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the PPCISelLowering class. // //===----------------------------------------------------------------------===// #include "PPCISelLowering.h" #include "MCTargetDesc/PPCPredicates.h" #include "PPC.h" #include "PPCCCState.h" #include "PPCCallingConv.h" #include "PPCFrameLowering.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" #include "PPCRegisterInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "ppc-lowering" static cl::opt DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); static cl::opt DisableILPPref("disable-ppc-ilp-pref", cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); static cl::opt DisablePPCUnaligned("disable-ppc-unaligned", cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); static cl::opt DisableSCO("disable-ppc-sco", cl::desc("disable sibling call optimization on ppc"), cl::Hidden); static cl::opt EnableQuadPrecision("enable-ppc-quad-precision", cl::desc("enable quad precision float support on ppc"), cl::Hidden); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. bool isPPC64 = Subtarget.isPPC64(); setMinStackArgumentAlignment(isPPC64 ? 8:4); // Set up the register classes. addRegisterClass(MVT::i32, &PPC::GPRCRegClass); if (!useSoftFloat()) { if (hasSPE()) { addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass); addRegisterClass(MVT::f64, &PPC::SPERCRegClass); } else { addRegisterClass(MVT::f32, &PPC::F4RCRegClass); addRegisterClass(MVT::f64, &PPC::F8RCRegClass); } } // Match BITREVERSE to customized fast code sequence in the td file. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); } setTruncStoreAction(MVT::f64, MVT::f32, Expand); // PowerPC has pre-inc load and store's. setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); if (!Subtarget.hasSPE()) { setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); } // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { setOperationAction(ISD::ADDC, VT, Legal); setOperationAction(ISD::ADDE, VT, Legal); setOperationAction(ISD::SUBC, VT, Legal); setOperationAction(ISD::SUBE, VT, Legal); } if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); if (isPPC64 || Subtarget.hasFPCVT()) { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); } else { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); } // PowerPC does not support direct load/store of condition registers. setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); // FIXME: Remove this once the ANDI glue bug is fixed: if (ANDIGlueBug) setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); setTruncStoreAction(VT, MVT::i1, Expand); } addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); } // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on // PPC (the libcall is not available). setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom); // We do not currently implement these libm ops for PowerPC. setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); setOperationAction(ISD::FREM, MVT::ppcf128, Expand); // PowerPC has no SREM/UREM instructions unless we are on P9 // On P9 we may use a hardware instruction to compute the remainder. // The instructions are not legalized directly because in the cases where the // result of both the remainder and the division is required it is more // efficient to compute the remainder from the result of the division rather // than use the remainder instruction. if (Subtarget.isISA3_0()) { setOperationAction(ISD::SREM, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Custom); setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); } else { setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); } if (Subtarget.hasP9Vector()) { setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); setOperationAction(ISD::ABS, MVT::v16i8, Legal); } // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i64, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); // We don't support sin/cos/sqrt/fmod/pow setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FREM , MVT::f64, Expand); setOperationAction(ISD::FPOW , MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM , MVT::f32, Expand); setOperationAction(ISD::FPOW , MVT::f32, Expand); if (Subtarget.hasSPE()) { setOperationAction(ISD::FMA , MVT::f64, Expand); setOperationAction(ISD::FMA , MVT::f32, Expand); } else { setOperationAction(ISD::FMA , MVT::f64, Legal); setOperationAction(ISD::FMA , MVT::f32, Legal); } setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); if (Subtarget.hasFCPSGN()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); } else { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); } if (Subtarget.hasFPRND()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); } // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd // to speed up scalar BSWAP64. // CTPOP or CTTZ were introduced in P8/P9 respectively setOperationAction(ISD::BSWAP, MVT::i32 , Expand); if (Subtarget.isISA3_0()) { setOperationAction(ISD::BSWAP, MVT::i64 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); } else { setOperationAction(ISD::BSWAP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); } if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { setOperationAction(ISD::CTPOP, MVT::i32 , Legal); setOperationAction(ISD::CTPOP, MVT::i64 , Legal); } else { setOperationAction(ISD::CTPOP, MVT::i32 , Expand); setOperationAction(ISD::CTPOP, MVT::i64 , Expand); } // PowerPC does not have ROTR setOperationAction(ISD::ROTR, MVT::i32 , Expand); setOperationAction(ISD::ROTR, MVT::i64 , Expand); if (!Subtarget.useCRBits()) { // PowerPC does not have Select setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::i64, Expand); setOperationAction(ISD::SELECT, MVT::f32, Expand); setOperationAction(ISD::SELECT, MVT::f64, Expand); } // PowerPC wants to turn select_cc of FP into fsel when possible. setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); // PowerPC wants to optimize integer setcc a bit if (!Subtarget.useCRBits()) setOperationAction(ISD::SETCC, MVT::i32, Custom); // PowerPC does not have BRCOND which requires SetCC if (!Subtarget.useCRBits()) setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); if (Subtarget.hasSPE()) { // SPE has built-in conversions setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); } else { // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); // PowerPC does not have [U|S]INT_TO_FP setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); } if (Subtarget.hasDirectMove() && isPPC64) { setOperationAction(ISD::BITCAST, MVT::f32, Legal); setOperationAction(ISD::BITCAST, MVT::i32, Legal); setOperationAction(ISD::BITCAST, MVT::i64, Legal); setOperationAction(ISD::BITCAST, MVT::f64, Legal); } else { setOperationAction(ISD::BITCAST, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i32, Expand); setOperationAction(ISD::BITCAST, MVT::i64, Expand); setOperationAction(ISD::BITCAST, MVT::f64, Expand); } // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // We want to legalize GlobalAddress and ConstantPool nodes into the // appropriate instructions to materialize the address. setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::JumpTable, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::BlockAddress, MVT::i64, Custom); setOperationAction(ISD::ConstantPool, MVT::i64, Custom); setOperationAction(ISD::JumpTable, MVT::i64, Custom); // TRAP is legal. setOperationAction(ISD::TRAP, MVT::Other, Legal); // TRAMPOLINE is custom lowered. setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); if (Subtarget.isSVR4ABI()) { if (isPPC64) { // VAARG always uses double-word chunks, so promote anything smaller. setOperationAction(ISD::VAARG, MVT::i1, Promote); AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); setOperationAction(ISD::VAARG, MVT::i8, Promote); AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); setOperationAction(ISD::VAARG, MVT::i16, Promote); AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); setOperationAction(ISD::VAARG, MVT::i32, Promote); AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); setOperationAction(ISD::VAARG, MVT::Other, Expand); } else { // VAARG is custom lowered with the 32-bit SVR4 ABI. setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::i64, Custom); } } else setOperationAction(ISD::VAARG, MVT::Other, Expand); if (Subtarget.isSVR4ABI() && !isPPC64) // VACOPY is custom lowered with the 32-bit SVR4 ABI. setOperationAction(ISD::VACOPY , MVT::Other, Custom); else setOperationAction(ISD::VACOPY , MVT::Other, Expand); // Use the default implementation. setOperationAction(ISD::VAEND , MVT::Other, Expand); setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); // Comparisons that require checking two conditions. if (Subtarget.hasSPE()) { setCondCodeAction(ISD::SETO, MVT::f32, Expand); setCondCodeAction(ISD::SETO, MVT::f64, Expand); setCondCodeAction(ISD::SETUO, MVT::f32, Expand); setCondCodeAction(ISD::SETUO, MVT::f64, Expand); } setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); // This is just the low 32 bits of a (signed) fp->i64 conversion. // We cannot do this with Promote because i64 is not a legal type. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); } else { // PowerPC does not have FP_TO_UINT on 32-bit implementations. if (Subtarget.hasSPE()) setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); else setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); } // With the instructions enabled under FPCVT, we can do everything. if (Subtarget.hasFPCVT()) { if (Subtarget.has64BitSupport()) { setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); } setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); } if (Subtarget.use64BitRegs()) { // 64-bit PowerPC implementations can support i64 types directly addRegisterClass(MVT::i64, &PPC::G8RCRegClass); // BUILD_PAIR can't be handled natively, and should be expanded to shl/or setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // 64-bit PowerPC wants to expand i128 shifts itself. setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); } else { // 32-bit PowerPC wants to expand i64 shifts itself. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { setOperationAction(ISD::CTPOP, VT, Legal); setOperationAction(ISD::CTLZ, VT, Legal); } else { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); } // Vector instructions introduced in P9 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128)) setOperationAction(ISD::CTTZ, VT, Legal); else setOperationAction(ISD::CTTZ, VT, Expand); // We promote all shuffles to v16i8. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); // We promote all non-typed operations to v4i32. setOperationAction(ISD::AND , VT, Promote); AddPromotedToType (ISD::AND , VT, MVT::v4i32); setOperationAction(ISD::OR , VT, Promote); AddPromotedToType (ISD::OR , VT, MVT::v4i32); setOperationAction(ISD::XOR , VT, Promote); AddPromotedToType (ISD::XOR , VT, MVT::v4i32); setOperationAction(ISD::LOAD , VT, Promote); AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); setOperationAction(ISD::SELECT_CC, VT, Promote); AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); setOperationAction(ISD::STORE, VT, Promote); AddPromotedToType (ISD::STORE, VT, MVT::v4i32); // No other operations are legal. setOperationAction(ISD::MUL , VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); setOperationAction(ISD::AND , MVT::v4i32, Legal); setOperationAction(ISD::OR , MVT::v4i32, Legal); setOperationAction(ISD::XOR , MVT::v4i32, Legal); setOperationAction(ISD::LOAD , MVT::v4i32, Legal); setOperationAction(ISD::SELECT, MVT::v4i32, Subtarget.useCRBits() ? Legal : Expand); setOperationAction(ISD::STORE , MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); setOperationAction(ISD::MUL, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } if (Subtarget.hasP8Altivec()) setOperationAction(ISD::MUL, MVT::v4i32, Legal); else setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); // Altivec does not contain unordered floating-point compare instructions setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); if (Subtarget.hasP8Vector()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); } if (Subtarget.hasDirectMove() && isPPC64) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); setOperationAction(ISD::FROUND, MVT::v2f64, Legal); setOperationAction(ISD::FROUND, MVT::v4f32, Legal); setOperationAction(ISD::MUL, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); // Share the Altivec comparison restrictions. setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::STORE, MVT::v2f64, Legal); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); if (Subtarget.hasP8Vector()) addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); if (Subtarget.hasP8Altivec()) { setOperationAction(ISD::SHL, MVT::v2i64, Legal); setOperationAction(ISD::SRA, MVT::v2i64, Legal); setOperationAction(ISD::SRL, MVT::v2i64, Legal); // 128 bit shifts can be accomplished via 3 instructions for SHL and // SRL, but not for SRA because of the instructions available: // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth // doing setOperationAction(ISD::SHL, MVT::v1i128, Expand); setOperationAction(ISD::SRL, MVT::v1i128, Expand); setOperationAction(ISD::SRA, MVT::v1i128, Expand); setOperationAction(ISD::SETCC, MVT::v2i64, Legal); } else { setOperationAction(ISD::SHL, MVT::v2i64, Expand); setOperationAction(ISD::SRA, MVT::v2i64, Expand); setOperationAction(ISD::SRL, MVT::v2i64, Expand); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); // VSX v2i64 only supports non-arithmetic operations. setOperationAction(ISD::ADD, MVT::v2i64, Expand); setOperationAction(ISD::SUB, MVT::v2i64, Expand); } setOperationAction(ISD::LOAD, MVT::v2i64, Promote); AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); setOperationAction(ISD::STORE, MVT::v2i64, Promote); AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); // Vector operation legalization checks the result type of // SIGN_EXTEND_INREG, overall legalization checks the inner type. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::FNEG, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Legal); setOperationAction(ISD::FABS, MVT::v4f32, Legal); setOperationAction(ISD::FABS, MVT::v2f64, Legal); if (Subtarget.hasDirectMove()) setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); } if (Subtarget.hasP8Altivec()) { addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); } if (Subtarget.hasP9Vector()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); // 128 bit shifts can be accomplished via 3 instructions for SHL and // SRL, but not for SRA because of the instructions available: // VS{RL} and VS{RL}O. setOperationAction(ISD::SHL, MVT::v1i128, Legal); setOperationAction(ISD::SRL, MVT::v1i128, Legal); setOperationAction(ISD::SRA, MVT::v1i128, Expand); if (EnableQuadPrecision) { addRegisterClass(MVT::f128, &PPC::VRRCRegClass); setOperationAction(ISD::FADD, MVT::f128, Legal); setOperationAction(ISD::FSUB, MVT::f128, Legal); setOperationAction(ISD::FDIV, MVT::f128, Legal); setOperationAction(ISD::FMUL, MVT::f128, Legal); setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal); // No extending loads to f128 on PPC. for (MVT FPT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand); setOperationAction(ISD::FMA, MVT::f128, Legal); setCondCodeAction(ISD::SETULT, MVT::f128, Expand); setCondCodeAction(ISD::SETUGT, MVT::f128, Expand); setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand); setCondCodeAction(ISD::SETOGE, MVT::f128, Expand); setCondCodeAction(ISD::SETOLE, MVT::f128, Expand); setCondCodeAction(ISD::SETONE, MVT::f128, Expand); setOperationAction(ISD::FTRUNC, MVT::f128, Legal); setOperationAction(ISD::FRINT, MVT::f128, Legal); setOperationAction(ISD::FFLOOR, MVT::f128, Legal); setOperationAction(ISD::FCEIL, MVT::f128, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal); setOperationAction(ISD::FROUND, MVT::f128, Legal); setOperationAction(ISD::SELECT, MVT::f128, Expand); setOperationAction(ISD::FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::FP_ROUND, MVT::f32, Legal); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i128, Custom); // No implementation for these ops for PowerPC. setOperationAction(ISD::FSIN , MVT::f128, Expand); setOperationAction(ISD::FCOS , MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FPOWI, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); } } if (Subtarget.hasP9Altivec()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); } } if (Subtarget.hasQPX()) { setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FREM, MVT::v4f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); setOperationAction(ISD::LOAD , MVT::v4f64, Custom); setOperationAction(ISD::STORE , MVT::v4f64, Custom); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); if (!Subtarget.useCRBits()) setOperationAction(ISD::SELECT, MVT::v4f64, Expand); setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); setOperationAction(ISD::FNEG , MVT::v4f64, Legal); setOperationAction(ISD::FABS , MVT::v4f64, Legal); setOperationAction(ISD::FSIN , MVT::v4f64, Expand); setOperationAction(ISD::FCOS , MVT::v4f64, Expand); setOperationAction(ISD::FPOW , MVT::v4f64, Expand); setOperationAction(ISD::FLOG , MVT::v4f64, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); setOperationAction(ISD::FEXP , MVT::v4f64, Expand); setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FREM, MVT::v4f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); setOperationAction(ISD::LOAD , MVT::v4f32, Custom); setOperationAction(ISD::STORE , MVT::v4f32, Custom); if (!Subtarget.useCRBits()) setOperationAction(ISD::SELECT, MVT::v4f32, Expand); setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); setOperationAction(ISD::FNEG , MVT::v4f32, Legal); setOperationAction(ISD::FABS , MVT::v4f32, Legal); setOperationAction(ISD::FSIN , MVT::v4f32, Expand); setOperationAction(ISD::FCOS , MVT::v4f32, Expand); setOperationAction(ISD::FPOW , MVT::v4f32, Expand); setOperationAction(ISD::FLOG , MVT::v4f32, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); setOperationAction(ISD::FEXP , MVT::v4f32, Expand); setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); setOperationAction(ISD::AND , MVT::v4i1, Legal); setOperationAction(ISD::OR , MVT::v4i1, Legal); setOperationAction(ISD::XOR , MVT::v4i1, Legal); if (!Subtarget.useCRBits()) setOperationAction(ISD::SELECT, MVT::v4i1, Expand); setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); setOperationAction(ISD::LOAD , MVT::v4i1, Custom); setOperationAction(ISD::STORE , MVT::v4i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); setOperationAction(ISD::FROUND, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FROUND, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); // These need to set FE_INEXACT, and so cannot be vectorized here. setOperationAction(ISD::FRINT, MVT::v4f64, Expand); setOperationAction(ISD::FRINT, MVT::v4f32, Expand); if (TM.Options.UnsafeFPMath) { setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } else { setOperationAction(ISD::FDIV, MVT::v4f64, Expand); setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); setOperationAction(ISD::FDIV, MVT::v4f32, Expand); setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); } } if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); if (!isPPC64) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); } setBooleanContents(ZeroOrOneBooleanContent); if (Subtarget.hasAltivec()) { // Altivec instructions set fields to all zeros or all ones. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); } if (!isPPC64) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); } setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); if (Subtarget.useCRBits()) setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::SELECT_CC); } // Use reciprocal estimates. if (TM.Options.UnsafeFPMath) { setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::FSQRT); } // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); } if (EnableQuadPrecision) { setLibcallName(RTLIB::LOG_F128, "logf128"); setLibcallName(RTLIB::LOG2_F128, "log2f128"); setLibcallName(RTLIB::LOG10_F128, "log10f128"); setLibcallName(RTLIB::EXP_F128, "expf128"); setLibcallName(RTLIB::EXP2_F128, "exp2f128"); setLibcallName(RTLIB::SIN_F128, "sinf128"); setLibcallName(RTLIB::COS_F128, "cosf128"); setLibcallName(RTLIB::POW_F128, "powf128"); setLibcallName(RTLIB::FMIN_F128, "fminf128"); setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); setLibcallName(RTLIB::POWI_F128, "__powikf2"); setLibcallName(RTLIB::REM_F128, "fmodf128"); } // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. if (Subtarget.useCRBits()) { setHasMultipleConditionRegisters(); setJumpIsExpensive(); } setMinFunctionAlignment(2); if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); switch (Subtarget.getDarwinDirective()) { default: break; case PPC::DIR_970: case PPC::DIR_A2: case PPC::DIR_E500: case PPC::DIR_E500mc: case PPC::DIR_E5500: case PPC::DIR_PWR4: case PPC::DIR_PWR5: case PPC::DIR_PWR5X: case PPC::DIR_PWR6: case PPC::DIR_PWR6X: case PPC::DIR_PWR7: case PPC::DIR_PWR8: case PPC::DIR_PWR9: setPrefFunctionAlignment(4); setPrefLoopAlignment(4); break; } if (Subtarget.enableMachineScheduler()) setSchedulingPreference(Sched::Source); else setSchedulingPreference(Sched::Hybrid); computeRegisterProperties(STI.getRegisterInfo()); // The Freescale cores do better with aggressive inlining of memcpy and // friends. GCC uses same threshold of 128 bytes (= 32 word stores). if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || Subtarget.getDarwinDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; MaxStoresPerMemsetOptSize = 16; MaxStoresPerMemcpy = 32; MaxStoresPerMemcpyOptSize = 8; MaxStoresPerMemmove = 32; MaxStoresPerMemmoveOptSize = 8; } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { // The A2 also benefits from (very) aggressive inlining of memcpy and // friends. The overhead of a the function call, even when warm, can be // over one hundred cycles. MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; MaxLoadsPerMemcmp = 128; } else { MaxLoadsPerMemcmp = 8; MaxLoadsPerMemcmpOptSize = 4; } } /// getMaxByValAlign - Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, unsigned MaxMaxAlign) { if (MaxAlign == MaxMaxAlign) return; if (VectorType *VTy = dyn_cast(Ty)) { if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) MaxAlign = 32; else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == MaxMaxAlign) break; } } } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { // Darwin passes everything on 4 byte boundary. if (Subtarget.isDarwin()) return 4; // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. unsigned Align = Subtarget.isPPC64() ? 8 : 4; if (Subtarget.hasAltivec() || Subtarget.hasQPX()) getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); return Align; } unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv:: ID CC, EVT VT) const { if (Subtarget.hasSPE() && VT == MVT::f64) return 2; return PPCTargetLowering::getNumRegisters(Context, VT); } MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv:: ID CC, EVT VT) const { if (Subtarget.hasSPE() && VT == MVT::f64) return MVT::i32; return PPCTargetLowering::getRegisterType(Context, VT); } bool PPCTargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } bool PPCTargetLowering::hasSPE() const { return Subtarget.hasSPE(); } const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; case PPCISD::FSEL: return "PPCISD::FSEL"; case PPCISD::FCFID: return "PPCISD::FCFID"; case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; case PPCISD::FP_TO_UINT_IN_VSR: return "PPCISD::FP_TO_UINT_IN_VSR,"; case PPCISD::FP_TO_SINT_IN_VSR: return "PPCISD::FP_TO_SINT_IN_VSR"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8"; case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; case PPCISD::SHL: return "PPCISD::SHL"; case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; case PPCISD::CALL: return "PPCISD::CALL"; case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; case PPCISD::BCTRL: return "PPCISD::BCTRL"; case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; case PPCISD::MFVSR: return "PPCISD::MFVSR"; case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; case PPCISD::VCMP: return "PPCISD::VCMP"; case PPCISD::VCMPo: return "PPCISD::VCMPo"; case PPCISD::LBRX: return "PPCISD::LBRX"; case PPCISD::STBRX: return "PPCISD::STBRX"; case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; case PPCISD::STXSIX: return "PPCISD::STXSIX"; case PPCISD::VEXTS: return "PPCISD::VEXTS"; case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; case PPCISD::ST_VSR_SCAL_INT: return "PPCISD::ST_VSR_SCAL_INT"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; case PPCISD::BDNZ: return "PPCISD::BDNZ"; case PPCISD::BDZ: return "PPCISD::BDZ"; case PPCISD::MFFS: return "PPCISD::MFFS"; case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; case PPCISD::CR6SET: return "PPCISD::CR6SET"; case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; case PPCISD::SC: return "PPCISD::SC"; case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; case PPCISD::QBFLT: return "PPCISD::QBFLT"; case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; } return nullptr; } EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, EVT VT) const { if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; if (Subtarget.hasQPX()) return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); return VT.changeVectorElementTypeToInteger(); } bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); return true; } //===----------------------------------------------------------------------===// // Node matching predicates, for use by the tblgen matching code. //===----------------------------------------------------------------------===// /// isFloatingPointZero - Return true if this is 0.0 or -0.0. static bool isFloatingPointZero(SDValue Op) { if (ConstantFPSDNode *CFP = dyn_cast(Op)) return CFP->getValueAPF().isZero(); else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { // Maybe this has already been legalized into the constant pool? if (ConstantPoolSDNode *CP = dyn_cast(Op.getOperand(1))) if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isZero(); } return false; } /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return /// true if Op is undef or if it matches the specified value. static bool isConstantOrUndef(int Op, int Val) { return Op < 0 || Op == Val; } /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. /// The ShuffleKind distinguishes between big-endian operations with /// two different inputs (0), either-endian operations with two identical /// inputs (1), and little-endian operations with two different inputs (2). /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; for (unsigned i = 0; i != 16; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) return false; } else if (ShuffleKind == 2) { if (!IsLE) return false; for (unsigned i = 0; i != 16; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2)) return false; } else if (ShuffleKind == 1) { unsigned j = IsLE ? 0 : 1; for (unsigned i = 0; i != 8; ++i) if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) return false; } return true; } /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. /// The ShuffleKind distinguishes between big-endian operations with /// two different inputs (0), either-endian operations with two identical /// inputs (1), and little-endian operations with two different inputs (2). /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) return false; } else if (ShuffleKind == 2) { if (!IsLE) return false; for (unsigned i = 0; i != 16; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) return false; } else if (ShuffleKind == 1) { unsigned j = IsLE ? 0 : 2; for (unsigned i = 0; i != 8; i += 2) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) return false; } return true; } /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the /// current subtarget. /// /// The ShuffleKind distinguishes between big-endian operations with /// two different inputs (0), either-endian operations with two identical /// inputs (1), and little-endian operations with two different inputs (2). /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); if (!Subtarget.hasP8Vector()) return false; bool IsLE = DAG.getDataLayout().isLittleEndian(); if (ShuffleKind == 0) { if (IsLE) return false; for (unsigned i = 0; i != 16; i += 4) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) return false; } else if (ShuffleKind == 2) { if (!IsLE) return false; for (unsigned i = 0; i != 16; i += 4) if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) return false; } else if (ShuffleKind == 1) { unsigned j = IsLE ? 0 : 4; for (unsigned i = 0; i != 8; i += 4) if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) return false; } return true; } /// isVMerge - Common function, used to match vmrg* shuffles. /// static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned LHSStart, unsigned RHSStart) { if (N->getValueType(0) != MVT::v16i8) return false; assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && "Unsupported merge size!"); for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), LHSStart+j+i*UnitSize) || !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), RHSStart+j+i*UnitSize)) return false; } return true; } /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). /// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { if (DAG.getDataLayout().isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 0, 16); else return false; } else { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 8, 24); else return false; } } /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). /// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, unsigned ShuffleKind, SelectionDAG &DAG) { if (DAG.getDataLayout().isLittleEndian()) { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 8, 8); else if (ShuffleKind == 2) // swapped return isVMerge(N, UnitSize, 8, 24); else return false; } else { if (ShuffleKind == 1) // unary return isVMerge(N, UnitSize, 0, 0); else if (ShuffleKind == 0) // normal return isVMerge(N, UnitSize, 0, 16); else return false; } } /** * Common function used to match vmrgew and vmrgow shuffles * * The indexOffset determines whether to look for even or odd words in * the shuffle mask. This is based on the of the endianness of the target * machine. * - Little Endian: * - Use offset of 0 to check for odd elements * - Use offset of 4 to check for even elements * - Big Endian: * - Use offset of 0 to check for even elements * - Use offset of 4 to check for odd elements * A detailed description of the vector element ordering for little endian and * big endian can be found at * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html * Targeting your applications - what little endian and big endian IBM XL C/C++ * compiler differences mean to you * * The mask to the shuffle vector instruction specifies the indices of the * elements from the two input vectors to place in the result. The elements are * numbered in array-access order, starting with the first vector. These vectors * are always of type v16i8, thus each vector will contain 16 elements of size * 8. More info on the shuffle vector can be found in the * http://llvm.org/docs/LangRef.html#shufflevector-instruction * Language Reference. * * The RHSStartValue indicates whether the same input vectors are used (unary) * or two different input vectors are used, based on the following: * - If the instruction uses the same vector for both inputs, the range of the * indices will be 0 to 15. In this case, the RHSStart value passed should * be 0. * - If the instruction has two different vectors then the range of the * indices will be 0 to 31. In this case, the RHSStart value passed should * be 16 (indices 0-15 specify elements in the first vector while indices 16 * to 31 specify elements in the second vector). * * \param[in] N The shuffle vector SD Node to analyze * \param[in] IndexOffset Specifies whether to look for even or odd elements * \param[in] RHSStartValue Specifies the starting index for the righthand input * vector to the shuffle_vector instruction * \return true iff this shuffle vector represents an even or odd word merge */ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, unsigned RHSStartValue) { if (N->getValueType(0) != MVT::v16i8) return false; for (unsigned i = 0; i < 2; ++i) for (unsigned j = 0; j < 4; ++j) if (!isConstantOrUndef(N->getMaskElt(i*4+j), i*RHSStartValue+j+IndexOffset) || !isConstantOrUndef(N->getMaskElt(i*4+j+8), i*RHSStartValue+j+IndexOffset+8)) return false; return true; } /** * Determine if the specified shuffle mask is suitable for the vmrgew or * vmrgow instructions. * * \param[in] N The shuffle vector SD Node to analyze * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) * \param[in] ShuffleKind Identify the type of merge: * - 0 = big-endian merge with two different inputs; * - 1 = either-endian merge with two identical inputs; * - 2 = little-endian merge with two different inputs (inputs are swapped for * little-endian merges). * \param[in] DAG The current SelectionDAG * \return true iff this shuffle mask */ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG) { if (DAG.getDataLayout().isLittleEndian()) { unsigned indexOffset = CheckEven ? 4 : 0; if (ShuffleKind == 1) // Unary return isVMerge(N, indexOffset, 0); else if (ShuffleKind == 2) // swapped return isVMerge(N, indexOffset, 16); else return false; } else { unsigned indexOffset = CheckEven ? 0 : 4; if (ShuffleKind == 1) // Unary return isVMerge(N, indexOffset, 0); else if (ShuffleKind == 0) // Normal return isVMerge(N, indexOffset, 16); else return false; } return false; } /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. /// The ShuffleKind distinguishes between big-endian operations with two /// different inputs (0), either-endian operations with two identical inputs /// (1), and little-endian operations with two different inputs (2). For the /// latter, the input operands are swapped (see PPCInstrAltivec.td). int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { if (N->getValueType(0) != MVT::v16i8) return -1; ShuffleVectorSDNode *SVOp = cast(N); // Find the first non-undef value in the shuffle mask. unsigned i; for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) /*search*/; if (i == 16) return -1; // all undef. // Otherwise, check to see if the rest of the elements are consecutively // numbered from this value. unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; ShiftAmt -= i; bool isLE = DAG.getDataLayout().isLittleEndian(); if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { // Check the rest of the elements to see if they are consecutive. for (++i; i != 16; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) return -1; } else if (ShuffleKind == 1) { // Check the rest of the elements to see if they are consecutive. for (++i; i != 16; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) return -1; } else return -1; if (isLE) ShiftAmt = 16 - ShiftAmt; return ShiftAmt; } /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a splat of a single element that is suitable for input to /// VSPLTB/VSPLTH/VSPLTW. bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { assert(N->getValueType(0) == MVT::v16i8 && (EltSize == 1 || EltSize == 2 || EltSize == 4)); // The consecutive indices need to specify an element, not part of two // different elements. So abandon ship early if this isn't the case. if (N->getMaskElt(0) % EltSize != 0) return false; // This is a splat operation if each element of the permute is the same, and // if the value doesn't reference the second vector. unsigned ElementBase = N->getMaskElt(0); // FIXME: Handle UNDEF elements too! if (ElementBase >= 16) return false; // Check that the indices are consecutive, in the case of a multi-byte element // splatted with a v16i8 mask. for (unsigned i = 1; i != EltSize; ++i) if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) return false; for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { if (N->getMaskElt(i) < 0) continue; for (unsigned j = 0; j != EltSize; ++j) if (N->getMaskElt(i+j) != N->getMaskElt(j)) return false; } return true; } /// Check that the mask is shuffling N byte elements. Within each N byte /// element of the mask, the indices could be either in increasing or /// decreasing order as long as they are consecutive. /// \param[in] N the shuffle vector SD Node to analyze /// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/ /// Word/DoubleWord/QuadWord). /// \param[in] StepLen the delta indices number among the N byte element, if /// the mask is in increasing/decreasing order then it is 1/-1. /// \return true iff the mask is shuffling N byte elements. static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width, int StepLen) { assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && "Unexpected element width."); assert((StepLen == 1 || StepLen == -1) && "Unexpected element width."); unsigned NumOfElem = 16 / Width; unsigned MaskVal[16]; // Width is never greater than 16 for (unsigned i = 0; i < NumOfElem; ++i) { MaskVal[0] = N->getMaskElt(i * Width); if ((StepLen == 1) && (MaskVal[0] % Width)) { return false; } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) { return false; } for (unsigned int j = 1; j < Width; ++j) { MaskVal[j] = N->getMaskElt(i * Width + j); if (MaskVal[j] != MaskVal[j-1] + StepLen) { return false; } } } return true; } bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE) { if (!isNByteElemShuffleMask(N, 4, 1)) return false; // Now we look at mask elements 0,4,8,12 unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; unsigned M2 = N->getMaskElt(8) / 4; unsigned M3 = N->getMaskElt(12) / 4; unsigned LittleEndianShifts[] = { 2, 1, 0, 3 }; unsigned BigEndianShifts[] = { 3, 0, 1, 2 }; // Below, let H and L be arbitrary elements of the shuffle mask // where H is in the range [4,7] and L is in the range [0,3]. // H, 1, 2, 3 or L, 5, 6, 7 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) || (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) { ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3]; InsertAtByte = IsLE ? 12 : 0; Swap = M0 < 4; return true; } // 0, H, 2, 3 or 4, L, 6, 7 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) || (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) { ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3]; InsertAtByte = IsLE ? 8 : 4; Swap = M1 < 4; return true; } // 0, 1, H, 3 or 4, 5, L, 7 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) || (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) { ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3]; InsertAtByte = IsLE ? 4 : 8; Swap = M2 < 4; return true; } // 0, 1, 2, H or 4, 5, 6, L if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) || (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) { ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3]; InsertAtByte = IsLE ? 0 : 12; Swap = M3 < 4; return true; } // If both vector operands for the shuffle are the same vector, the mask will // contain only elements from the first one and the second one will be undef. if (N->getOperand(1).isUndef()) { ShiftElts = 0; Swap = true; unsigned XXINSERTWSrcElem = IsLE ? 2 : 1; if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) { InsertAtByte = IsLE ? 12 : 0; return true; } if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) { InsertAtByte = IsLE ? 8 : 4; return true; } if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) { InsertAtByte = IsLE ? 4 : 8; return true; } if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) { InsertAtByte = IsLE ? 0 : 12; return true; } } return false; } bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE) { assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); // Ensure each byte index of the word is consecutive. if (!isNByteElemShuffleMask(N, 4, 1)) return false; // Now we look at mask elements 0,4,8,12, which are the beginning of words. unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; unsigned M2 = N->getMaskElt(8) / 4; unsigned M3 = N->getMaskElt(12) / 4; // If both vector operands for the shuffle are the same vector, the mask will // contain only elements from the first one and the second one will be undef. if (N->getOperand(1).isUndef()) { assert(M0 < 4 && "Indexing into an undef vector?"); if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) return false; ShiftElts = IsLE ? (4 - M0) % 4 : M0; Swap = false; return true; } // Ensure each word index of the ShuffleVector Mask is consecutive. if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) return false; if (IsLE) { if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { // Input vectors don't need to be swapped if the leading element // of the result is one of the 3 left elements of the second vector // (or if there is no shift to be done at all). Swap = false; ShiftElts = (8 - M0) % 8; } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { // Input vectors need to be swapped if the leading element // of the result is one of the 3 left elements of the first vector // (or if we're shifting by 4 - thereby simply swapping the vectors). Swap = true; ShiftElts = (4 - M0) % 4; } return true; } else { // BE if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { // Input vectors don't need to be swapped if the leading element // of the result is one of the 4 elements of the first vector. Swap = false; ShiftElts = M0; } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { // Input vectors need to be swapped if the leading element // of the result is one of the 4 elements of the right vector. Swap = true; ShiftElts = M0 - 4; } return true; } } bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) { assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); if (!isNByteElemShuffleMask(N, Width, -1)) return false; for (int i = 0; i < 16; i += Width) if (N->getMaskElt(i) != i + Width - 1) return false; return true; } bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) { return isXXBRShuffleMaskHelper(N, 2); } bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) { return isXXBRShuffleMaskHelper(N, 4); } bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) { return isXXBRShuffleMaskHelper(N, 8); } bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) { return isXXBRShuffleMaskHelper(N, 16); } /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap /// if the inputs to the instruction should be swapped and set \p DM to the /// value for the immediate. /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI /// AND element 0 of the result comes from the first input (LE) or second input /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle /// mask. bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, bool &Swap, bool IsLE) { assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); // Ensure each byte index of the double word is consecutive. if (!isNByteElemShuffleMask(N, 8, 1)) return false; unsigned M0 = N->getMaskElt(0) / 8; unsigned M1 = N->getMaskElt(8) / 8; assert(((M0 | M1) < 4) && "A mask element out of bounds?"); // If both vector operands for the shuffle are the same vector, the mask will // contain only elements from the first one and the second one will be undef. if (N->getOperand(1).isUndef()) { if ((M0 | M1) < 2) { DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); Swap = false; return true; } else return false; } if (IsLE) { if (M0 > 1 && M1 < 2) { Swap = false; } else if (M0 < 2 && M1 > 1) { M0 = (M0 + 2) % 4; M1 = (M1 + 2) % 4; Swap = true; } else return false; // Note: if control flow comes here that means Swap is already set above DM = (((~M1) & 1) << 1) + ((~M0) & 1); return true; } else { // BE if (M0 < 2 && M1 > 1) { Swap = false; } else if (M0 > 1 && M1 < 2) { M0 = (M0 + 2) % 4; M1 = (M1 + 2) % 4; Swap = true; } else return false; // Note: if control flow comes here that means Swap is already set above DM = (M0 << 1) + (M1 & 1); return true; } } /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); if (DAG.getDataLayout().isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else return SVOp->getMaskElt(0) / EltSize; } /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed /// by using a vspltis[bhw] instruction of the specified element size, return /// the constant being splatted. The ByteSize field indicates the number of /// bytes of each element [124] -> [bhw]. SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { SDValue OpVal(nullptr, 0); // If ByteSize of the splat is bigger than the element size of the // build_vector, then we have a case where we are checking for a splat where // multiple elements of the buildvector are folded together into a single // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). unsigned EltSize = 16/N->getNumOperands(); if (EltSize < ByteSize) { unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. SDValue UniquedVals[4]; assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); // See if all of the elements in the buildvector agree across. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { if (N->getOperand(i).isUndef()) continue; // If the element isn't a constant, bail fully out. if (!isa(N->getOperand(i))) return SDValue(); if (!UniquedVals[i&(Multiple-1)].getNode()) UniquedVals[i&(Multiple-1)] = N->getOperand(i); else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) return SDValue(); // no match. } // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains // either constant or undef values that are identical for each chunk. See // if these chunks can form into a larger vspltis*. // Check to see if all of the leading entries are either 0 or -1. If // neither, then this won't fit into the immediate field. bool LeadingZero = true; bool LeadingOnes = true; for (unsigned i = 0; i != Multiple-1; ++i) { if (!UniquedVals[i].getNode()) continue; // Must have been undefs. LeadingZero &= isNullConstant(UniquedVals[i]); LeadingOnes &= isAllOnesConstant(UniquedVals[i]); } // Finally, check the least significant entry. if (LeadingZero) { if (!UniquedVals[Multiple-1].getNode()) return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef int Val = cast(UniquedVals[Multiple-1])->getZExtValue(); if (Val < 16) // 0,0,0,4 -> vspltisw(4) return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); } if (LeadingOnes) { if (!UniquedVals[Multiple-1].getNode()) return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef int Val =cast(UniquedVals[Multiple-1])->getSExtValue(); if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); } return SDValue(); } // Check to see if this buildvec has a single non-undef value in its elements. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { if (N->getOperand(i).isUndef()) continue; if (!OpVal.getNode()) OpVal = N->getOperand(i); else if (OpVal != N->getOperand(i)) return SDValue(); } if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. unsigned ValSizeInBytes = EltSize; uint64_t Value = 0; if (ConstantSDNode *CN = dyn_cast(OpVal)) { Value = CN->getZExtValue(); } else if (ConstantFPSDNode *CN = dyn_cast(OpVal)) { assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); Value = FloatToBits(CN->getValueAPF().convertToFloat()); } // If the splat value is larger than the element value, then we can never do // this splat. The only case that we could fit the replicated bits into our // immediate field for would be zero, and we prefer to use vxor for it. if (ValSizeInBytes < ByteSize) return SDValue(); // If the element value is larger than the splat value, check if it consists // of a repeated bit pattern of size ByteSize. if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) return SDValue(); // Properly sign extend the value. int MaskVal = SignExtend32(Value, ByteSize * 8); // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. if (MaskVal == 0) return SDValue(); // Finally, if this value fits in a 5 bit sext field, return it if (SignExtend32<5>(MaskVal) == MaskVal) return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); return SDValue(); } /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift /// amount, otherwise return -1. int PPC::isQVALIGNIShuffleMask(SDNode *N) { EVT VT = N->getValueType(0); if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) return -1; ShuffleVectorSDNode *SVOp = cast(N); // Find the first non-undef value in the shuffle mask. unsigned i; for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) /*search*/; if (i == 4) return -1; // all undef. // Otherwise, check to see if the rest of the elements are consecutively // numbered from this value. unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; ShiftAmt -= i; // Check the rest of the elements to see if they are consecutive. for (++i; i != 4; ++i) if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) return -1; return ShiftAmt; } //===----------------------------------------------------------------------===// // Addressing Mode Selection //===----------------------------------------------------------------------===// /// isIntS16Immediate - This method tests to see if the node is either a 32-bit /// or 64-bit immediate, and if the value can be accurately represented as a /// sign extension from a 16-bit value. If so, this returns true and the /// immediate. bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { if (!isa(N)) return false; Imm = (int16_t)cast(N)->getZExtValue(); if (N->getValueType(0) == MVT::i32) return Imm == (int32_t)cast(N)->getZExtValue(); else return Imm == (int64_t)cast(N)->getZExtValue(); } bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented with [r+imm]. bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { int16_t imm = 0; if (N.getOpcode() == ISD::ADD) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i if (N.getOperand(1).getOpcode() == PPCISD::Lo) return false; // r+i Base = N.getOperand(0); Index = N.getOperand(1); return true; } else if (N.getOpcode() == ISD::OR) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i can fold it if we can. // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably // disjoint. KnownBits LHSKnown, RHSKnown; DAG.computeKnownBits(N.getOperand(0), LHSKnown); if (LHSKnown.Zero.getBoolValue()) { DAG.computeKnownBits(N.getOperand(1), RHSKnown); // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { Base = N.getOperand(0); Index = N.getOperand(1); return true; } } } return false; } // If we happen to be doing an i64 load or store into a stack slot that has // less than a 4-byte alignment, then the frame-index elimination may need to // use an indexed load or store instruction (because the offset may not be a // multiple of 4). The extra register needed to hold the offset comes from the // register scavenger, and it is possible that the scavenger will need to use // an emergency spill slot. As a result, we need to make sure that a spill slot // is allocated when doing an i64 load/store into a less-than-4-byte-aligned // stack slot. static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { // FIXME: This does not handle the LWA case. if (VT != MVT::i64) return; // NOTE: We'll exclude negative FIs here, which come from argument // lowering, because there are no known test cases triggering this problem // using packed structures (or similar). We can remove this exclusion if // we find such a test case. The reason why this is so test-case driven is // because this entire 'fixup' is only to prevent crashes (from the // register scavenger) on not-really-valid inputs. For example, if we have: // %a = alloca i1 // %b = bitcast i1* %a to i64* // store i64* a, i64 b // then the store should really be marked as 'align 1', but is not. If it // were marked as 'align 1' then the indexed form would have been // instruction-selected initially, and the problem this 'fixup' is preventing // won't happen regardless. if (FrameIdx < 0) return; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FrameIdx); if (Align >= 4) return; PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasNonRISpills(); } /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better /// represented as reg+reg. If \p Alignment is non-zero, only accept /// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, unsigned Alignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. if (SelectAddressRegReg(N, Disp, Base, DAG)) return false; if (N.getOpcode() == ISD::ADD) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (!Alignment || (imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else { Base = N.getOperand(0); } return true; // [r+i] } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { // Match LOAD (ADD (X, Lo(G))). assert(!cast(N.getOperand(1).getOperand(1))->getZExtValue() && "Cannot handle constant offsets yet!"); Disp = N.getOperand(1).getOperand(0); // The global address. assert(Disp.getOpcode() == ISD::TargetGlobalAddress || Disp.getOpcode() == ISD::TargetGlobalTLSAddress || Disp.getOpcode() == ISD::TargetConstantPool || Disp.getOpcode() == ISD::TargetJumpTable); Base = N.getOperand(0); return true; // [&g+r] } } else if (N.getOpcode() == ISD::OR) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (!Alignment || (imm % Alignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. KnownBits LHSKnown; DAG.computeKnownBits(N.getOperand(0), LHSKnown); if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (FrameIndexSDNode *FI = dyn_cast(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else { Base = N.getOperand(0); } Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); return true; } } } else if (ConstantSDNode *CN = dyn_cast(N)) { // Loading from a constant address. // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" int16_t Imm; if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); return true; } // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, MVT::i32); unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); return true; } } Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); if (FrameIndexSDNode *FI = dyn_cast(N)) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); } else Base = N; return true; // [r+0] } /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { // Check to see if we can easily represent this as an [r+r] address. This // will fail if it thinks that the address is more profitably represented as // reg+imm, e.g. where imm = 0. if (SelectAddressRegReg(N, Base, Index, DAG)) return true; // If the address is the result of an add, we will utilize the fact that the // address calculation includes an implicit add. However, we can reduce // register pressure if we do not materialize a constant just for use as the // index register. We only get rid of the add if it is not an add of a // value and a 16-bit signed constant and both have a single use. int16_t imm = 0; if (N.getOpcode() == ISD::ADD && (!isIntS16Immediate(N.getOperand(1), imm) || !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { Base = N.getOperand(0); Index = N.getOperand(1); return true; } // Otherwise, do it the hard way, using R0 as the base register. Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, N.getValueType()); Index = N; return true; } /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { if (DisablePPCPreinc) return false; bool isLoad = true; SDValue Ptr; EVT VT; unsigned Alignment; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); Alignment = LD->getAlignment(); } else if (StoreSDNode *ST = dyn_cast(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); Alignment = ST->getAlignment(); isLoad = false; } else return false; // PowerPC doesn't have preinc load/store instructions for vectors (except // for QPX, which does have preinc r+r forms). if (VT.isVector()) { if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { return false; } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { AM = ISD::PRE_INC; return true; } } if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer // is a frame index, or if N is a store and the base pointer is either // the same as or a predecessor of the value being stored. Check for // those situations here, and try with swapped Base/Offset instead. bool Swap = false; if (isa(Base) || isa(Base)) Swap = true; else if (!isLoad) { SDValue Val = cast(N)->getValue(); if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) Swap = true; } if (Swap) std::swap(Base, Offset); AM = ISD::PRE_INC; return true; } // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) return false; } if (LoadSDNode *LD = dyn_cast(N)) { // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of // sext i32 to i64 when addr mode is r+i. if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && LD->getExtensionType() == ISD::SEXTLOAD && isa(Offset)) return false; } AM = ISD::PRE_INC; return true; } //===----------------------------------------------------------------------===// // LowerOperation implementation //===----------------------------------------------------------------------===// /// Return true if we should reference labels using a PICBase, set the HiOpFlags /// and LoOpFlags to the target MO flags. static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, unsigned &HiOpFlags, unsigned &LoOpFlags, const GlobalValue *GV = nullptr) { HiOpFlags = PPCII::MO_HA; LoOpFlags = PPCII::MO_LO; // Don't use the pic base if not in PIC relocation model. if (IsPIC) { HiOpFlags |= PPCII::MO_PIC_FLAG; LoOpFlags |= PPCII::MO_PIC_FLAG; } // If this is a reference to a global value that requires a non-lazy-ptr, make // sure that instruction lowering adds it. if (GV && Subtarget.hasLazyResolverStub(GV)) { HiOpFlags |= PPCII::MO_NLP_FLAG; LoOpFlags |= PPCII::MO_NLP_FLAG; if (GV->hasHiddenVisibility()) { HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; } } } static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, SelectionDAG &DAG) { SDLoc DL(HiPart); EVT PtrVT = HiPart.getValueType(); SDValue Zero = DAG.getConstant(0, DL, PtrVT); SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); // With PIC, the first instruction is actually "GR+hi(&G)". if (isPIC) Hi = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); // Generate non-pic code that has direct accesses to the constant pool. // The address of the global is just (hi(&g)+lo(&g)). return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); } static void setUsesTOCBasePtr(MachineFunction &MF) { PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setUsesTOCBasePtr(); } static void setUsesTOCBasePtr(SelectionDAG &DAG) { setUsesTOCBasePtr(DAG.getMachineFunction()); } static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, SDValue GA) { EVT VT = Is64Bit ? MVT::i64 : MVT::i32; SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, MachineMemOperand::MOLoad); } SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); ConstantPoolSDNode *CP = cast(Op); const Constant *C = CP->getConstVal(); // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); return getTOCEntry(DAG, SDLoc(CP), true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), PPCII::MO_PIC_FLAG); return getTOCEntry(DAG, SDLoc(CP), false, GA); } SDValue CPIHi = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); SDValue CPILo = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG); } // For 64-bit PowerPC, prefer the more compact relative encodings. // This trades 32 bits per jump table entry for one or two instructions // on the jump site. unsigned PPCTargetLowering::getJumpTableEncoding() const { if (isJumpTableRelative()) return MachineJumpTableInfo::EK_LabelDifference32; return TargetLowering::getJumpTableEncoding(); } bool PPCTargetLowering::isJumpTableRelative() const { if (Subtarget.isPPC64()) return true; return TargetLowering::isJumpTableRelative(); } SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget.isPPC64()) return TargetLowering::getPICJumpTableRelocBase(Table, DAG); switch (getTargetMachine().getCodeModel()) { case CodeModel::Small: case CodeModel::Medium: return TargetLowering::getPICJumpTableRelocBase(Table, DAG); default: return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())); } } const MCExpr * PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { if (!Subtarget.isPPC64()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); switch (getTargetMachine().getCodeModel()) { case CodeModel::Small: case CodeModel::Medium: return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); default: return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } } SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); JumpTableSDNode *JT = cast(Op); // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); return getTOCEntry(DAG, SDLoc(JT), true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, PPCII::MO_PIC_FLAG); return getTOCEntry(DAG, SDLoc(GA), false, GA); } SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG); } SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); BlockAddressSDNode *BASDN = cast(Op); const BlockAddress *BA = BASDN->getBlockAddress(); // 64-bit SVR4 ABI code is always position-independent. // The actual BlockAddress is stored in the TOC. if (Subtarget.isSVR4ABI() && isPositionIndependent()) { if (Subtarget.isPPC64()) setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG); } SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // FIXME: TLS addresses currently use medium model code sequences, // which is the most useful form. Eventually support for small and // large models could be added if users need it, at the cost of // additional complexity. GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool is64bit = Subtarget.isPPC64(); const Module *M = DAG.getMachineFunction().getFunction().getParent(); PICLevel::Level picLevel = M->getPICLevel(); TLSModel::Model Model = getTargetMachine().getTLSModel(GV); if (Model == TLSModel::LocalExec) { SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_LO); SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) : DAG.getRegister(PPC::R2, MVT::i32); SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); } if (Model == TLSModel::InitialExec) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLS); SDValue GOTPtr; if (is64bit) { setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA); } else GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr); return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); } if (Model == TLSModel::GeneralDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, GOTReg, TGA); } else { if (picLevel == PICLevel::SmallPIC) GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); else GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); } return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, GOTPtr, TGA, TGA); } if (Model == TLSModel::LocalDynamic) { SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { setUsesTOCBasePtr(DAG); SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, GOTReg, TGA); } else { if (picLevel == PICLevel::SmallPIC) GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); else GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); } SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, PtrVT, GOTPtr, TGA, TGA); SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, TLSAddr, TGA); return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); } llvm_unreachable("Unknown TLS model!"); } SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); GlobalAddressSDNode *GSDN = cast(Op); SDLoc DL(GSDN); const GlobalValue *GV = GSDN->getGlobal(); // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); return getTOCEntry(DAG, DL, true, GA); } unsigned MOHiFlag, MOLoFlag; bool IsPIC = isPositionIndependent(); getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV); if (IsPIC && Subtarget.isSVR4ABI()) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), PPCII::MO_PIC_FLAG); return getTOCEntry(DAG, DL, false, GA); } SDValue GAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); SDValue GALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG); // If the global reference is actually to a non-lazy-pointer, we have to do an // extra load to get the address of the global. if (MOHiFlag & PPCII::MO_NLP_FLAG) Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); return Ptr; } SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc dl(Op); if (Op.getValueType() == MVT::v2i64) { // When the operands themselves are v2i64 values, we need to do something // special because VSX has no underlying comparison operations for these. if (Op.getOperand(0).getValueType() == MVT::v2i64) { // Equality can be handled by casting to the legal type for Altivec // comparisons, everything else needs to be expanded. if (CC == ISD::SETEQ || CC == ISD::SETNE) { return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getSetCC(dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), CC)); } return SDValue(); } // We handle most of these in the usual way. return Op; } // If we're comparing for equality to zero, expose the fact that this is // implemented as a ctlz/srl pair on ppc, so that the dag combiner can // fold the new nodes. if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) return V; if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { // Leave comparisons against 0 and -1 alone for now, since they're usually // optimized. FIXME: revisit this when we can custom lower all setcc // optimizations. if (C->isAllOnesValue() || C->isNullValue()) return SDValue(); } // If we have an integer seteq/setne, turn it into a compare against zero // by xor'ing the rhs with the lhs, which is faster than setting a // condition register, reading it back out, and masking the correct bit. The // normal approach here uses sub to do this instead of xor. Using xor exposes // the result to other bit-twiddling opportunities. EVT LHSVT = Op.getOperand(0).getValueType(); if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { EVT VT = Op.getValueType(); SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), Op.getOperand(1)); return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); } return SDValue(); } SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); EVT VT = Node->getValueType(0); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue InChain = Node->getOperand(0); SDValue VAListPtr = Node->getOperand(1); const Value *SV = cast(Node->getOperand(2))->getValue(); SDLoc dl(Node); assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); // gpr_index SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, VAListPtr, MachinePointerInfo(SV), MVT::i8); InChain = GprIndex.getValue(1); if (VT == MVT::i64) { // Check if GprIndex is even SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, DAG.getConstant(1, dl, MVT::i32)); SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, DAG.getConstant(1, dl, MVT::i32)); // Align GprIndex to be even if it isn't GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, GprIndex); } // fpr index is 1 byte after gpr SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, DAG.getConstant(1, dl, MVT::i32)); // fpr SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, FprPtr, MachinePointerInfo(SV), MVT::i8); InChain = FprIndex.getValue(1); SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, DAG.getConstant(8, dl, MVT::i32)); SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, DAG.getConstant(4, dl, MVT::i32)); // areas SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo()); InChain = OverflowArea.getValue(1); SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo()); InChain = RegSaveArea.getValue(1); // select overflow_area if index > 8 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); // adjustment constant gpr_index * 4/8 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, DAG.getConstant(VT.isInteger() ? 4 : 8, dl, MVT::i32)); // OurReg = RegSaveArea + RegConstant SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, RegConstant); // Floating types are 32 bytes into RegSaveArea if (VT.isFloatingPoint()) OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, DAG.getConstant(32, dl, MVT::i32)); // increase {f,g}pr_index by 1 (or 2 if VT is i64) SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, MVT::i32)); InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, VT.isInteger() ? VAListPtr : FprPtr, MachinePointerInfo(SV), MVT::i8); // determine if we should load from reg_save_area or overflow_area SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); // increase overflow_area by 4/8 if gpr/fpr > 8 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, DAG.getConstant(VT.isInteger() ? 4 : 8, dl, MVT::i32)); OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, OverflowAreaPlusN); InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr, MachinePointerInfo(), MVT::i32); return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo()); } SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); // We have to copy the entire va_list struct: // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, false, MachinePointerInfo(), MachinePointerInfo()); } SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { return Op.getOperand(0); } SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool isPPC64 = (PtrVT == MVT::i64); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Ty = IntPtrTy; Entry.Node = Trmp; Args.push_back(Entry); // TrampSize == (isPPC64 ? 48 : 40); Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, isPPC64 ? MVT::i64 : MVT::i32); Args.push_back(Entry); Entry.Node = FPtr; Args.push_back(Entry); Entry.Node = Nest; Args.push_back(Entry); // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( CallingConv::C, Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.second; } SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); PPCFunctionInfo *FuncInfo = MF.getInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); SDLoc dl(Op); if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), MachinePointerInfo(SV)); } // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. // We suppose the given va_list is already allocated. // // typedef struct { // char gpr; /* index into the array of 8 GPRs // * stored in the register save area // * gpr=0 corresponds to r3, // * gpr=1 to r4, etc. // */ // char fpr; /* index into the array of 8 FPRs // * stored in the register save area // * fpr=0 corresponds to f1, // * fpr=1 to f2, etc. // */ // char *overflow_arg_area; // /* location on stack that holds // * the next overflow argument // */ // char *reg_save_area; // /* where r3:r10 and f1:f8 (if saved) // * are stored // */ // } va_list[1]; SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), PtrVT); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); uint64_t FrameOffset = PtrVT.getSizeInBits()/8; SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); uint64_t FPROffset = 1; SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); // Store first byte : number of int regs SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1), MachinePointerInfo(SV), MVT::i8); uint64_t nextOffset = FPROffset; SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), ConstFPROffset); // Store second byte : number of float regs SDValue secondStore = DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, MachinePointerInfo(SV, nextOffset), MVT::i8); nextOffset += StackOffset; nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); // Store second word : arguments given on stack SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, MachinePointerInfo(SV, nextOffset)); nextOffset += FrameOffset; nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); // Store third word : arguments given in registers return DAG.getStore(thirdStore, dl, FR, nextPtr, MachinePointerInfo(SV, nextOffset)); } #include "PPCGenCallingConv.inc" // Function whose sole purpose is to kill compiler warnings // stemming from unused functions included from PPCGenCallingConv.inc. CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; } bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { return true; } bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg ArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; const unsigned NumArgRegs = array_lengthof(ArgRegs); unsigned RegNum = State.getFirstUnallocated(ArgRegs); // Skip one register if the first unallocated register has an even register // number and there are still argument registers available which have not been // allocated yet. RegNum is actually an index into ArgRegs, which means we // need to skip a register if RegNum is odd. if (RegNum != NumArgRegs && RegNum % 2 == 1) { State.AllocateReg(ArgRegs[RegNum]); } // Always return false here, as this function only makes sure that the first // unallocated register has an odd register number and does not actually // allocate a register for the current argument. return false; } bool llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg ArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; const unsigned NumArgRegs = array_lengthof(ArgRegs); unsigned RegNum = State.getFirstUnallocated(ArgRegs); int RegsLeft = NumArgRegs - RegNum; // Skip if there is not enough registers left for long double type (4 gpr regs // in soft float mode) and put long double argument on the stack. if (RegNum != NumArgRegs && RegsLeft < 4) { for (int i = 0; i < RegsLeft; i++) { State.AllocateReg(ArgRegs[RegNum + i]); } } return false; } bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg ArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; const unsigned NumArgRegs = array_lengthof(ArgRegs); unsigned RegNum = State.getFirstUnallocated(ArgRegs); // If there is only one Floating-point register left we need to put both f64 // values of a split ppc_fp128 value on the stack. if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { State.AllocateReg(ArgRegs[RegNum]); } // Always return false here, as this function only makes sure that the two f64 // values a ppc_fp128 value is split into are both passed in registers or both // passed on the stack and does not actually allocate a register for the // current argument. return false; } /// FPR - The set of FP registers that should be allocated for arguments, /// on Darwin. static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; /// QFPR - The set of QPX registers that should be allocated for arguments. static const MCPhysReg QFPR[] = { PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); // Round up to multiples of the pointer size, except for array members, // which are always packed. if (!Flags.isInConsecutiveRegs()) ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } /// CalculateStackSlotAlignment - Calculates the alignment of this argument /// on the stack. static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned Align = PtrByteSize; // Altivec parameters are padded to a 16 byte boundary. if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) Align = 16; // QPX vector types stored in double-precision are padded to a 32 byte // boundary. else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) Align = 32; // ByVal parameters are aligned as requested. if (Flags.isByVal()) { unsigned BVAlign = Flags.getByValAlign(); if (BVAlign > PtrByteSize) { if (BVAlign % PtrByteSize != 0) llvm_unreachable( "ByVal alignment is not a multiple of the pointer size"); Align = BVAlign; } } // Array members are always packed to their original alignment. if (Flags.isInConsecutiveRegs()) { // If the array member was split into multiple registers, the first // needs to be aligned to the size of the full type. (Except for // ppcf128, which is only aligned as its f64 components.) if (Flags.isSplit() && OrigVT != MVT::ppcf128) Align = OrigVT.getStoreSize(); else Align = ArgVT.getStoreSize(); } return Align; } /// CalculateStackSlotUsed - Return whether this argument will use its /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, unsigned PtrByteSize, unsigned LinkageSize, unsigned ParamAreaSize, unsigned &ArgOffset, unsigned &AvailableFPRs, unsigned &AvailableVRs, bool HasQPX) { bool UseMemory = false; // Respect alignment of argument on the stack. unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; // If there's no space left in the argument save area, we must // use memory (this check also catches zero-sized arguments). if (ArgOffset >= LinkageSize + ParamAreaSize) UseMemory = true; // Allocate argument on the stack. ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); if (Flags.isInConsecutiveRegsLast()) ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // If we overran the argument save area, we must use memory // (this check catches arguments passed partially in memory) if (ArgOffset > LinkageSize + ParamAreaSize) UseMemory = true; // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || // QPX registers overlap with the scalar FP registers. (HasQPX && (ArgVT == MVT::v4f32 || ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1))) if (AvailableFPRs > 0) { --AvailableFPRs; return false; } if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) if (AvailableVRs > 0) { --AvailableVRs; return false; } } return UseMemory; } /// EnsureStackAlignment - Round stack frame size up from NumBytes to /// ensure minimum alignment required for target. static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, unsigned NumBytes) { unsigned TargetAlign = Lowering->getStackAlignment(); unsigned AlignMask = TargetAlign - 1; NumBytes = (NumBytes + AlignMask) & ~AlignMask; return NumBytes; } SDValue PPCTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { if (Subtarget.isSVR4ABI()) { if (Subtarget.isPPC64()) return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); else return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); } else { return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); } } SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // 32-bit SVR4 ABI Stack Frame Layout: // +-----------------------------------+ // +--> | Back chain | // | +-----------------------------------+ // | | Floating-point register save area | // | +-----------------------------------+ // | | General register save area | // | +-----------------------------------+ // | | CR save word | // | +-----------------------------------+ // | | VRSAVE save word | // | +-----------------------------------+ // | | Alignment padding | // | +-----------------------------------+ // | | Vector register save area | // | +-----------------------------------+ // | | Local variable space | // | +-----------------------------------+ // | | Parameter list area | // | +-----------------------------------+ // | | LR save word | // | +-----------------------------------+ // SP--> +--- | Back chain | // +-----------------------------------+ // // Specifications: // System V Application Binary Interface PowerPC Processor Supplement // AltiVec Technology Programming Interface Manual MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 4; // Assign locations to all of the incoming arguments. SmallVector ArgLocs; PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, PtrByteSize); if (useSoftFloat() || hasSPE()) CCInfo.PreAnalyzeFormalArguments(Ins); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); CCInfo.clearWasPPCF128(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; // Arguments stored in registers. if (VA.isRegLoc()) { const TargetRegisterClass *RC; EVT ValVT = VA.getValVT(); switch (ValVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("ValVT not supported by formal arguments Lowering"); case MVT::i1: case MVT::i32: RC = &PPC::GPRCRegClass; break; case MVT::f32: if (Subtarget.hasP8Vector()) RC = &PPC::VSSRCRegClass; else if (Subtarget.hasSPE()) RC = &PPC::SPE4RCRegClass; else RC = &PPC::F4RCRegClass; break; case MVT::f64: if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else if (Subtarget.hasSPE()) RC = &PPC::SPERCRegClass; else RC = &PPC::F8RCRegClass; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: RC = &PPC::VRRCRegClass; break; case MVT::v4f32: RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VRRCRegClass; break; case MVT::v4f64: RC = &PPC::QFRCRegClass; break; case MVT::v4i1: RC = &PPC::QBRCRegClass; break; } // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT == MVT::i1 ? MVT::i32 : ValVT); if (ValVT == MVT::i1) ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); InVals.push_back(ArgValue); } else { // Argument stored in memory. assert(VA.isMemLoc()); unsigned ArgSize = VA.getLocVT().getStoreSize(); int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(), isImmutable); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back( DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); } } // Assign locations to all of the incoming aggregate by value arguments. // Aggregates passed by value are stored in the local variable space of the // caller's stack frame, right above the parameter list area. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); // Area that is at least reserved in the caller of this function. unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); MinReservedArea = std::max(MinReservedArea, LinkageSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized function's reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. MinReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); SmallVector MemOps; // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { static const MCPhysReg GPArgRegs[] = { PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); static const MCPhysReg FPArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; unsigned NumFPArgRegs = array_lengthof(FPArgRegs); if (useSoftFloat() || hasSPE()) NumFPArgRegs = 0; FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); // Make room for NumGPArgRegs and NumFPArgRegs. int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; FuncInfo->setVarArgsStackOffset( MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, CCInfo.getNextStackOffset(), true)); FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // The fixed integer arguments of a variadic function are stored to the // VarArgsFrameIndex on the stack so that they may be loaded by // dereferencing the result of va_next. for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { // Get an existing live-in vreg, or add a new one. unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); if (!VReg) VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by four for the next argument to store SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 // is set. // The double arguments are stored to the VarArgsFrameIndex // on the stack. for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { // Get an existing live-in vreg, or add a new one. unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); if (!VReg) VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by eight for the next argument to store SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return Chain; } // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, SDValue ArgVal, const SDLoc &dl) const { if (Flags.isSExt()) ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, DAG.getValueType(ObjectVT)); else if (Flags.isZExt()) ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, DAG.getValueType(ObjectVT)); return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); } SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); assert(!(CallConv == CallingConv::Fast && isVarArg) && "fastcc not supported on varargs functions"); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof(VR); const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area // on its stack frame. In the ELFv1 ABI, this is always the case; // in the ELFv2 ABI, it is true if this is a vararg function or if // any parameter is located in a stack slot. bool HasParameterArea = !isELFv2ABI || isVarArg; unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; unsigned NumBytes = LinkageSize; unsigned AvailableFPRs = Num_FPR_Regs; unsigned AvailableVRs = Num_VR_Regs; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (Ins[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytes, AvailableFPRs, AvailableVRs, Subtarget.hasQPX())) HasParameterArea = true; } // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; if (Ins[ArgNo].isOrigArg()) { std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[ArgNo].getOrigArgIndex(); } // We re-align the argument offset for each argument, except when using the // fast calling convention, when we need to make sure we do that only when // we'll actually use a stack slot. unsigned CurArgOffset, Align; auto ComputeArgOffset = [&]() { /* Respect alignment of argument on the stack. */ Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; CurArgOffset = ArgOffset; }; if (CallConv != CallingConv::Fast) { ComputeArgOffset(); /* Compute GPR index associated with argument offset. */ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx = std::min(GPR_idx, Num_GPR_Regs); } // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); if (CallConv == CallingConv::Fast) ComputeArgOffset(); // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // Empty aggregate parameters do not take up registers. Examples: // struct { } a; // union { } b; // int c[0]; // etc. However, we have to provide a place-holder in InVals, so // pretend we have an 8-byte item at the current address for that // purpose. if (!ObjSize) { int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); continue; } // Create a stack object covering all stack doublewords occupied // by the argument. If the argument is (fully or partially) on // the stack, or if the argument is fully in registers but the // caller has allocated the parameter save anyway, we can refer // directly to the caller's stack frame. Otherwise, create a // local copy in our own frame. int FI; if (HasParameterArea || ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true); else FI = MFI.CreateStackObject(ArgSize, Align, false); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); // Handle aggregates smaller than 8 bytes. if (ObjSize < PtrByteSize) { // The value of the object is its address, which differs from the // address of the enclosing doubleword on big-endian systems. SDValue Arg = FIN; if (!isLittleEndian) { SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); } InVals.push_back(Arg); if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store; if (ObjSize==1 || ObjSize==2 || ObjSize==4) { EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, MachinePointerInfo(&*FuncArg), ObjType); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area // slot. Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(&*FuncArg)); } MemOps.push_back(Store); } // Whether we copied from a register or not, advance the offset // into the parameter save area by a full doubleword. ArgOffset += PtrByteSize; continue; } // The value of the object is its address, which is the address of // its first stack doubleword. InVals.push_back(FIN); // Store whatever pieces of the object are in registers to memory. for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { if (GPR_idx == Num_GPR_Regs) break; unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Addr = FIN; if (j) { SDValue Off = DAG.getConstant(j, dl, PtrVT); Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, MachinePointerInfo(&*FuncArg, j)); MemOps.push_back(Store); ++GPR_idx; } ArgOffset += ArgSize; continue; } switch (ObjectVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unhandled argument type!"); case MVT::i1: case MVT::i32: case MVT::i64: if (Flags.isNest()) { // The 'nest' parameter, if any, is passed in R11. unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); break; } // These can be scalar arguments or elements of an integer array type // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; ArgSize = PtrByteSize; } if (CallConv != CallingConv::Fast || needsLoad) ArgOffset += 8; break; case MVT::f32: case MVT::f64: // These can be scalar arguments or elements of a float array type // passed directly. The latter are used to implement ELFv2 homogenous // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasP8Vector() ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass); else VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 // once we support fp <-> gpr moves. // This can only ever happen in the presence of f32 array types, // since otherwise we never run out of FPRs before running out // of GPRs. unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::f32) { if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, DAG.getConstant(32, dl, MVT::i32)); ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); } ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } // When passing an array of floats, the array occupies consecutive // space in the argument area; only round up to the next doubleword // at the end of the array. Otherwise, each float takes 8 bytes. if (CallConv != CallingConv::Fast || needsLoad) { ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; ArgOffset += ArgSize; if (Flags.isInConsecutiveRegsLast()) ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: case MVT::v1i128: case MVT::f128: if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++VR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) ArgOffset += 16; break; } // not QPX assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && "Invalid QPX parameter type"); /* fall through */ case MVT::v4f64: case MVT::v4i1: // QPX vectors are treated like their scalar floating-point subregisters // (except that they're larger). unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; if (QFPR_idx != Num_QFPR_Regs) { const TargetRegisterClass *RC; switch (ObjectVT.getSimpleVT().SimpleTy) { case MVT::v4f64: RC = &PPC::QFRCRegClass; break; case MVT::v4f32: RC = &PPC::QSRCRegClass; break; default: RC = &PPC::QBRCRegClass; break; } unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++QFPR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) ArgOffset += Sz; break; } // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type. if (needsLoad) { if (ObjSize < ArgSize && !isLittleEndian) CurArgOffset += ArgSize - ObjSize; int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); } InVals.push_back(ArgVal); } // Area that is at least reserved in the caller of this function. unsigned MinReservedArea; if (HasParameterArea) MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); else MinReservedArea = LinkageSize; // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. MinReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { int Depth = ArgOffset; FuncInfo->setVarArgsFrameIndex( MFI.CreateFixedObject(PtrByteSize, Depth, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx < Num_GPR_Regs; ++GPR_idx) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by four for the next argument to store SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return Chain; } SDValue PPCTargetLowering::LowerFormalArguments_Darwin( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; // Potential tail calls could cause overwriting of argument stack slots. bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; static const MCPhysReg GPR_32[] = { // 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; static const MCPhysReg GPR_64[] = { // 64-bit registers. PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned Num_GPR_Regs = array_lengthof(GPR_32); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof( VR); unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; // In 32-bit non-varargs functions, the stack space for vectors is after the // stack space for non-vectors. We do not use this space unless we have // too many vectors to fit in registers, something that only occurs in // constructed examples:), but we have to walk the arglist to figure // that out...for the pathological case, compute VecArgOffset as the // start of the vector parameter area. Computing VecArgOffset is the // entire point of the following loop. unsigned VecArgOffset = ArgOffset; if (!isVarArg && !isPPC64) { for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { EVT ObjectVT = Ins[ArgNo].VT; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; if (Flags.isByVal()) { // ObjSize is the true size, ArgSize rounded up to multiple of regs. unsigned ObjSize = Flags.getByValSize(); unsigned ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; VecArgOffset += ArgSize; continue; } switch(ObjectVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unhandled argument type!"); case MVT::i1: case MVT::i32: case MVT::f32: VecArgOffset += 4; break; case MVT::i64: // PPC64 case MVT::f64: // FIXME: We are guaranteed to be !isPPC64 at this point. // Does MVT::i64 apply? VecArgOffset += 8; break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: // Nothing to do, we're only looking at Nonvector args here. break; } } } // We've found where the vector parameter area in memory is. Skip the // first 12 parameters; these don't use that memory. VecArgOffset = ((VecArgOffset+15)/16)*16; VecArgOffset += 12*16; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. SmallVector MemOps; unsigned nAltivecParamsAtEnd = 0; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; unsigned ObjSize = ObjectVT.getSizeInBits()/8; unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; if (Ins[ArgNo].isOrigArg()) { std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[ArgNo].getOrigArgIndex(); } unsigned CurArgOffset = ArgOffset; // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { if (isVarArg || isPPC64) { MinReservedArea = ((MinReservedArea+15)/16)*16; MinReservedArea += CalculateStackSlotSize(ObjectVT, Flags, PtrByteSize); } else nAltivecParamsAtEnd++; } else // Calculate min reserved area. MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, Flags, PtrByteSize); // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. if (Flags.isByVal()) { assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); // ObjSize is the true size, ArgSize rounded up to multiple of registers. ObjSize = Flags.getByValSize(); ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // Objects of size 1 and 2 are right justified, everything else is // left justified. This means the memory address is adjusted forwards. if (ObjSize==1 || ObjSize==2) { CurArgOffset = CurArgOffset + (4 - ObjSize); } // The value of the object is its address. int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(FIN); if (ObjSize==1 || ObjSize==2) { if (GPR_idx != Num_GPR_Regs) { unsigned VReg; if (isPPC64) VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(&*FuncArg), ObjType); MemOps.push_back(Store); ++GPR_idx; } ArgOffset += PtrByteSize; continue; } for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { // Store whatever pieces of the object are in registers // to memory. ArgOffset will be the address of the beginning // of the object. if (GPR_idx != Num_GPR_Regs) { unsigned VReg; if (isPPC64) VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(&*FuncArg, j)); MemOps.push_back(Store); ++GPR_idx; ArgOffset += PtrByteSize; } else { ArgOffset += ArgSize - (ArgOffset-CurArgOffset); break; } } continue; } switch (ObjectVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unhandled argument type!"); case MVT::i1: case MVT::i32: if (!isPPC64) { if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); if (ObjectVT == MVT::i1) ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; } // All int arguments reserve stack space in the Darwin ABI. ArgOffset += PtrByteSize; break; } LLVM_FALLTHROUGH; case MVT::i64: // PPC64 if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; } // All int arguments reserve stack space in the Darwin ABI. ArgOffset += 8; break; case MVT::f32: case MVT::f64: // Every 4 bytes of argument space consumes one of the GPRs available for // argument passing. if (GPR_idx != Num_GPR_Regs) { ++GPR_idx; if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) ++GPR_idx; } if (FPR_idx != Num_FPR_Regs) { unsigned VReg; if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); else VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; } else { needsLoad = true; } // All FP arguments reserve stack space in the Darwin ABI. ArgOffset += isPPC64 ? 8 : ObjSize; break; case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: // Note that vector arguments in registers don't reserve stack space, // except in varargs functions. if (VR_idx != Num_VR_Regs) { unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); if (isVarArg) { while ((ArgOffset % 16) != 0) { ArgOffset += PtrByteSize; if (GPR_idx != Num_GPR_Regs) GPR_idx++; } ArgOffset += 16; GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? } ++VR_idx; } else { if (!isVarArg && !isPPC64) { // Vectors go after all the nonvectors. CurArgOffset = VecArgOffset; VecArgOffset += 16; } else { // Vectors are aligned. ArgOffset = ((ArgOffset+15)/16)*16; CurArgOffset = ArgOffset; ArgOffset += 16; } needsLoad = true; } break; } // We need to load the argument to a virtual register if we determined above // that we ran out of physical registers of the appropriate type. if (needsLoad) { int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset + (ArgSize - ObjSize), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); } InVals.push_back(ArgVal); } // Allow for Altivec parameters at the end, if needed. if (nAltivecParamsAtEnd) { MinReservedArea = ((MinReservedArea+15)/16)*16; MinReservedArea += 16*nAltivecParamsAtEnd; } // Area that is at least reserved in the caller of this function. MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. MinReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { int Depth = ArgOffset; FuncInfo->setVarArgsFrameIndex( MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, Depth, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { unsigned VReg; if (isPPC64) VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); else VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); MemOps.push_back(Store); // Increment the address by four for the next argument to store SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); } } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return Chain; } /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be /// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, unsigned ParamSize) { if (!isTailCall) return 0; PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo(); unsigned CallerMinReservedArea = FI->getMinReservedArea(); int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; // Remember only if the new adjustment is bigger. if (SPDiff < FI->getTailCallSPDelta()) FI->setTailCallSPDelta(SPDiff); return SPDiff; } static bool isFunctionGlobalAddress(SDValue Callee); static bool callsShareTOCBase(const Function *Caller, SDValue Callee, const TargetMachine &TM) { // If !G, Callee can be an external symbol. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G) return false; // The medium and large code models are expected to provide a sufficiently // large TOC to provide all data addressing needs of a module with a // single TOC. Since each module will be addressed with a single TOC then we // only need to check that caller and callee don't cross dso boundaries. if (CodeModel::Medium == TM.getCodeModel() || CodeModel::Large == TM.getCodeModel()) return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); // Otherwise we need to ensure callee and caller are in the same section, // since the linker may allocate multiple TOCs, and we don't know which // sections will belong to the same TOC base. const GlobalValue *GV = G->getGlobal(); if (!GV->isStrongDefinitionForLinker()) return false; // Any explicitly-specified sections and section prefixes must also match. // Also, if we're using -ffunction-sections, then each function is always in // a different section (the same is true for COMDAT functions). if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() || GV->getSection() != Caller->getSection()) return false; if (const auto *F = dyn_cast(GV)) { if (F->getSectionPrefix() != Caller->getSectionPrefix()) return false; } // If the callee might be interposed, then we can't assume the ultimate call // target will be in the same section. Even in cases where we can assume that // interposition won't happen, in any case where the linker might insert a // stub to allow for interposition, we must generate code as though // interposition might occur. To understand why this matters, consider a // situation where: a -> b -> c where the arrows indicate calls. b and c are // in the same section, but a is in a different module (i.e. has a different // TOC base pointer). If the linker allows for interposition between b and c, // then it will generate a stub for the call edge between b and c which will // save the TOC pointer into the designated stack slot allocated by b. If we // return true here, and therefore allow a tail call between b and c, that // stack slot won't exist and the b -> c stub will end up saving b'c TOC base // pointer into the stack slot allocated by a (where the a -> b stub saved // a's TOC base pointer). If we're not considering a tail call, but rather, // whether a nop is needed after the call instruction in b, because the linker // will insert a stub, it might complain about a missing nop if we omit it // (although many don't complain in this case). if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV)) return false; return true; } static bool needStackSlotPassParameters(const PPCSubtarget &Subtarget, const SmallVectorImpl &Outs) { assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); const unsigned PtrByteSize = 8; const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); const unsigned ParamAreaSize = NumGPRs * PtrByteSize; unsigned NumBytes = LinkageSize; unsigned AvailableFPRs = NumFPRs; unsigned AvailableVRs = NumVRs; for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytes, AvailableFPRs, AvailableVRs, Subtarget.hasQPX())) return true; } return false; } static bool hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { if (CS.arg_size() != CallerFn->arg_size()) return false; ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin(); ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end(); Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { const Value* CalleeArg = *CalleeArgIter; const Value* CallerArg = &(*CallerArgIter); if (CalleeArg == CallerArg) continue; // e.g. @caller([4 x i64] %a, [4 x i64] %b) { // tail call @callee([4 x i64] undef, [4 x i64] %b) // } // 1st argument of callee is undef and has the same type as caller. if (CalleeArg->getType() == CallerArg->getType() && isa(CalleeArg)) continue; return false; } return true; } // Returns true if TCO is possible between the callers and callees // calling conventions. static bool areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, CallingConv::ID CalleeCC) { // Tail calls are possible with fastcc and ccc. auto isTailCallableCC = [] (CallingConv::ID CC){ return CC == CallingConv::C || CC == CallingConv::Fast; }; if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC)) return false; // We can safely tail call both fastcc and ccc callees from a c calling // convention caller. If the caller is fastcc, we may have less stack space // than a non-fastcc caller with the same signature so disable tail-calls in // that case. return CallerCC == CallingConv::C || CallerCC == CalleeCC; } bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( SDValue Callee, CallingConv::ID CalleeCC, ImmutableCallSite CS, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; if (DisableSCO && !TailCallOpt) return false; // Variadic argument functions are not supported. if (isVarArg) return false; auto &Caller = DAG.getMachineFunction().getFunction(); // Check that the calling conventions are compatible for tco. if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC)) return false; // Caller contains any byval parameter is not supported. if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); })) return false; // Callee contains any byval parameter is not supported, too. // Note: This is a quick work around, because in some cases, e.g. // caller's stack size > callee's stack size, we are still able to apply // sibling call optimization. For example, gcc is able to do SCO for caller1 // in the following example, but not for caller2. // struct test { // long int a; // char ary[56]; // } gTest; // __attribute__((noinline)) int callee(struct test v, struct test *b) { // b->a = v.a; // return 0; // } // void caller1(struct test a, struct test c, struct test *b) { // callee(gTest, b); } // void caller2(struct test *b) { callee(gTest, b); } if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) return false; // If callee and caller use different calling conventions, we cannot pass // parameters on stack since offsets for the parameter area may be different. if (Caller.getCallingConv() != CalleeCC && needStackSlotPassParameters(Subtarget, Outs)) return false; // No TCO/SCO on indirect call because Caller have to restore its TOC if (!isFunctionGlobalAddress(Callee) && !isa(Callee)) return false; // If the caller and callee potentially have different TOC bases then we // cannot tail call since we need to restore the TOC pointer after the call. // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 if (!callsShareTOCBase(&Caller, Callee, getTargetMachine())) return false; // TCO allows altering callee ABI, so we don't have to check further. if (CalleeCC == CallingConv::Fast && TailCallOpt) return true; if (DisableSCO) return false; // If callee use the same argument list that caller is using, then we can // apply SCO on this case. If it is not, then we need to check if callee needs // stack for passing arguments. if (!hasSameArgumentList(&Caller, CS) && needStackSlotPassParameters(Subtarget, Outs)) { return false; } return true; } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { if (!getTargetMachine().Options.GuaranteedTailCallOpt) return false; // Variable argument functions are not supported. if (isVarArg) return false; MachineFunction &MF = DAG.getMachineFunction(); CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { // Functions containing by val parameters are not supported. for (unsigned i = 0; i != Ins.size(); i++) { ISD::ArgFlagsTy Flags = Ins[i].Flags; if (Flags.isByVal()) return false; } // Non-PIC/GOT tail calls are supported. if (getTargetMachine().getRelocationModel() != Reloc::PIC_) return true; // At the moment we can only do local tail calls (in same module, hidden // or protected) if we are generating PIC. if (GlobalAddressSDNode *G = dyn_cast(Callee)) return G->getGlobal()->hasHiddenVisibility() || G->getGlobal()->hasProtectedVisibility(); } return false; } /// isCallCompatibleAddress - Return the immediate to use if the specified /// 32-bit value is representable in the immediate field of a BxA instruction. static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { ConstantSDNode *C = dyn_cast(Op); if (!C) return nullptr; int Addr = C->getZExtValue(); if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. SignExtend32<26>(Addr) != Addr) return nullptr; // Top 6 bits have to be sext of immediate. return DAG .getConstant( (int)C->getZExtValue() >> 2, SDLoc(Op), DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())) .getNode(); } namespace { struct TailCallArgumentInfo { SDValue Arg; SDValue FrameIdxOp; int FrameIdx = 0; TailCallArgumentInfo() = default; }; } // end anonymous namespace /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. static void StoreTailCallArgumentsToStackSlot( SelectionDAG &DAG, SDValue Chain, const SmallVectorImpl &TailCallArgs, SmallVectorImpl &MemOpChains, const SDLoc &dl) { for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { SDValue Arg = TailCallArgs[i].Arg; SDValue FIN = TailCallArgs[i].FrameIdxOp; int FI = TailCallArgs[i].FrameIdx; // Store relative to framepointer. MemOpChains.push_back(DAG.getStore( Chain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); } } /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to /// the appropriate stack slot for the tail call optimized function call. static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, SDValue OldRetAddr, SDValue OldFP, int SPDiff, const SDLoc &dl) { if (SPDiff) { // Calculate the new stack slot for the return address. MachineFunction &MF = DAG.getMachineFunction(); const PPCSubtarget &Subtarget = MF.getSubtarget(); const PPCFrameLowering *FL = Subtarget.getFrameLowering(); bool isPPC64 = Subtarget.isPPC64(); int SlotSize = isPPC64 ? 8 : 4; int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize, NewRetAddrLoc, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(MF, NewRetAddr)); // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack // slot as the FP is never overwritten. if (Subtarget.isDarwinABI()) { int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc, true); SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewFPIdx)); } } return Chain; } /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate /// the position of the argument. static void CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, SDValue Arg, int SPDiff, unsigned ArgOffset, SmallVectorImpl& TailCallArguments) { int Offset = ArgOffset + SPDiff; uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8; int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; SDValue FIN = DAG.getFrameIndex(FI, VT); TailCallArgumentInfo Info; Info.Arg = Arg; Info.FrameIdxOp = FIN; Info.FrameIdx = FI; TailCallArguments.push_back(Info); } /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address /// stack slot. Returns the chain as result and the loaded frame pointers in /// LROpOut/FPOpout. Used when tail calling. SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut, SDValue &FPOpOut, const SDLoc &dl) const { if (SPDiff) { // Load the LR and FP stack slot for later adjusting. EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; LROpOut = getReturnAddrFrameIndex(DAG); LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); Chain = SDValue(LROpOut.getNode(), 1); // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack // slot as the FP is never overwritten. if (Subtarget.isDarwinABI()) { FPOpOut = getFramePointerFrameIndex(DAG); FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo()); Chain = SDValue(FPOpOut.getNode(), 1); } } return Chain; } /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified /// by "Src" to address "Dst" of size "Size". Alignment information is /// specified by the specific parameter attribute. The copy will be passed as /// a byval function parameter. /// Sometimes what we are copying is the end of a larger object, the part that /// does not fit in registers. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), false, false, false, MachinePointerInfo(), MachinePointerInfo()); } /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of /// tail calls. static void LowerMemOpCallTo( SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, bool isTailCall, bool isVector, SmallVectorImpl &MemOpChains, SmallVectorImpl &TailCallArguments, const SDLoc &dl) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); if (!isTailCall) { if (isVector) { SDValue StackPtr; if (isPPC64) StackPtr = DAG.getRegister(PPC::X1, MVT::i64); else StackPtr = DAG.getRegister(PPC::R1, MVT::i32); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, DAG.getConstant(ArgOffset, dl, PtrVT)); } MemOpChains.push_back( DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); // Calculate and remember argument location. } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, TailCallArguments); } static void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, SDValue FPOp, SmallVectorImpl &TailCallArguments) { // Emit a sequence of copyto/copyfrom virtual registers for arguments that // might overwrite each other in case of tail call optimization. SmallVector MemOpChains2; // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, MemOpChains2, dl); if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); // Emit callseq_end just before tailcall node. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Is this global address that of a function that can be called by name? (as // opposed to something that must hold a descriptor for an indirect call). static bool isFunctionGlobalAddress(SDValue Callee) { if (GlobalAddressSDNode *G = dyn_cast(Callee)) { if (Callee.getOpcode() == ISD::GlobalTLSAddress || Callee.getOpcode() == ISD::TargetGlobalTLSAddress) return false; return G->getGlobal()->getValueType()->isFunctionTy(); } return false; } static unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, bool isPatchPoint, bool hasNest, SmallVectorImpl> &RegsToPass, SmallVectorImpl &Ops, std::vector &NodeTys, ImmutableCallSite CS, const PPCSubtarget &Subtarget) { bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); NodeTys.push_back(MVT::Other); // Returns a chain NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. unsigned CallOpc = PPCISD::CALL; bool needIndirectCall = true; if (!isSVR4ABI || !isPPC64) if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { // If this is an absolute destination address, use the munged value. Callee = SDValue(Dest, 0); needIndirectCall = false; } // PC-relative references to external symbols should go through $stub, unless // we're building with the leopard linker or later, which automatically // synthesizes these stubs. const TargetMachine &TM = DAG.getTarget(); const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); const GlobalValue *GV = nullptr; if (auto *G = dyn_cast(Callee)) GV = G->getGlobal(); bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; if (isFunctionGlobalAddress(Callee)) { GlobalAddressSDNode *G = cast(Callee); // A call to a TLS address is actually an indirect call to a // thread-specific pointer. unsigned OpFlags = 0; if (UsePlt) OpFlags = PPCII::MO_PLT; // If the callee is a GlobalAddress/ExternalSymbol node (quite common, // every direct call is) turn it into a TargetGlobalAddress / // TargetExternalSymbol node so that legalize doesn't hack it. Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, Callee.getValueType(), 0, OpFlags); needIndirectCall = false; } if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; if (UsePlt) OpFlags = PPCII::MO_PLT; Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), OpFlags); needIndirectCall = false; } if (isPatchPoint) { // We'll form an invalid direct call when lowering a patchpoint; the full // sequence for an indirect call is complicated, and many of the // instructions introduced might have side effects (and, thus, can't be // removed later). The call itself will be removed as soon as the // argument/return lowering is complete, so the fact that it has the wrong // kind of operands should not really matter. needIndirectCall = false; } if (needIndirectCall) { // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair // to do the call, we can't use PPCISD::CALL. SDValue MTCTROps[] = {Chain, Callee, InFlag}; if (isSVR4ABI && isPPC64 && !isELFv2ABI) { // Function pointers in the 64-bit SVR4 ABI do not point to the function // entry point, but to the function descriptor (the function entry point // address is part of the function descriptor though). // The function descriptor is a three doubleword structure with the // following fields: function entry point, TOC base address and // environment pointer. // Thus for a call through a function pointer, the following actions need // to be performed: // 1. Save the TOC of the caller in the TOC save area of its stack // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). // 2. Load the address of the function entry point from the function // descriptor. // 3. Load the TOC of the callee from the function descriptor into r2. // 4. Load the environment pointer from the function descriptor into // r11. // 5. Branch to the function entry point address. // 6. On return of the callee, the TOC of the caller needs to be // restored (this is done in FinishCall()). // // The loads are scheduled at the beginning of the call sequence, and the // register copies are flagged together to ensure that no other // operations can be scheduled in between. E.g. without flagging the // copies together, a TOC access in the caller could be scheduled between // the assignment of the callee TOC and the branch to the callee, which // results in the TOC access going through the TOC of the callee instead // of going through the TOC of the caller, which leads to incorrect code. // Load the address of the function entry point from the function // descriptor. SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); if (LDChain.getValueType() == MVT::Glue) LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() ? (MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant) : MachineMemOperand::MONone; MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, /* Alignment = */ 8, MMOFlags); // Load environment pointer into r11. SDValue PtrOff = DAG.getIntPtrConstant(16, dl); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); SDValue LoadEnvPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), /* Alignment = */ 8, MMOFlags); SDValue TOCOff = DAG.getIntPtrConstant(8, dl); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); SDValue TOCPtr = DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), /* Alignment = */ 8, MMOFlags); setUsesTOCBasePtr(DAG); SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, InFlag); Chain = TOCVal.getValue(0); InFlag = TOCVal.getValue(1); // If the function call has an explicit 'nest' parameter, it takes the // place of the environment pointer. if (!hasNest) { SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, InFlag); Chain = EnvVal.getValue(0); InFlag = EnvVal.getValue(1); } MTCTROps[0] = Chain; MTCTROps[1] = LoadFuncPtr; MTCTROps[2] = InFlag; } Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); InFlag = Chain.getValue(1); NodeTys.clear(); NodeTys.push_back(MVT::Other); NodeTys.push_back(MVT::Glue); Ops.push_back(Chain); CallOpc = PPCISD::BCTRL; Callee.setNode(nullptr); // Add use of X11 (holding environment pointer) if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); } // If this is a direct call, pass the chain and the callee. if (Callee.getNode()) { Ops.push_back(Chain); Ops.push_back(Callee); } // If this is a tail call add stack pointer delta. if (isTailCall) Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live // into the call. if (isSVR4ABI && isPPC64 && !isPatchPoint) { setUsesTOCBasePtr(DAG); Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); } return CallOpc; } SDValue PPCTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { SmallVector RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult( Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) ? RetCC_PPC_Cold : RetCC_PPC); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::AExt: Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); break; case CCValAssign::ZExt: Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, DAG.getValueType(VA.getValVT())); Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); break; case CCValAssign::SExt: Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, DAG.getValueType(VA.getValVT())); Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } SDValue PPCTargetLowering::FinishCall( CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, bool isPatchPoint, bool hasNest, SelectionDAG &DAG, SmallVector, 8> &RegsToPass, SDValue InFlag, SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl &Ins, SmallVectorImpl &InVals, ImmutableCallSite CS) const { std::vector NodeTys; SmallVector Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, SPDiff, isTailCall, isPatchPoint, hasNest, RegsToPass, Ops, NodeTys, CS, Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); // When performing tail call optimization the callee pops its arguments off // the stack. Account for this here so these bytes can be pushed back on in // PPCFrameLowering::eliminateCallFramePseudoInstr. int BytesCalleePops = (CallConv == CallingConv::Fast && getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); // Emit tail call. if (isTailCall) { assert(((Callee.getOpcode() == ISD::Register && cast(Callee)->getReg() == PPC::CTR) || Callee.getOpcode() == ISD::TargetExternalSymbol || Callee.getOpcode() == ISD::TargetGlobalAddress || isa(Callee)) && "Expecting an global address, external symbol, absolute value or register"); DAG.getMachineFunction().getFrameInfo().setHasTailCall(); return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); } // Add a NOP immediately after the branch instruction when using the 64-bit // SVR4 ABI. At link time, if caller and callee are in a different module and // thus have a different TOC, the call will be replaced with a call to a stub // function which saves the current TOC, loads the TOC of the callee and // branches to the callee. The NOP will be replaced with a load instruction // which restores the TOC of the caller from the TOC save slot of the current // stack frame. If caller and callee belong to the same module (and have the // same TOC), the NOP will remain unchanged. MachineFunction &MF = DAG.getMachineFunction(); if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && !isPatchPoint) { if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. // We are using a target-specific load with r2 hard coded, because the // result of a target-independent load would never go directly into r2, // since r2 is a reserved register (which prevents the register allocator // from allocating it), resulting in an additional register being // allocated and an unnecessary move instruction being generated. CallOpc = PPCISD::BCTRL_LOAD_TOC; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); // The address needs to go after the chain input but before the flag (or // any other variadic arguments). Ops.insert(std::next(Ops.begin()), AddTOC); } else if (CallOpc == PPCISD::CALL && !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) { // Otherwise insert NOP for non-local calls. CallOpc = PPCISD::CALL_NOP; } } Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(BytesCalleePops, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals); } SDValue PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; bool isPatchPoint = CLI.IsPatchPoint; ImmutableCallSite CS = CLI.CS; if (isTailCall) { if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall())) isTailCall = false; else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) isTailCall = IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, isVarArg, Outs, Ins, DAG); else isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, Ins, DAG); if (isTailCall) { ++NumTailCalls; if (!getTargetMachine().Options.GuaranteedTailCallOpt) ++NumSiblingCalls; assert(isa(Callee) && "Callee should be an llvm::Function object."); LLVM_DEBUG( const GlobalValue *GV = cast(Callee)->getGlobal(); const unsigned Width = 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0"); dbgs() << "TCO caller: " << left_justify(DAG.getMachineFunction().getName(), Width) << ", callee linkage: " << GV->getVisibility() << ", " << GV->getLinkage() << "\n"); } } if (!isTailCall && CS && CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // When long calls (i.e. indirect calls) are always used, calls are always // made via function pointer. If we have a function name, first translate it // into a pointer. if (Subtarget.useLongCalls() && isa(Callee) && !isTailCall) Callee = LowerGlobalAddress(Callee, DAG); if (Subtarget.isSVR4ABI()) { if (Subtarget.isPPC64()) return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals, CS); else return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals, CS); } return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, isTailCall, isPatchPoint, Outs, OutVals, Ins, dl, DAG, InVals, CS); } SDValue PPCTargetLowering::LowerCall_32SVR4( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite CS) const { // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description // of the 32-bit SVR4 ABI stack frame layout. assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold || CallConv == CallingConv::Fast) && "Unknown calling convention!"); unsigned PtrByteSize = 4; MachineFunction &MF = DAG.getMachineFunction(); // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is // done because by tail calling the called function might overwrite the value // in this function's (MF) stack pointer stack slot 0(SP). if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); // Count how many bytes are to be pushed on the stack, including the linkage // area, parameter list area and the part of the local variable space which // contains copies of aggregates which are passed by value. // Assign locations to all of the outgoing arguments. SmallVector ArgLocs; PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), PtrByteSize); if (useSoftFloat()) CCInfo.PreAnalyzeCallOperands(Outs); if (isVarArg) { // Handle fixed and variable vector arguments differently. // Fixed vector arguments go into registers as long as registers are // available. Variable vector arguments always go into memory. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; bool Result; if (Outs[i].IsFixed) { Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } else { Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); } if (Result) { #ifndef NDEBUG errs() << "Call operand #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"; #endif llvm_unreachable(nullptr); } } } else { // All arguments are treated the same. CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); } CCInfo.clearWasPPCF128(); // Assign locations to all of the outgoing aggregate by value arguments. SmallVector ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); // Size of the linkage area, parameter list area and the part of the local // space variable where copies of aggregates which are passed by value are // stored. unsigned NumBytes = CCByValInfo.getNextStackOffset(); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else // later. SDValue LROp, FPOp; Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); SmallVector, 8> RegsToPass; SmallVector TailCallArguments; SmallVector MemOpChains; bool seenFloatArg = false; // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (Flags.isByVal()) { // Argument is an aggregate which is passed by value, thus we need to // create a copy of it in the local variable space of the current stack // frame (which is the stack frame of the caller) and pass the address of // this copy to the callee. assert((j < ByValArgLocs.size()) && "Index out of bounds!"); CCValAssign &ByValVA = ByValArgLocs[j++]; assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); // Memory reserved in the local variable space of the callers stack frame. unsigned LocMemOffset = ByValVA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), StackPtr, PtrOff); // Create a copy of the argument in the local area of the current // stack frame. SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // This must go outside the CALLSEQ_START..END. SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; // Pass the address of the aggregate copy on the stack either in a // physical register or in the parameter list area of the current stack // frame to the callee. Arg = PtrOff; } if (VA.isRegLoc()) { if (Arg.getValueType() == MVT::i1) Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); seenFloatArg |= VA.getLocVT().isFloatingPoint(); // Put argument in a physical register. RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { // Put argument in the parameter list area of the current stack frame. assert(VA.isMemLoc()); unsigned LocMemOffset = VA.getLocMemOffset(); if (!isTailCall) { SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), StackPtr, PtrOff); MemOpChains.push_back( DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); } else { // Calculate and remember argument location. CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, TailCallArguments); } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } // Set CR bit 6 to true if this is a vararg call with floating args passed in // registers. if (isVarArg) { SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, InFlag }; Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); InFlag = Chain.getValue(1); } if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } // Copy an argument into memory, being careful to do this outside the // call sequence for the call to which the argument belongs. SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) const { SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; } SDValue PPCTargetLowering::LowerCall_64SVR4( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite CS) const { bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); bool hasNest = false; bool IsSibCall = false; EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned PtrByteSize = 8; MachineFunction &MF = DAG.getMachineFunction(); if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) IsSibCall = true; // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is // done because by tail calling the called function might overwrite the value // in this function's (MF) stack pointer stack slot 0(SP). if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); assert(!(CallConv == CallingConv::Fast && isVarArg) && "fastcc not supported on varargs functions"); // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage // area is 32 bytes reserved space for [SP][CR][LR][TOC]. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); const unsigned NumQFPRs = NumFPRs; // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. // For the fast calling convention, there is another check below. // Note: We should keep consistent with LowerFormalArguments_64SVR4() bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; if (!HasParameterArea) { unsigned ParamAreaSize = NumGPRs * PtrByteSize; unsigned AvailableFPRs = NumFPRs; unsigned AvailableVRs = NumVRs; unsigned NumBytesTmp = NumBytes; for (unsigned i = 0; i != NumOps; ++i) { if (Outs[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytesTmp, AvailableFPRs, AvailableVRs, Subtarget.hasQPX())) HasParameterArea = true; } } // When using the fast calling convention, we don't provide backing for // arguments that will be in registers. unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; // Avoid allocating parameter area for fastcc functions if all the arguments // can be passed in the registers. if (CallConv == CallingConv::Fast) HasParameterArea = false; // Add up all the space actually used. for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; if (Flags.isNest()) continue; if (CallConv == CallingConv::Fast) { if (Flags.isByVal()) { NumGPRsUsed += (Flags.getByValSize()+7)/8; if (NumGPRsUsed > NumGPRs) HasParameterArea = true; } else { switch (ArgVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected ValueType for argument!"); case MVT::i1: case MVT::i32: case MVT::i64: if (++NumGPRsUsed <= NumGPRs) continue; break; case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: case MVT::v1i128: case MVT::f128: if (++NumVRsUsed <= NumVRs) continue; break; case MVT::v4f32: // When using QPX, this is handled like a FP register, otherwise, it // is an Altivec register. if (Subtarget.hasQPX()) { if (++NumFPRsUsed <= NumFPRs) continue; } else { if (++NumVRsUsed <= NumVRs) continue; } break; case MVT::f32: case MVT::f64: case MVT::v4f64: // QPX case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; } HasParameterArea = true; } } /* Respect alignment of argument on the stack. */ unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); NumBytes = ((NumBytes + Align - 1) / Align) * Align; NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); if (Flags.isInConsecutiveRegsLast()) NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } unsigned NumBytesActuallyUsed = NumBytes; // In the old ELFv1 ABI, // the prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. // In the ELFv2 ABI, we allocate the parameter area iff a callee // really requires memory operands, e.g. a vararg function. if (HasParameterArea) NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); else NumBytes = LinkageSize; // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); int SPDiff = 0; // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. if (!IsSibCall) SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // To protect arguments on the stack from being clobbered in a tail call, // force all the loads to happen before doing any other lowering. if (isTailCall) Chain = DAG.getStackArgumentTokenFactor(Chain); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else // later. SDValue LROp, FPOp; Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); // Figure out which arguments are going to go in registers, and which in // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. unsigned ArgOffset = LinkageSize; SmallVector, 8> RegsToPass; SmallVector TailCallArguments; SmallVector MemOpChains; for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; EVT OrigVT = Outs[i].ArgVT; // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; // We re-align the argument offset for each argument, except when using the // fast calling convention, when we need to make sure we do that only when // we'll actually use a stack slot. auto ComputePtrOff = [&]() { /* Respect alignment of argument on the stack. */ unsigned Align = CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); }; if (CallConv != CallingConv::Fast) { ComputePtrOff(); /* Compute GPR index associated with argument offset. */ GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx = std::min(GPR_idx, NumGPRs); } // Promote integers to 64-bit values. if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { // FIXME: Should this use ANY_EXTEND if neither sext nor zext? unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); } // FIXME memcpy is used way more than necessary. Correctness first. // Note: "by value" is code for passing a structure by value, not // basic types. if (Flags.isByVal()) { // Note: Size includes alignment padding, so // struct x { short a; char b; } // will have Size = 4. With #pragma pack(1), it will have Size = 3. // These are the proper values we need for right-justifying the // aggregate in a parameter register. unsigned Size = Flags.getByValSize(); // An empty aggregate parameter takes up no storage and no // registers. if (Size == 0) continue; if (CallConv == CallingConv::Fast) ComputePtrOff(); // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; continue; } } if (GPR_idx == NumGPRs && Size < 8) { SDValue AddPtr = PtrOff; if (!isLittleEndian) { SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, PtrOff.getValueType()); AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); ArgOffset += PtrByteSize; continue; } // Copy entire object into memory. There are cases where gcc-generated // code assumes it is there, even if it could be put entirely into // registers. (This is not what the doc says.) // FIXME: The above statement is likely due to a misunderstanding of the // documents. All arguments must be copied into the parameter area BY // THE CALLEE in the event that the callee takes the address of any // formal argument. That has not yet been implemented. However, it is // reasonable to use the stack area as a staging area for the register // load. // Skip this for small aggregates, as we will use the same slot for a // right-justified copy, below. if (Size >= 8) Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, CallSeqStart, Flags, DAG, dl); // When a register is available, pass a small aggregate right-justified. if (Size < 8 && GPR_idx != NumGPRs) { // The easiest way to get this right-justified in a register // is to copy the structure into the rightmost portion of a // local variable slot, then load the whole slot into the // register. // FIXME: The memcpy seems to produce pretty awful code for // small aggregates, particularly for packed ones. // FIXME: It would be preferable to use the slot in the // parameter save area instead of a new local variable. SDValue AddPtr = PtrOff; if (!isLittleEndian) { SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); // Load the slot into the register. SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); // Done with this argument. ArgOffset += PtrByteSize; continue; } // For aggregates larger than PtrByteSize, copy the pieces of the // object that fit into registers from the parameter save area. for (unsigned j=0; j gpr moves. // In the non-vararg case, this can only ever happen in the // presence of f32 array types, since otherwise we never run // out of FPRs before running out of GPRs. SDValue ArgVal; // Double values are always passed in a single GPR. if (Arg.getValueType() != MVT::f32) { ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); // Non-array float values are extended and passed in a GPR. } else if (!Flags.isInConsecutiveRegs()) { ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); // If we have an array of floats, we collect every odd element // together with its predecessor into one GPR. } else if (ArgOffset % PtrByteSize != 0) { SDValue Lo, Hi; Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); if (!isLittleEndian) std::swap(Lo, Hi); ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); // The final element, if even, goes into the first half of a GPR. } else if (Flags.isInConsecutiveRegsLast()) { ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); if (!isLittleEndian) ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, DAG.getConstant(32, dl, MVT::i32)); // Non-final even elements are skipped; they will be handled // together the with subsequent argument on the next go-around. } else ArgVal = SDValue(); if (ArgVal.getNode()) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); } else { if (CallConv == CallingConv::Fast) ComputePtrOff(); // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. if (Arg.getValueType() == MVT::f32 && !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } assert(HasParameterArea && "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); NeededLoad = true; } // When passing an array of floats, the array occupies consecutive // space in the argument area; only round up to the next doubleword // at the end of the array. Otherwise, each float takes 8 bytes. if (CallConv != CallingConv::Fast || NeededLoad) { ArgOffset += (Arg.getValueType() == MVT::f32 && Flags.isInConsecutiveRegs()) ? 4 : 8; if (Flags.isInConsecutiveRegsLast()) ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } break; } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: case MVT::v1i128: case MVT::f128: if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. // For a varargs call, named arguments go into VRs or on the stack as // usual; unnamed arguments always go to the stack or the corresponding // GPRs when within range. For now, we always put the value in both // locations (or even all three). if (isVarArg) { assert(HasParameterArea && "Parameter area must exist if we have a varargs call."); // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Store); if (VR_idx != NumVRs) { SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); } ArgOffset += 16; for (unsigned i=0; i<16; i+=PtrByteSize) { if (GPR_idx == NumGPRs) break; SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, DAG.getConstant(i, dl, PtrVT)); SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); } break; } // Non-varargs Altivec params go into VRs or on the stack. if (VR_idx != NumVRs) { RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); } else { if (CallConv == CallingConv::Fast) ComputePtrOff(); assert(HasParameterArea && "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); if (CallConv == CallingConv::Fast) ArgOffset += 16; } if (CallConv != CallingConv::Fast) ArgOffset += 16; break; } // not QPX assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && "Invalid QPX parameter type"); /* fall through */ case MVT::v4f64: case MVT::v4i1: { bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; if (isVarArg) { assert(HasParameterArea && "Parameter area must exist if we have a varargs call."); // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Store); if (QFPR_idx != NumQFPRs) { SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, PtrOff, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); } ArgOffset += (IsF32 ? 16 : 32); for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { if (GPR_idx == NumGPRs) break; SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, DAG.getConstant(i, dl, PtrVT)); SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); } break; } // Non-varargs QPX params go into registers or on the stack. if (QFPR_idx != NumQFPRs) { RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); } else { if (CallConv == CallingConv::Fast) ComputePtrOff(); assert(HasParameterArea && "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); if (CallConv == CallingConv::Fast) ArgOffset += (IsF32 ? 16 : 32); } if (CallConv != CallingConv::Fast) ArgOffset += (IsF32 ? 16 : 32); break; } } } assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && "mismatch in size of parameter area"); (void)NumBytesActuallyUsed; if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Check if this is an indirect call (MTCTR/BCTRL). // See PrepareCall() for more information about calls through function // pointers in the 64-bit SVR4 ABI. if (!isTailCall && !isPatchPoint && !isFunctionGlobalAddress(Callee) && !isa(Callee)) { // Load r2 into a virtual register and store it to the TOC save area. setUsesTOCBasePtr(DAG); SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore( Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); // In the ELFv2 ABI, R12 must contain the address of an indirect callee. // This does not mean the MTCTR instruction must use R12; it's easier // to model this as an extra parameter, so do that. if (isELFv2ABI && !isPatchPoint) RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (isTailCall && !IsSibCall) PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } SDValue PPCTargetLowering::LowerCall_Darwin( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, bool isPatchPoint, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite CS) const { unsigned NumOps = Outs.size(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; unsigned PtrByteSize = isPPC64 ? 8 : 4; MachineFunction &MF = DAG.getMachineFunction(); // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is // done because by tail calling the called function might overwrite the value // in this function's (MF) stack pointer stack slot 0(SP). if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) MF.getInfo()->setHasFastCall(); // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; // Add up all the space actually used. // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually // they all go in registers, but we must reserve stack space for them for // possible use by the caller. In varargs or 64-bit calls, parameters are // assigned stack space in order, with padding so Altivec parameters are // 16-byte aligned. unsigned nAltivecParamsAtEnd = 0; for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; // Varargs Altivec parameters are padded to a 16 byte boundary. if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { if (!isVarArg && !isPPC64) { // Non-varargs Altivec parameters go after all the non-Altivec // parameters; handle those later so we know how much padding we need. nAltivecParamsAtEnd++; continue; } // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. NumBytes = ((NumBytes+15)/16)*16; } NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); } // Allow for Altivec parameters at the end, if needed. if (nAltivecParamsAtEnd) { NumBytes = ((NumBytes+15)/16)*16; NumBytes += 16*nAltivecParamsAtEnd; } // The prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && CallConv == CallingConv::Fast) NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // To protect arguments on the stack from being clobbered in a tail call, // force all the loads to happen before doing any other lowering. if (isTailCall) Chain = DAG.getStackArgumentTokenFactor(Chain); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else // later. SDValue LROp, FPOp; Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); // Set up a copy of the stack pointer for use loading and storing any // arguments that may not fit in the registers available for argument // passing. SDValue StackPtr; if (isPPC64) StackPtr = DAG.getRegister(PPC::X1, MVT::i64); else StackPtr = DAG.getRegister(PPC::R1, MVT::i32); // Figure out which arguments are going to go in registers, and which in // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR_32[] = { // 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; static const MCPhysReg GPR_64[] = { // 64-bit registers. PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10, }; static const MCPhysReg VR[] = { PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; const unsigned NumGPRs = array_lengthof(GPR_32); const unsigned NumFPRs = 13; const unsigned NumVRs = array_lengthof(VR); const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; SmallVector, 8> RegsToPass; SmallVector TailCallArguments; SmallVector MemOpChains; for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); // On PPC64, promote integers to 64-bit values. if (isPPC64 && Arg.getValueType() == MVT::i32) { // FIXME: Should this use ANY_EXTEND if neither sext nor zext? unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); } // FIXME memcpy is used way more than necessary. Correctness first. // Note: "by value" is code for passing a structure by value, not // basic types. if (Flags.isByVal()) { unsigned Size = Flags.getByValSize(); // Very small objects are passed right-justified. Everything else is // passed left-justified. if (Size==1 || Size==2) { EVT VT = (Size==1) ? MVT::i8 : MVT::i16; if (GPR_idx != NumGPRs) { SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, MachinePointerInfo(), VT); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; } else { SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, PtrOff.getValueType()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); ArgOffset += PtrByteSize; } continue; } // Copy entire object into memory. There are cases where gcc-generated // code assumes it is there, even if it could be put entirely into // registers. (This is not what the doc says.) Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, CallSeqStart, Flags, DAG, dl); // For small aggregates (Darwin only) and aggregates >= PtrByteSize, // copy the pieces of the object that fit into registers from the // parameter save area. for (unsigned j=0; j NumVRs) { unsigned j = 0; // Offset is aligned; skip 1st 12 params which go in V registers. ArgOffset = ((ArgOffset+15)/16)*16; ArgOffset += 12*16; for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; EVT ArgType = Outs[i].VT; if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { if (++j > NumVRs) { SDValue PtrOff; // We are emitting Altivec params in order. LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, isPPC64, isTailCall, true, MemOpChains, TailCallArguments, dl); ArgOffset += 16; } } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // On Darwin, R12 must contain the address of an indirect callee. This does // not mean the MTCTR instruction must use R12; it's easier to model this as // an extra parameter, so do that. if (!isTailCall && !isFunctionGlobalAddress(Callee) && !isa(Callee) && !isBLACompatibleAddress(Callee, DAG)) RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : PPC::R12), Callee)); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CS); } bool PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn( Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) ? RetCC_PPC_Cold : RetCC_PPC); } SDValue PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) ? RetCC_PPC_Cold : RetCC_PPC); SDValue Flag; SmallVector RetOps(1, Chain); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[i]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; } Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (PPC::G8RCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else if (PPC::F8RCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else if (PPC::CRRCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i1)); else if (PPC::VRRCRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::Other)); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); } SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Get the correct type for integers. EVT IntVT = Op.getValueType(); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue FPSIdx = getFramePointerFrameIndex(DAG); // Build a DYNAREAOFFSET node. SDValue Ops[2] = {Chain, FPSIdx}; SDVTList VTs = DAG.getVTList(IntVT); return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); } SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const { // When we pop the dynamic allocation we need to restore the SP link. SDLoc dl(Op); // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Construct the stack pointer operand. bool isPPC64 = Subtarget.isPPC64(); unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; SDValue StackPtr = DAG.getRegister(SP, PtrVT); // Get the operands for the STACKRESTORE. SDValue Chain = Op.getOperand(0); SDValue SaveSP = Op.getOperand(1); // Load the old link SP. SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo()); // Restore the stack pointer. Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); // Store the old link SP. return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo()); } SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Get current frame pointer save index. The users of this index will be // primarily DYNALLOC instructions. PPCFunctionInfo *FI = MF.getInfo(); int RASI = FI->getReturnAddrSaveIndex(); // If the frame pointer save index hasn't been defined yet. if (!RASI) { // Find out what the fix offset of the frame pointer save area. int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); // Allocate the frame index for frame pointer save area. RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false); // Save the result. FI->setReturnAddrSaveIndex(RASI); } return DAG.getFrameIndex(RASI, PtrVT); } SDValue PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Get current frame pointer save index. The users of this index will be // primarily DYNALLOC instructions. PPCFunctionInfo *FI = MF.getInfo(); int FPSI = FI->getFramePointerSaveIndex(); // If the frame pointer save index hasn't been defined yet. if (!FPSI) { // Find out what the fix offset of the frame pointer save area. int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); // Allocate the frame index for frame pointer save area. FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); // Save the result. FI->setFramePointerSaveIndex(FPSI); } return DAG.getFrameIndex(FPSI, PtrVT); } SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); SDLoc dl(Op); // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Negate the size. SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, DAG.getConstant(0, dl, PtrVT), Size); // Construct a node for the frame pointer save index. SDValue FPSIdx = getFramePointerFrameIndex(DAG); // Build a DYNALLOC node. SDValue Ops[3] = { Chain, NegSize, FPSIdx }; SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); } SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false); return DAG.getFrameIndex(FI, PtrVT); } SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); // First, load 8 bits into 32 bits, then truncate to 1 bit. SDLoc dl(Op); LoadSDNode *LD = cast(Op); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); MachineMemOperand *MMO = LD->getMemOperand(); SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, BasePtr, MVT::i8, MMO); SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; return DAG.getMergeValues(Ops, dl); } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(1).getValueType().isVector()) return LowerVectorStore(Op, DAG); assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); // First, zero extend to 32 bits, then use a truncating store to 8 bits. SDLoc dl(Op); StoreSDNode *ST = cast(Op); SDValue Chain = ST->getChain(); SDValue BasePtr = ST->getBasePtr(); SDValue Value = ST->getValue(); MachineMemOperand *MMO = ST->getMemOperand(); Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), Value); return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); } // FIXME: Remove this once the ANDI glue bug is fixed: SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 results"); SDLoc DL(Op); return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, Op.getOperand(0)); } /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Not FP? Not a fsel. if (!Op.getOperand(0).getValueType().isFloatingPoint() || !Op.getOperand(2).getValueType().isFloatingPoint()) return Op; // We might be able to do better than this under some circumstances, but in // general, fsel-based lowering of select is a finite-math-only optimization. // For more information, see section F.3 of the 2.06 ISA specification. if (!DAG.getTarget().Options.NoInfsFPMath || !DAG.getTarget().Options.NoNaNsFPMath) return Op; // TODO: Propagate flags from the select rather than global settings. SDNodeFlags Flags; Flags.setNoInfs(true); Flags.setNoNaNs(true); ISD::CondCode CC = cast(Op.getOperand(4))->get(); EVT ResVT = Op.getValueType(); EVT CmpVT = Op.getOperand(0).getValueType(); SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); SDLoc dl(Op); // If the RHS of the comparison is a 0.0, we don't need to do the // subtraction at all. SDValue Sel1; if (isFloatingPointZero(RHS)) switch (CC) { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); LLVM_FALLTHROUGH; case ISD::SETEQ: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); return DAG.getNode(PPCISD::FSEL, dl, ResVT, DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); case ISD::SETUGT: case ISD::SETGT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt LLVM_FALLTHROUGH; case ISD::SETOLE: case ISD::SETLE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); return DAG.getNode(PPCISD::FSEL, dl, ResVT, DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); } SDValue Cmp; switch (CC) { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); LLVM_FALLTHROUGH; case ISD::SETEQ: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); return DAG.getNode(PPCISD::FSEL, dl, ResVT, DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); } return Op; } void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, SelectionDAG &DAG, const SDLoc &dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); SDValue Tmp; switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode( Op.getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIWZ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); break; case MVT::i64: assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, dl, MVT::f64, Src); break; } // Convert the FP value to an int value through memory. bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast(FIPtr)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Emit a store to the stack slot. SDValue Chain; if (i32Stack) { MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); } else Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI); // Result is a load from the stack slot. If loading 4 bytes, make sure to // add in a bias on big endian. if (Op.getValueType() == MVT::i32 && !i32Stack) { FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, DAG.getConstant(4, dl, FIPtr.getValueType())); MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4); } RLI.Chain = Chain; RLI.Ptr = FIPtr; RLI.MPI = MPI; } /// Custom lowers floating point to integer conversions to use /// the direct move instructions available in ISA 2.07 to avoid the /// need for load/store combinations. SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); SDValue Src = Op.getOperand(0); if (Src.getValueType() == MVT::f32) Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); SDValue Tmp; switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode( Op.getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIWZ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); break; case MVT::i64: assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, dl, MVT::f64, Src); Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); break; } return Tmp; } SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { // FP to INT conversions are legal for f128. if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128)) return Op; // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on // PPC (the libcall is not available). if (Op.getOperand(0).getValueType() == MVT::ppcf128) { if (Op.getValueType() == MVT::i32) { if (Op.getOpcode() == ISD::FP_TO_SINT) { SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Op.getOperand(0), DAG.getIntPtrConstant(1, dl)); // Add the two halves of the long double in round-to-zero mode. SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); // Now use a smaller FP_TO_SINT. return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res); } if (Op.getOpcode() == ISD::FP_TO_UINT) { const uint64_t TwoE31[] = {0x41e0000000000000LL, 0}; APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31)); SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128); // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X // FIXME: generated code sucks. // TODO: Are there fast-math-flags to propagate to this FSUB? SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Op.getOperand(0), Tmp); True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True); True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, DAG.getConstant(0x80000000, dl, MVT::i32)); SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op.getOperand(0)); return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False, ISD::SETGE); } } return SDValue(); } if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) return LowerFP_TO_INTDirectMove(Op, DAG, dl); ReuseLoadInfo RLI; LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); } // We're trying to insert a regular store, S, and then a load, L. If the // incoming value, O, is a load, we might just be able to have our load use the // address used by O. However, we don't know if anything else will store to // that address before we can load from it. To prevent this situation, we need // to insert our load, L, into the chain as a peer of O. To do this, we give L // the same chain operand as O, we create a token factor from the chain results // of O and L, and we replace all uses of O's chain result with that token // factor (see spliceIntoChain below for this last part). bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, SelectionDAG &DAG, ISD::LoadExtType ET) const { SDLoc dl(Op); if (ET == ISD::NON_EXTLOAD && (Op.getOpcode() == ISD::FP_TO_UINT || Op.getOpcode() == ISD::FP_TO_SINT) && isOperationLegalOrCustom(Op.getOpcode(), Op.getOperand(0).getValueType())) { LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); return true; } LoadSDNode *LD = dyn_cast(Op); if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || LD->isNonTemporal()) return false; if (LD->getMemoryVT() != MemVT) return false; RLI.Ptr = LD->getBasePtr(); if (LD->isIndexed() && !LD->getOffset().isUndef()) { assert(LD->getAddressingMode() == ISD::PRE_INC && "Non-pre-inc AM on PPC?"); RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, LD->getOffset()); } RLI.Chain = LD->getChain(); RLI.MPI = LD->getPointerInfo(); RLI.IsDereferenceable = LD->isDereferenceable(); RLI.IsInvariant = LD->isInvariant(); RLI.Alignment = LD->getAlignment(); RLI.AAInfo = LD->getAAInfo(); RLI.Ranges = LD->getRanges(); RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); return true; } // Given the head of the old chain, ResChain, insert a token factor containing // it and NewResChain, and make users of ResChain now be users of that token // factor. // TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead. void PPCTargetLowering::spliceIntoChain(SDValue ResChain, SDValue NewResChain, SelectionDAG &DAG) const { if (!ResChain) return; SDLoc dl(NewResChain); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, NewResChain, DAG.getUNDEF(MVT::Other)); assert(TF.getNode() != NewResChain.getNode() && "A new TF really is required here"); DAG.ReplaceAllUsesOfValueWith(ResChain, TF); DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); } /// Analyze profitability of direct move /// prefer float load to int load plus direct move /// when there is no integer use of int load bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { SDNode *Origin = Op.getOperand(0).getNode(); if (Origin->getOpcode() != ISD::LOAD) return true; // If there is no LXSIBZX/LXSIHZX, like Power8, // prefer direct move if the memory size is 1 or 2 bytes. MachineMemOperand *MMO = cast(Origin)->getMemOperand(); if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2) return true; for (SDNode::use_iterator UI = Origin->use_begin(), UE = Origin->use_end(); UI != UE; ++UI) { // Only look at the users of the loaded value. if (UI.getUse().get().getResNo() != 0) continue; if (UI->getOpcode() != ISD::SINT_TO_FP && UI->getOpcode() != ISD::UINT_TO_FP) return true; } return false; } /// Custom lowers integer to floating point conversions to use /// the direct move instructions available in ISA 2.07 to avoid the /// need for load/store combinations. SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Invalid floating point type as target of conversion"); assert(Subtarget.hasFPCVT() && "Int to FP conversions with direct moves require FPCVT"); SDValue FP; SDValue Src = Op.getOperand(0); bool SinglePrec = Op.getValueType() == MVT::f32; bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); if (WordInt) { FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, dl, MVT::f64, Src); FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); } else { FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); } return FP; } SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Conversions to f128 are legal. if (EnableQuadPrecision && (Op.getValueType() == MVT::f128)) return Op; if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) return SDValue(); SDValue Value = Op.getOperand(0); // The values are now known to be -1 (false) or 1 (true). To convert this // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); if (Op.getValueType() != MVT::v4f64) Value = DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(), Value, DAG.getIntPtrConstant(1, dl)); return Value; } // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); if (Op.getOperand(0).getValueType() == MVT::i1) return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), DAG.getConstantFP(1.0, dl, Op.getValueType()), DAG.getConstantFP(0.0, dl, Op.getValueType())); // If we have direct moves, we can do all the conversion, skip the store/load // however, without FPCVT we can't do most conversions. if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) && Subtarget.isPPC64() && Subtarget.hasFPCVT()) return LowerINT_TO_FPDirectMove(Op, DAG, dl); assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS : PPCISD::FCFIDS) : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU : PPCISD::FCFID); MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? MVT::f32 : MVT::f64; if (Op.getOperand(0).getValueType() == MVT::i64) { SDValue SINT = Op.getOperand(0); // When converting to single-precision, we actually need to convert // to double-precision first and then round to single-precision. // To avoid double-rounding effects during that operation, we have // to prepare the input operand. Bits that might be truncated when // converting to double-precision are replaced by a bit that won't // be lost at this stage, but is below the single-precision rounding // position. // // However, if -enable-unsafe-fp-math is in effect, accept double // rounding to avoid the extra overhead. if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() && !DAG.getTarget().Options.UnsafeFPMath) { // Twiddle input to make sure the low 11 bits are zero. (If this // is the case, we are guaranteed the value will fit into the 53 bit // mantissa of an IEEE double-precision value without rounding.) // If any of those low 11 bits were not zero originally, make sure // bit 12 (value 2048) is set instead, so that the final rounding // to single-precision gets the correct result. SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, SINT, DAG.getConstant(2047, dl, MVT::i64)); Round = DAG.getNode(ISD::ADD, dl, MVT::i64, Round, DAG.getConstant(2047, dl, MVT::i64)); Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); Round = DAG.getNode(ISD::AND, dl, MVT::i64, Round, DAG.getConstant(-2048, dl, MVT::i64)); // However, we cannot use that value unconditionally: if the magnitude // of the input value is small, the bit-twiddling we did above might // end up visibly changing the output. Fortunately, in that case, we // don't need to twiddle bits since the original input will convert // exactly to double-precision floating-point already. Therefore, // construct a conditional to use the original value if the top 11 // bits are all sign-bit copies, and use the rounded value computed // above otherwise. SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, SINT, DAG.getConstant(53, dl, MVT::i32)); Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, Cond, DAG.getConstant(1, dl, MVT::i64)); Cond = DAG.getSetCC(dl, MVT::i32, Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); } ReuseLoadInfo RLI; SDValue Bits; MachineFunction &MF = DAG.getMachineFunction(); if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); } else if (Subtarget.hasLFIWAX() && canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); } else if (Subtarget.hasFPCVT() && canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); } else if (((Subtarget.hasLFIWAX() && SINT.getOpcode() == ISD::SIGN_EXTEND) || (Subtarget.hasFPCVT() && SINT.getOpcode() == ISD::ZERO_EXTEND)) && SINT.getOperand(0).getValueType() == MVT::i32) { MachineFrameInfo &MFI = MF.getFrameInfo(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIdx = MFI.CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FrameIdx)); assert(cast(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; RLI.MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); } else Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); return FP; } assert(Op.getOperand(0).getValueType() == MVT::i32 && "Unhandled INT_TO_FP type in custom expander!"); // Since we only generate this in 64-bit mode, we can take advantage of // 64-bit registers. In particular, sign extend the input value into the // 64-bit register with extsw, store the WHOLE 64-bit value into the stack // then lfd it and fcfid it. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); EVT PtrVT = getPointerTy(MF.getDataLayout()); SDValue Ld; if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { ReuseLoadInfo RLI; bool ReusingLoad; if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, DAG))) { int FrameIdx = MFI.CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FrameIdx)); assert(cast(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; RLI.MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; } MachineMemOperand *MMO = MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); if (ReusingLoad) spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); } else { assert(Subtarget.isPPC64() && "i32->FP without LFIWAX supported only on PPC64"); int FrameIdx = MFI.CreateStackObject(8, 8, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Op.getOperand(0)); // STD the extended value into the stack slot. SDValue Store = DAG.getStore( DAG.getEntryNode(), dl, Ext64, FIdx, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); // Load the value as a double. Ld = DAG.getLoad( MVT::f64, dl, Store, FIdx, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); } // FCFID it and return it. SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); return FP; } SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); /* The rounding mode is in bits 30:31 of FPSR, and has the following settings: 00 Round to nearest 01 Round to 0 10 Round to +inf 11 Round to -inf FLT_ROUNDS, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we do: ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) */ MachineFunction &MF = DAG.getMachineFunction(); EVT VT = Op.getValueType(); EVT PtrVT = getPointerTy(MF.getDataLayout()); // Save FP Control Word to register EVT NodeTys[] = { MVT::f64, // return register MVT::Glue // unused in this context }; SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); // Save FP register to stack slot int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot, MachinePointerInfo()); // Load FP Control Word from low 32 bits of stack slot. SDValue Four = DAG.getConstant(4, dl, PtrVT); SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo()); // Transform as necessary SDValue CWD1 = DAG.getNode(ISD::AND, dl, MVT::i32, CWD, DAG.getConstant(3, dl, MVT::i32)); SDValue CWD2 = DAG.getNode(ISD::SRL, dl, MVT::i32, DAG.getNode(ISD::AND, dl, MVT::i32, DAG.getNode(ISD::XOR, dl, MVT::i32, CWD, DAG.getConstant(3, dl, MVT::i32)), DAG.getConstant(3, dl, MVT::i32)), DAG.getConstant(1, dl, MVT::i32)); SDValue RetVal = DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); } SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); SDLoc dl(Op); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SHL!"); // Expand into a bunch of logical ops. Note that these ops // depend on the PPC behavior for oversized shift amounts. SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Amt = Op.getOperand(2); EVT AmtVT = Amt.getValueType(); SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Amt); SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, DAG.getConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); SDValue OutOps[] = { OutLo, OutHi }; return DAG.getMergeValues(OutOps, dl); } SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); unsigned BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRL!"); // Expand into a bunch of logical ops. Note that these ops // depend on the PPC behavior for oversized shift amounts. SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Amt = Op.getOperand(2); EVT AmtVT = Amt.getValueType(); SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Amt); SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, DAG.getConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); SDValue OutOps[] = { OutLo, OutHi }; return DAG.getMergeValues(OutOps, dl); } SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRA!"); // Expand into a bunch of logical ops, followed by a select_cc. SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Amt = Op.getOperand(2); EVT AmtVT = Amt.getValueType(); SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Amt); SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, DAG.getConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), Tmp4, Tmp6, ISD::SETLE); SDValue OutOps[] = { OutLo, OutHi }; return DAG.getMergeValues(OutOps, dl); } //===----------------------------------------------------------------------===// // Vector related lowering. // /// BuildSplatI - Build a canonical splati of Val with an element size of /// SplatSize. Cast the result to VT. static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); static const MVT VTys[] = { // canonical VT to use for each size. MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 }; EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. if (Val == -1) SplatSize = 1; EVT CanonicalVT = VTys[SplatSize-1]; // Build a canonical splat for this value. return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT)); } /// BuildIntrinsicOp - Return a unary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG, const SDLoc &dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = Op.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, dl, MVT::i32), Op); } /// BuildIntrinsicOp - Return a binary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = LHS.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); } /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the /// specified intrinsic ID. static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, SDValue Op2, SelectionDAG &DAG, const SDLoc &dl, EVT DestVT = MVT::Other) { if (DestVT == MVT::Other) DestVT = Op0.getValueType(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); } /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified /// amount. The result has the specified value type. static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, SelectionDAG &DAG, const SDLoc &dl) { // Force LHS/RHS to be the right type. LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); int Ops[16]; for (unsigned i = 0; i != 16; ++i) Ops[i] = i + Amt; SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, T); } /// Do we have an efficient pattern in a .td file for this node? /// /// \param V - pointer to the BuildVectorSDNode being matched /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves? /// /// There are some patterns where it is beneficial to keep a BUILD_VECTOR /// node as a BUILD_VECTOR node rather than expanding it. The patterns where /// the opposite is true (expansion is beneficial) are: /// - The node builds a vector out of integers that are not 32 or 64-bits /// - The node builds a vector out of constants /// - The node is a "load-and-splat" /// In all other cases, we will choose to keep the BUILD_VECTOR. static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, bool HasDirectMove, bool HasP8Vector) { EVT VecVT = V->getValueType(0); bool RightType = VecVT == MVT::v2f64 || (HasP8Vector && VecVT == MVT::v4f32) || (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); if (!RightType) return false; bool IsSplat = true; bool IsLoad = false; SDValue Op0 = V->getOperand(0); // This function is called in a block that confirms the node is not a constant // splat. So a constant BUILD_VECTOR here means the vector is built out of // different constants. if (V->isConstant()) return false; for (int i = 0, e = V->getNumOperands(); i < e; ++i) { if (V->getOperand(i).isUndef()) return false; // We want to expand nodes that represent load-and-splat even if the // loaded value is a floating point truncation or conversion to int. if (V->getOperand(i).getOpcode() == ISD::LOAD || (V->getOperand(i).getOpcode() == ISD::FP_ROUND && V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT && V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT && V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD)) IsLoad = true; // If the operands are different or the input is not a load and has more // uses than just this BV node, then it isn't a splat. if (V->getOperand(i) != Op0 || (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode()))) IsSplat = false; } return !(IsSplat && IsLoad); } // Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128. SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Op0 = Op->getOperand(0); if (!EnableQuadPrecision || (Op.getValueType() != MVT::f128 ) || (Op0.getOpcode() != ISD::BUILD_PAIR) || (Op0.getOperand(0).getValueType() != MVT::i64) || (Op0.getOperand(1).getValueType() != MVT::i64)) return SDValue(); return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0), Op0.getOperand(1)); } // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen // this case more efficiently than a constant pool load, lower it to the // sequence of ops that should be used. SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { // We first build an i32 vector, load it into a QPX register, // then convert it to a floating-point vector and compare it // to a zero vector to get the boolean result. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); assert(BVN->getNumOperands() == 4 && "BUILD_VECTOR for v4i1 does not have 4 operands"); bool IsConst = true; for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).isUndef()) continue; if (!isa(BVN->getOperand(i))) { IsConst = false; break; } } if (IsConst) { Constant *One = ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); Constant *NegOne = ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); Constant *CV[4]; for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).isUndef()) CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); else if (isNullConstant(BVN->getOperand(i))) CV[i] = NegOne; else CV[i] = One; } Constant *CP = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), 16 /* alignment */); SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); return DAG.getMemIntrinsicNode( PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } SmallVector Stores; for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).isUndef()) continue; unsigned Offset = 4*i; SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); if (StoreSize > 4) { Stores.push_back( DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, PtrInfo.getWithOffset(Offset), MVT::i32)); } else { SDValue StoreValue = BVN->getOperand(i); if (StoreSize < 4) StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, PtrInfo.getWithOffset(Offset))); } } SDValue StoreChain; if (!Stores.empty()) StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); else StoreChain = DAG.getEntryNode(); // Now load from v4i32 into the QPX register; this will extend it to // v4i64 but not yet convert it to a floating point. Nevertheless, this // is typed as v4f64 because the QPX register integer states are not // explicitly represented. SDValue Ops[] = {StoreChain, DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), FIdx}; SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, VTs, Ops, MVT::v4i32, PtrInfo); LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), LoadedVect); SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); } // All other QPX vectors are handled by generic code. if (Subtarget.hasQPX()) return SDValue(); // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || SplatBitSize > 32) { // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be // lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. if (Subtarget.hasVSX() && haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(), Subtarget.hasP8Vector())) return Op; return SDValue(); } unsigned SplatBits = APSplatBits.getZExtValue(); unsigned SplatUndef = APSplatUndef.getZExtValue(); unsigned SplatSize = SplatBitSize / 8; // First, handle single instruction cases. // All zeros? if (SplatBits == 0) { // Canonicalize all zero vectors to be v4i32. if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { SDValue Z = DAG.getConstant(0, dl, MVT::v4i32); Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); } return Op; } // We have XXSPLTIB for constant splats one byte wide if (Subtarget.hasP9Vector() && SplatSize == 1) { // This is a splat of 1-byte elements with some elements potentially undef. // Rather than trying to match undef in the SDAG patterns, ensure that all // elements are the same constant. if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { SmallVector Ops(16, DAG.getConstant(SplatBits, dl, MVT::i32)); SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); if (Op.getValueType() != MVT::v16i8) return DAG.getBitcast(Op.getValueType(), NewBV); return NewBV; } // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll // detect that constant splats like v8i16: 0xABAB are really just splats // of a 1-byte constant. In this case, we need to convert the node to a // splat of v16i8 and a bitcast. if (Op.getValueType() != MVT::v16i8) return DAG.getBitcast(Op.getValueType(), DAG.getConstant(SplatBits, dl, MVT::v16i8)); return Op; } // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> (32-SplatBitSize)); if (SextVal >= -16 && SextVal <= 15) return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); // Two instruction sequences. // If this value is in the range [-32,30] and is even, use: // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) // If this value is in the range [17,31] and is odd, use: // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) // If this value is in the range [-31,-17] and is odd, use: // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) // Note the last two are three-instruction sequences. if (SextVal >= -32 && SextVal <= 31) { // To avoid having these optimizations undone by constant folding, // we convert to a pseudo that will be expanded later into one of // the above forms. SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); EVT VT = (SplatSize == 1 ? MVT::v16i8 : (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); if (VT == Op.getValueType()) return RetVal; else return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); } // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important // for fneg/fabs. if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { // Make -1 and vspltisw -1: SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); // Make the VSLW intrinsic, computing 0x8000_0000. SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, OnesV, DAG, dl); // xor by OnesV to invert it. Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // Check to see if this is a wide variety of vsplti*, binop self cases. static const signed char SplatCsts[] = { -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 }; for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { // Indirect through the SplatCsts array so that we favor 'vsplti -1' for // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' int i = SplatCsts[idx]; // Figure out what shift amount will be used by altivec if shifted by i in // this splat size. unsigned TypeShiftAmt = i & (SplatBitSize-1); // vsplti + shl self. if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, Intrinsic::ppc_altivec_vslw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // vsplti + srl self. if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, Intrinsic::ppc_altivec_vsrw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // vsplti + sra self. if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, Intrinsic::ppc_altivec_vsraw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // vsplti + rol self. if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); static const unsigned IIDs[] = { // Intrinsic to use for each size. Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, Intrinsic::ppc_altivec_vrlw }; Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } // t = vsplti c, result = vsldoi t, t, 1 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 2 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } // t = vsplti c, result = vsldoi t, t, 3 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); } } return SDValue(); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VMRGHW, OP_VMRGLW, OP_VSPLTISW0, OP_VSPLTISW1, OP_VSPLTISW2, OP_VSPLTISW3, OP_VSLDOI4, OP_VSLDOI8, OP_VSLDOI12 }; if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); int ShufIdxs[16]; switch (OpNum) { default: llvm_unreachable("Unknown i32 permute!"); case OP_VMRGHW: ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; break; case OP_VMRGLW: ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; break; case OP_VSPLTISW0: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+0; break; case OP_VSPLTISW1: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+4; break; case OP_VSPLTISW2: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+8; break; case OP_VSPLTISW3: for (unsigned i = 0; i != 16; ++i) ShufIdxs[i] = (i&3)+12; break; case OP_VSLDOI4: return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); case OP_VSLDOI8: return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); case OP_VSLDOI12: return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); } EVT VT = OpLHS.getValueType(); OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); return DAG.getNode(ISD::BITCAST, dl, VT, T); } /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled /// by the VINSERTB instruction introduced in ISA 3.0, else just return default /// SDValue. SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const { const unsigned BytesInVector = 16; bool IsLE = Subtarget.isLittleEndian(); SDLoc dl(N); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); unsigned ShiftElts = 0, InsertAtByte = 0; bool Swap = false; // Shifts required to get the byte we want at element 7. unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9}; unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8}; ArrayRef Mask = N->getMask(); int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; // For each mask element, find out if we're just inserting something // from V2 into V1 or vice versa. // Possible permutations inserting an element from V2 into V1: // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // ... // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X // Inserting from V1 into V2 will be similar, except mask range will be // [16,31]. bool FoundCandidate = false; // If both vector operands for the shuffle are the same vector, the mask // will contain only elements from the first one and the second one will be // undef. unsigned VINSERTBSrcElem = IsLE ? 8 : 7; // Go through the mask of half-words to find an element that's being moved // from one vector to the other. for (unsigned i = 0; i < BytesInVector; ++i) { unsigned CurrentElement = Mask[i]; // If 2nd operand is undefined, we should only look for element 7 in the // Mask. if (V2.isUndef() && CurrentElement != VINSERTBSrcElem) continue; bool OtherElementsInOrder = true; // Examine the other elements in the Mask to see if they're in original // order. for (unsigned j = 0; j < BytesInVector; ++j) { if (j == i) continue; // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined, // in which we always assume we're always picking from the 1st operand. int MaskOffset = (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0; if (Mask[j] != OriginalOrder[j] + MaskOffset) { OtherElementsInOrder = false; break; } } // If other elements are in original order, we record the number of shifts // we need to get the element we want into element 7. Also record which byte // in the vector we should insert into. if (OtherElementsInOrder) { // If 2nd operand is undefined, we assume no shifts and no swapping. if (V2.isUndef()) { ShiftElts = 0; Swap = false; } else { // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4. ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF] : BigEndianShifts[CurrentElement & 0xF]; Swap = CurrentElement < BytesInVector; } InsertAtByte = IsLE ? BytesInVector - (i + 1) : i; FoundCandidate = true; break; } } if (!FoundCandidate) return SDValue(); // Candidate found, construct the proper SDAG sequence with VINSERTB, // optionally with VECSHL if shift is required. if (Swap) std::swap(V1, V2); if (V2.isUndef()) V2 = V1; if (ShiftElts) { SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, DAG.getConstant(ShiftElts, dl, MVT::i32)); return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl, DAG.getConstant(InsertAtByte, dl, MVT::i32)); } return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); } /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled /// by the VINSERTH instruction introduced in ISA 3.0, else just return default /// SDValue. SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const { const unsigned NumHalfWords = 8; const unsigned BytesInVector = NumHalfWords * 2; // Check that the shuffle is on half-words. if (!isNByteElemShuffleMask(N, 2, 1)) return SDValue(); bool IsLE = Subtarget.isLittleEndian(); SDLoc dl(N); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); unsigned ShiftElts = 0, InsertAtByte = 0; bool Swap = false; // Shifts required to get the half-word we want at element 3. unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5}; unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4}; uint32_t Mask = 0; uint32_t OriginalOrderLow = 0x1234567; uint32_t OriginalOrderHigh = 0x89ABCDEF; // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a // 32-bit space, only need 4-bit nibbles per element. for (unsigned i = 0; i < NumHalfWords; ++i) { unsigned MaskShift = (NumHalfWords - 1 - i) * 4; Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift); } // For each mask element, find out if we're just inserting something // from V2 into V1 or vice versa. Possible permutations inserting an element // from V2 into V1: // X, 1, 2, 3, 4, 5, 6, 7 // 0, X, 2, 3, 4, 5, 6, 7 // 0, 1, X, 3, 4, 5, 6, 7 // 0, 1, 2, X, 4, 5, 6, 7 // 0, 1, 2, 3, X, 5, 6, 7 // 0, 1, 2, 3, 4, X, 6, 7 // 0, 1, 2, 3, 4, 5, X, 7 // 0, 1, 2, 3, 4, 5, 6, X // Inserting from V1 into V2 will be similar, except mask range will be [8,15]. bool FoundCandidate = false; // Go through the mask of half-words to find an element that's being moved // from one vector to the other. for (unsigned i = 0; i < NumHalfWords; ++i) { unsigned MaskShift = (NumHalfWords - 1 - i) * 4; uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; uint32_t MaskOtherElts = ~(0xF << MaskShift); uint32_t TargetOrder = 0x0; // If both vector operands for the shuffle are the same vector, the mask // will contain only elements from the first one and the second one will be // undef. if (V2.isUndef()) { ShiftElts = 0; unsigned VINSERTHSrcElem = IsLE ? 4 : 3; TargetOrder = OriginalOrderLow; Swap = false; // Skip if not the correct element or mask of other elements don't equal // to our expected order. if (MaskOneElt == VINSERTHSrcElem && (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; FoundCandidate = true; break; } } else { // If both operands are defined. // Target order is [8,15] if the current mask is between [0,7]. TargetOrder = (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow; // Skip if mask of other elements don't equal our expected order. if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { // We only need the last 3 bits for the number of shifts. ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7] : BigEndianShifts[MaskOneElt & 0x7]; InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; Swap = MaskOneElt < NumHalfWords; FoundCandidate = true; break; } } } if (!FoundCandidate) return SDValue(); // Candidate found, construct the proper SDAG sequence with VINSERTH, // optionally with VECSHL if shift is required. if (Swap) std::swap(V1, V2); if (V2.isUndef()) V2 = V1; SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); if (ShiftElts) { // Double ShiftElts because we're left shifting on v16i8 type. SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, DAG.getConstant(2 * ShiftElts, dl, MVT::i32)); SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl); SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this /// is a shuffle we can handle in a single instruction, return it. Otherwise, /// return the code it can be lowered into. Worst case, it can always be /// lowered into a vperm. SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); ShuffleVectorSDNode *SVOp = cast(Op); EVT VT = Op.getValueType(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned ShiftElts, InsertAtByte; bool Swap = false; if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { if (Swap) std::swap(V1, V2); SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2); if (ShiftElts) { SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } if (Subtarget.hasP9Altivec()) { SDValue NewISDNode; if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) return NewISDNode; if ((NewISDNode = lowerToVINSERTB(SVOp, DAG))) return NewISDNode; } if (Subtarget.hasVSX() && PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { if (Swap) std::swap(V1, V2); SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); } if (Subtarget.hasVSX() && PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { if (Swap) std::swap(V1, V2); SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); } if (Subtarget.hasP9Vector()) { if (PPC::isXXBRHShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); } else if (PPC::isXXBRWShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); } else if (PPC::isXXBRDShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); } else if (PPC::isXXBRQShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); } } if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); // If the source for the shuffle is a scalar_to_vector that came from a // 32-bit load, it will have used LXVWSX so we don't need to splat again. if (Subtarget.hasP9Vector() && ((isLittleEndian && SplatIdx == 3) || (!isLittleEndian && SplatIdx == 0))) { SDValue Src = V1.getOperand(0); if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && Src.getOperand(0).getOpcode() == ISD::LOAD && Src.getOperand(0).hasOneUse()) return V1; } SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, DAG.getConstant(SplatIdx, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat); } // Left shifts of 8 bytes are actually swaps. Convert accordingly. if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); } } if (Subtarget.hasQPX()) { if (VT.getVectorNumElements() != 4) return SDValue(); if (V2.isUndef()) V2 = V1; int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); if (AlignIdx != -1) { return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, DAG.getConstant(AlignIdx, dl, MVT::i32)); } else if (SVOp->isSplat()) { int SplatIdx = SVOp->getSplatIndex(); if (SplatIdx >= 4) { std::swap(V1, V2); SplatIdx -= 4; } return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, DAG.getConstant(SplatIdx, dl, MVT::i32)); } // Lower this into a qvgpci/qvfperm pair. // Compute the qvgpci literal unsigned idx = 0; for (unsigned i = 0; i < 4; ++i) { int m = SVOp->getMaskElt(i); unsigned mm = m >= 0 ? (unsigned) m : i; idx |= mm << (3-i)*3; } SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, DAG.getConstant(idx, dl, MVT::i32)); return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); } // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. if (V2.isUndef()) { if (PPC::isSplatShuffleMask(SVOp, 1) || PPC::isSplatShuffleMask(SVOp, 2) || PPC::isSplatShuffleMask(SVOp, 4) || PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || (Subtarget.hasP8Altivec() && ( PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { return Op; } } // Altivec has a variety of "shuffle immediates" that take two vector inputs // and produce a fixed permutation. If any of these match, do not lower to // VPERM. unsigned int ShuffleKind = isLittleEndian ? 2 : 0; if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || (Subtarget.hasP8Altivec() && ( PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) return Op; // Check to see if this is a shuffle of 4-byte values. If so, we can use our // perfect shuffle table to emit an optimal matching sequence. ArrayRef PermMask = SVOp->getMask(); unsigned PFIndexes[4]; bool isFourElementShuffle = true; for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number unsigned EltNo = 8; // Start out undef. for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. if (PermMask[i*4+j] < 0) continue; // Undef, ignore it. unsigned ByteSource = PermMask[i*4+j]; if ((ByteSource & 3) != j) { isFourElementShuffle = false; break; } if (EltNo == 8) { EltNo = ByteSource/4; } else if (EltNo != ByteSource/4) { isFourElementShuffle = false; break; } } PFIndexes[i] = EltNo; } // If this shuffle can be expressed as a shuffle of 4-byte elements, use the // perfect shuffle vector to determine if it is cost effective to do this as // discrete instructions, or whether we should use a vperm. // For now, we skip this for little endian until such time as we have a // little-endian perfect shuffle table. if (isFourElementShuffle && !isLittleEndian) { // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); // Determining when to avoid vperm is tricky. Many things affect the cost // of vperm, particularly how many times the perm mask needs to be computed. // For example, if the perm mask can be hoisted out of a loop or is already // used (perhaps because there are multiple permutes with the same shuffle // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of // the loop requires an extra register. // // As a compromise, we only emit discrete instructions if the shuffle can be // generated in 3 or fewer operations. When we have loop information // available, if this block is within a loop, we should avoid using vperm // for 3-operation perms and use a constant pool load instead. if (Cost < 3) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant // vector that will get spilled to the constant pool. if (V2.isUndef()) V2 = V1; // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except // that it is in input element units, not in bytes. Convert now. // For little endian, the order of the input vectors is reversed, and // the permutation mask is complemented with respect to 31. This is // necessary to produce proper semantics with the big-endian-biased vperm // instruction. EVT EltVT = V1.getValueType().getVectorElementType(); unsigned BytesPerElement = EltVT.getSizeInBits()/8; SmallVector ResultMask; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; for (unsigned j = 0; j != BytesPerElement; ++j) if (isLittleEndian) ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), dl, MVT::i32)); else ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, MVT::i32)); } SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); if (isLittleEndian) return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V2, V1, VPermMask); else return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask); } /// getVectorCompareInfo - Given an intrinsic, return false if it is not a /// vector comparison. If it is, return true and fill in Opc/isDot with /// information about the intrinsic. static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = cast(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; isDot = false; switch (IntrinsicID) { default: return false; // Comparison predicates. case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = true; break; case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = true; break; case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = true; break; case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = true; break; case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = true; break; case Intrinsic::ppc_altivec_vcmpequd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 199; isDot = true; } else return false; break; case Intrinsic::ppc_altivec_vcmpneb_p: case Intrinsic::ppc_altivec_vcmpneh_p: case Intrinsic::ppc_altivec_vcmpnew_p: case Intrinsic::ppc_altivec_vcmpnezb_p: case Intrinsic::ppc_altivec_vcmpnezh_p: case Intrinsic::ppc_altivec_vcmpnezw_p: if (Subtarget.hasP9Altivec()) { switch (IntrinsicID) { default: llvm_unreachable("Unknown comparison intrinsic."); case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break; case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break; case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break; case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break; case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break; case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break; } isDot = true; } else return false; break; case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtsd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 967; isDot = true; } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = true; break; case Intrinsic::ppc_altivec_vcmpgtud_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 711; isDot = true; } else return false; break; // VSX predicate comparisons use the same infrastructure case Intrinsic::ppc_vsx_xvcmpeqdp_p: case Intrinsic::ppc_vsx_xvcmpgedp_p: case Intrinsic::ppc_vsx_xvcmpgtdp_p: case Intrinsic::ppc_vsx_xvcmpeqsp_p: case Intrinsic::ppc_vsx_xvcmpgesp_p: case Intrinsic::ppc_vsx_xvcmpgtsp_p: if (Subtarget.hasVSX()) { switch (IntrinsicID) { case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; } isDot = true; } else return false; break; // Normal Comparisons. case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; break; case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; break; case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; break; case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; break; case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; break; case Intrinsic::ppc_altivec_vcmpequd: if (Subtarget.hasP8Altivec()) CompareOpc = 199; else return false; break; case Intrinsic::ppc_altivec_vcmpneb: case Intrinsic::ppc_altivec_vcmpneh: case Intrinsic::ppc_altivec_vcmpnew: case Intrinsic::ppc_altivec_vcmpnezb: case Intrinsic::ppc_altivec_vcmpnezh: case Intrinsic::ppc_altivec_vcmpnezw: if (Subtarget.hasP9Altivec()) switch (IntrinsicID) { default: llvm_unreachable("Unknown comparison intrinsic."); case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break; case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break; case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break; case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break; case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break; case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break; } else return false; break; case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; break; case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; break; case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; break; case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; break; case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; break; case Intrinsic::ppc_altivec_vcmpgtsd: if (Subtarget.hasP8Altivec()) CompareOpc = 967; else return false; break; case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; break; case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; break; case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; break; case Intrinsic::ppc_altivec_vcmpgtud: if (Subtarget.hasP8Altivec()) CompareOpc = 711; else return false; break; } return true; } /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom /// lower, do it, otherwise return null. SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. if (Subtarget.isPPC64()) return DAG.getRegister(PPC::X13, MVT::i64); return DAG.getRegister(PPC::R2, MVT::i32); } // We are looking for absolute values here. // The idea is to try to fit one of two patterns: // max (a, (0-a)) OR max ((0-a), a) if (Subtarget.hasP9Vector() && (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { SDValue V1 = Op.getOperand(1); SDValue V2 = Op.getOperand(2); if (V1.getSimpleValueType() == V2.getSimpleValueType() && (V1.getSimpleValueType() == MVT::v4i32 || V1.getSimpleValueType() == MVT::v8i16 || V1.getSimpleValueType() == MVT::v16i8)) { if ( V1.getOpcode() == ISD::SUB && ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && V1.getOperand(1) == V2 ) { // Generate the abs instruction with the operands return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); } if ( V2.getOpcode() == ISD::SUB && ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && V2.getOperand(1) == V1 ) { // Generate the abs instruction with the operands return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); } } } // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. int CompareOpc; bool isDot; if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) return SDValue(); // Don't custom lower most intrinsics. // If this is a non-dot comparison, make the VCMP node and we are done. if (!isDot) { SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), Op.getOperand(1), Op.getOperand(2), DAG.getConstant(CompareOpc, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); } // Create the PPCISD altivec 'dot' comparison node. SDValue Ops[] = { Op.getOperand(2), // LHS Op.getOperand(3), // RHS DAG.getConstant(CompareOpc, dl, MVT::i32) }; EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); // Now that we have the comparison, emit a copy from the CR to a GPR. // This is flagged to the above dot comparison. SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, DAG.getRegister(PPC::CR6, MVT::i32), CompNode.getValue(1)); // Unpack the result based on how the target uses it. unsigned BitNo; // Bit # of CR6. bool InvertBit; // Invert result? switch (cast(Op.getOperand(1))->getZExtValue()) { default: // Can't happen, don't crash on invalid number though. case 0: // Return the value of the EQ bit of CR6. BitNo = 0; InvertBit = false; break; case 1: // Return the inverted value of the EQ bit of CR6. BitNo = 0; InvertBit = true; break; case 2: // Return the value of the LT bit of CR6. BitNo = 2; InvertBit = false; break; case 3: // Return the inverted value of the LT bit of CR6. BitNo = 2; InvertBit = true; break; } // Shift the bit into the low position. Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); // Isolate the bit. Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, DAG.getConstant(1, dl, MVT::i32)); // If we are supposed to, toggle the bit. if (InvertBit) Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, DAG.getConstant(1, dl, MVT::i32)); return Flags; } SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to // the beginning of the argument list. int ArgStart = isa(Op.getOperand(0)) ? 0 : 1; SDLoc DL(Op); switch (cast(Op.getOperand(ArgStart))->getZExtValue()) { case Intrinsic::ppc_cfence: { assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(ArgStart + 1)), Op.getOperand(0)), 0); } default: break; } return SDValue(); } SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { // Check for a DIV with the same operands as this REM. for (auto UI : Op.getOperand(1)->uses()) { if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) || (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV)) if (UI->getOperand(0) == Op.getOperand(0) && UI->getOperand(1) == Op.getOperand(1)) return SDValue(); } return Op; } // Lower scalar BSWAP64 to xxbrd. SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // MTVSRDD Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0), Op.getOperand(0)); // XXBRD Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op); // MFVSRD int VectorIndex = 0; if (Subtarget.isLittleEndian()) VectorIndex = 1; Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, DAG.getTargetConstant(VectorIndex, dl, MVT::i32)); return Op; } // ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be // compared to a value that is atomically loaded (atomic loads zero-extend). SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP && "Expecting an atomic compare-and-swap here."); SDLoc dl(Op); auto *AtomicNode = cast(Op.getNode()); EVT MemVT = AtomicNode->getMemoryVT(); if (MemVT.getSizeInBits() >= 32) return Op; SDValue CmpOp = Op.getOperand(2); // If this is already correctly zero-extended, leave it alone. auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits()); if (DAG.MaskedValueIsZero(CmpOp, HighBits)) return Op; // Clear the high bits of the compare operand. unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1; SDValue NewCmpOp = DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp, DAG.getConstant(MaskVal, dl, MVT::i32)); // Replace the existing compare operand with the properly zero-extended one. SmallVector Ops; for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++) Ops.push_back(AtomicNode->getOperand(i)); Ops[2] = NewCmpOp; MachineMemOperand *MMO = AtomicNode->getMemOperand(); SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other); auto NodeTy = (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16; return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); } SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int // instructions), but for smaller types, we need to first extend up to v2i32 // before doing going farther. if (Op.getValueType() == MVT::v2i64) { EVT ExtVT = cast(Op.getOperand(1))->getVT(); if (ExtVT != MVT::v2i32) { Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), ExtVT.getVectorElementType(), 4))); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, DAG.getValueType(MVT::v2i32)); } return Op; } return SDValue(); } SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Create a stack slot that is 16-byte aligned. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(16, 16, false); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); // Store the input value into Value#0 of the stack slot. SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, MachinePointerInfo()); // Load it out. return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo()); } SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Should only be called for ISD::INSERT_VECTOR_ELT"); ConstantSDNode *C = dyn_cast(Op.getOperand(2)); // We have legal lowering for constant indices but not for variable ones. if (!C) return SDValue(); EVT VT = Op.getValueType(); SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types. if (VT == MVT::v8i16 || VT == MVT::v16i8) { SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2); unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8; unsigned InsertAtElement = C->getZExtValue(); unsigned InsertAtByte = InsertAtElement * BytesInEachElement; if (Subtarget.isLittleEndian()) { InsertAtByte = (16 - BytesInEachElement) - InsertAtByte; } return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz, DAG.getConstant(InsertAtByte, dl, MVT::i32)); } return Op; } SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDNode *N = Op.getNode(); assert(N->getOperand(0).getValueType() == MVT::v4i1 && "Unknown extract_vector_elt type"); SDValue Value = N->getOperand(0); // The first part of this is like the store lowering except that we don't // need to track the chain. // The values are now known to be -1 (false) or 1 (true). To convert this // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to // understand how to form the extending load. SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), Value); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue StoreChain = DAG.getEntryNode(); SDValue Ops[] = {StoreChain, DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), Value, FIdx}; SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, VTs, Ops, MVT::v4i32, PtrInfo); // Extract the value requested. unsigned Offset = 4*cast(N->getOperand(1))->getZExtValue(); SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); SDValue IntVal = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); if (!Subtarget.useCRBits()) return IntVal; return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); } /// Lowering for QPX v4i1 loads SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); LoadSDNode *LN = cast(Op.getNode()); SDValue LoadChain = LN->getChain(); SDValue BasePtr = LN->getBasePtr(); if (Op.getValueType() == MVT::v4f64 || Op.getValueType() == MVT::v4f32) { EVT MemVT = LN->getMemoryVT(); unsigned Alignment = LN->getAlignment(); // If this load is properly aligned, then it is legal. if (Alignment >= MemVT.getStoreSize()) return Op; EVT ScalarVT = Op.getValueType().getScalarType(), ScalarMemVT = MemVT.getScalarType(); unsigned Stride = ScalarMemVT.getStoreSize(); SDValue Vals[4], LoadChains[4]; for (unsigned Idx = 0; Idx < 4; ++Idx) { SDValue Load; if (ScalarVT != ScalarMemVT) Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, BasePtr, LN->getPointerInfo().getWithOffset(Idx * Stride), ScalarMemVT, MinAlign(Alignment, Idx * Stride), LN->getMemOperand()->getFlags(), LN->getAAInfo()); else Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, LN->getPointerInfo().getWithOffset(Idx * Stride), MinAlign(Alignment, Idx * Stride), LN->getMemOperand()->getFlags(), LN->getAAInfo()); if (Idx == 0 && LN->isIndexed()) { assert(LN->getAddressingMode() == ISD::PRE_INC && "Unknown addressing mode on vector load"); Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), LN->getAddressingMode()); } Vals[Idx] = Load; LoadChains[Idx] = Load.getValue(1); BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Stride, dl, BasePtr.getValueType())); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); if (LN->isIndexed()) { SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; return DAG.getMergeValues(RetOps, dl); } SDValue RetOps[] = { Value, TF }; return DAG.getMergeValues(RetOps, dl); } assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); // To lower v4i1 from a byte array, we load the byte elements of the // vector and then reuse the BUILD_VECTOR logic. SDValue VectElmts[4], VectElmtChains[4]; for (unsigned i = 0; i < 4; ++i) { SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); VectElmts[i] = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, LN->getPointerInfo().getWithOffset(i), MVT::i8, /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); VectElmtChains[i] = VectElmts[i].getValue(1); } LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); SDValue RVals[] = { Value, LoadChain }; return DAG.getMergeValues(RVals, dl); } /// Lowering for QPX v4i1 stores SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); StoreSDNode *SN = cast(Op.getNode()); SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); if (Value.getValueType() == MVT::v4f64 || Value.getValueType() == MVT::v4f32) { EVT MemVT = SN->getMemoryVT(); unsigned Alignment = SN->getAlignment(); // If this store is properly aligned, then it is legal. if (Alignment >= MemVT.getStoreSize()) return Op; EVT ScalarVT = Value.getValueType().getScalarType(), ScalarMemVT = MemVT.getScalarType(); unsigned Stride = ScalarMemVT.getStoreSize(); SDValue Stores[4]; for (unsigned Idx = 0; Idx < 4; ++Idx) { SDValue Ex = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); SDValue Store; if (ScalarVT != ScalarMemVT) Store = DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, SN->getPointerInfo().getWithOffset(Idx * Stride), ScalarMemVT, MinAlign(Alignment, Idx * Stride), SN->getMemOperand()->getFlags(), SN->getAAInfo()); else Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, SN->getPointerInfo().getWithOffset(Idx * Stride), MinAlign(Alignment, Idx * Stride), SN->getMemOperand()->getFlags(), SN->getAAInfo()); if (Idx == 0 && SN->isIndexed()) { assert(SN->getAddressingMode() == ISD::PRE_INC && "Unknown addressing mode on vector store"); Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), SN->getAddressingMode()); } BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(Stride, dl, BasePtr.getValueType())); Stores[Idx] = Store; } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); if (SN->isIndexed()) { SDValue RetOps[] = { TF, Stores[0].getValue(1) }; return DAG.getMergeValues(RetOps, dl); } return TF; } assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); // The values are now known to be -1 (false) or 1 (true). To convert this // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to // understand how to form the extending load. SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), Value); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(16, 16, false); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Ops[] = {StoreChain, DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), Value, FIdx}; SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, VTs, Ops, MVT::v4i32, PtrInfo); // Move data into the byte array. SDValue Loads[4], LoadChains[4]; for (unsigned i = 0; i < 4; ++i) { unsigned Offset = 4*i; SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); LoadChains[i] = Loads[i].getValue(1); } StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); SDValue Stores[4]; for (unsigned i = 0; i < 4; ++i) { SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); Stores[i] = DAG.getTruncStore( StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), SN->getAAInfo()); } StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); return StoreChain; } SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. SDValue RHSSwap = // = vrlw RHS, 16 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); // Shrinkify inputs to v8i16. LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); // Low parts multiplied together, generating 32-bit results (we ignore the // top parts). SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, LHS, RHS, DAG, dl, MVT::v4i32); SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); // Shift the high parts up 16 bits. HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, Neg16, DAG, dl); return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); } else if (Op.getValueType() == MVT::v8i16) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, LHS, RHS, Zero, DAG, dl); } else if (Op.getValueType() == MVT::v16i8) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); bool isLittleEndian = Subtarget.isLittleEndian(); // Multiply the even 8-bit parts, producing 16-bit sums. SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, LHS, RHS, DAG, dl, MVT::v8i16); EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); // Multiply the odd 8-bit parts, producing 16-bit sums. SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, LHS, RHS, DAG, dl, MVT::v8i16); OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); // Merge the results together. Because vmuleub and vmuloub are // instructions with a big-endian bias, we must reverse the // element numbering and reverse the meaning of "odd" and "even" // when generating little endian code. int Ops[16]; for (unsigned i = 0; i != 8; ++i) { if (isLittleEndian) { Ops[i*2 ] = 2*i; Ops[i*2+1] = 2*i+16; } else { Ops[i*2 ] = 2*i+1; Ops[i*2+1] = 2*i+1+16; } } if (isLittleEndian) return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); else return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); } else { llvm_unreachable("Unknown mul to lower!"); } } /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Wasn't expecting to be able to lower this!"); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); // Variable argument lowering. case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG); // Exception handling lowering. case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op)); case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); // Lower 64-bit shifts. case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); // Vector-related lowering. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); case ISD::BITCAST: return LowerBITCAST(Op, DAG); // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::SREM: case ISD::UREM: return LowerREM(Op, DAG); case ISD::BSWAP: return LowerBSWAP(Op, DAG); case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); } } void PPCTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::READCYCLECOUNTER: { SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); Results.push_back(RTB); Results.push_back(RTB.getValue(1)); Results.push_back(RTB.getValue(2)); break; } case ISD::INTRINSIC_W_CHAIN: { if (cast(N->getOperand(1))->getZExtValue() != Intrinsic::ppc_is_decremented_ctr_nonzero) break; assert(N->getValueType(0) == MVT::i1 && "Unexpected result type for CTR decrement intrinsic"); EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), N->getValueType(0)); SDVTList VTs = DAG.getVTList(SVT, MVT::Other); SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), N->getOperand(1)); Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt)); Results.push_back(NewInt.getValue(1)); break; } case ISD::VAARG: { if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) return; EVT VT = N->getValueType(0); if (VT == MVT::i64) { SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG); Results.push_back(NewNode); Results.push_back(NewNode.getValue(1)); } return; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: // LowerFP_TO_INT() can only handle f32 and f64. if (N->getOperand(0).getValueType() == MVT::ppcf128) return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; } } //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *Func = Intrinsic::getDeclaration(M, Id); return Builder.CreateCall(Func, {}); } // The mappings for emitLeading/TrailingFence is taken from // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) return callIntrinsic(Builder, Intrinsic::ppc_lwsync); return nullptr; } Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. if (isa(Inst) && Subtarget.isPPC64()) return Builder.CreateCall( Intrinsic::getDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), Intrinsic::ppc_cfence, {Inst->getType()}), {Inst}); // FIXME: Can use isync for rmw operation. return callIntrinsic(Builder, Intrinsic::ppc_lwsync); } return nullptr; } MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, unsigned AtomicSize, unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const { // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); auto LoadMnemonic = PPC::LDARX; auto StoreMnemonic = PPC::STDCX; switch (AtomicSize) { default: llvm_unreachable("Unexpected size of atomic entity"); case 1: LoadMnemonic = PPC::LBARX; StoreMnemonic = PPC::STBCX; assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); break; case 2: LoadMnemonic = PPC::LHARX; StoreMnemonic = PPC::STHCX; assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); break; case 4: LoadMnemonic = PPC::LWARX; StoreMnemonic = PPC::STWCX; break; case 8: LoadMnemonic = PPC::LDARX; StoreMnemonic = PPC::STDCX; break; } const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loopMBB); if (CmpOpcode) F->insert(It, loop2MBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned TmpReg = (!BinOpcode) ? incr : RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loopMBB); // loopMBB: // l[wd]arx dest, ptr // add r0, dest, incr // st[wd]cx. r0, ptr // bne- loopMBB // fallthrough --> exitMBB // For max/min... // loopMBB: // l[wd]arx dest, ptr // cmpl?[wd] incr, dest // bgt exitMBB // loop2MBB: // st[wd]cx. dest, ptr // bne- loopMBB // fallthrough --> exitMBB BB = loopMBB; BuildMI(BB, dl, TII->get(LoadMnemonic), dest) .addReg(ptrA).addReg(ptrB); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); if (CmpOpcode) { // Signed comparisons of byte or halfword values must be sign-extended. if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), ExtReg).addReg(dest); BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) .addReg(incr).addReg(ExtReg); } else BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) .addReg(incr).addReg(dest); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(exitMBB); BB = loop2MBB; } BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(TmpReg).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; return BB; } MachineBasicBlock * PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, bool is8bit, // operation unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const { // If we support part-word atomic mnemonics, just use them if (Subtarget.hasPartwordAtomics()) return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode, CmpPred); // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're // doing actual arithmetic on the addresses. bool is64bit = Subtarget.isPPC64(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned incr = MI.getOperand(3).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loopMBB); if (CmpOpcode) F->insert(It, loop2MBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; unsigned PtrReg = RegInfo.createVirtualRegister(RC); unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); unsigned ShiftReg = isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); unsigned MaskReg = RegInfo.createVirtualRegister(RC); unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); unsigned Ptr1Reg; unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loopMBB); // The 4-byte load must be aligned, while a char or short may be // anywhere in the word. Hence all this nasty bookkeeping code. // add ptr1, ptrA, ptrB [copy if ptrA==0] // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] // xori shift, shift1, 24 [16] // rlwinm ptr, ptr1, 0, 0, 29 // slw incr2, incr, shift // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] // slw mask, mask2, shift // loopMBB: // lwarx tmpDest, ptr // add tmp, tmpDest, incr2 // andc tmp2, tmpDest, mask // and tmp3, tmp, mask // or tmp4, tmp3, tmp2 // stwcx. tmp4, ptr // bne- loopMBB // fallthrough --> exitMBB // srw dest, tmpDest, shift if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) .addReg(ptrA).addReg(ptrB); } else { Ptr1Reg = ptrB; } BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); if (!isLittleEndian) BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); if (is64bit) BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(61); else BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) .addReg(incr).addReg(ShiftReg); if (is8bit) BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); else { BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); } BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) .addReg(Mask2Reg).addReg(ShiftReg); BB = loopMBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) .addReg(ZeroReg).addReg(PtrReg); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) .addReg(Incr2Reg).addReg(TmpDestReg); BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) .addReg(TmpReg).addReg(MaskReg); if (CmpOpcode) { // For unsigned comparisons, we can directly compare the shifted values. // For signed comparisons we shift and sign extend. unsigned SReg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg) .addReg(TmpDestReg).addReg(MaskReg); unsigned ValueReg = SReg; unsigned CmpReg = Incr2Reg; if (CmpOpcode == PPC::CMPW) { ValueReg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) .addReg(SReg).addReg(ShiftReg); unsigned ValueSReg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) .addReg(ValueReg); ValueReg = ValueSReg; CmpReg = incr; } BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) .addReg(CmpReg).addReg(ValueReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(exitMBB); BB = loop2MBB; } BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) .addReg(Tmp3Reg).addReg(Tmp2Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)) .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) .addReg(ShiftReg); return BB; } llvm::MachineBasicBlock * PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // SjLjSetup mainMBB // bl mainMBB // v_restore = 1 // b sinkMBB // // mainMBB: // buf[LabelOffset] = LR // v_main = 0 // // sinkMBB: // v = phi(main, restore) // MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // Note that the structure of the jmp_buf used here is not compatible // with that used by libc, and is not designed to be. Specifically, it // stores only those 'reserved' registers that LLVM does not otherwise // understand how to spill. Also, by convention, by the time this // intrinsic is called, Clang has already stored the frame address in the // first slot of the buffer and stack address in the third. Following the // X86 target code, we'll store the jump address in the second slot. We also // need to save the TOC pointer (R2) to handle jumps between shared // libraries, and that will be stored in the fourth slot. The thread // identifier (R13) is not affected. // thisMBB: const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t TOCOffset = 3 * PVT.getStoreSize(); const int64_t BPOffset = 4 * PVT.getStoreSize(); // Prepare IP either in reg. const TargetRegisterClass *PtrRC = getRegClassFor(PVT); unsigned LabelReg = MRI.createVirtualRegister(PtrRC); unsigned BufReg = MI.getOperand(1).getReg(); if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) .addImm(TOCOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); } // Naked functions never have a base pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned BaseReg; if (MF->getFunction().hasFnAttribute(Attribute::Naked)) BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; else BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; MIB = BuildMI(*thisMBB, MI, DL, TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) .addReg(BaseReg) .addImm(BPOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) .addMBB(mainMBB); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); // mainMBB: // mainDstReg = 0 MIB = BuildMI(mainMBB, DL, TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); // Store IP if (Subtarget.isPPC64()) { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) .addReg(LabelReg) .addImm(LabelOffset) .addReg(BufReg); } else { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) .addReg(LabelReg) .addImm(LabelOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(PPC::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(restoreDstReg).addMBB(thisMBB); MI.eraseFromParent(); return sinkMBB; } MachineBasicBlock * PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; unsigned BP = (PVT == MVT::i64) ? PPC::X30 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29 : PPC::R30); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); const int64_t TOCOffset = 3 * PVT.getStoreSize(); const int64_t BPOffset = 4 * PVT.getStoreSize(); unsigned BufReg = MI.getOperand(0).getReg(); // Reload FP (the jumped-to function may not have had a // frame pointer, and if so, then its r31 will be restored // as necessary). if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) .addImm(0) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) .addImm(0) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) .addImm(LabelOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) .addImm(LabelOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) .addImm(SPOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) .addImm(SPOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload BP if (PVT == MVT::i64) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) .addImm(BPOffset) .addReg(BufReg); } else { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) .addImm(BPOffset) .addReg(BufReg); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload TOC if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { setUsesTOCBasePtr(*MBB->getParent()); MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) .addImm(TOCOffset) .addReg(BufReg); MIB.setMemRefs(MMOBegin, MMOEnd); } // Jump BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); MI.eraseFromParent(); return MBB; } MachineBasicBlock * PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { if (MI.getOpcode() == TargetOpcode::STACKMAP || MI.getOpcode() == TargetOpcode::PATCHPOINT) { if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && MI.getOpcode() == TargetOpcode::PATCHPOINT) { // Call lowering should have added an r2 operand to indicate a dependence // on the TOC base pointer value. It can't however, because there is no // way to mark the dependence as implicit there, and so the stackmap code // will confuse it with a regular operand. Instead, add the dependence // here. setUsesTOCBasePtr(*BB->getParent()); MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); } return emitPatchPoint(MI, BB); } if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { return emitEHSjLjSetJmp(MI, BB); } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { return emitEHSjLjLongJmp(MI, BB); } const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // To "insert" these instructions we actually have to insert their // control-flow patterns. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); MachineFunction *F = BB->getParent(); if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { SmallVector Cond; if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8) Cond.push_back(MI.getOperand(4)); else Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); Cond.push_back(MI.getOperand(1)); DebugLoc dl = MI.getDebugLoc(); TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || MI.getOpcode() == PPC::SELECT_CC_QFRC || MI.getOpcode() == PPC::SELECT_CC_QSRC || MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || MI.getOpcode() == PPC::SELECT_CC_VSRC || MI.getOpcode() == PPC::SELECT_CC_SPE4 || MI.getOpcode() == PPC::SELECT_CC_SPE || MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_QFRC || MI.getOpcode() == PPC::SELECT_QSRC || MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || MI.getOpcode() == PPC::SELECT_VSRC) { // The incoming instruction knows the destination vreg to set, the // condition code register to branch on, the true/false values to // select between, and a branch opcode to use. // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); DebugLoc dl = MI.getDebugLoc(); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Next, add the true and fallthrough blocks as its successors. BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_QFRC || MI.getOpcode() == PPC::SELECT_QSRC || MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || MI.getOpcode() == PPC::SELECT_VSRC) { BuildMI(BB, dl, TII->get(PPC::BC)) .addReg(MI.getOperand(1).getReg()) .addMBB(sinkMBB); } else { unsigned SelectPred = MI.getOperand(4).getImm(); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(SelectPred) .addReg(MI.getOperand(1).getReg()) .addMBB(sinkMBB); } // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB BB = copy0MBB; // Update machine-CFG edges BB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) .addReg(MI.getOperand(3).getReg()) .addMBB(copy0MBB) .addReg(MI.getOperand(2).getReg()) .addMBB(thisMBB); } else if (MI.getOpcode() == PPC::ReadTB) { // To read the 64-bit time-base register on a 32-bit target, we read the // two halves. Should the counter have wrapped while it was being read, we // need to try again. // ... // readLoop: // mfspr Rx,TBU # load from TBU // mfspr Ry,TB # load from TB // mfspr Rz,TBU # load from TBU // cmpw crX,Rx,Rz # check if 'old'='new' // bne readLoop # branch if they're not equal // ... MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); DebugLoc dl = MI.getDebugLoc(); F->insert(It, readMBB); F->insert(It, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(readMBB); BB = readMBB; MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); unsigned LoReg = MI.getOperand(0).getReg(); unsigned HiReg = MI.getOperand(1).getReg(); BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) .addReg(HiReg).addReg(ReadAgainReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); BB->addSuccessor(readMBB); BB->addSuccessor(sinkMBB); } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32) BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32) BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32) BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32) BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16) BB = EmitPartwordAtomicBinary(MI, BB, false, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32) BB = EmitAtomicBinary(MI, BB, 4, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) BB = EmitAtomicBinary(MI, BB, 8, 0); else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || (Subtarget.hasPartwordAtomics() && MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || (Subtarget.hasPartwordAtomics() && MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; auto LoadMnemonic = PPC::LDARX; auto StoreMnemonic = PPC::STDCX; switch (MI.getOpcode()) { default: llvm_unreachable("Compare and swap of unknown size"); case PPC::ATOMIC_CMP_SWAP_I8: LoadMnemonic = PPC::LBARX; StoreMnemonic = PPC::STBCX; assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); break; case PPC::ATOMIC_CMP_SWAP_I16: LoadMnemonic = PPC::LHARX; StoreMnemonic = PPC::STHCX; assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); break; case PPC::ATOMIC_CMP_SWAP_I32: LoadMnemonic = PPC::LWARX; StoreMnemonic = PPC::STWCX; break; case PPC::ATOMIC_CMP_SWAP_I64: LoadMnemonic = PPC::LDARX; StoreMnemonic = PPC::STDCX; break; } unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned oldval = MI.getOperand(3).getReg(); unsigned newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loop1MBB); F->insert(It, loop2MBB); F->insert(It, midMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loop1MBB); // loop1MBB: // l[bhwd]arx dest, ptr // cmp[wd] dest, oldval // bne- midMBB // loop2MBB: // st[bhwd]cx. newval, ptr // bne- loopMBB // b exitBB // midMBB: // st[bhwd]cx. dest, ptr // exitBB: BB = loop1MBB; BuildMI(BB, dl, TII->get(LoadMnemonic), dest) .addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) .addReg(oldval).addReg(dest); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(midMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(newval).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); BB = midMBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(dest).addReg(ptrA).addReg(ptrB); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { // We must use 64-bit registers for addresses when targeting 64-bit, // since we're actually doing arithmetic on them. Other registers // can be 32-bit. bool is64bit = Subtarget.isPPC64(); bool isLittleEndian = Subtarget.isLittleEndian(); bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; unsigned dest = MI.getOperand(0).getReg(); unsigned ptrA = MI.getOperand(1).getReg(); unsigned ptrB = MI.getOperand(2).getReg(); unsigned oldval = MI.getOperand(3).getReg(); unsigned newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loop1MBB); F->insert(It, loop2MBB); F->insert(It, midMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = F->getRegInfo(); const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; unsigned PtrReg = RegInfo.createVirtualRegister(RC); unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); unsigned ShiftReg = isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); unsigned MaskReg = RegInfo.createVirtualRegister(RC); unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); unsigned Ptr1Reg; unsigned TmpReg = RegInfo.createVirtualRegister(RC); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; // thisMBB: // ... // fallthrough --> loopMBB BB->addSuccessor(loop1MBB); // The 4-byte load must be aligned, while a char or short may be // anywhere in the word. Hence all this nasty bookkeeping code. // add ptr1, ptrA, ptrB [copy if ptrA==0] // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] // xori shift, shift1, 24 [16] // rlwinm ptr, ptr1, 0, 0, 29 // slw newval2, newval, shift // slw oldval2, oldval,shift // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] // slw mask, mask2, shift // and newval3, newval2, mask // and oldval3, oldval2, mask // loop1MBB: // lwarx tmpDest, ptr // and tmp, tmpDest, mask // cmpw tmp, oldval3 // bne- midMBB // loop2MBB: // andc tmp2, tmpDest, mask // or tmp4, tmp2, newval3 // stwcx. tmp4, ptr // bne- loop1MBB // b exitBB // midMBB: // stwcx. tmpDest, ptr // exitBB: // srw dest, tmpDest, shift if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) .addReg(ptrA).addReg(ptrB); } else { Ptr1Reg = ptrB; } BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); if (!isLittleEndian) BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); if (is64bit) BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(61); else BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) .addReg(newval).addReg(ShiftReg); BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) .addReg(oldval).addReg(ShiftReg); if (is8bit) BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); else { BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) .addReg(Mask3Reg).addImm(65535); } BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) .addReg(Mask2Reg).addReg(ShiftReg); BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) .addReg(NewVal2Reg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) .addReg(OldVal2Reg).addReg(MaskReg); BB = loop1MBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) .addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) .addReg(TmpReg).addReg(OldVal3Reg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(midMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) .addReg(Tmp2Reg).addReg(NewVal3Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) .addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); BB = midMBB; BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) .addReg(ZeroReg).addReg(PtrReg); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) .addReg(ShiftReg); } else if (MI.getOpcode() == PPC::FADDrtz) { // This pseudo performs an FADD with rounding mode temporarily forced // to round-to-zero. We emit this via custom inserter since the FPSCR // is not modeled at the SelectionDAG level. unsigned Dest = MI.getOperand(0).getReg(); unsigned Src1 = MI.getOperand(1).getReg(); unsigned Src2 = MI.getOperand(2).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); // Save FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); // Set rounding mode to round-to-zero. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); // Perform addition. BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); // Restore FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || MI.getOpcode() == PPC::ANDIo_1_GT_BIT || MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) ? PPC::ANDIo8 : PPC::ANDIo; bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); DebugLoc dl = MI.getDebugLoc(); BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) .addReg(MI.getOperand(1).getReg()) .addImm(1); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); } else if (MI.getOpcode() == PPC::TCHECK_RET) { DebugLoc Dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); return BB; } else { llvm_unreachable("Unexpected instr type to insert"); } MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } //===----------------------------------------------------------------------===// // Target Optimization Hooks //===----------------------------------------------------------------------===// static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { // For the estimates, convergence is quadratic, so we essentially double the // number of digits correct after every iteration. For both FRE and FRSQRTE, // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), // this is 2^-14. IEEE float has 23 digits and double has 52 digits. int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) RefinementSteps++; return RefinementSteps; } SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || (VT == MVT::v4f64 && Subtarget.hasQPX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); UseOneConstNR = true; return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } return SDValue(); } SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || (VT == MVT::v2f64 && Subtarget.hasVSX()) || (VT == MVT::v4f32 && Subtarget.hasQPX()) || (VT == MVT::v4f64 && Subtarget.hasQPX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); } return SDValue(); } unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { // Note: This functionality is used only when unsafe-fp-math is enabled, and // on cores with reciprocal estimates (which are used when unsafe-fp-math is // enabled for division), this functionality is redundant with the default // combiner logic (once the division -> reciprocal/multiply transformation // has taken place). As a result, this matters more for older cores than for // newer ones. // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are two or more FDIVs (for embedded cores with only // one FP pipeline) for three or more FDIVs (for generic OOO cores). switch (Subtarget.getDarwinDirective()) { default: return 3; case PPC::DIR_440: case PPC::DIR_A2: case PPC::DIR_E500: case PPC::DIR_E500mc: case PPC::DIR_E5500: return 2; } } // isConsecutiveLSLoc needs to work even if all adds have not yet been // collapsed, and so we need to look through chains of them. static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, int64_t& Offset, SelectionDAG &DAG) { if (DAG.isBaseWithConstantOffset(Loc)) { Base = Loc.getOperand(0); Offset += cast(Loc.getOperand(1))->getSExtValue(); // The base might itself be a base plus an offset, and if so, accumulate // that as well. getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); } } static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { if (VT.getSizeInBits() / 8 != Bytes) return false; SDValue BaseLoc = Base->getBasePtr(); if (Loc.getOpcode() == ISD::FrameIndex) { if (BaseLoc.getOpcode() != ISD::FrameIndex) return false; const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); int FI = cast(Loc)->getIndex(); int BFI = cast(BaseLoc)->getIndex(); int FS = MFI.getObjectSize(FI); int BFS = MFI.getObjectSize(BFI); if (FS != BFS || FS != (int)Bytes) return false; return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); } SDValue Base1 = Loc, Base2 = BaseLoc; int64_t Offset1 = 0, Offset2 = 0; getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) return true; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; Offset1 = 0; Offset2 = 0; bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); if (isGA1 && isGA2 && GV1 == GV2) return Offset1 == (Offset2 + Dist*Bytes); return false; } // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does // not enforce equality of the chain operands. static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, unsigned Bytes, int Dist, SelectionDAG &DAG) { if (LSBaseSDNode *LS = dyn_cast(N)) { EVT VT = LS->getMemoryVT(); SDValue Loc = LS->getBasePtr(); return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); } if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; case Intrinsic::ppc_qpx_qvlfd: case Intrinsic::ppc_qpx_qvlfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvlfs: case Intrinsic::ppc_qpx_qvlfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvlfcd: case Intrinsic::ppc_qpx_qvlfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfcs: case Intrinsic::ppc_qpx_qvlfcsa: VT = MVT::v2f32; break; case Intrinsic::ppc_qpx_qvlfiwa: case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: case Intrinsic::ppc_vsx_lxvw4x_be: VT = MVT::v4i32; break; case Intrinsic::ppc_vsx_lxvd2x: case Intrinsic::ppc_vsx_lxvd2x_be: VT = MVT::v2f64; break; case Intrinsic::ppc_altivec_lvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_lvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_lvewx: VT = MVT::i32; break; } return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); } if (N->getOpcode() == ISD::INTRINSIC_VOID) { EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; case Intrinsic::ppc_qpx_qvstfd: case Intrinsic::ppc_qpx_qvstfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvstfs: case Intrinsic::ppc_qpx_qvstfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvstfcd: case Intrinsic::ppc_qpx_qvstfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfcs: case Intrinsic::ppc_qpx_qvstfcsa: VT = MVT::v2f32; break; case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: VT = MVT::v4i32; break; case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_vsx_stxvw4x_be: VT = MVT::v4i32; break; case Intrinsic::ppc_vsx_stxvd2x_be: VT = MVT::v2f64; break; case Intrinsic::ppc_altivec_stvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_stvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_stvewx: VT = MVT::i32; break; } return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); } return false; } // Return true is there is a nearyby consecutive load to the one provided // (regardless of alignment). We search up and down the chain, looking though // token factors and other loads (but nothing else). As a result, a true result // indicates that it is safe to create a new consecutive load adjacent to the // load provided. static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); SmallSet LoadRoots; SmallVector Queue(1, Chain.getNode()); SmallSet Visited; // First, search up the chain, branching to follow all token-factor operands. // If we find a consecutive load, then we're done, otherwise, record all // nodes just above the top-level loads and token factors. while (!Queue.empty()) { SDNode *ChainNext = Queue.pop_back_val(); if (!Visited.insert(ChainNext).second) continue; if (MemSDNode *ChainLD = dyn_cast(ChainNext)) { if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; if (!Visited.count(ChainLD->getChain().getNode())) Queue.push_back(ChainLD->getChain().getNode()); } else if (ChainNext->getOpcode() == ISD::TokenFactor) { for (const SDUse &O : ChainNext->ops()) if (!Visited.count(O.getNode())) Queue.push_back(O.getNode()); } else LoadRoots.insert(ChainNext); } // Second, search down the chain, starting from the top-level nodes recorded // in the first phase. These top-level nodes are the nodes just above all // loads and token factors. Starting with their uses, recursively look though // all loads (just the chain uses) and token factors to find a consecutive // load. Visited.clear(); Queue.clear(); for (SmallSet::iterator I = LoadRoots.begin(), IE = LoadRoots.end(); I != IE; ++I) { Queue.push_back(*I); while (!Queue.empty()) { SDNode *LoadRoot = Queue.pop_back_val(); if (!Visited.insert(LoadRoot).second) continue; if (MemSDNode *ChainLD = dyn_cast(LoadRoot)) if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) return true; for (SDNode::use_iterator UI = LoadRoot->use_begin(), UE = LoadRoot->use_end(); UI != UE; ++UI) if (((isa(*UI) && cast(*UI)->getChain().getNode() == LoadRoot) || UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) Queue.push_back(*UI); } } return false; } /// This function is called when we have proved that a SETCC node can be replaced /// by subtraction (and other supporting instructions) so that the result of /// comparison is kept in a GPR instead of CR. This function is purely for /// codegen purposes and has some flags to guide the codegen process. static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, bool Swap, SDLoc &DL, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); // Zero extend the operands to the largest legal integer. Originally, they // must be of a strictly smaller size. auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0), DAG.getConstant(Size, DL, MVT::i32)); auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1), DAG.getConstant(Size, DL, MVT::i32)); // Swap if needed. Depends on the condition code. if (Swap) std::swap(Op0, Op1); // Subtract extended integers. auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1); // Move the sign bit to the least significant position and zero out the rest. // Now the least significant bit carries the result of original comparison. auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode, DAG.getConstant(Size - 1, DL, MVT::i32)); auto Final = Shifted; // Complement the result if needed. Based on the condition code. if (Complement) Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted, DAG.getConstant(1, DL, MVT::i64)); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final); } SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); // Size of integers being compared has a critical role in the following // analysis, so we prefer to do this when all types are legal. if (!DCI.isAfterLegalizeDAG()) return SDValue(); // If all users of SETCC extend its value to a legal integer type // then we replace SETCC with a subtraction for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); } ISD::CondCode CC = cast(N->getOperand(2))->get(); auto OpSize = N->getOperand(0).getValueSizeInBits(); unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits(); if (OpSize < Size) { switch (CC) { default: break; case ISD::SETULT: return generateEquivalentSub(N, Size, false, false, DL, DAG); case ISD::SETULE: return generateEquivalentSub(N, Size, true, true, DL, DAG); case ISD::SETUGT: return generateEquivalentSub(N, Size, false, true, DL, DAG); case ISD::SETUGE: return generateEquivalentSub(N, Size, true, false, DL, DAG); } } return SDValue(); } SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); // If we're tracking CR bits, we need to be careful that we don't have: // trunc(binary-ops(zext(x), zext(y))) // or // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) // such that we're unnecessarily moving things into GPRs when it would be // better to keep them in CR bits. // Note that trunc here can be an actual i1 trunc, or can be the effective // truncation that comes from a setcc or select_cc. if (N->getOpcode() == ISD::TRUNCATE && N->getValueType(0) != MVT::i1) return SDValue(); if (N->getOperand(0).getValueType() != MVT::i32 && N->getOperand(0).getValueType() != MVT::i64) return SDValue(); if (N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) { // If we're looking at a comparison, then we need to make sure that the // high bits (all except for the first) don't matter the result. ISD::CondCode CC = cast(N->getOperand( N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); unsigned OpBits = N->getOperand(0).getValueSizeInBits(); if (ISD::isSignedIntSetCC(CC)) { if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) return SDValue(); } else if (ISD::isUnsignedIntSetCC(CC)) { if (!DAG.MaskedValueIsZero(N->getOperand(0), APInt::getHighBitsSet(OpBits, OpBits-1)) || !DAG.MaskedValueIsZero(N->getOperand(1), APInt::getHighBitsSet(OpBits, OpBits-1))) return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI) : SDValue()); } else { // This is neither a signed nor an unsigned comparison, just make sure // that the high bits are equal. KnownBits Op1Known, Op2Known; DAG.computeKnownBits(N->getOperand(0), Op1Known); DAG.computeKnownBits(N->getOperand(1), Op2Known); // We don't really care about what is known about the first bit (if // anything), so clear it in all masks prior to comparing them. Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) return SDValue(); } } // We now know that the higher-order bits are irrelevant, we just need to // make sure that all of the intermediate operations are bit operations, and // all inputs are extensions. if (N->getOperand(0).getOpcode() != ISD::AND && N->getOperand(0).getOpcode() != ISD::OR && N->getOperand(0).getOpcode() != ISD::XOR && N->getOperand(0).getOpcode() != ISD::SELECT && N->getOperand(0).getOpcode() != ISD::SELECT_CC && N->getOperand(0).getOpcode() != ISD::TRUNCATE && N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) return SDValue(); if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && N->getOperand(1).getOpcode() != ISD::AND && N->getOperand(1).getOpcode() != ISD::OR && N->getOperand(1).getOpcode() != ISD::XOR && N->getOperand(1).getOpcode() != ISD::SELECT && N->getOperand(1).getOpcode() != ISD::SELECT_CC && N->getOperand(1).getOpcode() != ISD::TRUNCATE && N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) return SDValue(); SmallVector Inputs; SmallVector BinOps, PromOps; SmallPtrSet Visited; for (unsigned i = 0; i < 2; ++i) { if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || isa(N->getOperand(i))) Inputs.push_back(N->getOperand(i)); else BinOps.push_back(N->getOperand(i)); if (N->getOpcode() == ISD::TRUNCATE) break; } // Visit all inputs, collect all binary operations (and, or, xor and // select) that are all fed by extensions. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); if (!Visited.insert(BinOp.getNode()).second) continue; PromOps.push_back(BinOp); for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { // The condition of the select is not promoted. if (BinOp.getOpcode() == ISD::SELECT && i == 0) continue; if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) continue; if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || isa(BinOp.getOperand(i))) { Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || BinOp.getOperand(i).getOpcode() == ISD::SELECT || BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { BinOps.push_back(BinOp.getOperand(i)); } else { // We have an input that is not an extension or another binary // operation; we'll abort this transformation. return SDValue(); } } } // Make sure that this is a self-contained cluster of operations (which // is not quite the same thing as saying that everything has only one // use). for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { if (isa(Inputs[i])) continue; for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), UE = Inputs[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // Make sure that we're not going to promote the non-output-value // operand(s) or SELECT or SELECT_CC. // FIXME: Although we could sometimes handle this, and it does occur in // practice that one of the condition inputs to the select is also one of // the outputs, we currently can't deal with this. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == Inputs[i]) return SDValue(); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == Inputs[i] || User->getOperand(1) == Inputs[i]) return SDValue(); } } } for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), UE = PromOps[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // Make sure that we're not going to promote the non-output-value // operand(s) or SELECT or SELECT_CC. // FIXME: Although we could sometimes handle this, and it does occur in // practice that one of the condition inputs to the select is also one of // the outputs, we currently can't deal with this. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == PromOps[i]) return SDValue(); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == PromOps[i] || User->getOperand(1) == PromOps[i]) return SDValue(); } } } // Replace all inputs with the extension operand. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { // Constants may have users outside the cluster of to-be-promoted nodes, // and so we need to replace those as we do the promotions. if (isa(Inputs[i])) continue; else DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); } std::list PromOpHandles; for (auto &PromOp : PromOps) PromOpHandles.emplace_back(PromOp); // Replace all operations (these are all the same, but have a different // (i1) return type). DAG.getNode will validate that the types of // a binary operator match, so go through the list in reverse so that // we've likely promoted both operands first. Any intermediate truncations or // extensions disappear. while (!PromOpHandles.empty()) { SDValue PromOp = PromOpHandles.back().getValue(); PromOpHandles.pop_back(); if (PromOp.getOpcode() == ISD::TRUNCATE || PromOp.getOpcode() == ISD::SIGN_EXTEND || PromOp.getOpcode() == ISD::ZERO_EXTEND || PromOp.getOpcode() == ISD::ANY_EXTEND) { if (!isa(PromOp.getOperand(0)) && PromOp.getOperand(0).getValueType() != MVT::i1) { // The operand is not yet ready (see comment below). PromOpHandles.emplace_front(PromOp); continue; } SDValue RepValue = PromOp.getOperand(0); if (isa(RepValue)) RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); continue; } unsigned C; switch (PromOp.getOpcode()) { default: C = 0; break; case ISD::SELECT: C = 1; break; case ISD::SELECT_CC: C = 2; break; } if ((!isa(PromOp.getOperand(C)) && PromOp.getOperand(C).getValueType() != MVT::i1) || (!isa(PromOp.getOperand(C+1)) && PromOp.getOperand(C+1).getValueType() != MVT::i1)) { // The to-be-promoted operands of this node have not yet been // promoted (this should be rare because we're going through the // list backward, but if one of the operands has several users in // this cluster of to-be-promoted nodes, it is possible). PromOpHandles.emplace_front(PromOp); continue; } SmallVector Ops(PromOp.getNode()->op_begin(), PromOp.getNode()->op_end()); // If there are any constant inputs, make sure they're replaced now. for (unsigned i = 0; i < 2; ++i) if (isa(Ops[C+i])) Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); DAG.ReplaceAllUsesOfValueWith(PromOp, DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); } // Now we're left with the initial truncation itself. if (N->getOpcode() == ISD::TRUNCATE) return N->getOperand(0); // Otherwise, this is a comparison. The operands to be compared have just // changed type (to i1), but everything else is the same. return SDValue(N, 0); } SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); // If we're tracking CR bits, we need to be careful that we don't have: // zext(binary-ops(trunc(x), trunc(y))) // or // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) // such that we're unnecessarily moving things into CR bits that can more // efficiently stay in GPRs. Note that if we're not certain that the high // bits are set as required by the final extension, we still may need to do // some masking to get the proper behavior. // This same functionality is important on PPC64 when dealing with // 32-to-64-bit extensions; these occur often when 32-bit values are used as // the return values of functions. Because it is so similar, it is handled // here as well. if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64) return SDValue(); if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::AND && N->getOperand(0).getOpcode() != ISD::OR && N->getOperand(0).getOpcode() != ISD::XOR && N->getOperand(0).getOpcode() != ISD::SELECT && N->getOperand(0).getOpcode() != ISD::SELECT_CC) return SDValue(); SmallVector Inputs; SmallVector BinOps(1, N->getOperand(0)), PromOps; SmallPtrSet Visited; // Visit all inputs, collect all binary operations (and, or, xor and // select) that are all fed by truncations. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); if (!Visited.insert(BinOp.getNode()).second) continue; PromOps.push_back(BinOp); for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { // The condition of the select is not promoted. if (BinOp.getOpcode() == ISD::SELECT && i == 0) continue; if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) continue; if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || isa(BinOp.getOperand(i))) { Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || BinOp.getOperand(i).getOpcode() == ISD::SELECT || BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { BinOps.push_back(BinOp.getOperand(i)); } else { // We have an input that is not a truncation or another binary // operation; we'll abort this transformation. return SDValue(); } } } // The operands of a select that must be truncated when the select is // promoted because the operand is actually part of the to-be-promoted set. DenseMap SelectTruncOp[2]; // Make sure that this is a self-contained cluster of operations (which // is not quite the same thing as saying that everything has only one // use). for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { if (isa(Inputs[i])) continue; for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), UE = Inputs[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // If we're going to promote the non-output-value operand(s) or SELECT or // SELECT_CC, record them for truncation. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == Inputs[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == Inputs[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); if (User->getOperand(1) == Inputs[i]) SelectTruncOp[1].insert(std::make_pair(User, User->getOperand(1).getValueType())); } } } for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), UE = PromOps[i].getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User != N && !Visited.count(User)) return SDValue(); // If we're going to promote the non-output-value operand(s) or SELECT or // SELECT_CC, record them for truncation. if (User->getOpcode() == ISD::SELECT) { if (User->getOperand(0) == PromOps[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); } else if (User->getOpcode() == ISD::SELECT_CC) { if (User->getOperand(0) == PromOps[i]) SelectTruncOp[0].insert(std::make_pair(User, User->getOperand(0).getValueType())); if (User->getOperand(1) == PromOps[i]) SelectTruncOp[1].insert(std::make_pair(User, User->getOperand(1).getValueType())); } } } unsigned PromBits = N->getOperand(0).getValueSizeInBits(); bool ReallyNeedsExt = false; if (N->getOpcode() != ISD::ANY_EXTEND) { // If all of the inputs are not already sign/zero extended, then // we'll still need to do that at the end. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { if (isa(Inputs[i])) continue; unsigned OpBits = Inputs[i].getOperand(0).getValueSizeInBits(); assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); if ((N->getOpcode() == ISD::ZERO_EXTEND && !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), APInt::getHighBitsSet(OpBits, OpBits-PromBits))) || (N->getOpcode() == ISD::SIGN_EXTEND && DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < (OpBits-(PromBits-1)))) { ReallyNeedsExt = true; break; } } } // Replace all inputs, either with the truncation operand, or a // truncation or extension to the final output type. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { // Constant inputs need to be replaced with the to-be-promoted nodes that // use them because they might have users outside of the cluster of // promoted nodes. if (isa(Inputs[i])) continue; SDValue InSrc = Inputs[i].getOperand(0); if (Inputs[i].getValueType() == N->getValueType(0)) DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); else if (N->getOpcode() == ISD::SIGN_EXTEND) DAG.ReplaceAllUsesOfValueWith(Inputs[i], DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); else if (N->getOpcode() == ISD::ZERO_EXTEND) DAG.ReplaceAllUsesOfValueWith(Inputs[i], DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); else DAG.ReplaceAllUsesOfValueWith(Inputs[i], DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); } std::list PromOpHandles; for (auto &PromOp : PromOps) PromOpHandles.emplace_back(PromOp); // Replace all operations (these are all the same, but have a different // (promoted) return type). DAG.getNode will validate that the types of // a binary operator match, so go through the list in reverse so that // we've likely promoted both operands first. while (!PromOpHandles.empty()) { SDValue PromOp = PromOpHandles.back().getValue(); PromOpHandles.pop_back(); unsigned C; switch (PromOp.getOpcode()) { default: C = 0; break; case ISD::SELECT: C = 1; break; case ISD::SELECT_CC: C = 2; break; } if ((!isa(PromOp.getOperand(C)) && PromOp.getOperand(C).getValueType() != N->getValueType(0)) || (!isa(PromOp.getOperand(C+1)) && PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { // The to-be-promoted operands of this node have not yet been // promoted (this should be rare because we're going through the // list backward, but if one of the operands has several users in // this cluster of to-be-promoted nodes, it is possible). PromOpHandles.emplace_front(PromOp); continue; } // For SELECT and SELECT_CC nodes, we do a similar check for any // to-be-promoted comparison inputs. if (PromOp.getOpcode() == ISD::SELECT || PromOp.getOpcode() == ISD::SELECT_CC) { if ((SelectTruncOp[0].count(PromOp.getNode()) && PromOp.getOperand(0).getValueType() != N->getValueType(0)) || (SelectTruncOp[1].count(PromOp.getNode()) && PromOp.getOperand(1).getValueType() != N->getValueType(0))) { PromOpHandles.emplace_front(PromOp); continue; } } SmallVector Ops(PromOp.getNode()->op_begin(), PromOp.getNode()->op_end()); // If this node has constant inputs, then they'll need to be promoted here. for (unsigned i = 0; i < 2; ++i) { if (!isa(Ops[C+i])) continue; if (Ops[C+i].getValueType() == N->getValueType(0)) continue; if (N->getOpcode() == ISD::SIGN_EXTEND) Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); else if (N->getOpcode() == ISD::ZERO_EXTEND) Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); else Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); } // If we've promoted the comparison inputs of a SELECT or SELECT_CC, // truncate them again to the original value type. if (PromOp.getOpcode() == ISD::SELECT || PromOp.getOpcode() == ISD::SELECT_CC) { auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); if (SI0 != SelectTruncOp[0].end()) Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); if (SI1 != SelectTruncOp[1].end()) Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); } DAG.ReplaceAllUsesOfValueWith(PromOp, DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); } // Now we're left with the initial extension itself. if (!ReallyNeedsExt) return N->getOperand(0); // To zero extend, just mask off everything except for the first bit (in the // i1 case). if (N->getOpcode() == ISD::ZERO_EXTEND) return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), DAG.getConstant(APInt::getLowBitsSet( N->getValueSizeInBits(0), PromBits), dl, N->getValueType(0))); assert(N->getOpcode() == ISD::SIGN_EXTEND && "Invalid extension type"); EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); SDValue ShiftCst = DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); return DAG.getNode( ISD::SRA, dl, N->getValueType(0), DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), ShiftCst); } +// Is this an extending load from an f32 to an f64? +static bool isFPExtLoad(SDValue Op) { + if (LoadSDNode *LD = dyn_cast(Op.getNode())) + return LD->getExtensionType() == ISD::EXTLOAD && + Op.getValueType() == MVT::f64; + return false; +} + /// Reduces the number of fp-to-int conversion when building a vector. /// /// If this vector is built out of floating to integer conversions, /// transform it to a vector built out of floating point values followed by a /// single floating to integer conversion of the vector. /// Namely (build_vector (fptosi $A), (fptosi $B), ...) /// becomes (fptosi (build_vector ($A, $B, ...))) SDValue PPCTargetLowering:: combineElementTruncationToVectorTruncation(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && "Should be called with a BUILD_VECTOR node"); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue FirstInput = N->getOperand(0); assert(FirstInput.getOpcode() == PPCISD::MFVSR && "The input operand must be an fp-to-int conversion."); // This combine happens after legalization so the fp_to_[su]i nodes are // already converted to PPCSISD nodes. unsigned FirstConversion = FirstInput.getOperand(0).getOpcode(); if (FirstConversion == PPCISD::FCTIDZ || FirstConversion == PPCISD::FCTIDUZ || FirstConversion == PPCISD::FCTIWZ || FirstConversion == PPCISD::FCTIWUZ) { bool IsSplat = true; bool Is32Bit = FirstConversion == PPCISD::FCTIWZ || FirstConversion == PPCISD::FCTIWUZ; EVT SrcVT = FirstInput.getOperand(0).getValueType(); SmallVector Ops; EVT TargetVT = N->getValueType(0); for (int i = 0, e = N->getNumOperands(); i < e; ++i) { - if (N->getOperand(i).getOpcode() != PPCISD::MFVSR) + SDValue NextOp = N->getOperand(i); + if (NextOp.getOpcode() != PPCISD::MFVSR) return SDValue(); - unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode(); + unsigned NextConversion = NextOp.getOperand(0).getOpcode(); if (NextConversion != FirstConversion) return SDValue(); + // If we are converting to 32-bit integers, we need to add an FP_ROUND. + // This is not valid if the input was originally double precision. It is + // also not profitable to do unless this is an extending load in which + // case doing this combine will allow us to combine consecutive loads. + if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0))) + return SDValue(); if (N->getOperand(i) != FirstInput) IsSplat = false; } // If this is a splat, we leave it as-is since there will be only a single // fp-to-int conversion followed by a splat of the integer. This is better // for 32-bit and smaller ints and neutral for 64-bit ints. if (IsSplat) return SDValue(); // Now that we know we have the right type of node, get its operands for (int i = 0, e = N->getNumOperands(); i < e; ++i) { SDValue In = N->getOperand(i).getOperand(0); - // For 32-bit values, we need to add an FP_ROUND node. if (Is32Bit) { + // For 32-bit values, we need to add an FP_ROUND node (if we made it + // here, we know that all inputs are extending loads so this is safe). if (In.isUndef()) Ops.push_back(DAG.getUNDEF(SrcVT)); else { SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, In.getOperand(0), DAG.getIntPtrConstant(1, dl)); Ops.push_back(Trunc); } } else Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0)); } unsigned Opcode; if (FirstConversion == PPCISD::FCTIDZ || FirstConversion == PPCISD::FCTIWZ) Opcode = ISD::FP_TO_SINT; else Opcode = ISD::FP_TO_UINT; EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32; SDValue BV = DAG.getBuildVector(NewVT, dl, Ops); return DAG.getNode(Opcode, dl, TargetVT, BV); } return SDValue(); } /// Reduce the number of loads when building a vector. /// /// Building a vector out of multiple loads can be converted to a load /// of the vector type if the loads are consecutive. If the loads are /// consecutive but in descending order, a shuffle is added at the end /// to reorder the vector. static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::BUILD_VECTOR && "Should be called with a BUILD_VECTOR node"); SDLoc dl(N); bool InputsAreConsecutiveLoads = true; bool InputsAreReverseConsecutive = true; unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; SDValue FirstInput = N->getOperand(0); bool IsRoundOfExtLoad = false; if (FirstInput.getOpcode() == ISD::FP_ROUND && FirstInput.getOperand(0).getOpcode() == ISD::LOAD) { LoadSDNode *LD = dyn_cast(FirstInput.getOperand(0)); IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; } // Not a build vector of (possibly fp_rounded) loads. if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) return SDValue(); for (int i = 1, e = N->getNumOperands(); i < e; ++i) { // If any inputs are fp_round(extload), they all must be. if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND) return SDValue(); SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) : N->getOperand(i); if (NextInput.getOpcode() != ISD::LOAD) return SDValue(); SDValue PreviousInput = IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1); LoadSDNode *LD1 = dyn_cast(PreviousInput); LoadSDNode *LD2 = dyn_cast(NextInput); // If any inputs are fp_round(extload), they all must be. if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD) return SDValue(); if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG)) InputsAreConsecutiveLoads = false; if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG)) InputsAreReverseConsecutive = false; // Exit early if the loads are neither consecutive nor reverse consecutive. if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive) return SDValue(); } assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) && "The loads cannot be both consecutive and reverse consecutive."); SDValue FirstLoadOp = IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput; SDValue LastLoadOp = IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) : N->getOperand(N->getNumOperands()-1); LoadSDNode *LD1 = dyn_cast(FirstLoadOp); LoadSDNode *LDL = dyn_cast(LastLoadOp); if (InputsAreConsecutiveLoads) { assert(LD1 && "Input needs to be a LoadSDNode."); return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(), LD1->getBasePtr(), LD1->getPointerInfo(), LD1->getAlignment()); } if (InputsAreReverseConsecutive) { assert(LDL && "Input needs to be a LoadSDNode."); SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), LDL->getBasePtr(), LDL->getPointerInfo(), LDL->getAlignment()); SmallVector Ops; for (int i = N->getNumOperands() - 1; i >= 0; i--) Ops.push_back(i); return DAG.getVectorShuffle(N->getValueType(0), dl, Load, DAG.getUNDEF(N->getValueType(0)), Ops); } return SDValue(); } // This function adds the required vector_shuffle needed to get // the elements of the vector extract in the correct position // as specified by the CorrectElems encoding. static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, SDValue Input, uint64_t Elems, uint64_t CorrectElems) { SDLoc dl(N); unsigned NumElems = Input.getValueType().getVectorNumElements(); SmallVector ShuffleMask(NumElems, -1); // Knowing the element indices being extracted from the original // vector and the order in which they're being inserted, just put // them at element indices required for the instruction. for (unsigned i = 0; i < N->getNumOperands(); i++) { if (DAG.getDataLayout().isLittleEndian()) ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; else ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; CorrectElems = CorrectElems >> 8; Elems = Elems >> 8; } SDValue Shuffle = DAG.getVectorShuffle(Input.getValueType(), dl, Input, DAG.getUNDEF(Input.getValueType()), ShuffleMask); EVT Ty = N->getValueType(0); SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); return BV; } // Look for build vector patterns where input operands come from sign // extended vector_extract elements of specific indices. If the correct indices // aren't used, add a vector shuffle to fix up the indices and create a new // PPCISD:SExtVElems node which selects the vector sign extend instructions // during instruction selection. static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { // This array encodes the indices that the vector sign extend instructions // extract from when extending from one type to another for both BE and LE. // The right nibble of each byte corresponds to the LE incides. // and the left nibble of each byte corresponds to the BE incides. // For example: 0x3074B8FC byte->word // For LE: the allowed indices are: 0x0,0x4,0x8,0xC // For BE: the allowed indices are: 0x3,0x7,0xB,0xF // For example: 0x000070F8 byte->double word // For LE: the allowed indices are: 0x0,0x8 // For BE: the allowed indices are: 0x7,0xF uint64_t TargetElems[] = { 0x3074B8FC, // b->w 0x000070F8, // b->d 0x10325476, // h->w 0x00003074, // h->d 0x00001032, // w->d }; uint64_t Elems = 0; int Index; SDValue Input; auto isSExtOfVecExtract = [&](SDValue Op) -> bool { if (!Op) return false; if (Op.getOpcode() != ISD::SIGN_EXTEND) return false; SDValue Extract = Op.getOperand(0); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; ConstantSDNode *ExtOp = dyn_cast(Extract.getOperand(1)); if (!ExtOp) return false; Index = ExtOp->getZExtValue(); if (Input && Input != Extract.getOperand(0)) return false; if (!Input) Input = Extract.getOperand(0); Elems = Elems << 8; Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; Elems |= Index; return true; }; // If the build vector operands aren't sign extended vector extracts, // of the same input vector, then return. for (unsigned i = 0; i < N->getNumOperands(); i++) { if (!isSExtOfVecExtract(N->getOperand(i))) { return SDValue(); } } // If the vector extract indicies are not correct, add the appropriate // vector_shuffle. int TgtElemArrayIdx; int InputSize = Input.getValueType().getScalarSizeInBits(); int OutputSize = N->getValueType(0).getScalarSizeInBits(); if (InputSize + OutputSize == 40) TgtElemArrayIdx = 0; else if (InputSize + OutputSize == 72) TgtElemArrayIdx = 1; else if (InputSize + OutputSize == 48) TgtElemArrayIdx = 2; else if (InputSize + OutputSize == 80) TgtElemArrayIdx = 3; else if (InputSize + OutputSize == 96) TgtElemArrayIdx = 4; else return SDValue(); uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; CorrectElems = DAG.getDataLayout().isLittleEndian() ? CorrectElems & 0x0F0F0F0F0F0F0F0F : CorrectElems & 0xF0F0F0F0F0F0F0F0; if (Elems != CorrectElems) { return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); } // Regular lowering will catch cases where a shuffle is not needed. return SDValue(); } SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && "Should be called with a BUILD_VECTOR node"); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); if (!Subtarget.hasVSX()) return SDValue(); // The target independent DAG combiner will leave a build_vector of // float-to-int conversions intact. We can generate MUCH better code for // a float-to-int conversion of a vector of floats. SDValue FirstInput = N->getOperand(0); if (FirstInput.getOpcode() == PPCISD::MFVSR) { SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI); if (Reduced) return Reduced; } // If we're building a vector out of consecutive loads, just load that // vector type. SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG); if (Reduced) return Reduced; // If we're building a vector out of extended elements from another vector // we have P9 vector integer extend instructions. if (Subtarget.hasP9Altivec()) { Reduced = combineBVOfVecSExt(N, DAG); if (Reduced) return Reduced; } if (N->getValueType(0) != MVT::v2f64) return SDValue(); // Looking for: // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) if (FirstInput.getOpcode() != ISD::SINT_TO_FP && FirstInput.getOpcode() != ISD::UINT_TO_FP) return SDValue(); if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) return SDValue(); if (FirstInput.getOpcode() != N->getOperand(1).getOpcode()) return SDValue(); SDValue Ext1 = FirstInput.getOperand(0); SDValue Ext2 = N->getOperand(1).getOperand(0); if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); ConstantSDNode *Ext1Op = dyn_cast(Ext1.getOperand(1)); ConstantSDNode *Ext2Op = dyn_cast(Ext2.getOperand(1)); if (!Ext1Op || !Ext2Op) return SDValue(); if (Ext1.getValueType() != MVT::i32 || Ext2.getValueType() != MVT::i32) if (Ext1.getOperand(0) != Ext2.getOperand(0)) return SDValue(); int FirstElem = Ext1Op->getZExtValue(); int SecondElem = Ext2Op->getZExtValue(); int SubvecIdx; if (FirstElem == 0 && SecondElem == 1) SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; else if (FirstElem == 2 && SecondElem == 3) SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; else return SDValue(); SDValue SrcVec = Ext1.getOperand(0); auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP; return DAG.getNode(NodeType, dl, MVT::v2f64, SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); } SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const { assert((N->getOpcode() == ISD::SINT_TO_FP || N->getOpcode() == ISD::UINT_TO_FP) && "Need an int -> FP conversion node here"); if (useSoftFloat() || !Subtarget.has64BitSupport()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Op(N, 0); // Don't handle ppc_fp128 here or conversions that are out-of-range capable // from the hardware. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) || Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64)) return SDValue(); SDValue FirstOperand(Op.getOperand(0)); bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && (FirstOperand.getValueType() == MVT::i8 || FirstOperand.getValueType() == MVT::i16); if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) { bool Signed = N->getOpcode() == ISD::SINT_TO_FP; bool DstDouble = Op.getValueType() == MVT::f64; unsigned ConvOp = Signed ? (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) : (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS); SDValue WidthConst = DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2, dl, false); LoadSDNode *LDN = cast(FirstOperand.getNode()); SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst }; SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i8, LDN->getMemOperand()); // For signed conversion, we need to sign-extend the value in the VSR if (Signed) { SDValue ExtOps[] = { Ld, WidthConst }; SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps); return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext); } else return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); } // For i32 intermediate values, unfortunately, the conversion functions // leave the upper 32 bits of the value are undefined. Within the set of // scalar instructions, we have no method for zero- or sign-extending the // value. Thus, we cannot handle i32 intermediate values here. if (Op.getOperand(0).getValueType() == MVT::i32) return SDValue(); assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS : PPCISD::FCFIDS) : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU : PPCISD::FCFID); MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? MVT::f32 : MVT::f64; // If we're converting from a float, to an int, and back to a float again, // then we don't need the store/load pair at all. if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && Subtarget.hasFPCVT()) || (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { SDValue Src = Op.getOperand(0).getOperand(0); if (Src.getValueType() == MVT::f32) { Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); DCI.AddToWorklist(Src.getNode()); } else if (Src.getValueType() != MVT::f64) { // Make sure that we don't pick up a ppc_fp128 source value. return SDValue(); } unsigned FCTOp = Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ; SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); DCI.AddToWorklist(FP.getNode()); } return FP; } return SDValue(); } // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for // builtins) into loads with swaps. SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Chain; SDValue Base; MachineMemOperand *MMO; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode for little endian VSX load"); case ISD::LOAD: { LoadSDNode *LD = cast(N); Chain = LD->getChain(); Base = LD->getBasePtr(); MMO = LD->getMemOperand(); // If the MMO suggests this isn't a load of a full vector, leave // things alone. For a built-in, we have to make the change for // correctness, so if there is a size problem that will be a bug. if (MMO->getSize() < 16) return SDValue(); break; } case ISD::INTRINSIC_W_CHAIN: { MemIntrinsicSDNode *Intrin = cast(N); Chain = Intrin->getChain(); // Similarly to the store case below, Intrin->getBasePtr() doesn't get // us what we want. Get operand 2 instead. Base = Intrin->getOperand(2); MMO = Intrin->getMemOperand(); break; } } MVT VecTy = N->getValueType(0).getSimpleVT(); // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is // aligned and the type is a vector with elements up to 4 bytes if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) && VecTy.getScalarSizeInBits() <= 32 ) { return SDValue(); } SDValue LoadOps[] = { Chain, Base }; SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, DAG.getVTList(MVT::v2f64, MVT::Other), LoadOps, MVT::v2f64, MMO); DCI.AddToWorklist(Load.getNode()); Chain = Load.getValue(1); SDValue Swap = DAG.getNode( PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load); DCI.AddToWorklist(Swap.getNode()); // Add a bitcast if the resulting load type doesn't match v2f64. if (VecTy != MVT::v2f64) { SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap); DCI.AddToWorklist(N.getNode()); // Package {bitcast value, swap's chain} to match Load's shape. return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other), N, Swap.getValue(1)); } return Swap; } // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for // builtins) into stores with swaps. SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Chain; SDValue Base; unsigned SrcOpnd; MachineMemOperand *MMO; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode for little endian VSX store"); case ISD::STORE: { StoreSDNode *ST = cast(N); Chain = ST->getChain(); Base = ST->getBasePtr(); MMO = ST->getMemOperand(); SrcOpnd = 1; // If the MMO suggests this isn't a store of a full vector, leave // things alone. For a built-in, we have to make the change for // correctness, so if there is a size problem that will be a bug. if (MMO->getSize() < 16) return SDValue(); break; } case ISD::INTRINSIC_VOID: { MemIntrinsicSDNode *Intrin = cast(N); Chain = Intrin->getChain(); // Intrin->getBasePtr() oddly does not get what we want. Base = Intrin->getOperand(3); MMO = Intrin->getMemOperand(); SrcOpnd = 2; break; } } SDValue Src = N->getOperand(SrcOpnd); MVT VecTy = Src.getValueType().getSimpleVT(); // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is // aligned and the type is a vector with elements up to 4 bytes if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) && VecTy.getScalarSizeInBits() <= 32 ) { return SDValue(); } // All stores are done as v2f64 and possible bit cast. if (VecTy != MVT::v2f64) { Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); DCI.AddToWorklist(Src.getNode()); } SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src); DCI.AddToWorklist(Swap.getNode()); Chain = Swap.getValue(1); SDValue StoreOps[] = { Chain, Swap, Base }; SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, DAG.getVTList(MVT::Other), StoreOps, VecTy, MMO); DCI.AddToWorklist(Store.getNode()); return Store; } // Handle DAG combine for STORE (FP_TO_INT F). SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); unsigned Opcode = N->getOperand(1).getOpcode(); assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) && "Not a FP_TO_INT Instruction!"); SDValue Val = N->getOperand(1).getOperand(0); EVT Op1VT = N->getOperand(1).getValueType(); EVT ResVT = Val.getValueType(); // Floating point types smaller than 32 bits are not legal on Power. if (ResVT.getScalarSizeInBits() < 32) return SDValue(); // Only perform combine for conversion to i64/i32 or power9 i16/i8. bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32 || Op1VT == MVT::i64 || (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8))); if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() || cast(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt) return SDValue(); // Extend f32 values to f64 if (ResVT.getScalarSizeInBits() == 32) { Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); DCI.AddToWorklist(Val.getNode()); } // Set signed or unsigned conversion opcode. unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ? PPCISD::FP_TO_SINT_IN_VSR : PPCISD::FP_TO_UINT_IN_VSR; Val = DAG.getNode(ConvOpcode, dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val); DCI.AddToWorklist(Val.getNode()); // Set number of bytes being converted. unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8; SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2), DAG.getIntPtrConstant(ByteSize, dl, false), DAG.getValueType(Op1VT) }; Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl, DAG.getVTList(MVT::Other), Ops, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); DCI.AddToWorklist(Val.getNode()); return Val; } SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); switch (N->getOpcode()) { default: break; case ISD::SHL: return combineSHL(N, DCI); case ISD::SRA: return combineSRA(N, DCI); case ISD::SRL: return combineSRL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); break; case PPCISD::SRL: if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. return N->getOperand(0); break; case PPCISD::SRA: if (ConstantSDNode *C = dyn_cast(N->getOperand(0))) { if (C->isNullValue() || // 0 >>s V -> 0. C->isAllOnesValue()) // -1 >>s V -> -1. return N->getOperand(0); } break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return DAGCombineExtBoolTrunc(N, DCI); case ISD::TRUNCATE: case ISD::SETCC: case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return combineFPToIntToFP(N, DCI); case ISD::STORE: { EVT Op1VT = N->getOperand(1).getValueType(); unsigned Opcode = N->getOperand(1).getOpcode(); if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) { SDValue Val= combineStoreFPToInt(N, DCI); if (Val) return Val; } // Turn STORE (BSWAP) -> sthbrx/stwbrx. if (cast(N)->isUnindexed() && Opcode == ISD::BSWAP && N->getOperand(1).getNode()->hasOneUse() && (Op1VT == MVT::i32 || Op1VT == MVT::i16 || (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) { // STBRX can only handle simple types. EVT mVT = cast(N)->getMemoryVT(); if (mVT.isExtended()) break; SDValue BSwapOp = N->getOperand(1).getOperand(0); // Do an any-extend to 32-bits if this is a half-word input. if (BSwapOp.getValueType() == MVT::i16) BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); // If the type of BSWAP operand is wider than stored memory width // it need to be shifted to the right side before STBRX. if (Op1VT.bitsGT(mVT)) { int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, DAG.getConstant(Shift, dl, MVT::i32)); // Need to truncate if this is a bswap of i64 stored as i32/i16. if (Op1VT == MVT::i64) BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); } SDValue Ops[] = { N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) }; return DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), Ops, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); } // STORE Constant:i32<0> -> STORE Constant:i64<0> // So it can increase the chance of CSE constant construction. if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() && isa(N->getOperand(1)) && Op1VT == MVT::i32) { // Need to sign-extended to 64-bits to handle negative values. EVT MemVT = cast(N)->getMemoryVT(); uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1), MemVT.getSizeInBits()); SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64); // DAG.getTruncStore() can't be used here because it doesn't accept // the general (base + offset) addressing mode. // So we use UpdateNodeOperands and setTruncatingStore instead. DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2), N->getOperand(3)); cast(N)->setTruncatingStore(true); return SDValue(N, 0); } // For little endian, VSX stores require generating xxswapd/lxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. if (Op1VT.isSimple()) { MVT StoreVT = Op1VT.getSimpleVT(); if (Subtarget.needsSwapsForVSXMemOps() && (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) return expandVSXStoreForLE(N, DCI); } break; } case ISD::LOAD: { LoadSDNode *LD = cast(N); EVT VT = LD->getValueType(0); // For little endian, VSX loads require generating lxvd2x/xxswapd. // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. if (VT.isSimple()) { MVT LoadVT = VT.getSimpleVT(); if (Subtarget.needsSwapsForVSXMemOps() && (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) return expandVSXLoadForLE(N, DCI); } // We sometimes end up with a 64-bit integer load, from which we extract // two single-precision floating-point numbers. This happens with // std::complex, and other similar structures, because of the way we // canonicalize structure copies. However, if we lack direct moves, // then the final bitcasts from the extracted integer values to the // floating-point numbers turn into store/load pairs. Even with direct moves, // just loading the two floating-point numbers is likely better. auto ReplaceTwoFloatLoad = [&]() { if (VT != MVT::i64) return false; if (LD->getExtensionType() != ISD::NON_EXTLOAD || LD->isVolatile()) return false; // We're looking for a sequence like this: // t13: i64,ch = load t0, t6, undef:i64 // t16: i64 = srl t13, Constant:i32<32> // t17: i32 = truncate t16 // t18: f32 = bitcast t17 // t19: i32 = truncate t13 // t20: f32 = bitcast t19 if (!LD->hasNUsesOfValue(2, 0)) return false; auto UI = LD->use_begin(); while (UI.getUse().getResNo() != 0) ++UI; SDNode *Trunc = *UI++; while (UI.getUse().getResNo() != 0) ++UI; SDNode *RightShift = *UI; if (Trunc->getOpcode() != ISD::TRUNCATE) std::swap(Trunc, RightShift); if (Trunc->getOpcode() != ISD::TRUNCATE || Trunc->getValueType(0) != MVT::i32 || !Trunc->hasOneUse()) return false; if (RightShift->getOpcode() != ISD::SRL || !isa(RightShift->getOperand(1)) || RightShift->getConstantOperandVal(1) != 32 || !RightShift->hasOneUse()) return false; SDNode *Trunc2 = *RightShift->use_begin(); if (Trunc2->getOpcode() != ISD::TRUNCATE || Trunc2->getValueType(0) != MVT::i32 || !Trunc2->hasOneUse()) return false; SDNode *Bitcast = *Trunc->use_begin(); SDNode *Bitcast2 = *Trunc2->use_begin(); if (Bitcast->getOpcode() != ISD::BITCAST || Bitcast->getValueType(0) != MVT::f32) return false; if (Bitcast2->getOpcode() != ISD::BITCAST || Bitcast2->getValueType(0) != MVT::f32) return false; if (Subtarget.isLittleEndian()) std::swap(Bitcast, Bitcast2); // Bitcast has the second float (in memory-layout order) and Bitcast2 // has the first one. SDValue BasePtr = LD->getBasePtr(); if (LD->isIndexed()) { assert(LD->getAddressingMode() == ISD::PRE_INC && "Non-pre-inc AM on PPC?"); BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, LD->getOffset()); } auto MMOFlags = LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->getAlignment(), MMOFlags, LD->getAAInfo()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getIntPtrConstant(4, dl)); SDValue FloatLoad2 = DAG.getLoad( MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, LD->getPointerInfo().getWithOffset(4), MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); if (LD->isIndexed()) { // Note that DAGCombine should re-form any pre-increment load(s) from // what is produced here if that makes sense. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); } DCI.CombineTo(Bitcast2, FloatLoad); DCI.CombineTo(Bitcast, FloatLoad2); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), SDValue(FloatLoad2.getNode(), 1)); return true; }; if (ReplaceTwoFloatLoad()) return SDValue(N, 0); EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v4f32)) || (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && LD->getAlignment() >= ScalarABIAlignment)) && LD->getAlignment() < ABIAlignment) { // This is a type-legal unaligned Altivec or QPX load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); // This implements the loading of unaligned vectors as described in // the venerable Apple Velocity Engine overview. Specifically: // https://developer.apple.com/hardwaredrivers/ve/alignment.html // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html // // The general idea is to expand a sequence of one or more unaligned // loads into an alignment-based permutation-control instruction (lvsl // or lvsr), a series of regular vector loads (which always truncate // their input address to an aligned address), and a series of // permutations. The results of these permutations are the requested // loaded values. The trick is that the last "extra" load is not taken // from the address you might suspect (sizeof(vector) bytes after the // last requested load), but rather sizeof(vector) - 1 bytes after the // last requested vector. The point of this is to avoid a page fault if // the base address happened to be aligned. This works because if the // base address is aligned, then adding less than a full vector length // will cause the last vector in the sequence to be (re)loaded. // Otherwise, the next vector will be fetched as you might suspect was // necessary. // We might be able to reuse the permutation generation from // a different base address offset from this one by an aligned amount. // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; if (Subtarget.hasAltivec()) { Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl; IntrLD = Intrinsic::ppc_altivec_lvx; IntrPerm = Intrinsic::ppc_altivec_vperm; PermCntlTy = MVT::v16i8; PermTy = MVT::v4i32; LDTy = MVT::v4i32; } else { Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : Intrinsic::ppc_qpx_qvlpcls; IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : Intrinsic::ppc_qpx_qvlfs; IntrPerm = Intrinsic::ppc_qpx_qvfperm; PermCntlTy = MVT::v4f64; PermTy = MVT::v4f64; LDTy = MemVT.getSimpleVT(); } SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); // Create the new MMO for the new base load. It is like the original MMO, // but represents an area in memory almost twice the vector size centered // on the original address. If the address is unaligned, we might start // reading up to (sizeof(vector)-1) bytes below the address of the // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMMO = MF.getMachineMemOperand(LD->getMemOperand(), -(long)MemVT.getStoreSize()+1, 2*MemVT.getStoreSize()-1); // Create the new base load. SDValue LDXIntID = DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue BaseLoad = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, DAG.getVTList(PermTy, MVT::Other), BaseLoadOps, LDTy, BaseMMO); // Note that the value of IncOffset (which is provided to the next // load's pointer info offset value, and thus used to calculate the // alignment), and the value of IncValue (which is actually used to // increment the pointer value) are different! This is because we // require the next load to appear to be aligned, even though it // is actually offset from the base pointer by a lesser amount. int IncOffset = VT.getSizeInBits() / 8; int IncValue = IncOffset; // Walk (both up and down) the chain looking for another load at the real // (aligned) offset (the alignment of the other load does not matter in // this case). If found, then do not use the offset reduction trick, as // that will prevent the loads from being later combined (as they would // otherwise be duplicates). if (!findConsecutiveLoad(LD, DAG)) --IncValue; SDValue Increment = DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); MachineMemOperand *ExtraMMO = MF.getMachineMemOperand(LD->getMemOperand(), 1, 2*MemVT.getStoreSize()-1); SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; SDValue ExtraLoad = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, DAG.getVTList(PermTy, MVT::Other), ExtraLoadOps, LDTy, ExtraMMO); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, BaseLoad.getValue(1), ExtraLoad.getValue(1)); // Because vperm has a big-endian bias, we must reverse the order // of the input vectors and complement the permute control vector // when generating little endian code. We have already handled the // latter by using lvsr instead of lvsl, so just reverse BaseLoad // and ExtraLoad here. SDValue Perm; if (isLittleEndian) Perm = BuildIntrinsicOp(IntrPerm, ExtraLoad, BaseLoad, PermCntl, DAG, dl); else Perm = BuildIntrinsicOp(IntrPerm, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) Perm = Subtarget.hasAltivec() ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. // The output of the permutation is our loaded result, the TokenFactor is // our new chain. DCI.CombineTo(N, Perm, TF); return SDValue(N, 0); } } break; case ISD::INTRINSIC_WO_CHAIN: { bool isLittleEndian = Subtarget.isLittleEndian(); unsigned IID = cast(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); if ((IID == Intr || IID == Intrinsic::ppc_qpx_qvlpcld || IID == Intrinsic::ppc_qpx_qvlpcls) && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero(Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) .zext(Add.getScalarValueSizeInBits()))) { SDNode *BasePtr = Add->getOperand(0).getNode(); for (SDNode::use_iterator UI = BasePtr->use_begin(), UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast(UI->getOperand(0))->getZExtValue() == IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. return SDValue(*UI, 0); } } } if (isa(Add->getOperand(1))) { SDNode *BasePtr = Add->getOperand(0).getNode(); for (SDNode::use_iterator UI = BasePtr->use_begin(), UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::ADD && isa(UI->getOperand(1)) && (cast(Add->getOperand(1))->getZExtValue() - cast(UI->getOperand(1))->getZExtValue()) % (1ULL << Bits) == 0) { SDNode *OtherAdd = *UI; for (SDNode::use_iterator VI = OtherAdd->use_begin(), VE = OtherAdd->use_end(); VI != VE; ++VI) { if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast(VI->getOperand(0))->getZExtValue() == IID) { return SDValue(*VI, 0); } } } } } } } break; case ISD::INTRINSIC_W_CHAIN: // For little endian, VSX loads require generating lxvd2x/xxswapd. // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. if (Subtarget.needsSwapsForVSXMemOps()) { switch (cast(N->getOperand(1))->getZExtValue()) { default: break; case Intrinsic::ppc_vsx_lxvw4x: case Intrinsic::ppc_vsx_lxvd2x: return expandVSXLoadForLE(N, DCI); } } break; case ISD::INTRINSIC_VOID: // For little endian, VSX stores require generating xxswapd/stxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. if (Subtarget.needsSwapsForVSXMemOps()) { switch (cast(N->getOperand(1))->getZExtValue()) { default: break; case Intrinsic::ppc_vsx_stxvw4x: case Intrinsic::ppc_vsx_stxvd2x: return expandVSXStoreForLE(N, DCI); } } break; case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && N->getOperand(0).hasOneUse() && (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || (Subtarget.hasLDBRX() && Subtarget.isPPC64() && N->getValueType(0) == MVT::i64))) { SDValue Load = N->getOperand(0); LoadSDNode *LD = cast(Load); // Create the byte-swapping load. SDValue Ops[] = { LD->getChain(), // Chain LD->getBasePtr(), // Ptr DAG.getValueType(N->getValueType(0)) // VT }; SDValue BSLoad = DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, DAG.getVTList(N->getValueType(0) == MVT::i64 ? MVT::i64 : MVT::i32, MVT::Other), Ops, LD->getMemoryVT(), LD->getMemOperand()); // If this is an i16 load, insert the truncate. SDValue ResVal = BSLoad; if (N->getValueType(0) == MVT::i16) ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); // First, combine the bswap away. This makes the value produced by the // load dead. DCI.CombineTo(N, ResVal); // Next, combine the load away, we give it a bogus result value but a real // chain result. The result value is dead because the bswap is dead. DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); // Return N so it doesn't get rechecked! return SDValue(N, 0); } break; case PPCISD::VCMP: // If a VCMPo node already exists with exactly the same operands as this // node, use its result instead of this node (VCMPo computes both a CR6 and // a normal output). // if (!N->getOperand(0).hasOneUse() && !N->getOperand(1).hasOneUse() && !N->getOperand(2).hasOneUse()) { // Scan all of the users of the LHS, looking for VCMPo's that match. SDNode *VCMPoNode = nullptr; SDNode *LHSN = N->getOperand(0).getNode(); for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); UI != E; ++UI) if (UI->getOpcode() == PPCISD::VCMPo && UI->getOperand(1) == N->getOperand(1) && UI->getOperand(2) == N->getOperand(2) && UI->getOperand(0) == N->getOperand(0)) { VCMPoNode = *UI; break; } // If there is no VCMPo node, or if the flag value has a single use, don't // transform this. if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) break; // Look at the (necessarily single) use of the flag value. If it has a // chain, this transformation is more complex. Note that multiple things // could use the value result, which we should ignore. SDNode *FlagUser = nullptr; for (SDNode::use_iterator UI = VCMPoNode->use_begin(); FlagUser == nullptr; ++UI) { assert(UI != VCMPoNode->use_end() && "Didn't find user!"); SDNode *User = *UI; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { FlagUser = User; break; } } } // If the user is a MFOCRF instruction, we know this is safe. // Otherwise we give up for right now. if (FlagUser->getOpcode() == PPCISD::MFOCRF) return SDValue(VCMPoNode, 0); } break; case ISD::BRCOND: { SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(Cond.getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero) { // We now need to make the intrinsic dead (it cannot be instruction // selected). DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); assert(Cond.getNode()->hasOneUse() && "Counter decrement has more than one use"); return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, N->getOperand(0), Target); } } break; case ISD::BR_CC: { // If this is a branch on an altivec predicate comparison, lower this so // that we don't have to do a MFOCRF: instead, branch directly on CR6. This // lowering is done pre-legalize, because the legalizer lowers the predicate // compare down to code that is difficult to reassemble. ISD::CondCode CC = cast(N->getOperand(1))->get(); SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); // Sometimes the promoted value of the intrinsic is ANDed by some non-zero // value. If so, pass-through the AND to get to the intrinsic. if (LHS.getOpcode() == ISD::AND && LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(0).getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero && isa(LHS.getOperand(1)) && !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast(LHS.getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero && isa(RHS)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Counter decrement comparison is not EQ or NE"); unsigned Val = cast(RHS)->getZExtValue(); bool isBDNZ = (CC == ISD::SETEQ && Val) || (CC == ISD::SETNE && !Val); // We now need to make the intrinsic dead (it cannot be instruction // selected). DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); assert(LHS.getNode()->hasOneUse() && "Counter decrement has more than one use"); return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, N->getOperand(0), N->getOperand(4)); } int CompareOpc; bool isDot; if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && isa(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { assert(isDot && "Can't compare against a vector result!"); // If this is a comparison against something other than 0/1, then we know // that the condition is never/always true. unsigned Val = cast(RHS)->getZExtValue(); if (Val != 0 && Val != 1) { if (CC == ISD::SETEQ) // Cond never true, remove branch. return N->getOperand(0); // Always !=, turn it into an unconditional branch. return DAG.getNode(ISD::BR, dl, MVT::Other, N->getOperand(0), N->getOperand(4)); } bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); // Create the PPCISD altivec 'dot' comparison node. SDValue Ops[] = { LHS.getOperand(2), // LHS of compare LHS.getOperand(3), // RHS of compare DAG.getConstant(CompareOpc, dl, MVT::i32) }; EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); // Unpack the result based on how the target uses it. PPC::Predicate CompOpc; switch (cast(LHS.getOperand(1))->getZExtValue()) { default: // Can't happen, don't crash on invalid number though. case 0: // Branch on the value of the EQ bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; break; case 1: // Branch on the inverted value of the EQ bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; break; case 2: // Branch on the value of the LT bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; break; case 3: // Branch on the inverted value of the LT bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; break; } return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), DAG.getConstant(CompOpc, dl, MVT::i32), DAG.getRegister(PPC::CR6, MVT::i32), N->getOperand(4), CompNode.getValue(1)); } break; } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); } return SDValue(); } SDValue PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if (VT == MVT::i64 && !Subtarget.isPPC64()) return SDValue(); if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); bool IsNegPow2 = (-Divisor).isPowerOf2(); unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); Created.push_back(Op.getNode()); if (IsNegPow2) { Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); Created.push_back(Op.getNode()); } return Op; } //===----------------------------------------------------------------------===// // Inline Assembly Support //===----------------------------------------------------------------------===// void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { Known.resetAll(); switch (Op.getOpcode()) { default: break; case PPCISD::LBRX: { // lhbrx is known to have the top bits cleared out. if (cast(Op.getOperand(2))->getVT() == MVT::i16) Known.Zero = 0xFFFF0000; break; } case ISD::INTRINSIC_WO_CHAIN: { switch (cast(Op.getOperand(0))->getZExtValue()) { default: break; case Intrinsic::ppc_altivec_vcmpbfp_p: case Intrinsic::ppc_altivec_vcmpeqfp_p: case Intrinsic::ppc_altivec_vcmpequb_p: case Intrinsic::ppc_altivec_vcmpequh_p: case Intrinsic::ppc_altivec_vcmpequw_p: case Intrinsic::ppc_altivec_vcmpequd_p: case Intrinsic::ppc_altivec_vcmpgefp_p: case Intrinsic::ppc_altivec_vcmpgtfp_p: case Intrinsic::ppc_altivec_vcmpgtsb_p: case Intrinsic::ppc_altivec_vcmpgtsh_p: case Intrinsic::ppc_altivec_vcmpgtsw_p: case Intrinsic::ppc_altivec_vcmpgtsd_p: case Intrinsic::ppc_altivec_vcmpgtub_p: case Intrinsic::ppc_altivec_vcmpgtuh_p: case Intrinsic::ppc_altivec_vcmpgtuw_p: case Intrinsic::ppc_altivec_vcmpgtud_p: Known.Zero = ~1U; // All bits but the low one are known to be zero. break; } } } } unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { switch (Subtarget.getDarwinDirective()) { default: break; case PPC::DIR_970: case PPC::DIR_PWR4: case PPC::DIR_PWR5: case PPC::DIR_PWR5X: case PPC::DIR_PWR6: case PPC::DIR_PWR6X: case PPC::DIR_PWR7: case PPC::DIR_PWR8: case PPC::DIR_PWR9: { if (!ML) break; const PPCInstrInfo *TII = Subtarget.getInstrInfo(); // For small loops (between 5 and 8 instructions), align to a 32-byte // boundary so that the entire loop fits in one instruction-cache line. uint64_t LoopSize = 0; for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { LoopSize += TII->getInstSizeInBytes(*J); if (LoopSize > 32) break; } if (LoopSize > 16 && LoopSize <= 32) return 5; break; } } return TargetLowering::getPrefLoopAlignment(ML); } /// getConstraintType - Given a constraint, return the type of /// constraint it is for this target. PPCTargetLowering::ConstraintType PPCTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'b': case 'r': case 'f': case 'd': case 'v': case 'y': return C_RegisterClass; case 'Z': // FIXME: While Z does indicate a memory constraint, it specifically // indicates an r+r address (used in conjunction with the 'y' modifier // in the replacement string). Currently, we're forcing the base // register to be r0 in the asm printer (which is interpreted as zero) // and forming the complete address in the second register. This is // suboptimal. return C_Memory; } } else if (Constraint == "wc") { // individual CR bits. return C_RegisterClass; } else if (Constraint == "wa" || Constraint == "wd" || Constraint == "wf" || Constraint == "ws") { return C_RegisterClass; // VSX registers. } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight PPCTargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) return CW_Register; // an individual CR bit. else if ((StringRef(constraint) == "wa" || StringRef(constraint) == "wd" || StringRef(constraint) == "wf") && type->isVectorTy()) return CW_Register; else if (StringRef(constraint) == "ws" && type->isDoubleTy()) return CW_Register; switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'b': if (type->isIntegerTy()) weight = CW_Register; break; case 'f': if (type->isFloatTy()) weight = CW_Register; break; case 'd': if (type->isDoubleTy()) weight = CW_Register; break; case 'v': if (type->isVectorTy()) weight = CW_Register; break; case 'y': weight = CW_Register; break; case 'Z': weight = CW_Memory; break; } return weight; } std::pair PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC RS6000 Constraint Letters switch (Constraint[0]) { case 'b': // R1-R31 if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); case 'r': // R0-R31 if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RCRegClass); return std::make_pair(0U, &PPC::GPRCRegClass); // 'd' and 'f' constraints are both defined to be "the floating point // registers", where one is for 32-bit and the other for 64-bit. We don't // really care overly much here so just give them all the same reg classes. case 'd': case 'f': if (Subtarget.hasSPE()) { if (VT == MVT::f32 || VT == MVT::i32) return std::make_pair(0U, &PPC::SPE4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::SPERCRegClass); } else { if (VT == MVT::f32 || VT == MVT::i32) return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); if (VT == MVT::v4f64 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QFRCRegClass); if (VT == MVT::v4f32 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QSRCRegClass); } break; case 'v': if (VT == MVT::v4f64 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QFRCRegClass); if (VT == MVT::v4f32 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); break; case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); } } else if (Constraint == "wc" && Subtarget.useCRBits()) { // An individual CR bit. return std::make_pair(0U, &PPC::CRBITRCRegClass); } else if ((Constraint == "wa" || Constraint == "wd" || Constraint == "wf") && Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); } else if (Constraint == "ws" && Subtarget.hasVSX()) { if (VT == MVT::f32 && Subtarget.hasP8Vector()) return std::make_pair(0U, &PPC::VSSRCRegClass); else return std::make_pair(0U, &PPC::VSFRCRegClass); } std::pair R = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers // (which we call X[0-9]+). If a 64-bit value has been requested, and a // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent // register. // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use // the AsmName field from *RegisterInfo.td, then this would not be necessary. if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) return std::make_pair(TRI->getMatchingSuperReg(R.first, PPC::sub_32, &PPC::G8RCRegClass), &PPC::G8RCRegClass); // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { R.first = PPC::CR0; R.second = &PPC::CRRCRegClass; } return R; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints. if (Constraint.length() > 1) return; char Letter = Constraint[0]; switch (Letter) { default: break; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': { ConstantSDNode *CST = dyn_cast(Op); if (!CST) return; // Must be an immediate to match. SDLoc dl(Op); int64_t Value = CST->getSExtValue(); EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative // numbers are printed as such. switch (Letter) { default: llvm_unreachable("Unknown constraint letter!"); case 'I': // "I" is a signed 16-bit constant. if (isInt<16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'J': // "J" is a constant with only the high-order 16 bits nonzero. if (isShiftedUInt<16, 16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. if (isShiftedInt<16, 16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'K': // "K" is a constant with only the low-order 16 bits nonzero. if (isUInt<16>(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'M': // "M" is a constant that is greater than 31. if (Value > 31) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'N': // "N" is a positive constant that is an exact power of two. if (Value > 0 && isPowerOf2_64(Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'O': // "O" is the constant zero. if (Value == 0) Result = DAG.getTargetConstant(Value, dl, TCVT); break; case 'P': // "P" is a constant whose negation is a signed 16-bit constant. if (isInt<16>(-Value)) Result = DAG.getTargetConstant(Value, dl, TCVT); break; } break; } } if (Result.getNode()) { Ops.push_back(Result); return; } // Handle standard constraint letters. TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // PPC does not allow r+i addressing modes for vectors! if (Ty->isVectorTy() && AM.BaseOffs != 0) return false; // PPC allows a sign-extended 16-bit immediate field. if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) return false; // No global is ever allowed as a base. if (AM.BaseGV) return false; // PPC only support r+r, switch (AM.Scale) { case 0: // "r+i" or just "i", depending on HasBaseReg. break; case 1: if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. return false; // Otherwise we have r+r or r+i. break; case 2: if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. return false; // Allow 2*r as r+r. break; default: // No other scales are supported. return false; } return true; } SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); // Make sure the function does not optimize away the store of the RA to // the stack. PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setLRStoreRequired(); bool isPPC64 = Subtarget.isPPC64(); auto PtrVT = getPointerTy(MF.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, isPPC64 ? MVT::i64 : MVT::i32); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo()); } // Just load the return address off the stack. SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo()); } SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); EVT PtrVT = getPointerTy(MF.getDataLayout()); bool isPPC64 = PtrVT == MVT::i64; // Naked functions never have a frame pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned FrameReg; if (MF.getFunction().hasFnAttribute(Attribute::Naked)) FrameReg = isPPC64 ? PPC::X1 : PPC::R1; else FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); while (Depth--) FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { bool isPPC64 = Subtarget.isPPC64(); bool isDarwinABI = Subtarget.isDarwinABI(); if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || (!isPPC64 && VT != MVT::i32)) report_fatal_error("Invalid register global variable type"); bool is64Bit = isPPC64 && VT == MVT::i64; unsigned Reg = StringSwitch(RegName) .Case("r1", is64Bit ? PPC::X1 : PPC::R1) .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : (is64Bit ? PPC::X13 : PPC::R13)) .Default(0); if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } bool PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The PowerPC target isn't yet aware of offsets. return false; } bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfd: case Intrinsic::ppc_qpx_qvlfs: case Intrinsic::ppc_qpx_qvlfcd: case Intrinsic::ppc_qpx_qvlfcs: case Intrinsic::ppc_qpx_qvlfiwa: case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: case Intrinsic::ppc_altivec_lvehx: case Intrinsic::ppc_altivec_lvewx: case Intrinsic::ppc_vsx_lxvd2x: case Intrinsic::ppc_vsx_lxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_lvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_lvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_lvewx: VT = MVT::i32; break; case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfd: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvlfs: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvlfcd: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfcs: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = VT; Info.ptrVal = I.getArgOperand(0); Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::ppc_qpx_qvlfda: case Intrinsic::ppc_qpx_qvlfsa: case Intrinsic::ppc_qpx_qvlfcda: case Intrinsic::ppc_qpx_qvlfcsa: case Intrinsic::ppc_qpx_qvlfiwaa: case Intrinsic::ppc_qpx_qvlfiwza: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvlfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvlfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvlfcsa: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = VT; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::ppc_qpx_qvstfd: case Intrinsic::ppc_qpx_qvstfs: case Intrinsic::ppc_qpx_qvstfcd: case Intrinsic::ppc_qpx_qvstfcs: case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: case Intrinsic::ppc_altivec_stvehx: case Intrinsic::ppc_altivec_stvewx: case Intrinsic::ppc_vsx_stxvd2x: case Intrinsic::ppc_vsx_stxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_stvebx: VT = MVT::i8; break; case Intrinsic::ppc_altivec_stvehx: VT = MVT::i16; break; case Intrinsic::ppc_altivec_stvewx: VT = MVT::i32; break; case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfd: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvstfs: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvstfcd: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfcs: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_VOID; Info.memVT = VT; Info.ptrVal = I.getArgOperand(1); Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::ppc_qpx_qvstfda: case Intrinsic::ppc_qpx_qvstfsa: case Intrinsic::ppc_qpx_qvstfcda: case Intrinsic::ppc_qpx_qvstfcsa: case Intrinsic::ppc_qpx_qvstfiwa: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_qpx_qvstfda: VT = MVT::v4f64; break; case Intrinsic::ppc_qpx_qvstfsa: VT = MVT::v4f32; break; case Intrinsic::ppc_qpx_qvstfcda: VT = MVT::v2f64; break; case Intrinsic::ppc_qpx_qvstfcsa: VT = MVT::v2f32; break; default: VT = MVT::v4i32; break; } Info.opc = ISD::INTRINSIC_VOID; Info.memVT = VT; Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; Info.flags = MachineMemOperand::MOStore; return true; } default: break; } return false; } /// getOptimalMemOpType - Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { const Function &F = MF.getFunction(); // When expanding a memset, require at least two QPX instructions to cover // the cost of loading the value to be stored from the constant pool. if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && !F.hasFnAttribute(Attribute::NoImplicitFloat)) { return MVT::v4f64; } // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Size >= 16 && (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) return MVT::v4i32; } if (Subtarget.isPPC64()) { return MVT::i64; } return MVT::i32; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); return !(BitSize == 0 || BitSize > 64); } bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 == 64 && NumBits2 == 32; } bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 == 64 && NumBits2 == 32; } bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // Generally speaking, zexts are not free, but they are free when they can be // folded with other operations. if (LoadSDNode *LD = dyn_cast(Val)) { EVT MemVT = LD->getMemoryVT(); if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || (Subtarget.isPPC64() && MemVT == MVT::i32)) && (LD->getExtensionType() == ISD::NON_EXTLOAD || LD->getExtensionType() == ISD::ZEXTLOAD)) return true; } // FIXME: Add other cases... // - 32-bit shifts with a zext to i64 // - zext after ctlz, bswap, etc. // - zext after and by a constant mask return TargetLowering::isZExtFree(Val, VT2); } bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const { assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && "invalid fpext types"); // Extending to float128 is not free. if (DestVT == MVT::f128) return false; return true; } bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<16>(Imm) || isUInt<16>(Imm); } bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { if (DisablePPCUnaligned) return false; // PowerPC supports unaligned memory access for simple non-vector types. // Although accessing unaligned addresses is not as efficient as accessing // aligned addresses, it is generally more efficient than manual expansion, // and generally only traps for software emulation when crossing page // boundaries. if (!VT.isSimple()) return false; if (VT.getSimpleVT().isVector()) { if (Subtarget.hasVSX()) { if (VT != MVT::v2f64 && VT != MVT::v2i64 && VT != MVT::v4f32 && VT != MVT::v4i32) return false; } else { return false; } } if (VT == MVT::ppcf128) return false; if (Fast) *Fast = true; return true; } bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; case MVT::f128: return (EnableQuadPrecision && Subtarget.hasP9Vector()); default: break; } return false; } const MCPhysReg * PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call // site. Hence we include LR in the scratch registers, which are in turn added // as implicit-defs for stackmaps and patchpoints. The same reasoning applies // to CTR, which is used by any indirect call. static const MCPhysReg ScratchRegs[] = { PPC::X12, PPC::LR8, PPC::CTR8, 0 }; return ScratchRegs; } unsigned PPCTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; } unsigned PPCTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; } bool PPCTargetLowering::shouldExpandBuildVectorWithShuffles( EVT VT , unsigned DefinedValues) const { if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves if (Subtarget.hasVSX() || Subtarget.hasQPX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); } Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { if (DisableILPPref || Subtarget.enableMachineScheduler()) return TargetLowering::getSchedulingPreference(N); return Sched::ILP; } // Create a fast isel object. FastISel * PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const { return PPC::createFastISel(FuncInfo, LibInfo); } void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { if (Subtarget.isDarwinABI()) return; if (!Subtarget.isPPC64()) return; // Update IsSplitCSR in PPCFunctionInfo PPCFunctionInfo *PFI = Entry->getParent()->getInfo(); PFI->setIsSplitCSR(true); } void PPCTargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (PPC::G8RCRegClass.contains(*I)) RC = &PPC::G8RCRegClass; else if (PPC::F8RCRegClass.contains(*I)) RC = &PPC::F8RCRegClass; else if (PPC::CRRCRegClass.contains(*I)) RC = &PPC::CRRCRegClass; else if (PPC::VRRCRegClass.contains(*I)) RC = &PPC::VRRCRegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } // Override to enable LOAD_STACK_GUARD lowering on Linux. bool PPCTargetLowering::useLoadStackGuardNode() const { if (!Subtarget.isTargetLinux()) return TargetLowering::useLoadStackGuardNode(); return true; } // Override to disable global variable loading on Linux. void PPCTargetLowering::insertSSPDeclarations(Module &M) const { if (!Subtarget.isTargetLinux()) return TargetLowering::insertSSPDeclarations(M); } bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { if (!VT.isSimple() || !Subtarget.hasVSX()) return false; switch(VT.getSimpleVT().SimpleTy) { default: // For FP types that are currently not supported by PPC backend, return // false. Examples: f16, f80. return false; case MVT::f32: case MVT::f64: case MVT::ppcf128: return Imm.isPosZero(); } } // For vector shift operation op, fold // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); unsigned Opcode = N->getOpcode(); unsigned TargetOpcode; switch (Opcode) { default: llvm_unreachable("Unexpected shift operation"); case ISD::SHL: TargetOpcode = PPCISD::SHL; break; case ISD::SRL: TargetOpcode = PPCISD::SRL; break; case ISD::SRA: TargetOpcode = PPCISD::SRA; break; } if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && N1->getOpcode() == ISD::AND) if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) if (Mask->getZExtValue() == OpSizeInBits - 1) return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); return SDValue(); } SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) return Value; return SDValue(); } SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) return Value; return SDValue(); } SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) return Value; return SDValue(); } bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // Only duplicate to increase tail-calls for the 64bit SysV ABIs. if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) return false; // If not a tail call then no need to proceed. if (!CI->isTailCall()) return false; // If tail calls are disabled for the caller then we are done. const Function *Caller = CI->getParent()->getParent(); auto Attr = Caller->getFnAttribute("disable-tail-calls"); if (Attr.getValueAsString() == "true") return false; // If sibling calls have been disabled and tail-calls aren't guaranteed // there is no reason to duplicate. auto &TM = getTargetMachine(); if (!TM.Options.GuaranteedTailCallOpt && DisableSCO) return false; // Can't tail call a function called indirectly, or if it has variadic args. const Function *Callee = CI->getCalledFunction(); if (!Callee || Callee->isVarArg()) return false; // Make sure the callee and caller calling conventions are eligible for tco. if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(), CI->getCallingConv())) return false; // If the function is local then we have a good chance at tail-calling it return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); } bool PPCTargetLowering:: isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { const Value *Mask = AndI.getOperand(1); // If the mask is suitable for andi. or andis. we should sink the and. if (const ConstantInt *CI = dyn_cast(Mask)) { // Can't handle constants wider than 64-bits. if (CI->getBitWidth() > 64) return false; int64_t ConstVal = CI->getZExtValue(); return isUInt<16>(ConstVal) || (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF)); } // For non-constant masks, we can always use the record-form and. return true; } Index: vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCInstrVSX.td (revision 337298) +++ vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCInstrVSX.td (revision 337299) @@ -1,3848 +1,3934 @@ //===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file describes the VSX extension to the PowerPC instruction set. // //===----------------------------------------------------------------------===// // *********************************** NOTE *********************************** // ** For POWER8 Little Endian, the VSX swap optimization relies on knowing ** // ** which VMX and VSX instructions are lane-sensitive and which are not. ** // ** A lane-sensitive instruction relies, implicitly or explicitly, on ** // ** whether lanes are numbered from left to right. An instruction like ** // ** VADDFP is not lane-sensitive, because each lane of the result vector ** // ** relies only on the corresponding lane of the source vectors. However, ** // ** an instruction like VMULESB is lane-sensitive, because "even" and ** // ** "odd" lanes are different for big-endian and little-endian numbering. ** // ** ** // ** When adding new VMX and VSX instructions, please consider whether they ** // ** are lane-sensitive. If so, they must be added to a switch statement ** // ** in PPCVSXSwapRemoval::gatherVectorInstructions(). ** // **************************************************************************** def PPCRegVSRCAsmOperand : AsmOperandClass { let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber"; } def vsrc : RegisterOperand { let ParserMatchClass = PPCRegVSRCAsmOperand; } def PPCRegVSFRCAsmOperand : AsmOperandClass { let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber"; } def vsfrc : RegisterOperand { let ParserMatchClass = PPCRegVSFRCAsmOperand; } def PPCRegVSSRCAsmOperand : AsmOperandClass { let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber"; } def vssrc : RegisterOperand { let ParserMatchClass = PPCRegVSSRCAsmOperand; } def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass { let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber"; } def spilltovsrrc : RegisterOperand { let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand; } // Little-endian-specific nodes. def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>, SDTCisPtrTy<1> ]>; def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [ SDTCisVT<0, v2f64>, SDTCisPtrTy<1> ]>; def SDT_PPCxxswapd : SDTypeProfile<1, 1, [ SDTCisSameAs<0, 1> ]>; def SDTVecConv : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2> ]>; def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x, [SDNPHasChain, SDNPMayStore]>; def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>; def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>; def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>; def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>; def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, ValueType OutTy, ValueType InTy> { let BaseName = asmbase in { def NAME : XX3Form_Rc; let Defs = [CR6] in def o : XX3Form_Rc, isDOT; } } // Instruction form with a single input register for instructions such as // XXPERMDI. The reason for defining this is that specifying multiple chained // operands (such as loads) to an instruction will perform both chained // operations rather than coalescing them into a single register - even though // the source memory location is the same. This simply forces the instruction // to use the same register for both inputs. // For example, an output DAG such as this: // (XXPERMDI (LXSIBZX xoaddr:$src), (LXSIBZX xoaddr:$src ), 0)) // would result in two load instructions emitted and used as separate inputs // to the XXPERMDI instruction. class XX3Form_2s opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : XX3Form_2 { let XB = XA; } def HasVSX : Predicate<"PPCSubTarget->hasVSX()">; def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">; def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">; def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">; let Predicates = [HasVSX] in { let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. let UseVSXReg = 1 in { let hasSideEffects = 0 in { // VSX instructions don't have side effects. let Uses = [RM] in { // Load indexed instructions let mayLoad = 1, mayStore = 0 in { let CodeSize = 3 in def LXSDX : XX1Form_memOp<31, 588, (outs vsfrc:$XT), (ins memrr:$src), "lxsdx $XT, $src", IIC_LdStLFD, []>; // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later let isPseudo = 1, CodeSize = 3 in def XFLOADf64 : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), "#XFLOADf64", [(set f64:$XT, (load xoaddr:$src))]>; let Predicates = [HasVSX, HasOnlySwappingMemOps] in def LXVD2X : XX1Form_memOp<31, 844, (outs vsrc:$XT), (ins memrr:$src), "lxvd2x $XT, $src", IIC_LdStLFD, [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>; def LXVDSX : XX1Form_memOp<31, 332, (outs vsrc:$XT), (ins memrr:$src), "lxvdsx $XT, $src", IIC_LdStLFD, []>; let Predicates = [HasVSX, HasOnlySwappingMemOps] in def LXVW4X : XX1Form_memOp<31, 780, (outs vsrc:$XT), (ins memrr:$src), "lxvw4x $XT, $src", IIC_LdStLFD, []>; } // mayLoad // Store indexed instructions let mayStore = 1, mayLoad = 0 in { let CodeSize = 3 in def STXSDX : XX1Form_memOp<31, 716, (outs), (ins vsfrc:$XT, memrr:$dst), "stxsdx $XT, $dst", IIC_LdStSTFD, []>; // Pseudo instruction XFSTOREf64 will be expanded to STXSDX or STFDX later let isPseudo = 1, CodeSize = 3 in def XFSTOREf64 : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst), "#XFSTOREf64", [(store f64:$XT, xoaddr:$dst)]>; let Predicates = [HasVSX, HasOnlySwappingMemOps] in { // The behaviour of this instruction is endianness-specific so we provide no // pattern to match it without considering endianness. def STXVD2X : XX1Form_memOp<31, 972, (outs), (ins vsrc:$XT, memrr:$dst), "stxvd2x $XT, $dst", IIC_LdStSTFD, []>; def STXVW4X : XX1Form_memOp<31, 908, (outs), (ins vsrc:$XT, memrr:$dst), "stxvw4x $XT, $dst", IIC_LdStSTFD, []>; } } // mayStore // Add/Mul Instructions let isCommutable = 1 in { def XSADDDP : XX3Form<60, 32, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xsadddp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>; def XSMULDP : XX3Form<60, 48, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xsmuldp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>; def XVADDDP : XX3Form<60, 96, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvadddp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>; def XVADDSP : XX3Form<60, 64, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvaddsp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>; def XVMULDP : XX3Form<60, 112, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvmuldp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>; def XVMULSP : XX3Form<60, 80, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvmulsp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>; } // Subtract Instructions def XSSUBDP : XX3Form<60, 40, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xssubdp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>; def XVSUBDP : XX3Form<60, 104, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvsubdp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>; def XVSUBSP : XX3Form<60, 72, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvsubsp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>; // FMA Instructions let BaseName = "XSMADDADP" in { let isCommutable = 1 in def XSMADDADP : XX3Form<60, 33, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmaddadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSMADDMDP : XX3Form<60, 41, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XSMSUBADP" in { let isCommutable = 1 in def XSMSUBADP : XX3Form<60, 49, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmsubadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSMSUBMDP : XX3Form<60, 57, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XSNMADDADP" in { let isCommutable = 1 in def XSNMADDADP : XX3Form<60, 161, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmaddadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSNMADDMDP : XX3Form<60, 169, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XSNMSUBADP" in { let isCommutable = 1 in def XSNMSUBADP : XX3Form<60, 177, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmsubadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSNMSUBMDP : XX3Form<60, 185, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVMADDADP" in { let isCommutable = 1 in def XVMADDADP : XX3Form<60, 97, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMADDMDP : XX3Form<60, 105, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVMADDASP" in { let isCommutable = 1 in def XVMADDASP : XX3Form<60, 65, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMADDMSP : XX3Form<60, 73, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVMSUBADP" in { let isCommutable = 1 in def XVMSUBADP : XX3Form<60, 113, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMSUBMDP : XX3Form<60, 121, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVMSUBASP" in { let isCommutable = 1 in def XVMSUBASP : XX3Form<60, 81, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMSUBMSP : XX3Form<60, 89, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVNMADDADP" in { let isCommutable = 1 in def XVNMADDADP : XX3Form<60, 225, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMADDMDP : XX3Form<60, 233, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVNMADDASP" in { let isCommutable = 1 in def XVNMADDASP : XX3Form<60, 193, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMADDMSP : XX3Form<60, 201, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVNMSUBADP" in { let isCommutable = 1 in def XVNMSUBADP : XX3Form<60, 241, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMSUBMDP : XX3Form<60, 249, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XVNMSUBASP" in { let isCommutable = 1 in def XVNMSUBASP : XX3Form<60, 209, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMSUBMSP : XX3Form<60, 217, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } // Division Instructions def XSDIVDP : XX3Form<60, 56, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xsdivdp $XT, $XA, $XB", IIC_FPDivD, [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>; def XSSQRTDP : XX2Form<60, 75, (outs vsfrc:$XT), (ins vsfrc:$XB), "xssqrtdp $XT, $XB", IIC_FPSqrtD, [(set f64:$XT, (fsqrt f64:$XB))]>; def XSREDP : XX2Form<60, 90, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsredp $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfre f64:$XB))]>; def XSRSQRTEDP : XX2Form<60, 74, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrsqrtedp $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfrsqrte f64:$XB))]>; def XSTDIVDP : XX3Form_1<60, 61, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>; def XSTSQRTDP : XX2Form_1<60, 106, (outs crrc:$crD), (ins vsfrc:$XB), "xstsqrtdp $crD, $XB", IIC_FPCompare, []>; def XVDIVDP : XX3Form<60, 120, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvdivdp $XT, $XA, $XB", IIC_FPDivD, [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>; def XVDIVSP : XX3Form<60, 88, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvdivsp $XT, $XA, $XB", IIC_FPDivS, [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>; def XVSQRTDP : XX2Form<60, 203, (outs vsrc:$XT), (ins vsrc:$XB), "xvsqrtdp $XT, $XB", IIC_FPSqrtD, [(set v2f64:$XT, (fsqrt v2f64:$XB))]>; def XVSQRTSP : XX2Form<60, 139, (outs vsrc:$XT), (ins vsrc:$XB), "xvsqrtsp $XT, $XB", IIC_FPSqrtS, [(set v4f32:$XT, (fsqrt v4f32:$XB))]>; def XVTDIVDP : XX3Form_1<60, 125, (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; def XVTDIVSP : XX3Form_1<60, 93, (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>; def XVTSQRTDP : XX2Form_1<60, 234, (outs crrc:$crD), (ins vsrc:$XB), "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>; def XVTSQRTSP : XX2Form_1<60, 170, (outs crrc:$crD), (ins vsrc:$XB), "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>; def XVREDP : XX2Form<60, 218, (outs vsrc:$XT), (ins vsrc:$XB), "xvredp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (PPCfre v2f64:$XB))]>; def XVRESP : XX2Form<60, 154, (outs vsrc:$XT), (ins vsrc:$XB), "xvresp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (PPCfre v4f32:$XB))]>; def XVRSQRTEDP : XX2Form<60, 202, (outs vsrc:$XT), (ins vsrc:$XB), "xvrsqrtedp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>; def XVRSQRTESP : XX2Form<60, 138, (outs vsrc:$XT), (ins vsrc:$XB), "xvrsqrtesp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>; // Compare Instructions def XSCMPODP : XX3Form_1<60, 43, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>; def XSCMPUDP : XX3Form_1<60, 35, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>; defm XVCMPEQDP : XX3Form_Rcr<60, 99, "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>; defm XVCMPEQSP : XX3Form_Rcr<60, 67, "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>; defm XVCMPGEDP : XX3Form_Rcr<60, 115, "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, int_ppc_vsx_xvcmpgedp, v2i64, v2f64>; defm XVCMPGESP : XX3Form_Rcr<60, 83, "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, int_ppc_vsx_xvcmpgesp, v4i32, v4f32>; defm XVCMPGTDP : XX3Form_Rcr<60, 107, "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>; defm XVCMPGTSP : XX3Form_Rcr<60, 75, "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>; // Move Instructions def XSABSDP : XX2Form<60, 345, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsabsdp $XT, $XB", IIC_VecFP, [(set f64:$XT, (fabs f64:$XB))]>; def XSNABSDP : XX2Form<60, 361, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsnabsdp $XT, $XB", IIC_VecFP, [(set f64:$XT, (fneg (fabs f64:$XB)))]>; def XSNEGDP : XX2Form<60, 377, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsnegdp $XT, $XB", IIC_VecFP, [(set f64:$XT, (fneg f64:$XB))]>; def XSCPSGNDP : XX3Form<60, 176, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xscpsgndp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>; def XVABSDP : XX2Form<60, 473, (outs vsrc:$XT), (ins vsrc:$XB), "xvabsdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (fabs v2f64:$XB))]>; def XVABSSP : XX2Form<60, 409, (outs vsrc:$XT), (ins vsrc:$XB), "xvabssp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fabs v4f32:$XB))]>; def XVCPSGNDP : XX3Form<60, 240, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcpsgndp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>; def XVCPSGNSP : XX3Form<60, 208, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>; def XVNABSDP : XX2Form<60, 489, (outs vsrc:$XT), (ins vsrc:$XB), "xvnabsdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>; def XVNABSSP : XX2Form<60, 425, (outs vsrc:$XT), (ins vsrc:$XB), "xvnabssp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>; def XVNEGDP : XX2Form<60, 505, (outs vsrc:$XT), (ins vsrc:$XB), "xvnegdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (fneg v2f64:$XB))]>; def XVNEGSP : XX2Form<60, 441, (outs vsrc:$XT), (ins vsrc:$XB), "xvnegsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg v4f32:$XB))]>; // Conversion Instructions def XSCVDPSP : XX2Form<60, 265, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsp $XT, $XB", IIC_VecFP, []>; def XSCVDPSXDS : XX2Form<60, 344, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsxds $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctidz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPSXDSs : XX2Form<60, 344, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpsxds $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfctidz f32:$XB))]>; def XSCVDPSXWS : XX2Form<60, 88, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsxws $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctiwz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPSXWSs : XX2Form<60, 88, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpsxws $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfctiwz f32:$XB))]>; def XSCVDPUXDS : XX2Form<60, 328, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpuxds $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctiduz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPUXDSs : XX2Form<60, 328, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpuxds $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfctiduz f32:$XB))]>; def XSCVDPUXWS : XX2Form<60, 72, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpuxws $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctiwuz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPUXWSs : XX2Form<60, 72, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpuxws $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfctiwuz f32:$XB))]>; def XSCVSPDP : XX2Form<60, 329, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvspdp $XT, $XB", IIC_VecFP, []>; def XSCVSXDDP : XX2Form<60, 376, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvsxddp $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfcfid f64:$XB))]>; def XSCVUXDDP : XX2Form<60, 360, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvuxddp $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfcfidu f64:$XB))]>; def XVCVDPSP : XX2Form<60, 393, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (int_ppc_vsx_xvcvdpsp v2f64:$XB))]>; def XVCVDPSXDS : XX2Form<60, 472, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpsxds $XT, $XB", IIC_VecFP, [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>; def XVCVDPSXWS : XX2Form<60, 216, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpsxws $XT, $XB", IIC_VecFP, [(set v4i32:$XT, (int_ppc_vsx_xvcvdpsxws v2f64:$XB))]>; def XVCVDPUXDS : XX2Form<60, 456, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpuxds $XT, $XB", IIC_VecFP, [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>; def XVCVDPUXWS : XX2Form<60, 200, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpuxws $XT, $XB", IIC_VecFP, [(set v4i32:$XT, (int_ppc_vsx_xvcvdpuxws v2f64:$XB))]>; def XVCVSPDP : XX2Form<60, 457, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (int_ppc_vsx_xvcvspdp v4f32:$XB))]>; def XVCVSPSXDS : XX2Form<60, 408, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspsxds $XT, $XB", IIC_VecFP, []>; def XVCVSPSXWS : XX2Form<60, 152, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspsxws $XT, $XB", IIC_VecFP, [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>; def XVCVSPUXDS : XX2Form<60, 392, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspuxds $XT, $XB", IIC_VecFP, []>; def XVCVSPUXWS : XX2Form<60, 136, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspuxws $XT, $XB", IIC_VecFP, [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>; def XVCVSXDDP : XX2Form<60, 504, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxddp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>; def XVCVSXDSP : XX2Form<60, 440, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxdsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>; def XVCVSXWDP : XX2Form<60, 248, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxwdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>; def XVCVSXWSP : XX2Form<60, 184, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxwsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>; def XVCVUXDDP : XX2Form<60, 488, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxddp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>; def XVCVUXDSP : XX2Form<60, 424, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxdsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>; def XVCVUXWDP : XX2Form<60, 232, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxwdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>; def XVCVUXWSP : XX2Form<60, 168, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxwsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>; // Rounding Instructions def XSRDPI : XX2Form<60, 73, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpi $XT, $XB", IIC_VecFP, [(set f64:$XT, (fround f64:$XB))]>; def XSRDPIC : XX2Form<60, 107, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpic $XT, $XB", IIC_VecFP, [(set f64:$XT, (fnearbyint f64:$XB))]>; def XSRDPIM : XX2Form<60, 121, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpim $XT, $XB", IIC_VecFP, [(set f64:$XT, (ffloor f64:$XB))]>; def XSRDPIP : XX2Form<60, 105, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpip $XT, $XB", IIC_VecFP, [(set f64:$XT, (fceil f64:$XB))]>; def XSRDPIZ : XX2Form<60, 89, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpiz $XT, $XB", IIC_VecFP, [(set f64:$XT, (ftrunc f64:$XB))]>; def XVRDPI : XX2Form<60, 201, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpi $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (fround v2f64:$XB))]>; def XVRDPIC : XX2Form<60, 235, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpic $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; def XVRDPIM : XX2Form<60, 249, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpim $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (ffloor v2f64:$XB))]>; def XVRDPIP : XX2Form<60, 233, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpip $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (fceil v2f64:$XB))]>; def XVRDPIZ : XX2Form<60, 217, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpiz $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (ftrunc v2f64:$XB))]>; def XVRSPI : XX2Form<60, 137, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspi $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fround v4f32:$XB))]>; def XVRSPIC : XX2Form<60, 171, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspic $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; def XVRSPIM : XX2Form<60, 185, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspim $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (ffloor v4f32:$XB))]>; def XVRSPIP : XX2Form<60, 169, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspip $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fceil v4f32:$XB))]>; def XVRSPIZ : XX2Form<60, 153, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspiz $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (ftrunc v4f32:$XB))]>; // Max/Min Instructions let isCommutable = 1 in { def XSMAXDP : XX3Form<60, 160, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xsmaxdp $XT, $XA, $XB", IIC_VecFP, [(set vsfrc:$XT, (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>; def XSMINDP : XX3Form<60, 168, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xsmindp $XT, $XA, $XB", IIC_VecFP, [(set vsfrc:$XT, (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>; def XVMAXDP : XX3Form<60, 224, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvmaxdp $XT, $XA, $XB", IIC_VecFP, [(set vsrc:$XT, (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>; def XVMINDP : XX3Form<60, 232, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvmindp $XT, $XA, $XB", IIC_VecFP, [(set vsrc:$XT, (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>; def XVMAXSP : XX3Form<60, 192, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvmaxsp $XT, $XA, $XB", IIC_VecFP, [(set vsrc:$XT, (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>; def XVMINSP : XX3Form<60, 200, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvminsp $XT, $XA, $XB", IIC_VecFP, [(set vsrc:$XT, (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>; } // isCommutable } // Uses = [RM] // Logical Instructions let isCommutable = 1 in def XXLAND : XX3Form<60, 130, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxland $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>; def XXLANDC : XX3Form<60, 138, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlandc $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (and v4i32:$XA, (vnot_ppc v4i32:$XB)))]>; let isCommutable = 1 in { def XXLNOR : XX3Form<60, 162, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlnor $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA, v4i32:$XB)))]>; def XXLOR : XX3Form<60, 146, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlor $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>; let isCodeGenOnly = 1 in def XXLORf: XX3Form<60, 146, (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>; def XXLXOR : XX3Form<60, 154, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlxor $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>; } // isCommutable let isCodeGenOnly = 1 in def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set v4i32:$XT, (v4i32 immAllZerosV))]>; let isCodeGenOnly = 1 in { def XXLXORdpz : XX3Form_SetZero<60, 154, (outs vsfrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set f64:$XT, (fpimm0))]>; def XXLXORspz : XX3Form_SetZero<60, 154, (outs vssrc:$XT), (ins), "xxlxor $XT, $XT, $XT", IIC_VecGeneral, [(set f32:$XT, (fpimm0))]>; } // Permutation Instructions def XXMRGHW : XX3Form<60, 18, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>; def XXMRGLW : XX3Form<60, 50, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>; def XXPERMDI : XX3Form_2<60, 10, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB, imm32SExt16:$DM))]>; let isCodeGenOnly = 1 in def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM), "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>; def XXSEL : XX4Form<60, 3, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC), "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>; def XXSLDWI : XX3Form_2<60, 2, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW), "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, [(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB, imm32SExt16:$SHW))]>; def XXSPLTW : XX2Form_2<60, 164, (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM), "xxspltw $XT, $XB, $UIM", IIC_VecPerm, [(set v4i32:$XT, (PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>; let isCodeGenOnly = 1 in def XXSPLTWs : XX2Form_2<60, 164, (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM), "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>; } // hasSideEffects } // UseVSXReg = 1 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. let usesCustomInserter = 1, // Expanded after instruction selection. PPC970_Single = 1 in { def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst), (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC), "#SELECT_CC_VSRC", []>; def SELECT_VSRC: Pseudo<(outs vsrc:$dst), (ins crbitrc:$cond, vsrc:$T, vsrc:$F), "#SELECT_VSRC", [(set v2f64:$dst, (select i1:$cond, v2f64:$T, v2f64:$F))]>; def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F, i32imm:$BROPC), "#SELECT_CC_VSFRC", []>; def SELECT_VSFRC: Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond, f8rc:$T, f8rc:$F), "#SELECT_VSFRC", [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>; def SELECT_CC_VSSRC: Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F, i32imm:$BROPC), "#SELECT_CC_VSSRC", []>; def SELECT_VSSRC: Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond, f4rc:$T, f4rc:$F), "#SELECT_VSSRC", [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>; } // usesCustomInserter } // AddedComplexity def : InstAlias<"xvmovdp $XT, $XB", (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; def : InstAlias<"xvmovsp $XT, $XB", (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; def : InstAlias<"xxspltd $XT, $XB, 0", (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>; def : InstAlias<"xxspltd $XT, $XB, 1", (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>; def : InstAlias<"xxmrghd $XT, $XA, $XB", (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>; def : InstAlias<"xxmrgld $XT, $XA, $XB", (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>; def : InstAlias<"xxswapd $XT, $XB", (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>; def : InstAlias<"xxspltd $XT, $XB, 0", (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>; def : InstAlias<"xxspltd $XT, $XB, 1", (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>; def : InstAlias<"xxswapd $XT, $XB", (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>; let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def : Pat<(v4i32 (vnot_ppc v4i32:$A)), (v4i32 (XXLNOR $A, $A))>; let Predicates = [IsBigEndian] in { def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>; def : Pat<(f64 (extractelt v2f64:$S, 0)), (f64 (EXTRACT_SUBREG $S, sub_64))>; def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; } let Predicates = [IsLittleEndian] in { def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64), (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>; def : Pat<(f64 (extractelt v2f64:$S, 0)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG $S, sub_64))>; } // Additional fnmsub patterns: -a*c + b == -(a*c - b) def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), (XSNMSUBADP $B, $C, $A)>; def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), (XSNMSUBADP $B, $C, $A)>; def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B), (XVNMSUBADP $B, $C, $A)>; def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B), (XVNMSUBADP $B, $C, $A)>; def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), (XVNMSUBASP $B, $C, $A)>; def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), (XVNMSUBASP $B, $C, $A)>; def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2f64 (bitconvert v4i32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2f64 (bitconvert v8i16:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2f64 (bitconvert v16i8:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v4f32 (bitconvert v2f64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v4i32 (bitconvert v2f64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v8i16 (bitconvert v2f64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v16i8 (bitconvert v2f64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v2i64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2i64 (bitconvert v4i32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2i64 (bitconvert v8i16:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2i64 (bitconvert v16i8:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v4f32 (bitconvert v2i64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v4i32 (bitconvert v2i64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v8i16 (bitconvert v2i64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v16i8 (bitconvert v2i64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v2f64 (bitconvert v2i64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v2i64 (bitconvert v2f64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v2f64 (bitconvert v1i128:$A)), (COPY_TO_REGCLASS $A, VRRC)>; def : Pat<(v1i128 (bitconvert v2f64:$A)), (COPY_TO_REGCLASS $A, VRRC)>; // sign extension patterns // To extend "in place" from v2i32 to v2i64, we have input data like: // | undef | i32 | undef | i32 | // but xvcvsxwdp expects the input in big-Endian format: // | i32 | undef | i32 | undef | // so we need to shift everything to the left by one i32 (word) before // the conversion. def : Pat<(sext_inreg v2i64:$C, v2i32), (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>; def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))), (XVCVSXWDP (XXSLDWI $C, $C, 1))>; def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)), (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>; def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)), (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>; def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)), (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>; def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)), (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>; // Loads. let Predicates = [HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>; // Stores. def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; } let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>; def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>; def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; } // Permutes. def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>; // PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and // XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable. def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>; // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)), (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)), (SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)), (SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)), (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)), (SELECT_VSRC (CRORC $rhs, $lhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)), (SELECT_VSRC (CRORC $lhs, $rhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)), (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)), (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)), (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)), (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)), (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)), (SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)), (SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)), (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)), (SELECT_VSFRC (CRORC $rhs, $lhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)), (SELECT_VSFRC (CRORC $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)), (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)), (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)), (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>; // Divides. def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B), (XVDIVSP $A, $B)>; def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B), (XVDIVDP $A, $B)>; // Reciprocal estimate def : Pat<(int_ppc_vsx_xvresp v4f32:$A), (XVRESP $A)>; def : Pat<(int_ppc_vsx_xvredp v2f64:$A), (XVREDP $A)>; // Recip. square root estimate def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A), (XVRSQRTESP $A)>; def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A), (XVRSQRTEDP $A)>; let Predicates = [IsLittleEndian] in { def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))), (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))), (f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>; def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))), (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))), (f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>; } // IsLittleEndian let Predicates = [IsBigEndian] in { def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))), (f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>; def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))), (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))), (f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>; def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))), (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; } // IsBigEndian } // AddedComplexity } // HasVSX def ScalarLoads { dag Li8 = (i32 (extloadi8 xoaddr:$src)); dag ZELi8 = (i32 (zextloadi8 xoaddr:$src)); dag ZELi8i64 = (i64 (zextloadi8 xoaddr:$src)); dag SELi8 = (i32 (sext_inreg (extloadi8 xoaddr:$src), i8)); dag SELi8i64 = (i64 (sext_inreg (extloadi8 xoaddr:$src), i8)); dag Li16 = (i32 (extloadi16 xoaddr:$src)); dag ZELi16 = (i32 (zextloadi16 xoaddr:$src)); dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src)); dag SELi16 = (i32 (sextloadi16 xoaddr:$src)); dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src)); dag Li32 = (i32 (load xoaddr:$src)); } // The following VSX instructions were introduced in Power ISA 2.07 /* FIXME: if the operands are v2i64, these patterns will not match. we should define new patterns or otherwise match the same patterns when the elements are larger than i32. */ def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">; def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">; def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">; let Predicates = [HasP8Vector] in { let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. let isCommutable = 1, UseVSXReg = 1 in { def XXLEQV : XX3Form<60, 186, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxleqv $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>; def XXLNAND : XX3Form<60, 178, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlnand $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA, v4i32:$XB)))]>; } // isCommutable, UseVSXReg def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B), (XXLEQV $A, $B)>; let UseVSXReg = 1 in { def XXLORC : XX3Form<60, 170, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xxlorc $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>; // VSX scalar loads introduced in ISA 2.07 let mayLoad = 1, mayStore = 0 in { let CodeSize = 3 in def LXSSPX : XX1Form_memOp<31, 524, (outs vssrc:$XT), (ins memrr:$src), "lxsspx $XT, $src", IIC_LdStLFD, []>; def LXSIWAX : XX1Form_memOp<31, 76, (outs vsfrc:$XT), (ins memrr:$src), "lxsiwax $XT, $src", IIC_LdStLFD, []>; def LXSIWZX : XX1Form_memOp<31, 12, (outs vsfrc:$XT), (ins memrr:$src), "lxsiwzx $XT, $src", IIC_LdStLFD, []>; // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it // would cause these Pseudos are not expanded in expandPostRAPseudos() let isPseudo = 1 in { // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later let CodeSize = 3 in def XFLOADf32 : PseudoXFormMemOp<(outs vssrc:$XT), (ins memrr:$src), "#XFLOADf32", [(set f32:$XT, (load xoaddr:$src))]>; // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later def LIWAX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), "#LIWAX", [(set f64:$XT, (PPClfiwax xoaddr:$src))]>; // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later def LIWZX : PseudoXFormMemOp<(outs vsfrc:$XT), (ins memrr:$src), "#LIWZX", [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>; } } // mayLoad // VSX scalar stores introduced in ISA 2.07 let mayStore = 1, mayLoad = 0 in { let CodeSize = 3 in def STXSSPX : XX1Form_memOp<31, 652, (outs), (ins vssrc:$XT, memrr:$dst), "stxsspx $XT, $dst", IIC_LdStSTFD, []>; def STXSIWX : XX1Form_memOp<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst), "stxsiwx $XT, $dst", IIC_LdStSTFD, []>; // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it // would cause these Pseudos are not expanded in expandPostRAPseudos() let isPseudo = 1 in { // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later let CodeSize = 3 in def XFSTOREf32 : PseudoXFormMemOp<(outs), (ins vssrc:$XT, memrr:$dst), "#XFSTOREf32", [(store f32:$XT, xoaddr:$dst)]>; // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later def STIWX : PseudoXFormMemOp<(outs), (ins vsfrc:$XT, memrr:$dst), "#STIWX", [(PPCstfiwx f64:$XT, xoaddr:$dst)]>; } } // mayStore } // UseVSXReg = 1 def : Pat<(f64 (extloadf32 xoaddr:$src)), (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>; def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))), (f32 (XFLOADf32 xoaddr:$src))>; def : Pat<(f64 (fpextend f32:$src)), (COPY_TO_REGCLASS $src, VSFRC)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)), (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)), (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)), (SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)), (SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)), (SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)), (SELECT_VSSRC (CRORC $rhs, $lhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)), (SELECT_VSSRC (CRORC $lhs, $rhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)), (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)), (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)), (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>; let UseVSXReg = 1 in { // VSX Elementary Scalar FP arithmetic (SP) let isCommutable = 1 in { def XSADDSP : XX3Form<60, 0, (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), "xsaddsp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fadd f32:$XA, f32:$XB))]>; def XSMULSP : XX3Form<60, 16, (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), "xsmulsp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fmul f32:$XA, f32:$XB))]>; } // isCommutable def XSDIVSP : XX3Form<60, 24, (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), "xsdivsp $XT, $XA, $XB", IIC_FPDivS, [(set f32:$XT, (fdiv f32:$XA, f32:$XB))]>; def XSRESP : XX2Form<60, 26, (outs vssrc:$XT), (ins vssrc:$XB), "xsresp $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfre f32:$XB))]>; def XSRSP : XX2Form<60, 281, (outs vssrc:$XT), (ins vsfrc:$XB), "xsrsp $XT, $XB", IIC_VecFP, []>; def XSSQRTSP : XX2Form<60, 11, (outs vssrc:$XT), (ins vssrc:$XB), "xssqrtsp $XT, $XB", IIC_FPSqrtS, [(set f32:$XT, (fsqrt f32:$XB))]>; def XSRSQRTESP : XX2Form<60, 10, (outs vssrc:$XT), (ins vssrc:$XB), "xsrsqrtesp $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfrsqrte f32:$XB))]>; def XSSUBSP : XX3Form<60, 8, (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB), "xssubsp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>; // FMA Instructions let BaseName = "XSMADDASP" in { let isCommutable = 1 in def XSMADDASP : XX3Form<60, 1, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmaddasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSMADDMSP : XX3Form<60, 9, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XSMSUBASP" in { let isCommutable = 1 in def XSMSUBASP : XX3Form<60, 17, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmsubasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fma f32:$XA, f32:$XB, (fneg f32:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSMSUBMSP : XX3Form<60, 25, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XSNMADDASP" in { let isCommutable = 1 in def XSNMADDASP : XX3Form<60, 129, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsnmaddasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB, f32:$XTi)))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSNMADDMSP : XX3Form<60, 137, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } let BaseName = "XSNMSUBASP" in { let isCommutable = 1 in def XSNMSUBASP : XX3Form<60, 145, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsnmsubasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB, (fneg f32:$XTi))))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSNMSUBMSP : XX3Form<60, 153, (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } // Single Precision Conversions (FP <-> INT) def XSCVSXDSP : XX2Form<60, 312, (outs vssrc:$XT), (ins vsfrc:$XB), "xscvsxdsp $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfcfids f64:$XB))]>; def XSCVUXDSP : XX2Form<60, 296, (outs vssrc:$XT), (ins vsfrc:$XB), "xscvuxdsp $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfcfidus f64:$XB))]>; // Conversions between vector and scalar single precision def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB), "xscvdpspn $XT, $XB", IIC_VecFP, []>; def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB), "xscvspdpn $XT, $XB", IIC_VecFP, []>; } // UseVSXReg = 1 let Predicates = [IsLittleEndian] in { def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))), (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVSXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>; } let Predicates = [IsBigEndian] in { def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))), (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>; def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S, VSFRC)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; } def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)), (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>; // Instructions for converting float to i64 feeding a store. let Predicates = [NoP9Vector] in { def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8), (STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8), (STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>; } // Instructions for converting float to i32 feeding a store. def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4), (STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4), (STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>; } // AddedComplexity = 400 } // HasP8Vector let UseVSXReg = 1, AddedComplexity = 400 in { let Predicates = [HasDirectMove] in { // VSX direct move instructions def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT), "mfvsrd $rA, $XT", IIC_VecGeneral, [(set i64:$rA, (PPCmfvsr f64:$XT))]>, Requires<[In64BitMode]>; let isCodeGenOnly = 1 in def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT), "mfvsrd $rA, $XT", IIC_VecGeneral, []>, Requires<[In64BitMode]>; def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT), "mfvsrwz $rA, $XT", IIC_VecGeneral, [(set i32:$rA, (PPCmfvsr f64:$XT))]>; def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA), "mtvsrd $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsra i64:$rA))]>, Requires<[In64BitMode]>; def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA), "mtvsrwa $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsra i32:$rA))]>; def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA), "mtvsrwz $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsrz i32:$rA))]>; } // HasDirectMove let Predicates = [IsISA3_0, HasDirectMove] in { def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA), "mtvsrws $XT, $rA", IIC_VecGeneral, []>; def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB), "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral, []>, Requires<[In64BitMode]>; def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT), "mfvsrld $rA, $XT", IIC_VecGeneral, []>, Requires<[In64BitMode]>; } // IsISA3_0, HasDirectMove } // UseVSXReg = 1 // We want to parse this from asm, but we don't want to emit this as it would // be emitted with a VSX reg. So leave Emit = 0 here. def : InstAlias<"mfvrd $rA, $XT", (MFVRD g8rc:$rA, vrrc:$XT), 0>; def : InstAlias<"mffprd $rA, $src", (MFVSRD g8rc:$rA, f8rc:$src)>; /* Direct moves of various widths from GPR's into VSR's. Each move lines the value up into element 0 (both BE and LE). Namely, entities smaller than a doubleword are shifted left and moved for BE. For LE, they're moved, then swapped to go into the least significant element of the VSR. */ def MovesToVSR { dag BE_BYTE_0 = (MTVSRD (RLDICR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7)); dag BE_HALF_0 = (MTVSRD (RLDICR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15)); dag BE_WORD_0 = (MTVSRD (RLDICR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31)); dag BE_DWORD_0 = (MTVSRD $A); dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32)); dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), LE_MTVSRW, sub_64)); dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2); dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), BE_DWORD_0, sub_64)); dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2); } /* Patterns for extracting elements out of vectors. Integer elements are extracted using direct move operations. Patterns for extracting elements whose indices are not available at compile time are also provided with various _VARIABLE_ patterns. The numbering for the DAG's is for LE, but when used on BE, the correct LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13). */ def VectorExtractions { // Doubleword extraction dag LE_DWORD_0 = (MFVSRD (EXTRACT_SUBREG (XXPERMDI (COPY_TO_REGCLASS $S, VSRC), (COPY_TO_REGCLASS $S, VSRC), 2), sub_64)); dag LE_DWORD_1 = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); // Word extraction dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64)); dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64)); dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64)); // Halfword extraction dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32)); dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32)); dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32)); dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32)); dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32)); dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32)); dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32)); dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32)); // Byte extraction dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32)); dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32)); dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32)); dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32)); dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32)); dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32)); dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32)); dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32)); dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32)); dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32)); dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32)); dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32)); dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32)); dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32)); dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32)); dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32)); /* Variable element number (BE and LE patterns must be specified separately) This is a rather involved process. Conceptually, this is how the move is accomplished: 1. Identify which doubleword contains the element 2. Shift in the VMX register so that the correct doubleword is correctly lined up for the MFVSRD 3. Perform the move so that the element (along with some extra stuff) is in the GPR 4. Right shift within the GPR so that the element is right-justified Of course, the index is an element number which has a different meaning on LE/BE so the patterns have to be specified separately. Note: The final result will be the element right-justified with high order bits being arbitrarily defined (namely, whatever was in the vector register to the left of the value originally). */ /* LE variable byte Number 1. above: - For elements 0-7, we shift left by 8 bytes since they're on the right - For elements 8-15, we need not shift (shift left by zero bytes) This is accomplished by inverting the bits of the index and AND-ing with 0x8 (i.e. clearing all bits of the index and inverting bit 60). */ dag LE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDC8 (LI8 8), $Idx))); // Number 2. above: // - Now that we set up the shift amount, we shift in the VMX register dag LE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, LE_VBYTE_PERM_VEC)); // Number 3. above: // - The doubleword containing our element is moved to a GPR dag LE_MV_VBYTE = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)), sub_64)); /* Number 4. above: - Truncate the element number to the range 0-7 (8-15 are symmetrical and out of range values are truncated accordingly) - Multiply by 8 as we need to shift right by the number of bits, not bytes - Shift right in the GPR by the calculated value */ dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60), sub_32); dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT), sub_32); /* LE variable halfword Number 1. above: - For elements 0-3, we shift left by 8 since they're on the right - For elements 4-7, we need not shift (shift left by zero bytes) Similarly to the byte pattern, we invert the bits of the index, but we AND with 0x4 (i.e. clear all bits of the index and invert bit 61). Of course, the shift is still by 8 bytes, so we must multiply by 2. */ dag LE_VHALF_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62))); // Number 2. above: // - Now that we set up the shift amount, we shift in the VMX register dag LE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, LE_VHALF_PERM_VEC)); // Number 3. above: // - The doubleword containing our element is moved to a GPR dag LE_MV_VHALF = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)), sub_64)); /* Number 4. above: - Truncate the element number to the range 0-3 (4-7 are symmetrical and out of range values are truncated accordingly) - Multiply by 16 as we need to shift right by the number of bits - Shift right in the GPR by the calculated value */ dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59), sub_32); dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT), sub_32); /* LE variable word Number 1. above: - For elements 0-1, we shift left by 8 since they're on the right - For elements 2-3, we need not shift */ dag LE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61))); // Number 2. above: // - Now that we set up the shift amount, we shift in the VMX register dag LE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VWORD_PERM_VEC)); // Number 3. above: // - The doubleword containing our element is moved to a GPR dag LE_MV_VWORD = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)), sub_64)); /* Number 4. above: - Truncate the element number to the range 0-1 (2-3 are symmetrical and out of range values are truncated accordingly) - Multiply by 32 as we need to shift right by the number of bits - Shift right in the GPR by the calculated value */ dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58), sub_32); dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT), sub_32); /* LE variable doubleword Number 1. above: - For element 0, we shift left by 8 since it's on the right - For element 1, we need not shift */ dag LE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60))); // Number 2. above: // - Now that we set up the shift amount, we shift in the VMX register dag LE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, LE_VDWORD_PERM_VEC)); // Number 3. above: // - The doubleword containing our element is moved to a GPR // - Number 4. is not needed for the doubleword as the value is 64-bits dag LE_VARIABLE_DWORD = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)), sub_64)); /* LE variable float - Shift the vector to line up the desired element to BE Word 0 - Convert 32-bit float to a 64-bit single precision float */ dag LE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61))); dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC); dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE); /* LE variable double Same as the LE doubleword except there is no move. */ dag LE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)), (v16i8 (COPY_TO_REGCLASS $S, VRRC)), LE_VDWORD_PERM_VEC)); dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC); /* BE variable byte The algorithm here is the same as the LE variable byte except: - The shift in the VMX register is by 0/8 for opposite element numbers so we simply AND the element number with 0x8 - The order of elements after the move to GPR is reversed, so we invert the bits of the index prior to truncating to the range 0-7 */ dag BE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDIo8 $Idx, 8))); dag BE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, BE_VBYTE_PERM_VEC)); dag BE_MV_VBYTE = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)), sub_64)); dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60), sub_32); dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT), sub_32); /* BE variable halfword The algorithm here is the same as the LE variable halfword except: - The shift in the VMX register is by 0/8 for opposite element numbers so we simply AND the element number with 0x4 and multiply by 2 - The order of elements after the move to GPR is reversed, so we invert the bits of the index prior to truncating to the range 0-3 */ dag BE_VHALF_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62))); dag BE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, BE_VHALF_PERM_VEC)); dag BE_MV_VHALF = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)), sub_64)); dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59), sub_32); dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT), sub_32); /* BE variable word The algorithm is the same as the LE variable word except: - The shift in the VMX register happens for opposite element numbers - The order of elements after the move to GPR is reversed, so we invert the bits of the index prior to truncating to the range 0-1 */ dag BE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61))); dag BE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VWORD_PERM_VEC)); dag BE_MV_VWORD = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)), sub_64)); dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58), sub_32); dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT), sub_32); /* BE variable doubleword Same as the LE doubleword except we shift in the VMX register for opposite element indices. */ dag BE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60))); dag BE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VDWORD_PERM_VEC)); dag BE_VARIABLE_DWORD = (MFVSRD (EXTRACT_SUBREG (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)), sub_64)); /* BE variable float - Shift the vector to line up the desired element to BE Word 0 - Convert 32-bit float to a 64-bit single precision float */ dag BE_VFLOAT_PERM_VEC = (v16i8 (LVSL ZERO8, (RLDICR $Idx, 2, 61))); dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC); dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE); /* BE variable double Same as the BE doubleword except there is no move. */ dag BE_VDOUBLE_PERMUTE = (v16i8 (VPERM (v16i8 (COPY_TO_REGCLASS $S, VRRC)), (v16i8 (COPY_TO_REGCLASS $S, VRRC)), BE_VDWORD_PERM_VEC)); dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC); } def NoP9Altivec : Predicate<"!PPCSubTarget->hasP9Altivec()">; let AddedComplexity = 400 in { // v4f32 scalar <-> vector conversions (BE) let Predicates = [IsBigEndian, HasP8Vector] in { def : Pat<(v4f32 (scalar_to_vector f32:$A)), (v4f32 (XSCVDPSPN $A))>; def : Pat<(f32 (vector_extract v4f32:$S, 0)), (f32 (XSCVSPDPN $S))>; def : Pat<(f32 (vector_extract v4f32:$S, 1)), (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; def : Pat<(f32 (vector_extract v4f32:$S, 2)), (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>; def : Pat<(f32 (vector_extract v4f32:$S, 3)), (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), (f32 VectorExtractions.BE_VARIABLE_FLOAT)>; } // IsBigEndian, HasP8Vector // Variable index vector_extract for v2f64 does not require P8Vector let Predicates = [IsBigEndian, HasVSX] in def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>; let Predicates = [IsBigEndian, HasDirectMove] in { // v16i8 scalar <-> vector conversions (BE) def : Pat<(v16i8 (scalar_to_vector i32:$A)), (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>; def : Pat<(v8i16 (scalar_to_vector i32:$A)), (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>; def : Pat<(v4i32 (scalar_to_vector i32:$A)), (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>; def : Pat<(v2i64 (scalar_to_vector i64:$A)), (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>; // v2i64 scalar <-> vector conversions (BE) def : Pat<(i64 (vector_extract v2i64:$S, 0)), (i64 VectorExtractions.LE_DWORD_1)>; def : Pat<(i64 (vector_extract v2i64:$S, 1)), (i64 VectorExtractions.LE_DWORD_0)>; def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), (i64 VectorExtractions.BE_VARIABLE_DWORD)>; } // IsBigEndian, HasDirectMove let Predicates = [IsBigEndian, HasDirectMove, NoP9Altivec] in { def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 VectorExtractions.LE_BYTE_15)>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), (i32 VectorExtractions.LE_BYTE_14)>; def : Pat<(i32 (vector_extract v16i8:$S, 2)), (i32 VectorExtractions.LE_BYTE_13)>; def : Pat<(i32 (vector_extract v16i8:$S, 3)), (i32 VectorExtractions.LE_BYTE_12)>; def : Pat<(i32 (vector_extract v16i8:$S, 4)), (i32 VectorExtractions.LE_BYTE_11)>; def : Pat<(i32 (vector_extract v16i8:$S, 5)), (i32 VectorExtractions.LE_BYTE_10)>; def : Pat<(i32 (vector_extract v16i8:$S, 6)), (i32 VectorExtractions.LE_BYTE_9)>; def : Pat<(i32 (vector_extract v16i8:$S, 7)), (i32 VectorExtractions.LE_BYTE_8)>; def : Pat<(i32 (vector_extract v16i8:$S, 8)), (i32 VectorExtractions.LE_BYTE_7)>; def : Pat<(i32 (vector_extract v16i8:$S, 9)), (i32 VectorExtractions.LE_BYTE_6)>; def : Pat<(i32 (vector_extract v16i8:$S, 10)), (i32 VectorExtractions.LE_BYTE_5)>; def : Pat<(i32 (vector_extract v16i8:$S, 11)), (i32 VectorExtractions.LE_BYTE_4)>; def : Pat<(i32 (vector_extract v16i8:$S, 12)), (i32 VectorExtractions.LE_BYTE_3)>; def : Pat<(i32 (vector_extract v16i8:$S, 13)), (i32 VectorExtractions.LE_BYTE_2)>; def : Pat<(i32 (vector_extract v16i8:$S, 14)), (i32 VectorExtractions.LE_BYTE_1)>; def : Pat<(i32 (vector_extract v16i8:$S, 15)), (i32 VectorExtractions.LE_BYTE_0)>; def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), (i32 VectorExtractions.BE_VARIABLE_BYTE)>; // v8i16 scalar <-> vector conversions (BE) def : Pat<(i32 (vector_extract v8i16:$S, 0)), (i32 VectorExtractions.LE_HALF_7)>; def : Pat<(i32 (vector_extract v8i16:$S, 1)), (i32 VectorExtractions.LE_HALF_6)>; def : Pat<(i32 (vector_extract v8i16:$S, 2)), (i32 VectorExtractions.LE_HALF_5)>; def : Pat<(i32 (vector_extract v8i16:$S, 3)), (i32 VectorExtractions.LE_HALF_4)>; def : Pat<(i32 (vector_extract v8i16:$S, 4)), (i32 VectorExtractions.LE_HALF_3)>; def : Pat<(i32 (vector_extract v8i16:$S, 5)), (i32 VectorExtractions.LE_HALF_2)>; def : Pat<(i32 (vector_extract v8i16:$S, 6)), (i32 VectorExtractions.LE_HALF_1)>; def : Pat<(i32 (vector_extract v8i16:$S, 7)), (i32 VectorExtractions.LE_HALF_0)>; def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), (i32 VectorExtractions.BE_VARIABLE_HALF)>; // v4i32 scalar <-> vector conversions (BE) def : Pat<(i32 (vector_extract v4i32:$S, 0)), (i32 VectorExtractions.LE_WORD_3)>; def : Pat<(i32 (vector_extract v4i32:$S, 1)), (i32 VectorExtractions.LE_WORD_2)>; def : Pat<(i32 (vector_extract v4i32:$S, 2)), (i32 VectorExtractions.LE_WORD_1)>; def : Pat<(i32 (vector_extract v4i32:$S, 3)), (i32 VectorExtractions.LE_WORD_0)>; def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 VectorExtractions.BE_VARIABLE_WORD)>; } // IsBigEndian, HasDirectMove, NoP9Altivec // v4f32 scalar <-> vector conversions (LE) let Predicates = [IsLittleEndian, HasP8Vector] in { def : Pat<(v4f32 (scalar_to_vector f32:$A)), (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>; def : Pat<(f32 (vector_extract v4f32:$S, 0)), (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; def : Pat<(f32 (vector_extract v4f32:$S, 1)), (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>; def : Pat<(f32 (vector_extract v4f32:$S, 2)), (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; def : Pat<(f32 (vector_extract v4f32:$S, 3)), (f32 (XSCVSPDPN $S))>; def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), (f32 VectorExtractions.LE_VARIABLE_FLOAT)>; } // IsLittleEndian, HasP8Vector // Variable index vector_extract for v2f64 does not require P8Vector let Predicates = [IsLittleEndian, HasVSX] in def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>; def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; // Variable index unsigned vector_extract on Power9 let Predicates = [HasP9Altivec, IsLittleEndian] in { def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), (VEXTUBRX $Idx, $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))), (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))), (VEXTUHRX (LI8 0), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))), (VEXTUHRX (LI8 2), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))), (VEXTUHRX (LI8 4), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))), (VEXTUHRX (LI8 6), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))), (VEXTUHRX (LI8 8), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))), (VEXTUHRX (LI8 10), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))), (VEXTUHRX (LI8 12), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))), (VEXTUHRX (LI8 14), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))), (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), (VEXTUWRX (LI8 0), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), (VEXTUWRX (LI8 4), $S)>; // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (i32 VectorExtractions.LE_WORD_2), sub_32)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), (VEXTUWRX (LI8 12), $S)>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))), (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), (EXTSW (VEXTUWRX (LI8 0), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), (EXTSW (VEXTUWRX (LI8 4), $S))>; // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (i32 VectorExtractions.LE_WORD_2), sub_32))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), (EXTSW (VEXTUWRX (LI8 12), $S))>; def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), (i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 2)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 3)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 4)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 5)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 6)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 7)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 8)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 9)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 10)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 11)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 12)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 13)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 14)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 15)), (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), (i32 (EXTRACT_SUBREG (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 0)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 1)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 2)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 3)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 4)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 5)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 6)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 6)), (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>; def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 (EXTRACT_SUBREG (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>; def : Pat<(i32 (vector_extract v4i32:$S, 0)), (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>; def : Pat<(i32 (vector_extract v4i32:$S, 1)), (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>; // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX def : Pat<(i32 (vector_extract v4i32:$S, 2)), (i32 VectorExtractions.LE_WORD_2)>; def : Pat<(i32 (vector_extract v4i32:$S, 3)), (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>; } let Predicates = [HasP9Altivec, IsBigEndian] in { def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), (VEXTUBLX $Idx, $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))), (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))), (VEXTUHLX (LI8 0), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))), (VEXTUHLX (LI8 2), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))), (VEXTUHLX (LI8 4), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))), (VEXTUHLX (LI8 6), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))), (VEXTUHLX (LI8 8), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))), (VEXTUHLX (LI8 10), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))), (VEXTUHLX (LI8 12), $S)>; def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))), (VEXTUHLX (LI8 14), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))), (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), (VEXTUWLX (LI8 0), $S)>; // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (i32 VectorExtractions.LE_WORD_2), sub_32)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), (VEXTUWLX (LI8 8), $S)>; def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), (VEXTUWLX (LI8 12), $S)>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))), (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), (EXTSW (VEXTUWLX (LI8 0), $S))>; // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (i32 VectorExtractions.LE_WORD_2), sub_32))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), (EXTSW (VEXTUWLX (LI8 8), $S))>; def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), (EXTSW (VEXTUWLX (LI8 12), $S))>; def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), (i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 2)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 3)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 4)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 5)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 6)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 7)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 8)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 9)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 10)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 11)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 12)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 13)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 14)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>; def : Pat<(i32 (vector_extract v16i8:$S, 15)), (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), (i32 (EXTRACT_SUBREG (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 0)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 1)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 2)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 3)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 4)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 5)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 6)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>; def : Pat<(i32 (vector_extract v8i16:$S, 6)), (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>; def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 (EXTRACT_SUBREG (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>; def : Pat<(i32 (vector_extract v4i32:$S, 0)), (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>; // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX def : Pat<(i32 (vector_extract v4i32:$S, 1)), (i32 VectorExtractions.LE_WORD_2)>; def : Pat<(i32 (vector_extract v4i32:$S, 2)), (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>; def : Pat<(i32 (vector_extract v4i32:$S, 3)), (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>; } let Predicates = [IsLittleEndian, HasDirectMove] in { // v16i8 scalar <-> vector conversions (LE) def : Pat<(v16i8 (scalar_to_vector i32:$A)), (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; def : Pat<(v8i16 (scalar_to_vector i32:$A)), (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; def : Pat<(v4i32 (scalar_to_vector i32:$A)), (v4i32 MovesToVSR.LE_WORD_0)>; def : Pat<(v2i64 (scalar_to_vector i64:$A)), (v2i64 MovesToVSR.LE_DWORD_0)>; // v2i64 scalar <-> vector conversions (LE) def : Pat<(i64 (vector_extract v2i64:$S, 0)), (i64 VectorExtractions.LE_DWORD_0)>; def : Pat<(i64 (vector_extract v2i64:$S, 1)), (i64 VectorExtractions.LE_DWORD_1)>; def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), (i64 VectorExtractions.LE_VARIABLE_DWORD)>; } // IsLittleEndian, HasDirectMove let Predicates = [IsLittleEndian, HasDirectMove, NoP9Altivec] in { def : Pat<(i32 (vector_extract v16i8:$S, 0)), (i32 VectorExtractions.LE_BYTE_0)>; def : Pat<(i32 (vector_extract v16i8:$S, 1)), (i32 VectorExtractions.LE_BYTE_1)>; def : Pat<(i32 (vector_extract v16i8:$S, 2)), (i32 VectorExtractions.LE_BYTE_2)>; def : Pat<(i32 (vector_extract v16i8:$S, 3)), (i32 VectorExtractions.LE_BYTE_3)>; def : Pat<(i32 (vector_extract v16i8:$S, 4)), (i32 VectorExtractions.LE_BYTE_4)>; def : Pat<(i32 (vector_extract v16i8:$S, 5)), (i32 VectorExtractions.LE_BYTE_5)>; def : Pat<(i32 (vector_extract v16i8:$S, 6)), (i32 VectorExtractions.LE_BYTE_6)>; def : Pat<(i32 (vector_extract v16i8:$S, 7)), (i32 VectorExtractions.LE_BYTE_7)>; def : Pat<(i32 (vector_extract v16i8:$S, 8)), (i32 VectorExtractions.LE_BYTE_8)>; def : Pat<(i32 (vector_extract v16i8:$S, 9)), (i32 VectorExtractions.LE_BYTE_9)>; def : Pat<(i32 (vector_extract v16i8:$S, 10)), (i32 VectorExtractions.LE_BYTE_10)>; def : Pat<(i32 (vector_extract v16i8:$S, 11)), (i32 VectorExtractions.LE_BYTE_11)>; def : Pat<(i32 (vector_extract v16i8:$S, 12)), (i32 VectorExtractions.LE_BYTE_12)>; def : Pat<(i32 (vector_extract v16i8:$S, 13)), (i32 VectorExtractions.LE_BYTE_13)>; def : Pat<(i32 (vector_extract v16i8:$S, 14)), (i32 VectorExtractions.LE_BYTE_14)>; def : Pat<(i32 (vector_extract v16i8:$S, 15)), (i32 VectorExtractions.LE_BYTE_15)>; def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), (i32 VectorExtractions.LE_VARIABLE_BYTE)>; // v8i16 scalar <-> vector conversions (LE) def : Pat<(i32 (vector_extract v8i16:$S, 0)), (i32 VectorExtractions.LE_HALF_0)>; def : Pat<(i32 (vector_extract v8i16:$S, 1)), (i32 VectorExtractions.LE_HALF_1)>; def : Pat<(i32 (vector_extract v8i16:$S, 2)), (i32 VectorExtractions.LE_HALF_2)>; def : Pat<(i32 (vector_extract v8i16:$S, 3)), (i32 VectorExtractions.LE_HALF_3)>; def : Pat<(i32 (vector_extract v8i16:$S, 4)), (i32 VectorExtractions.LE_HALF_4)>; def : Pat<(i32 (vector_extract v8i16:$S, 5)), (i32 VectorExtractions.LE_HALF_5)>; def : Pat<(i32 (vector_extract v8i16:$S, 6)), (i32 VectorExtractions.LE_HALF_6)>; def : Pat<(i32 (vector_extract v8i16:$S, 7)), (i32 VectorExtractions.LE_HALF_7)>; def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), (i32 VectorExtractions.LE_VARIABLE_HALF)>; // v4i32 scalar <-> vector conversions (LE) def : Pat<(i32 (vector_extract v4i32:$S, 0)), (i32 VectorExtractions.LE_WORD_0)>; def : Pat<(i32 (vector_extract v4i32:$S, 1)), (i32 VectorExtractions.LE_WORD_1)>; def : Pat<(i32 (vector_extract v4i32:$S, 2)), (i32 VectorExtractions.LE_WORD_2)>; def : Pat<(i32 (vector_extract v4i32:$S, 3)), (i32 VectorExtractions.LE_WORD_3)>; def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 VectorExtractions.LE_VARIABLE_WORD)>; } // IsLittleEndian, HasDirectMove, NoP9Altivec let Predicates = [HasDirectMove, HasVSX] in { // bitconvert f32 -> i32 // (convert to 32-bit fp single, shift right 1 word, move to GPR) def : Pat<(i32 (bitconvert f32:$S)), (i32 (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3), sub_64)))>; // bitconvert i32 -> f32 // (move to FPR, shift left 1 word, convert to 64-bit fp single) def : Pat<(f32 (bitconvert i32:$A)), (f32 (XSCVSPDPN (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>; // bitconvert f64 -> i64 // (move to GPR, nothing else needed) def : Pat<(i64 (bitconvert f64:$S)), (i64 (MFVSRD $S))>; // bitconvert i64 -> f64 // (move to FPR, nothing else needed) def : Pat<(f64 (bitconvert i64:$S)), (f64 (MTVSRD $S))>; } // Materialize a zero-vector of long long def : Pat<(v2i64 immAllZerosV), (v2i64 (XXLXORz))>; } def AlignValues { dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3)); dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC); } // The following VSX instructions were introduced in Power ISA 3.0 def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">; let AddedComplexity = 400, Predicates = [HasP9Vector] in { // [PO VRT XO VRB XO /] class X_VT5_XO5_VB5 opcode, bits<5> xo2, bits<10> xo, string opc, list pattern> : X_RD5_XO5_RS5; // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /] class X_VT5_XO5_VB5_Ro opcode, bits<5> xo2, bits<10> xo, string opc, list pattern> : X_VT5_XO5_VB5, isDOT; // [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less), // So we use different operand class for VRB class X_VT5_XO5_VB5_TyVB opcode, bits<5> xo2, bits<10> xo, string opc, RegisterOperand vbtype, list pattern> : X_RD5_XO5_RS5; // [PO VRT XO VRB XO /] class X_VT5_XO5_VB5_VSFR opcode, bits<5> xo2, bits<10> xo, string opc, list pattern> : X_RD5_XO5_RS5; // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /] class X_VT5_XO5_VB5_VSFR_Ro opcode, bits<5> xo2, bits<10> xo, string opc, list pattern> : X_VT5_XO5_VB5_VSFR, isDOT; let UseVSXReg = 1 in { // [PO T XO B XO BX /] class XX2_RT5_XO5_XB6 opcode, bits<5> xo2, bits<9> xo, string opc, list pattern> : XX2_RD5_XO5_RS6; // [PO T XO B XO BX TX] class XX2_XT6_XO5_XB6 opcode, bits<5> xo2, bits<9> xo, string opc, RegisterOperand vtype, list pattern> : XX2_RD6_XO5_RS6; // [PO T A B XO AX BX TX], src and dest register use different operand class class XX3_XT5_XA5_XB5 opcode, bits<8> xo, string opc, RegisterOperand xty, RegisterOperand aty, RegisterOperand bty, InstrItinClass itin, list pattern> : XX3Form; } // UseVSXReg = 1 // [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5 opcode, bits<10> xo, string opc, list pattern> : XForm_1; // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_Ro opcode, bits<10> xo, string opc, list pattern> : X_VT5_VA5_VB5, isDOT; // [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_FMA opcode, bits<10> xo, string opc, list pattern> : XForm_1, RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">; // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_FMA_Ro opcode, bits<10> xo, string opc, list pattern> : X_VT5_VA5_VB5_FMA, isDOT; //===--------------------------------------------------------------------===// // Quad-Precision Scalar Move Instructions: // Copy Sign def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp", [(set f128:$vT, (fcopysign f128:$vB, f128:$vA))]>; // Absolute/Negative-Absolute/Negate def XSABSQP : X_VT5_XO5_VB5<63, 0, 804, "xsabsqp", [(set f128:$vT, (fabs f128:$vB))]>; def XSNABSQP : X_VT5_XO5_VB5<63, 8, 804, "xsnabsqp", [(set f128:$vT, (fneg (fabs f128:$vB)))]>; def XSNEGQP : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp", [(set f128:$vT, (fneg f128:$vB))]>; //===--------------------------------------------------------------------===// // Quad-Precision Scalar Floating-Point Arithmetic Instructions: // Add/Divide/Multiply/Subtract let isCommutable = 1 in { def XSADDQP : X_VT5_VA5_VB5 <63, 4, "xsaddqp", [(set f128:$vT, (fadd f128:$vA, f128:$vB))]>; def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo", [(set f128:$vT, (int_ppc_addf128_round_to_odd f128:$vA, f128:$vB))]>; def XSMULQP : X_VT5_VA5_VB5 <63, 36, "xsmulqp", [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>; def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo", [(set f128:$vT, (int_ppc_mulf128_round_to_odd f128:$vA, f128:$vB))]>; } def XSSUBQP : X_VT5_VA5_VB5 <63, 516, "xssubqp" , [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>; def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", [(set f128:$vT, (int_ppc_subf128_round_to_odd f128:$vA, f128:$vB))]>; def XSDIVQP : X_VT5_VA5_VB5 <63, 548, "xsdivqp", [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>; def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", [(set f128:$vT, (int_ppc_divf128_round_to_odd f128:$vA, f128:$vB))]>; // Square-Root def XSSQRTQP : X_VT5_XO5_VB5 <63, 27, 804, "xssqrtqp", [(set f128:$vT, (fsqrt f128:$vB))]>; def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", [(set f128:$vT, (int_ppc_sqrtf128_round_to_odd f128:$vB))]>; // (Negative) Multiply-{Add/Subtract} def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp", [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>; def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo", [(set f128:$vT, (int_ppc_fmaf128_round_to_odd f128:$vA,f128:$vB,f128:$vTi))]>; def XSMSUBQP : X_VT5_VA5_VB5_FMA <63, 420, "xsmsubqp" , [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>; def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" , [(set f128:$vT, (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, (fneg f128:$vTi)))]>; def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp", [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>; def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo", [(set f128:$vT, (fneg (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, f128:$vTi)))]>; def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp", [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>; def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo", [(set f128:$vT, (fneg (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, (fneg f128:$vTi))))]>; // Additional fnmsub patterns: -a*c + b == -(a*c - b) def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>; def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>; //===--------------------------------------------------------------------===// // Quad/Double-Precision Compare Instructions: // [PO BF // VRA VRB XO /] class X_BF3_VA5_VB5 opcode, bits<10> xo, string opc, list pattern> : XForm_17 { let Pattern = pattern; } // QP Compare Ordered/Unordered def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>; def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>; // DP/QP Compare Exponents def XSCMPEXPDP : XX3Form_1<60, 59, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>, UseVSXReg; def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>; // DP Compare ==, >=, >, != // Use vsrc for XT, because the entire register of XT is set. // XT.dword[1] = 0x0000_0000_0000_0000 def XSCMPEQDP : XX3_XT5_XA5_XB5<60, 3, "xscmpeqdp", vsrc, vsfrc, vsfrc, IIC_FPCompare, []>; def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc, IIC_FPCompare, []>; def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc, IIC_FPCompare, []>; //===--------------------------------------------------------------------===// // Quad-Precision Floating-Point Conversion Instructions: // Convert DP -> QP def XSCVDPQP : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc, [(set f128:$vT, (fpextend f64:$vB))]>; // Round & Convert QP -> DP (dword[1] is set to zero) def XSCVQPDP : X_VT5_XO5_VB5_VSFR<63, 20, 836, "xscvqpdp" , []>; def XSCVQPDPO : X_VT5_XO5_VB5_VSFR_Ro<63, 20, 836, "xscvqpdpo", [(set f64:$vT, (int_ppc_truncf128_round_to_odd f128:$vB))]>; // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero) def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>; def XSCVQPSWZ : X_VT5_XO5_VB5<63, 9, 836, "xscvqpswz", []>; def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>; def XSCVQPUWZ : X_VT5_XO5_VB5<63, 1, 836, "xscvqpuwz", []>; // Convert (Un)Signed DWord -> QP. def XSCVSDQP : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>; def : Pat<(f128 (sint_to_fp i64:$src)), (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>; def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))), (f128 (XSCVSDQP $src))>; def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))), (f128 (XSCVSDQP (VEXTSW2Ds $src)))>; def XSCVUDQP : X_VT5_XO5_VB5_TyVB<63, 2, 836, "xscvudqp", vfrc, []>; def : Pat<(f128 (uint_to_fp i64:$src)), (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>; def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))), (f128 (XSCVUDQP $src))>; // Convert (Un)Signed Word -> QP. def : Pat<(f128 (sint_to_fp i32:$src)), (f128 (XSCVSDQP (MTVSRWA $src)))>; def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))), (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>; def : Pat<(f128 (uint_to_fp i32:$src)), (f128 (XSCVUDQP (MTVSRWZ $src)))>; def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))), (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>; let UseVSXReg = 1 in { //===--------------------------------------------------------------------===// // Round to Floating-Point Integer Instructions // (Round &) Convert DP <-> HP // Note! xscvdphp's src and dest register both use the left 64 bits, so we use // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits, // but we still use vsfrc for it. def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>; def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>; // Vector HP -> SP def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>; def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc, [(set v4f32:$XT, (int_ppc_vsx_xvcvsphp v4f32:$XB))]>; } // UseVSXReg = 1 // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a // separate pattern so that it can convert the input register class from // VRRC(v8i16) to VSRC. def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)), (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>; class Z23_VT5_R1_VB5_RMC2_EX1 opcode, bits<8> xo, bit ex, string opc, list pattern> : Z23Form_8 { let RC = ex; } // Round to Quad-Precision Integer [with Inexact] def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>; def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>; // Use current rounding mode def : Pat<(f128 (fnearbyint f128:$vB)), (f128 (XSRQPI 0, $vB, 3))>; // Round to nearest, ties away from zero def : Pat<(f128 (fround f128:$vB)), (f128 (XSRQPI 0, $vB, 0))>; // Round towards Zero def : Pat<(f128 (ftrunc f128:$vB)), (f128 (XSRQPI 1, $vB, 1))>; // Round towards +Inf def : Pat<(f128 (fceil f128:$vB)), (f128 (XSRQPI 1, $vB, 2))>; // Round towards -Inf def : Pat<(f128 (ffloor f128:$vB)), (f128 (XSRQPI 1, $vB, 3))>; // Use current rounding mode, [with Inexact] def : Pat<(f128 (frint f128:$vB)), (f128 (XSRQPIX 0, $vB, 3))>; // Round Quad-Precision to Double-Extended Precision (fp80) def XSRQPXP : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>; //===--------------------------------------------------------------------===// // Insert/Extract Instructions // Insert Exponent DP/QP // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB), "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>, UseVSXReg; // vB NOTE: only vB.dword[0] is used, that's why we don't use // X_VT5_VA5_VB5 form def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB), "xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>; // Extract Exponent/Significand DP/QP def XSXEXPDP : XX2_RT5_XO5_XB6<60, 0, 347, "xsxexpdp", []>; def XSXSIGDP : XX2_RT5_XO5_XB6<60, 1, 347, "xsxsigdp", []>; def XSXEXPQP : X_VT5_XO5_VB5 <63, 2, 804, "xsxexpqp", []>; def XSXSIGQP : X_VT5_XO5_VB5 <63, 18, 804, "xsxsigqp", []>; // Vector Insert Word let UseVSXReg = 1 in { // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB. def XXINSERTW : XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM), "xxinsertw $XT, $XB, $UIM", IIC_VecFP, [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB, imm32SExt16:$UIM))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; // Vector Extract Unsigned Word def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165, (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM), "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>; } // UseVSXReg = 1 // Vector Insert Exponent DP/SP def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc, IIC_VecFP, [(set v2f64: $XT,(int_ppc_vsx_xviexpdp v2i64:$XA, v2i64:$XB))]>; def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc, IIC_VecFP, [(set v4f32: $XT,(int_ppc_vsx_xviexpsp v4i32:$XA, v4i32:$XB))]>; // Vector Extract Exponent/Significand DP/SP def XVXEXPDP : XX2_XT6_XO5_XB6<60, 0, 475, "xvxexpdp", vsrc, [(set v2i64: $XT, (int_ppc_vsx_xvxexpdp v2f64:$XB))]>; def XVXEXPSP : XX2_XT6_XO5_XB6<60, 8, 475, "xvxexpsp", vsrc, [(set v4i32: $XT, (int_ppc_vsx_xvxexpsp v4f32:$XB))]>; def XVXSIGDP : XX2_XT6_XO5_XB6<60, 1, 475, "xvxsigdp", vsrc, [(set v2i64: $XT, (int_ppc_vsx_xvxsigdp v2f64:$XB))]>; def XVXSIGSP : XX2_XT6_XO5_XB6<60, 9, 475, "xvxsigsp", vsrc, [(set v4i32: $XT, (int_ppc_vsx_xvxsigsp v4f32:$XB))]>; let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Extra patterns expanding to vector Extract Word/Insert Word def : Pat<(v4i32 (int_ppc_vsx_xxinsertw v4i32:$A, v2i64:$B, imm:$IMM)), (v4i32 (XXINSERTW $A, $B, imm:$IMM))>; def : Pat<(v2i64 (int_ppc_vsx_xxextractuw v2i64:$A, imm:$IMM)), (v2i64 (COPY_TO_REGCLASS (XXEXTRACTUW $A, imm:$IMM), VSRC))>; } // AddedComplexity = 400, HasP9Vector //===--------------------------------------------------------------------===// // Test Data Class SP/DP/QP let UseVSXReg = 1 in { def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298, (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB), "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>; def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362, (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB), "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>; } // UseVSXReg = 1 def XSTSTDCQP : X_BF3_DCMX7_RS5 <63, 708, (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB), "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>; // Vector Test Data Class SP/DP let UseVSXReg = 1 in { def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5, (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB), "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP, [(set v4i32: $XT, (int_ppc_vsx_xvtstdcsp v4f32:$XB, imm:$DCMX))]>; def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5, (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB), "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP, [(set v2i64: $XT, (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>; } // UseVSXReg = 1 //===--------------------------------------------------------------------===// // Maximum/Minimum Type-C/Type-J DP // XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU, so we use vsrc for XT def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; //===--------------------------------------------------------------------===// // Vector Byte-Reverse H/W/D/Q Word def XXBRH : XX2_XT6_XO5_XB6<60, 7, 475, "xxbrh", vsrc, []>; def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, []>; def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>; def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>; // Vector Reverse def : Pat<(v8i16 (PPCxxreverse v8i16 :$A)), (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; def : Pat<(v4i32 (PPCxxreverse v4i32 :$A)), (v4i32 (XXBRW $A))>; def : Pat<(v2i64 (PPCxxreverse v2i64 :$A)), (v2i64 (XXBRD $A))>; def : Pat<(v1i128 (PPCxxreverse v1i128 :$A)), (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; // Vector Permute def XXPERM : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc, IIC_VecPerm, []>; def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc, IIC_VecPerm, []>; // Vector Splat Immediate Byte def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8), "xxspltib $XT, $IMM8", IIC_VecPerm, []>, UseVSXReg; //===--------------------------------------------------------------------===// // Vector/Scalar Load/Store Instructions // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging. let mayLoad = 1, mayStore = 0 in { // Load Vector def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src), "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg; // Load DWord def LXSD : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src), "lxsd $vD, $src", IIC_LdStLFD, []>; // Load SP from src, convert it to DP, and place in dword[0] def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src), "lxssp $vD, $src", IIC_LdStLFD, []>; // [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different // "out" and "in" dag class X_XT6_RA5_RB5 opcode, bits<10> xo, string opc, RegisterOperand vtype, list pattern> : XX1Form_memOp, UseVSXReg; // Load as Integer Byte/Halfword & Zero Indexed def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc, [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>; def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc, [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>; // Load Vector Halfword*8/Byte*16 Indexed def LXVH8X : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>; def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>; // Load Vector Indexed def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc, [(set v2f64:$XT, (load xaddr:$src))]>; // Load Vector (Left-justified) with Length def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvl $XT, $src, $rB", IIC_LdStLoad, [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>, UseVSXReg; def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvll $XT, $src, $rB", IIC_LdStLoad, [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>, UseVSXReg; // Load Vector Word & Splat Indexed def LXVWSX : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>; } // mayLoad // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging. let mayStore = 1, mayLoad = 0 in { // Store Vector def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst), "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg; // Store DWord def STXSD : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst), "stxsd $vS, $dst", IIC_LdStSTFD, []>; // Convert DP of dword[0] to SP, and Store to dst def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst), "stxssp $vS, $dst", IIC_LdStSTFD, []>; // [PO S RA RB XO SX] class X_XS6_RA5_RB5 opcode, bits<10> xo, string opc, RegisterOperand vtype, list pattern> : XX1Form_memOp, UseVSXReg; // Store as Integer Byte/Halfword Indexed def STXSIBX : X_XS6_RA5_RB5<31, 909, "stxsibx" , vsfrc, [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>; def STXSIHX : X_XS6_RA5_RB5<31, 941, "stxsihx" , vsfrc, [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>; let isCodeGenOnly = 1 in { def STXSIBXv : X_XS6_RA5_RB5<31, 909, "stxsibx" , vrrc, []>; def STXSIHXv : X_XS6_RA5_RB5<31, 941, "stxsihx" , vrrc, []>; } // Store Vector Halfword*8/Byte*16 Indexed def STXVH8X : X_XS6_RA5_RB5<31, 940, "stxvh8x" , vsrc, []>; def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>; // Store Vector Indexed def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc, [(store v2f64:$XT, xaddr:$dst)]>; // Store Vector (Left-justified) with Length def STXVL : XX1Form_memOp<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), "stxvl $XT, $dst, $rB", IIC_LdStLoad, [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst, i64:$rB)]>, UseVSXReg; def STXVLL : XX1Form_memOp<31, 429, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), "stxvll $XT, $dst, $rB", IIC_LdStLoad, [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst, i64:$rB)]>, UseVSXReg; } // mayStore let Predicates = [IsLittleEndian] in { def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>; def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>; def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>; def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>; } let Predicates = [IsBigEndian] in { def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>; def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>; def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>; def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))), (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>; def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))), (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>; } // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead // of f64 def : Pat<(v8i16 (PPCmtvsrz i32:$A)), (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>; def : Pat<(v16i8 (PPCmtvsrz i32:$A)), (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>; // Patterns for which instructions from ISA 3.0 are a better match let Predicates = [IsLittleEndian, HasP9Vector] in { def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>; } // IsLittleEndian, HasP9Vector let Predicates = [IsBigEndian, HasP9Vector] in { def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>; def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))), (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>; def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)), (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>; } // IsLittleEndian, HasP9Vector // D-Form Load/Store def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; def : Pat<(f128 (quadwOffsetLoad iqaddr:$src)), (COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>; def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>; def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>; def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; def : Pat<(quadwOffsetStore f128:$rS, iqaddr:$dst), (STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>; def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(f128 (nonQuadwOffsetLoad xoaddr:$src)), (COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>; def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst), (STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>; def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), (v4f32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (fpround (f64 (extloadf32 xoaddr:$src)))))), (v4f32 (LXVWSX xoaddr:$src))>; // Build vectors from i8 loads def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)), (v16i8 (VSPLTBs 7, (LXSIBZX xoaddr:$src)))>; def : Pat<(v8i16 (scalar_to_vector ScalarLoads.ZELi8)), (v8i16 (VSPLTHs 3, (LXSIBZX xoaddr:$src)))>; def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)), (v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>; def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)), (v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>; def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)), (v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>; def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)), (v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>; // Build vectors from i16 loads def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)), (v8i16 (VSPLTHs 3, (LXSIHZX xoaddr:$src)))>; def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)), (v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>; def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)), (v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>; def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)), (v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>; def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)), (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>; let Predicates = [IsBigEndian, HasP9Vector] in { // Scalar stores of i8 def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst), (STXSIBXv $S, xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; // Scalar stores of i16 def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst), (STXSIHXv $S, xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; } // IsBigEndian, HasP9Vector let Predicates = [IsLittleEndian, HasP9Vector] in { // Scalar stores of i8 def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 7)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 5)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 3)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 1)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst), (STXSIBXv $S, xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 15)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 13)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 11)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst), (STXSIBXv (v16i8 (VSLDOI $S, $S, 9)), xoaddr:$dst)>; // Scalar stores of i16 def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 8)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 6)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 4)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 2)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst), (STXSIHXv $S, xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 14)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 12)), xoaddr:$dst)>; def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst), (STXSIHXv (v16i8 (VSLDOI $S, $S, 10)), xoaddr:$dst)>; } // IsLittleEndian, HasP9Vector // Vector sign extensions def : Pat<(f64 (PPCVexts f64:$A, 1)), (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>; def : Pat<(f64 (PPCVexts f64:$A, 2)), (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>; let isPseudo = 1 in { def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src), "#DFLOADf32", [(set f32:$XT, (load ixaddr:$src))]>; def DFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrix:$src), "#DFLOADf64", [(set f64:$XT, (load ixaddr:$src))]>; def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst), "#DFSTOREf32", [(store f32:$XT, ixaddr:$dst)]>; def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst), "#DFSTOREf64", [(store f64:$XT, ixaddr:$dst)]>; } def : Pat<(f64 (extloadf32 ixaddr:$src)), (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>; def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))), (f32 (DFLOADf32 ixaddr:$src))>; let Predicates = [IsBigEndian, HasP9Vector] in { // (Un)Signed DWord vector extract -> QP def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))), (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>; def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>; def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))), (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>; def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))), (f128 (XSCVUDQP (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>; // (Un)Signed Word vector extract -> QP def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 1)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>; foreach Idx = [0,2,3] in { def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, Idx)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D (VSPLTW Idx, $src)), sub_64)))>; } foreach Idx = 0-3 in { def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, Idx)))), (f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>; } // (Un)Signed HWord vector extract -> QP foreach Idx = 0-7 in { def : Pat<(f128 (sint_to_fp (i32 (sext_inreg (vector_extract v8i16:$src, Idx), i16)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)), sub_64)))>; // The SDAG adds the `and` since an `i16` is being extracted as an `i32`. def : Pat<(f128 (uint_to_fp (and (i32 (vector_extract v8i16:$src, Idx)), 65535))), (f128 (XSCVUDQP (EXTRACT_SUBREG (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>; } // (Un)Signed Byte vector extract -> QP foreach Idx = 0-15 in { def : Pat<(f128 (sint_to_fp (i32 (sext_inreg (vector_extract v16i8:$src, Idx), i8)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSB2D (VEXTRACTUB Idx, $src)), sub_64)))>; def : Pat<(f128 (uint_to_fp (and (i32 (vector_extract v16i8:$src, Idx)), 255))), (f128 (XSCVUDQP (EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>; } // Unsiged int in vsx register -> QP def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))), (f128 (XSCVUDQP (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>; } // IsBigEndian, HasP9Vector let Predicates = [IsLittleEndian, HasP9Vector] in { // (Un)Signed DWord vector extract -> QP def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>; def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))), (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>; def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))), (f128 (XSCVUDQP (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>; def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))), (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>; // (Un)Signed Word vector extract -> QP foreach Idx = [[0,3],[1,2],[3,0]] in { def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D (VSPLTW !head(!tail(Idx)), $src)), sub_64)))>; } def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 2)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>; foreach Idx = [[0,12],[1,8],[2,4],[3,0]] in { def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))), (f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>; } // (Un)Signed HWord vector extract -> QP // The Nested foreach lists identifies the vector element and corresponding // register byte location. foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in { def : Pat<(f128 (sint_to_fp (i32 (sext_inreg (vector_extract v8i16:$src, !head(Idx)), i16)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSH2D (VEXTRACTUH !head(!tail(Idx)), $src)), sub_64)))>; def : Pat<(f128 (uint_to_fp (and (i32 (vector_extract v8i16:$src, !head(Idx))), 65535))), (f128 (XSCVUDQP (EXTRACT_SUBREG (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>; } // (Un)Signed Byte vector extract -> QP foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7], [9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in { def : Pat<(f128 (sint_to_fp (i32 (sext_inreg (vector_extract v16i8:$src, !head(Idx)), i8)))), (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSB2D (VEXTRACTUB !head(!tail(Idx)), $src)), sub_64)))>; def : Pat<(f128 (uint_to_fp (and (i32 (vector_extract v16i8:$src, !head(Idx))), 255))), (f128 (XSCVUDQP (EXTRACT_SUBREG (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>; } // Unsiged int in vsx register -> QP def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))), (f128 (XSCVUDQP (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 8)))>; } // IsLittleEndian, HasP9Vector // Convert (Un)Signed DWord in memory -> QP def : Pat<(f128 (sint_to_fp (i64 (load xaddr:$src)))), (f128 (XSCVSDQP (LXSDX xaddr:$src)))>; def : Pat<(f128 (sint_to_fp (i64 (load ixaddr:$src)))), (f128 (XSCVSDQP (LXSD ixaddr:$src)))>; def : Pat<(f128 (uint_to_fp (i64 (load xaddr:$src)))), (f128 (XSCVUDQP (LXSDX xaddr:$src)))>; def : Pat<(f128 (uint_to_fp (i64 (load ixaddr:$src)))), (f128 (XSCVUDQP (LXSD ixaddr:$src)))>; // Convert Unsigned HWord in memory -> QP def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)), (f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>; // Convert Unsigned Byte in memory -> QP def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)), (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>; // Truncate & Convert QP -> (Un)Signed (D)Word. def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>; def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>; def : Pat<(i32 (fp_to_sint f128:$src)), (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>; def : Pat<(i32 (fp_to_uint f128:$src)), (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>; // Instructions for store(fptosi). // The 8-byte version is repeated here due to availability of D-Form STXSD. def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddr:$dst, 8), (STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), xaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), ixaddr:$dst, 8), (STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), ixaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4), (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2), (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1), (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddr:$dst, 8), (STXSDX (XSCVDPSXDS f64:$src), xaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), ixaddr:$dst, 8), (STXSD (XSCVDPSXDS f64:$src), ixaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2), (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1), (STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>; // Instructions for store(fptoui). def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddr:$dst, 8), (STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), xaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), ixaddr:$dst, 8), (STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), ixaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4), (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2), (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1), (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddr:$dst, 8), (STXSDX (XSCVDPUXDS f64:$src), xaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), ixaddr:$dst, 8), (STXSD (XSCVDPUXDS f64:$src), ixaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2), (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>; def : Pat<(PPCstore_scal_int_from_vsr (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1), (STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>; // Round & Convert QP -> DP/SP def : Pat<(f64 (fpround f128:$src)), (f64 (XSCVQPDP $src))>; def : Pat<(f32 (fpround f128:$src)), (f32 (XSRSP (XSCVQPDPO $src)))>; // Convert SP -> QP def : Pat<(f128 (fpextend f32:$src)), (f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>; } // end HasP9Vector, AddedComplexity let AddedComplexity = 400 in { let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsBigEndian] in { def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)), (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>; } let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsLittleEndian] in { def : Pat<(f128 (PPCbuild_fp128 i64:$rA, i64:$rB)), (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>; } } let Predicates = [HasP9Vector] in { let isPseudo = 1 in { let mayStore = 1 in { def SPILLTOVSR_STX : PseudoXFormMemOp<(outs), (ins spilltovsrrc:$XT, memrr:$dst), "#SPILLTOVSR_STX", []>; def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst), "#SPILLTOVSR_ST", []>; } let mayLoad = 1 in { def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT), (ins memrr:$src), "#SPILLTOVSR_LDX", []>; def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src), "#SPILLTOVSR_LD", []>; } } } // Integer extend helper dags 32 -> 64 def AnyExts { dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32); dag B = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $B, sub_32); dag C = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $C, sub_32); dag D = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $D, sub_32); } def DblToFlt { dag A0 = (f32 (fpround (f64 (extractelt v2f64:$A, 0)))); dag A1 = (f32 (fpround (f64 (extractelt v2f64:$A, 1)))); dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0)))); dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1)))); } +def ExtDbl { + dag A0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 0)))))); + dag A1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 1)))))); + dag B0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 0)))))); + dag B1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 1)))))); + dag A0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 0)))))); + dag A1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 1)))))); + dag B0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 0)))))); + dag B1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 1)))))); +} + def ByteToWord { dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8)); dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8)); dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8)); dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8)); dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8)); dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8)); } def ByteToDWord { dag LE_A0 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8)); dag LE_A1 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8)); dag BE_A0 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8)); dag BE_A1 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8)); } def HWordToWord { dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16)); dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16)); dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16)); dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16)); dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16)); dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16)); dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16)); dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16)); } def HWordToDWord { dag LE_A0 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16)); dag LE_A1 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16)); dag BE_A0 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16)); dag BE_A1 = (i64 (sext_inreg (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16)); } def WordToDWord { dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0)))); dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2)))); dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1)))); dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3)))); } def FltToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A))))); } def FltToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 xoaddr:$A))))); } def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } def FltToLongLoadP9 { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A))))); } def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } def FltToULongLoadP9 { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A))))); } def FltToLong { dag A = (i64 (PPCmfvsr (f64 (PPCfctidz (fpextend f32:$A))))); } def FltToULong { dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz (fpextend f32:$A))))); } def DblToInt { dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A)))); + dag B = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$B)))); + dag C = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$C)))); + dag D = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$D)))); } def DblToUInt { dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A)))); + dag B = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$B)))); + dag C = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$C)))); + dag D = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$D)))); } def DblToLong { dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A)))); } def DblToULong { dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz f64:$A)))); } def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } def DblToIntLoadP9 { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A))))); } def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } def DblToUIntLoadP9 { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A))))); } def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); } def DblToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A))))); } // FP merge dags (for f32 -> v4f32) def MrgFP { dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), (COPY_TO_REGCLASS $C, VSRC), 0)); dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), (COPY_TO_REGCLASS $D, VSRC), 0)); dag ABhToFlt = (XVCVDPSP (XXPERMDI $A, $B, 0)); dag ABlToFlt = (XVCVDPSP (XXPERMDI $A, $B, 3)); dag BAhToFlt = (XVCVDPSP (XXPERMDI $B, $A, 0)); dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3)); } +// Word-element merge dags - conversions from f64 to i32 merged into vectors. +def MrgWords { + // For big endian, we merge low and hi doublewords (A, B). + dag A0B0 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 0)); + dag A1B1 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 3)); + dag CVA1B1S = (v4i32 (XVCVDPSXWS A1B1)); + dag CVA0B0S = (v4i32 (XVCVDPSXWS A0B0)); + dag CVA1B1U = (v4i32 (XVCVDPUXWS A1B1)); + dag CVA0B0U = (v4i32 (XVCVDPUXWS A0B0)); + + // For little endian, we merge low and hi doublewords (B, A). + dag B1A1 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 0)); + dag B0A0 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 3)); + dag CVB1A1S = (v4i32 (XVCVDPSXWS B1A1)); + dag CVB0A0S = (v4i32 (XVCVDPSXWS B0A0)); + dag CVB1A1U = (v4i32 (XVCVDPUXWS B1A1)); + dag CVB0A0U = (v4i32 (XVCVDPUXWS B0A0)); + + // For big endian, we merge hi doublewords of (A, C) and (B, D), convert + // then merge. + dag AC = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$A, VSRC), + (COPY_TO_REGCLASS f64:$C, VSRC), 0)); + dag BD = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$B, VSRC), + (COPY_TO_REGCLASS f64:$D, VSRC), 0)); + dag CVACS = (v4i32 (XVCVDPSXWS AC)); + dag CVBDS = (v4i32 (XVCVDPSXWS BD)); + dag CVACU = (v4i32 (XVCVDPUXWS AC)); + dag CVBDU = (v4i32 (XVCVDPUXWS BD)); + + // For little endian, we merge hi doublewords of (D, B) and (C, A), convert + // then merge. + dag DB = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$D, VSRC), + (COPY_TO_REGCLASS f64:$B, VSRC), 0)); + dag CA = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$C, VSRC), + (COPY_TO_REGCLASS f64:$A, VSRC), 0)); + dag CVDBS = (v4i32 (XVCVDPSXWS DB)); + dag CVCAS = (v4i32 (XVCVDPSXWS CA)); + dag CVDBU = (v4i32 (XVCVDPUXWS DB)); + dag CVCAU = (v4i32 (XVCVDPUXWS CA)); +} + // Patterns for BUILD_VECTOR nodes. let AddedComplexity = 400 in { let Predicates = [HasVSX] in { // Build vectors of floating point converted to i32. def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A, DblToInt.A, DblToInt.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>; def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A, DblToUInt.A, DblToUInt.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>; def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)), (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>; def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)), (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>; def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; // Build vectors of floating point converted to i64. def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), (v2i64 (XXPERMDIs (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>; def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)), (v2i64 (XXPERMDIs (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>; def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)), (v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>; def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)), (v2i64 (XVCVDPUXDS (LXVDSX xoaddr:$A)))>; } let Predicates = [HasVSX, NoP9Vector] in { // Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads). def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>; def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A), VSFRC)), 0))>; def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A), VSFRC)), 0))>; } // Big endian, available on all targets with VSX let Predicates = [IsBigEndian, HasVSX] in { def : Pat<(v2f64 (build_vector f64:$A, f64:$B)), (v2f64 (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), (COPY_TO_REGCLASS $B, VSRC), 0))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)), (VMRGEW MrgFP.AC, MrgFP.BD)>; def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, DblToFlt.B0, DblToFlt.B1)), (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>; + + // Convert 4 doubles to a vector of ints. + def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B, + DblToInt.C, DblToInt.D)), + (v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>; + def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B, + DblToUInt.C, DblToUInt.D)), + (v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>; + def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S, + ExtDbl.B0S, ExtDbl.B1S)), + (v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>; + def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U, + ExtDbl.B0U, ExtDbl.B1U)), + (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>; } let Predicates = [IsLittleEndian, HasVSX] in { // Little endian, available on all targets with VSX def : Pat<(v2f64 (build_vector f64:$A, f64:$B)), (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), (COPY_TO_REGCLASS $A, VSRC), 0))>; def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)), (VMRGEW MrgFP.AC, MrgFP.BD)>; def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, DblToFlt.B0, DblToFlt.B1)), (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>; + + // Convert 4 doubles to a vector of ints. + def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B, + DblToInt.C, DblToInt.D)), + (v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>; + def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B, + DblToUInt.C, DblToUInt.D)), + (v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>; + def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S, + ExtDbl.B0S, ExtDbl.B1S)), + (v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>; + def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U, + ExtDbl.B0U, ExtDbl.B1U)), + (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>; } let Predicates = [HasDirectMove] in { // Endianness-neutral constant splat on P8 and newer targets. The reason // for this pattern is that on targets with direct moves, we don't expand // BUILD_VECTOR nodes for v4i32. def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A)), (v4i32 (VSPLTISW imm:$A))>; } let Predicates = [IsBigEndian, HasDirectMove, NoP9Vector] in { // Big endian integer vectors using direct moves. def : Pat<(v2i64 (build_vector i64:$A, i64:$B)), (v2i64 (XXPERMDI (COPY_TO_REGCLASS (MTVSRD $A), VSRC), (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), 0), (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), 0))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>; } let Predicates = [IsLittleEndian, HasDirectMove, NoP9Vector] in { // Little endian integer vectors using direct moves. def : Pat<(v2i64 (build_vector i64:$A, i64:$B)), (v2i64 (XXPERMDI (COPY_TO_REGCLASS (MTVSRD $B), VSRC), (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), 0), (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 0))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>; } let Predicates = [HasP9Vector] in { // Endianness-neutral patterns for const splats with ISA 3.0 instructions. def : Pat<(v4i32 (scalar_to_vector i32:$A)), (v4i32 (MTVSRWS $A))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), (v4i32 (MTVSRWS $A))>; def : Pat<(v16i8 (build_vector immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A)), (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>; def : Pat<(v16i8 immAllOnesV), (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; def : Pat<(v8i16 immAllOnesV), (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; def : Pat<(v4i32 immAllOnesV), (v4i32 (XXSPLTIB 255))>; def : Pat<(v2i64 immAllOnesV), (v2i64 (XXSPLTIB 255))>; def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)), (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; } let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { def : Pat<(i64 (extractelt v2i64:$A, 1)), (i64 (MFVSRLD $A))>; // Better way to build integer vectors if we have MTVSRDD. Big endian. def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)), (v2i64 (MTVSRDD $rB, $rA))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), (VMRGOW (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC)), (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC)))>; } let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in { def : Pat<(i64 (extractelt v2i64:$A, 0)), (i64 (MFVSRLD $A))>; // Better way to build integer vectors if we have MTVSRDD. Little endian. def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)), (v2i64 (MTVSRDD $rB, $rA))>; def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), (VMRGOW (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC)), (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC)))>; } // P9 Altivec instructions that can be used to build vectors. // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete // with complexities of existing build vector patterns in this file. let Predicates = [HasP9Altivec, IsLittleEndian] in { def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)), (v2i64 (VEXTSW2D $A))>; def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)), (v2i64 (VEXTSH2D $A))>; def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1, HWordToWord.LE_A2, HWordToWord.LE_A3)), (v4i32 (VEXTSH2W $A))>; def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1, ByteToWord.LE_A2, ByteToWord.LE_A3)), (v4i32 (VEXTSB2W $A))>; def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)), (v2i64 (VEXTSB2D $A))>; } let Predicates = [HasP9Altivec, IsBigEndian] in { def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)), (v2i64 (VEXTSW2D $A))>; def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)), (v2i64 (VEXTSH2D $A))>; def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1, HWordToWord.BE_A2, HWordToWord.BE_A3)), (v4i32 (VEXTSH2W $A))>; def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1, ByteToWord.BE_A2, ByteToWord.BE_A3)), (v4i32 (VEXTSB2W $A))>; def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)), (v2i64 (VEXTSB2D $A))>; } let Predicates = [HasP9Altivec] in { def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)), (v2i64 (VEXTSB2D $A))>; def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)), (v2i64 (VEXTSH2D $A))>; def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)), (v2i64 (VEXTSW2D $A))>; def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)), (v4i32 (VEXTSB2W $A))>; def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)), (v4i32 (VEXTSH2W $A))>; } } Index: vendor/llvm/dist-release_70/lib/Target/X86/X86FastISel.cpp =================================================================== --- vendor/llvm/dist-release_70/lib/Target/X86/X86FastISel.cpp (revision 337298) +++ vendor/llvm/dist-release_70/lib/Target/X86/X86FastISel.cpp (revision 337299) @@ -1,4050 +1,4054 @@ //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the X86-specific support for the FastISel class. Much // of the target-specific code is generated by tablegen in the file // X86GenFastISel.inc, which is #included here. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; namespace { class X86FastISel final : public FastISel { /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf64; bool X86ScalarSSEf32; public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) : FastISel(funcInfo, libInfo) { Subtarget = &funcInfo.MF->getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); } bool fastSelectInstruction(const Instruction *I) override; /// The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if /// possible. bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) override; bool fastLowerArguments() override; bool fastLowerCall(CallLoweringInfo &CLI) override; bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; #include "X86GenFastISel.inc" private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, const DebugLoc &DL); bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment = 1); bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); bool X86SelectAddress(const Value *V, X86AddressMode &AM); bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); bool X86SelectLoad(const Instruction *I); bool X86SelectStore(const Instruction *I); bool X86SelectRet(const Instruction *I); bool X86SelectCmp(const Instruction *I); bool X86SelectZExt(const Instruction *I); bool X86SelectSExt(const Instruction *I); bool X86SelectBranch(const Instruction *I); bool X86SelectShift(const Instruction *I); bool X86SelectDivRem(const Instruction *I); bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc, const TargetRegisterClass *RC); bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); bool X86SelectSIToFP(const Instruction *I); bool X86SelectUIToFP(const Instruction *I); bool X86SelectIntToFP(const Instruction *I, bool IsSigned); const X86InstrInfo *getInstrInfo() const { return Subtarget->getInstrInfo(); } const X86TargetMachine *getTargetMachine() const { return static_cast(&TM); } bool handleConstantAddresses(const Value *V, X86AddressMode &AM); unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); unsigned fastMaterializeConstant(const Constant *C) override; unsigned fastMaterializeAlloca(const AllocaInst *C) override; unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); bool IsMemcpySmall(uint64_t Len); bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, const Value *Cond); const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, X86AddressMode &AM); unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, unsigned Op1, bool Op1IsKill, unsigned Op2, bool Op2IsKill, unsigned Op3, bool Op3IsKill); }; } // end anonymous namespace. static std::pair getX86SSEConditionCode(CmpInst::Predicate Predicate) { unsigned CC; bool NeedSwap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (Predicate) { default: llvm_unreachable("Unexpected predicate"); case CmpInst::FCMP_OEQ: CC = 0; break; case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OLT: CC = 1; break; case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OLE: CC = 2; break; case CmpInst::FCMP_UNO: CC = 3; break; case CmpInst::FCMP_UNE: CC = 4; break; case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGE: CC = 5; break; case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGT: CC = 6; break; case CmpInst::FCMP_ORD: CC = 7; break; case CmpInst::FCMP_UEQ: CC = 8; break; case CmpInst::FCMP_ONE: CC = 12; break; } return std::make_pair(CC, NeedSwap); } /// Adds a complex addressing mode to the given machine instr builder. /// Note, this will constrain the index register. If its not possible to /// constrain the given index register, then a new one will be created. The /// IndexReg field of the addressing mode will be updated to match in this case. const MachineInstrBuilder & X86FastISel::addFullAddress(const MachineInstrBuilder &MIB, X86AddressMode &AM) { // First constrain the index register. It needs to be a GR64_NOSP. AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg, MIB->getNumOperands() + X86::AddrIndexReg); return ::addFullAddress(MIB, AM); } /// Check if it is possible to fold the condition from the XALU intrinsic /// into the user. The condition code will only be updated on success. bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, const Value *Cond) { if (!isa(Cond)) return false; const auto *EV = cast(Cond); if (!isa(EV->getAggregateOperand())) return false; const auto *II = cast(EV->getAggregateOperand()); MVT RetVT; const Function *Callee = II->getCalledFunction(); Type *RetTy = cast(Callee->getReturnType())->getTypeAtIndex(0U); if (!isTypeLegal(RetTy, RetVT)) return false; if (RetVT != MVT::i32 && RetVT != MVT::i64) return false; X86::CondCode TmpCC; switch (II->getIntrinsicID()) { default: return false; case Intrinsic::sadd_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; case Intrinsic::uadd_with_overflow: case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; } // Check if both instructions are in the same basic block. if (II->getParent() != I->getParent()) return false; // Make sure nothing is in the way BasicBlock::const_iterator Start(I); BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. if (!isa(Itr)) return false; // Check that the extractvalue operand comes from the intrinsic. const auto *EVI = cast(Itr); if (EVI->getAggregateOperand() != II) return false; } CC = TmpCC; return true; } bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) // Unhandled type. Halt "fast" selection and bail. return false; VT = evt.getSimpleVT(); // For now, require SSE/SSE2 for performing floating-point operations, // since x87 requires additional work. if (VT == MVT::f64 && !X86ScalarSSEf64) return false; if (VT == MVT::f32 && !X86ScalarSSEf32) return false; // Similarly, no f80 support yet. if (VT == MVT::f80) return false; // We only handle legal types. For example, on x86-32 the instruction // selector contains all of the 64-bit instructions from x86-64, // under the assumption that i64 won't be used if the target doesn't // support it. return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); } #include "X86GenCallingConv.inc" /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment) { bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); bool HasAVX512 = Subtarget->hasAVX512(); bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: Opc = X86::MOV8rm; RC = &X86::GR8RegClass; break; case MVT::i16: Opc = X86::MOV16rm; RC = &X86::GR16RegClass; break; case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; case MVT::i64: // Must be in x86-64 mode. Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; case MVT::f32: if (X86ScalarSSEf32) { Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; RC = &X86::RFP64RegClass; } break; case MVT::f80: // No f80 support yet. return false; case MVT::v4f32: if (IsNonTemporal && Alignment >= 16 && HasSSE41) Opc = HasVLX ? X86::VMOVNTDQAZ128rm : HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) Opc = HasVLX ? X86::VMOVAPSZ128rm : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; else Opc = HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) Opc = HasVLX ? X86::VMOVNTDQAZ128rm : HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) Opc = HasVLX ? X86::VMOVAPDZ128rm : HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; else Opc = HasVLX ? X86::VMOVUPDZ128rm : HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: if (IsNonTemporal && Alignment >= 16) Opc = HasVLX ? X86::VMOVNTDQAZ128rm : HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) Opc = HasVLX ? X86::VMOVDQA64Z128rm : HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; else Opc = HasVLX ? X86::VMOVDQU64Z128rm : HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; else if (IsNonTemporal && Alignment >= 16) return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; else if (IsNonTemporal && Alignment >= 16) return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v8i32: case MVT::v4i64: case MVT::v16i16: case MVT::v32i8: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; else if (IsNonTemporal && Alignment >= 16) return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v16f32: assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm; RC = &X86::VR512RegClass; break; case MVT::v8f64: assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm; RC = &X86::VR512RegClass; break; case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the load isn't masked. if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm; RC = &X86::VR512RegClass; break; } ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addFullAddress(MIB, AM); if (MMO) MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } /// X86FastEmitStore - Emit a machine instruction to store a value Val of /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { bool HasSSE1 = Subtarget->hasSSE1(); bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE4A = Subtarget->hasSSE4A(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX512 = Subtarget->hasAVX512(); bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { case MVT::f80: // No f80 support yet. default: return false; case MVT::i1: { // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::AND8ri), AndResult) .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; LLVM_FALLTHROUGH; // handle i1 as i8. } case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; case MVT::i32: Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr; break; case MVT::i64: // Must be in x86-64 mode. Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; break; case MVT::f32: if (X86ScalarSSEf32) { if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSS; else Opc = HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; } else Opc = X86::ST_Fp32m; break; case MVT::f64: if (X86ScalarSSEf32) { if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSD; else Opc = HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; } else Opc = X86::ST_Fp64m; break; case MVT::x86mmx: Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr; break; case MVT::v4f32: if (Aligned) { if (IsNonTemporal) Opc = HasVLX ? X86::VMOVNTPSZ128mr : HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; else Opc = HasVLX ? X86::VMOVAPSZ128mr : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; } else Opc = HasVLX ? X86::VMOVUPSZ128mr : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: if (Aligned) { if (IsNonTemporal) Opc = HasVLX ? X86::VMOVNTPDZ128mr : HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; else Opc = HasVLX ? X86::VMOVAPDZ128mr : HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; } else Opc = HasVLX ? X86::VMOVUPDZ128mr : HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: if (Aligned) { if (IsNonTemporal) Opc = HasVLX ? X86::VMOVNTDQZ128mr : HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; else Opc = HasVLX ? X86::VMOVDQA64Z128mr : HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; } else Opc = HasVLX ? X86::VMOVDQU64Z128mr : HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; break; case MVT::v8f32: assert(HasAVX); if (Aligned) { if (IsNonTemporal) Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr; else Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr; } else Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr; break; case MVT::v4f64: assert(HasAVX); if (Aligned) { if (IsNonTemporal) Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr; else Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr; } else Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr; break; case MVT::v8i32: case MVT::v4i64: case MVT::v16i16: case MVT::v32i8: assert(HasAVX); if (Aligned) { if (IsNonTemporal) Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr; else Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr; } else Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr; break; case MVT::v16f32: assert(HasAVX512); if (Aligned) Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr; else Opc = X86::VMOVUPSZmr; break; case MVT::v8f64: assert(HasAVX512); if (Aligned) { Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr; } else Opc = X86::VMOVUPDZmr; break; case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the store isn't masked. if (Aligned) Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr; else Opc = X86::VMOVDQU64Zmr; break; } const MCInstrDesc &Desc = TII.get(Opc); // Some of the instructions in the previous switch use FR128 instead // of FR32 for ValReg. Make sure the register we feed the instruction // matches its register class constraints. // Note: This is fine to do a copy from FR32 to FR128, this is the // same registers behind the scene and actually why it did not trigger // any bugs before. ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc); addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); if (MMO) MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa(Val)) Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); // If this is a store of a simple constant, fold the constant into the store. if (const ConstantInt *CI = dyn_cast(Val)) { unsigned Opc = 0; bool Signed = true; switch (VT.getSimpleVT().SimpleTy) { default: break; case MVT::i1: Signed = false; LLVM_FALLTHROUGH; // Handle as i8. case MVT::i8: Opc = X86::MOV8mi; break; case MVT::i16: Opc = X86::MOV16mi; break; case MVT::i32: Opc = X86::MOV32mi; break; case MVT::i64: // Must be a 32-bit sign extended value. if (isInt<32>(CI->getSExtValue())) Opc = X86::MOV64mi32; break; } if (Opc) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() : CI->getZExtValue()); if (MMO) MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } } unsigned ValReg = getRegForValue(Val); if (ValReg == 0) return false; bool ValKill = hasTrivialKill(Val); return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. /// ISD::SIGN_EXTEND). bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg) { unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src, /*TODO: Kill=*/false); if (RR == 0) return false; ResultReg = RR; return true; } bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Handle constant address. if (const GlobalValue *GV = dyn_cast(V)) { // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return false; // Can't handle TLS yet. if (GV->isThreadLocal()) return false; + // Can't handle !absolute_symbol references yet. + if (GV->isAbsoluteSymbolRef()) + return false; + // RIP-relative addresses can't have additional register operands, so if // we've already folded stuff into the addressing mode, just force the // global value into its own register, which we can use as the basereg. if (!Subtarget->isPICStyleRIPRel() || (AM.Base.Reg == 0 && AM.IndexReg == 0)) { // Okay, we've committed to selecting this global. Set up the address. AM.GV = GV; // Allow the subtarget to classify the global. unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); // If this reference is relative to the pic base, set it now. if (isGlobalRelativeToPICBase(GVFlags)) { // FIXME: How do we know Base.Reg is free?? AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); } // Unless the ABI requires an extra load, return a direct reference to // the global. if (!isGlobalStubReference(GVFlags)) { if (Subtarget->isPICStyleRIPRel()) { // Use rip-relative addressing if we can. Above we verified that the // base and index registers are unused. assert(AM.Base.Reg == 0 && AM.IndexReg == 0); AM.Base.Reg = X86::RIP; } AM.GVOpFlags = GVFlags; return true; } // Ok, we need to do a load from a stub. If we've already loaded from // this stub, reuse the loaded pointer, otherwise emit the load now. DenseMap::iterator I = LocalValueMap.find(V); unsigned LoadReg; if (I != LocalValueMap.end() && I->second != 0) { LoadReg = I->second; } else { // Issue load from stub. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; X86AddressMode StubAM; StubAM.Base.Reg = AM.Base.Reg; StubAM.GV = GV; StubAM.GVOpFlags = GVFlags; // Prepare for inserting code in the local-value area. SavePoint SaveInsertPt = enterLocalValueArea(); if (TLI.getPointerTy(DL) == MVT::i64) { Opc = X86::MOV64rm; RC = &X86::GR64RegClass; if (Subtarget->isPICStyleRIPRel()) StubAM.Base.Reg = X86::RIP; } else { Opc = X86::MOV32rm; RC = &X86::GR32RegClass; } LoadReg = createResultReg(RC); MachineInstrBuilder LoadMI = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); addFullAddress(LoadMI, StubAM); // Ok, back to normal mode. leaveLocalValueArea(SaveInsertPt); // Prevent loading GV stub multiple times in same MBB. LocalValueMap[V] = LoadReg; } // Now construct the final address. Note that the Disp, Scale, // and Index values may already be set here. AM.Base.Reg = LoadReg; AM.GV = nullptr; return true; } } // If all else fails, try to materialize the value in a register. if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { if (AM.Base.Reg == 0) { AM.Base.Reg = getRegForValue(V); return AM.Base.Reg != 0; } if (AM.IndexReg == 0) { assert(AM.Scale == 1 && "Scale with no index!"); AM.IndexReg = getRegForValue(V); return AM.IndexReg != 0; } } return false; } /// X86SelectAddress - Attempt to fill in an address from the given value. /// bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { SmallVector GEPs; redo_gep: const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; if (const Instruction *I = dyn_cast(V)) { // Don't walk into other basic blocks; it's possible we haven't // visited them yet, so the instructions may not yet be assigned // virtual registers. if (FuncInfo.StaticAllocaMap.count(static_cast(V)) || FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { Opcode = I->getOpcode(); U = I; } } else if (const ConstantExpr *C = dyn_cast(V)) { Opcode = C->getOpcode(); U = C; } if (PointerType *Ty = dyn_cast(V->getType())) if (Ty->getAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. return false; switch (Opcode) { default: break; case Instruction::BitCast: // Look past bitcasts. return X86SelectAddress(U->getOperand(0), AM); case Instruction::IntToPtr: // Look past no-op inttoptrs. if (TLI.getValueType(DL, U->getOperand(0)->getType()) == TLI.getPointerTy(DL)) return X86SelectAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: // Look past no-op ptrtoints. if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return X86SelectAddress(U->getOperand(0), AM); break; case Instruction::Alloca: { // Do static allocas. const AllocaInst *A = cast(V); DenseMap::iterator SI = FuncInfo.StaticAllocaMap.find(A); if (SI != FuncInfo.StaticAllocaMap.end()) { AM.BaseType = X86AddressMode::FrameIndexBase; AM.Base.FrameIndex = SI->second; return true; } break; } case Instruction::Add: { // Adds of constants are common and easy enough. if (const ConstantInt *CI = dyn_cast(U->getOperand(1))) { uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); // They have to fit in the 32-bit signed displacement field though. if (isInt<32>(Disp)) { AM.Disp = (uint32_t)Disp; return X86SelectAddress(U->getOperand(0), AM); } } break; } case Instruction::GetElementPtr: { X86AddressMode SavedAM = AM; // Pattern-match simple GEPs. uint64_t Disp = (int32_t)AM.Disp; unsigned IndexReg = AM.IndexReg; unsigned Scale = AM.Scale; gep_type_iterator GTI = gep_type_begin(U); // Iterate through the indices, folding what we can. Constants can be // folded, and one dynamic index can be handled, if the scale is supported. for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i, ++GTI) { const Value *Op = *i; if (StructType *STy = GTI.getStructTypeOrNull()) { const StructLayout *SL = DL.getStructLayout(STy); Disp += SL->getElementOffset(cast(Op)->getZExtValue()); continue; } // A array/variable index is always of the form i*S where S is the // constant scale size. See if we can push the scale into immediates. uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); for (;;) { if (const ConstantInt *CI = dyn_cast(Op)) { // Constant-offset addressing. Disp += CI->getSExtValue() * S; break; } if (canFoldAddIntoGEP(U, Op)) { // A compatible add with a constant operand. Fold the constant. ConstantInt *CI = cast(cast(Op)->getOperand(1)); Disp += CI->getSExtValue() * S; // Iterate on the other operand. Op = cast(Op)->getOperand(0); continue; } if (IndexReg == 0 && (!AM.GV || !Subtarget->isPICStyleRIPRel()) && (S == 1 || S == 2 || S == 4 || S == 8)) { // Scaled-index addressing. Scale = S; IndexReg = getRegForGEPIndex(Op).first; if (IndexReg == 0) return false; break; } // Unsupported. goto unsupported_gep; } } // Check for displacement overflow. if (!isInt<32>(Disp)) break; AM.IndexReg = IndexReg; AM.Scale = Scale; AM.Disp = (uint32_t)Disp; GEPs.push_back(V); if (const GetElementPtrInst *GEP = dyn_cast(U->getOperand(0))) { // Ok, the GEP indices were covered by constant-offset and scaled-index // addressing. Update the address state and move on to examining the base. V = GEP; goto redo_gep; } else if (X86SelectAddress(U->getOperand(0), AM)) { return true; } // If we couldn't merge the gep value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; for (const Value *I : reverse(GEPs)) if (handleConstantAddresses(I, AM)) return true; return false; unsupported_gep: // Ok, the GEP indices weren't all covered. break; } } return handleConstantAddresses(V, AM); } /// X86SelectCallAddress - Attempt to fill in an address from the given value. /// bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; const Instruction *I = dyn_cast(V); // Record if the value is defined in the same basic block. // // This information is crucial to know whether or not folding an // operand is valid. // Indeed, FastISel generates or reuses a virtual register for all // operands of all instructions it selects. Obviously, the definition and // its uses must use the same virtual register otherwise the produced // code is incorrect. // Before instruction selection, FunctionLoweringInfo::set sets the virtual // registers for values that are alive across basic blocks. This ensures // that the values are consistently set between across basic block, even // if different instruction selection mechanisms are used (e.g., a mix of // SDISel and FastISel). // For values local to a basic block, the instruction selection process // generates these virtual registers with whatever method is appropriate // for its needs. In particular, FastISel and SDISel do not share the way // local virtual registers are set. // Therefore, this is impossible (or at least unsafe) to share values // between basic blocks unless they use the same instruction selection // method, which is not guarantee for X86. // Moreover, things like hasOneUse could not be used accurately, if we // allow to reference values across basic blocks whereas they are not // alive across basic blocks initially. bool InMBB = true; if (I) { Opcode = I->getOpcode(); U = I; InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); } else if (const ConstantExpr *C = dyn_cast(V)) { Opcode = C->getOpcode(); U = C; } switch (Opcode) { default: break; case Instruction::BitCast: // Look past bitcasts if its operand is in the same BB. if (InMBB) return X86SelectCallAddress(U->getOperand(0), AM); break; case Instruction::IntToPtr: // Look past no-op inttoptrs if its operand is in the same BB. if (InMBB && TLI.getValueType(DL, U->getOperand(0)->getType()) == TLI.getPointerTy(DL)) return X86SelectCallAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: // Look past no-op ptrtoints if its operand is in the same BB. if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return X86SelectCallAddress(U->getOperand(0), AM); break; } // Handle constant address. if (const GlobalValue *GV = dyn_cast(V)) { // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return false; // RIP-relative addresses can't have additional register operands. if (Subtarget->isPICStyleRIPRel() && (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; // Can't handle TLS. if (const GlobalVariable *GVar = dyn_cast(GV)) if (GVar->isThreadLocal()) return false; // Okay, we've committed to selecting this global. Set up the basic address. AM.GV = GV; // Return a direct reference to the global. Fastisel can handle calls to // functions that require loads, such as dllimport and nonlazybind // functions. if (Subtarget->isPICStyleRIPRel()) { // Use rip-relative addressing if we can. Above we verified that the // base and index registers are unused. assert(AM.Base.Reg == 0 && AM.IndexReg == 0); AM.Base.Reg = X86::RIP; } else { AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr); } return true; } // If all else fails, try to materialize the value in a register. if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { if (AM.Base.Reg == 0) { AM.Base.Reg = getRegForValue(V); return AM.Base.Reg != 0; } if (AM.IndexReg == 0) { assert(AM.Scale == 1 && "Scale with no index!"); AM.IndexReg = getRegForValue(V); return AM.IndexReg != 0; } } return false; } /// X86SelectStore - Select and emit code to implement store instructions. bool X86FastISel::X86SelectStore(const Instruction *I) { // Atomic stores need special handling. const StoreInst *S = cast(I); if (S->isAtomic()) return false; const Value *PtrV = I->getOperand(1); if (TLI.supportSwiftError()) { // Swifterror values can come from either a function parameter with // swifterror attribute or an alloca with swifterror attribute. if (const Argument *Arg = dyn_cast(PtrV)) { if (Arg->hasSwiftErrorAttr()) return false; } if (const AllocaInst *Alloca = dyn_cast(PtrV)) { if (Alloca->isSwiftError()) return false; } } const Value *Val = S->getValueOperand(); const Value *Ptr = S->getPointerOperand(); MVT VT; if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) return false; unsigned Alignment = S->getAlignment(); unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = ABIAlignment; bool Aligned = Alignment >= ABIAlignment; X86AddressMode AM; if (!X86SelectAddress(Ptr, AM)) return false; return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. bool X86FastISel::X86SelectRet(const Instruction *I) { const ReturnInst *Ret = cast(I); const Function &F = *I->getParent()->getParent(); const X86MachineFunctionInfo *X86MFInfo = FuncInfo.MF->getInfo(); if (!FuncInfo.CanLowerReturn) return false; if (TLI.supportSwiftError() && F.getAttributes().hasAttrSomewhere(Attribute::SwiftError)) return false; if (TLI.supportSplitCSR(FuncInfo.MF)) return false; CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && CC != CallingConv::X86_FastCall && CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && CC != CallingConv::X86_64_SysV && CC != CallingConv::Win64) return false; // Don't handle popping bytes if they don't fit the ret's immediate. if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn())) return false; // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; // Let SDISel handle vararg functions. if (F.isVarArg()) return false; // Build a list of return value registers. SmallVector RetRegs; if (Ret->getNumOperands() > 0) { SmallVector Outs; GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector ValLocs; CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); unsigned Reg = getRegForValue(RV); if (Reg == 0) return false; // Only handle a single return value for now. if (ValLocs.size() != 1) return false; CCValAssign &VA = ValLocs[0]; // Don't bother handling odd stuff for now. if (VA.getLocInfo() != CCValAssign::Full) return false; // Only handle register returns for now. if (!VA.isRegLoc()) return false; // The calling-convention tables for x87 returns don't tell // the whole story. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; unsigned SrcReg = Reg + VA.getValNo(); EVT SrcVT = TLI.getValueType(DL, RV->getType()); EVT DstVT = VA.getValVT(); // Special handling for extended integers. if (SrcVT != DstVT) { if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) return false; if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) return false; assert(DstVT == MVT::i32 && "X86 should always ext to i32"); if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; } unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, /*TODO: Kill=*/false); } // Make the copy. unsigned DstReg = VA.getLocReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); // Add register to return instruction. RetRegs.push_back(VA.getLocReg()); } // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. if (F.hasStructRetAttr() && CC != CallingConv::Swift) { unsigned Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); RetRegs.push_back(RetReg); } // Now emit the RET. MachineInstrBuilder MIB; if (X86MFInfo->getBytesToPopOnReturn()) { MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL)) .addImm(X86MFInfo->getBytesToPopOnReturn()); } else { MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); } for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) MIB.addReg(RetRegs[i], RegState::Implicit); return true; } /// X86SelectLoad - Select and emit code to implement load instructions. /// bool X86FastISel::X86SelectLoad(const Instruction *I) { const LoadInst *LI = cast(I); // Atomic loads need special handling. if (LI->isAtomic()) return false; const Value *SV = I->getOperand(0); if (TLI.supportSwiftError()) { // Swifterror values can come from either a function parameter with // swifterror attribute or an alloca with swifterror attribute. if (const Argument *Arg = dyn_cast(SV)) { if (Arg->hasSwiftErrorAttr()) return false; } if (const AllocaInst *Alloca = dyn_cast(SV)) { if (Alloca->isSwiftError()) return false; } } MVT VT; if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) return false; const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; if (!X86SelectAddress(Ptr, AM)) return false; unsigned Alignment = LI->getAlignment(); unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType()); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = ABIAlignment; unsigned ResultReg = 0; if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, Alignment)) return false; updateValueMap(I, ResultReg); return true; } static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { bool HasAVX512 = Subtarget->hasAVX512(); bool HasAVX = Subtarget->hasAVX(); bool X86ScalarSSEf32 = Subtarget->hasSSE1(); bool X86ScalarSSEf64 = Subtarget->hasSSE2(); switch (VT.getSimpleVT().SimpleTy) { default: return 0; case MVT::i8: return X86::CMP8rr; case MVT::i16: return X86::CMP16rr; case MVT::i32: return X86::CMP32rr; case MVT::i64: return X86::CMP64rr; case MVT::f32: return X86ScalarSSEf32 ? (HasAVX512 ? X86::VUCOMISSZrr : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; case MVT::f64: return X86ScalarSSEf64 ? (HasAVX512 ? X86::VUCOMISDZrr : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; } } /// If we have a comparison with RHS as the RHS of the comparison, return an /// opcode that works for the compare (e.g. CMP32ri) otherwise return 0. static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { int64_t Val = RHSC->getSExtValue(); switch (VT.getSimpleVT().SimpleTy) { // Otherwise, we can't fold the immediate into this comparison. default: return 0; case MVT::i8: return X86::CMP8ri; case MVT::i16: if (isInt<8>(Val)) return X86::CMP16ri8; return X86::CMP16ri; case MVT::i32: if (isInt<8>(Val)) return X86::CMP32ri8; return X86::CMP32ri; case MVT::i64: if (isInt<8>(Val)) return X86::CMP64ri8; // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext // field. if (isInt<32>(Val)) return X86::CMP64ri32; return 0; } } bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT, const DebugLoc &CurDbgLoc) { unsigned Op0Reg = getRegForValue(Op0); if (Op0Reg == 0) return false; // Handle 'null' like i32/i64 0. if (isa(Op1)) Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext())); // We have two options: compare with register or immediate. If the RHS of // the compare is an immediate that we can fold into this compare, use // CMPri, otherwise use CMPrr. if (const ConstantInt *Op1C = dyn_cast(Op1)) { if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) .addReg(Op0Reg) .addImm(Op1C->getSExtValue()); return true; } } unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); if (CompareOpc == 0) return false; unsigned Op1Reg = getRegForValue(Op1); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) .addReg(Op0Reg) .addReg(Op1Reg); return true; } bool X86FastISel::X86SelectCmp(const Instruction *I) { const CmpInst *CI = cast(I); MVT VT; if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; switch (Predicate) { default: break; case CmpInst::FCMP_FALSE: { ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), ResultReg); ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; break; } case CmpInst::FCMP_TRUE: { ResultReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), ResultReg).addImm(1); break; } } if (ResultReg) { updateValueMap(I, ResultReg); return true; } const Value *LHS = CI->getOperand(0); const Value *RHS = CI->getOperand(1); // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. // We don't have to materialize a zero constant for this case and can just use // %x again on the RHS. if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { const auto *RHSC = dyn_cast(RHS); if (RHSC && RHSC->isNullValue()) RHS = LHS; } // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { { X86::SETEr, X86::SETNPr, X86::AND8rr }, { X86::SETNEr, X86::SETPr, X86::OR8rr } }; const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; } ResultReg = createResultReg(&X86::GR8RegClass); if (SETFOpc) { if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), FlagReg1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), FlagReg2); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), ResultReg).addReg(FlagReg1).addReg(FlagReg2); updateValueMap(I, ResultReg); return true; } X86::CondCode CC; bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) std::swap(LHS, RHS); // Emit a compare of LHS/RHS. if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectZExt(const Instruction *I) { EVT DstVT = TLI.getValueType(DL, I->getType()); if (!TLI.isTypeLegal(DstVT)) return false; unsigned ResultReg = getRegForValue(I->getOperand(0)); if (ResultReg == 0) return false; // Handle zero-extension from i1 to i8, which is common. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT == MVT::i1) { // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; if (ResultReg == 0) return false; } if (DstVT == MVT::i64) { // Handle extension to 64-bits via sub-register shenanigans. unsigned MovInst; switch (SrcVT.SimpleTy) { case MVT::i8: MovInst = X86::MOVZX32rr8; break; case MVT::i16: MovInst = X86::MOVZX32rr16; break; case MVT::i32: MovInst = X86::MOV32rr; break; default: llvm_unreachable("Unexpected zext to i64 source type"); } unsigned Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) .addReg(ResultReg); ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(Result32).addImm(X86::sub_32bit); } else if (DstVT == MVT::i16) { // i8->i16 doesn't exist in the autogenerated isel table. Need to zero // extend to 32-bits and then extract down to 16-bits. unsigned Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8), Result32).addReg(ResultReg); ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true, X86::sub_16bit); } else if (DstVT != MVT::i8) { ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) return false; } updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectSExt(const Instruction *I) { EVT DstVT = TLI.getValueType(DL, I->getType()); if (!TLI.isTypeLegal(DstVT)) return false; unsigned ResultReg = getRegForValue(I->getOperand(0)); if (ResultReg == 0) return false; // Handle sign-extension from i1 to i8. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT == MVT::i1) { // Set the high bits to zero. unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); if (ZExtReg == 0) return false; // Negate the result to make an 8-bit sign extended value. ResultReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r), ResultReg).addReg(ZExtReg); SrcVT = MVT::i8; } if (DstVT == MVT::i16) { // i8->i16 doesn't exist in the autogenerated isel table. Need to sign // extend to 32-bits and then extract down to 16-bits. unsigned Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8), Result32).addReg(ResultReg); ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true, X86::sub_16bit); } else if (DstVT != MVT::i8) { ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) return false; } updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectBranch(const Instruction *I) { // Unconditional branches are selected by tablegen-generated code. // Handle a conditional branch. const BranchInst *BI = cast(I); MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; // Fold the common case of a conditional branch with a comparison // in the same block (values defined on other blocks may not have // initialized registers). X86::CondCode CC; if (const CmpInst *CI = dyn_cast(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType()); // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); switch (Predicate) { default: break; case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; } const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, // 0.0. // We don't have to materialize a zero constant for this case and can just // use %x again on the RHS. if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { const auto *CmpRHSC = dyn_cast(CmpRHS); if (CmpRHSC && CmpRHSC->isNullValue()) CmpRHS = CmpLHS; } // Try to take advantage of fallthrough opportunities. if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); Predicate = CmpInst::getInversePredicate(Predicate); } // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition // code check. Instead two branch instructions are required to check all // the flags. First we change the predicate to a supported condition code, // which will be the first branch. Later one we will emit the second // branch. bool NeedExtraBranch = false; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: std::swap(TrueMBB, FalseMBB); LLVM_FALLTHROUGH; case CmpInst::FCMP_UNE: NeedExtraBranch = true; Predicate = CmpInst::FCMP_ONE; break; } bool SwapArgs; unsigned BranchOpc; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); // X86 requires a second branch to handle UNE (and OEQ, which is mapped // to UNE above). if (NeedExtraBranch) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) .addMBB(TrueMBB); } finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } else if (TruncInst *TI = dyn_cast(BI->getCondition())) { // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which // typically happen for _Bool and C++ bools. MVT SourceVT; if (TI->hasOneUse() && TI->getParent() == I->getParent() && isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { unsigned TestOpc = 0; switch (SourceVT.SimpleTy) { default: break; case MVT::i8: TestOpc = X86::TEST8ri; break; case MVT::i16: TestOpc = X86::TEST16ri; break; case MVT::i32: TestOpc = X86::TEST32ri; break; case MVT::i64: TestOpc = X86::TEST64ri32; break; } if (TestOpc) { unsigned OpReg = getRegForValue(TI->getOperand(0)); if (OpReg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); unsigned JmpOpc = X86::JNE_1; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); JmpOpc = X86::JE_1; } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. unsigned TmpReg = getRegForValue(BI->getCondition()); if (TmpReg == 0) return false; unsigned BranchOpc = X86::GetCondBranchFromCond(CC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } // Otherwise do a clumsy setcc and re-test it. // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used // in an explicit cast, so make sure to handle that correctly. unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) { unsigned KOpReg = OpReg; OpReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), OpReg) .addReg(KOpReg); OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true, X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg) .addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = nullptr; if (I->getType()->isIntegerTy(8)) { CReg = X86::CL; RC = &X86::GR8RegClass; switch (I->getOpcode()) { case Instruction::LShr: OpReg = X86::SHR8rCL; break; case Instruction::AShr: OpReg = X86::SAR8rCL; break; case Instruction::Shl: OpReg = X86::SHL8rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { default: llvm_unreachable("Unexpected shift opcode"); case Instruction::LShr: OpReg = X86::SHR16rCL; break; case Instruction::AShr: OpReg = X86::SAR16rCL; break; case Instruction::Shl: OpReg = X86::SHL16rCL; break; } } else if (I->getType()->isIntegerTy(32)) { CReg = X86::ECX; RC = &X86::GR32RegClass; switch (I->getOpcode()) { default: llvm_unreachable("Unexpected shift opcode"); case Instruction::LShr: OpReg = X86::SHR32rCL; break; case Instruction::AShr: OpReg = X86::SAR32rCL; break; case Instruction::Shl: OpReg = X86::SHL32rCL; break; } } else if (I->getType()->isIntegerTy(64)) { CReg = X86::RCX; RC = &X86::GR64RegClass; switch (I->getOpcode()) { default: llvm_unreachable("Unexpected shift opcode"); case Instruction::LShr: OpReg = X86::SHR64rCL; break; case Instruction::AShr: OpReg = X86::SAR64rCL; break; case Instruction::Shl: OpReg = X86::SHL64rCL; break; } } else { return false; } MVT VT; if (!isTypeLegal(I->getType(), VT)) return false; unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CReg).addReg(Op1Reg); // The shift instruction uses X86::CL. If we defined a super-register // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. if (CReg != X86::CL) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::KILL), X86::CL) .addReg(CReg, RegState::Kill); unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) .addReg(Op0Reg); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectDivRem(const Instruction *I) { const static unsigned NumTypes = 4; // i8, i16, i32, i64 const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem const static bool S = true; // IsSigned const static bool U = false; // !IsSigned const static unsigned Copy = TargetOpcode::COPY; // For the X86 DIV/IDIV instruction, in most cases the dividend // (numerator) must be in a specific register pair highreg:lowreg, // producing the quotient in lowreg and the remainder in highreg. // For most data types, to set up the instruction, the dividend is // copied into lowreg, and lowreg is sign-extended or zero-extended // into highreg. The exception is i8, where the dividend is defined // as a single register rather than a register pair, and we // therefore directly sign-extend or zero-extend the dividend into // lowreg, instead of copying, and ignore the highreg. const static struct DivRemEntry { // The following portion depends only on the data type. const TargetRegisterClass *RC; unsigned LowInReg; // low part of the register pair unsigned HighInReg; // high part of the register pair // The following portion depends on both the data type and the operation. struct DivRemResult { unsigned OpDivRem; // The specific DIV/IDIV opcode to use. unsigned OpSignExtend; // Opcode for sign-extending lowreg into // highreg, or copying a zero into highreg. unsigned OpCopy; // Opcode for copying dividend into lowreg, or // zero/sign-extending into lowreg for i8. unsigned DivRemResultReg; // Register containing the desired result. bool IsOpSigned; // Whether to use signed or unsigned form. } ResultTable[NumOps]; } OpTable[NumTypes] = { { &X86::GR8RegClass, X86::AX, 0, { { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem } }, // i8 { &X86::GR16RegClass, X86::AX, X86::DX, { { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem } }, // i16 { &X86::GR32RegClass, X86::EAX, X86::EDX, { { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem } }, // i32 { &X86::GR64RegClass, X86::RAX, X86::RDX, { { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem } }, // i64 }; MVT VT; if (!isTypeLegal(I->getType(), VT)) return false; unsigned TypeIndex, OpIndex; switch (VT.SimpleTy) { default: return false; case MVT::i8: TypeIndex = 0; break; case MVT::i16: TypeIndex = 1; break; case MVT::i32: TypeIndex = 2; break; case MVT::i64: TypeIndex = 3; if (!Subtarget->is64Bit()) return false; break; } switch (I->getOpcode()) { default: llvm_unreachable("Unexpected div/rem opcode"); case Instruction::SDiv: OpIndex = 0; break; case Instruction::SRem: OpIndex = 1; break; case Instruction::UDiv: OpIndex = 2; break; case Instruction::URem: OpIndex = 3; break; } const DivRemEntry &TypeEntry = OpTable[TypeIndex]; const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; // Move op0 into low-order input register. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); // Zero-extend or sign-extend into high-order input register. if (OpEntry.OpSignExtend) { if (OpEntry.IsOpSigned) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpSignExtend)); else { unsigned Zero32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), Zero32); // Copy the zero into the appropriate sub/super/identical physical // register. Unfortunately the operations needed are not uniform enough // to fit neatly into the table above. if (VT == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32, 0, X86::sub_16bit); } else if (VT == MVT::i32) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32); } else if (VT == MVT::i64) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); } } } // Generate the DIV/IDIV instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); // For i8 remainder, we can't reference ah directly, as we'll end // up with bogus copies like %r9b = COPY %ah. Reference ax // instead to prevent ah references in a rex instruction. // // The current assumption of the fast register allocator is that isel // won't generate explicit references to the GR8_NOREX registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. unsigned ResultReg = 0; if ((I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem) && OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), SourceSuperReg).addReg(X86::AX); // Shift AX right by 8 bits instead of using AH. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri), ResultSuperReg).addReg(SourceSuperReg).addImm(8); // Now reference the 8-bit subreg of the result. ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, /*Kill=*/true, X86::sub_8bit); } // Copy the result out of the physreg if we haven't already. if (!ResultReg) { ResultReg = createResultReg(TypeEntry.RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) .addReg(OpEntry.DivRemResultReg); } updateValueMap(I, ResultReg); return true; } /// Emit a conditional move instruction (if the are supported) to lower /// the select. bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // Check if the subtarget supports these instructions. if (!Subtarget->hasCMov()) return false; // FIXME: Add support for i8. if (RetVT < MVT::i16 || RetVT > MVT::i64) return false; const Value *Cond = I->getOperand(0); const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); bool NeedTest = true; X86::CondCode CC = X86::COND_NE; // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have // initialized registers). const auto *CI = dyn_cast(Cond); if (CI && (CI->getParent() == I->getParent())) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { { X86::SETNPr, X86::SETEr , X86::TEST8rr }, { X86::SETPr, X86::SETNEr, X86::OR8rr } }; const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; Predicate = CmpInst::ICMP_NE; break; case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; Predicate = CmpInst::ICMP_NE; break; } bool NeedSwap; std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); if (NeedSwap) std::swap(CmpLHS, CmpRHS); EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; if (SETFOpc) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), FlagReg1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), FlagReg2); auto const &II = TII.get(SETFOpc[2]); if (II.getNumDefs()) { unsigned TmpReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) .addReg(FlagReg2).addReg(FlagReg1); } else { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) .addReg(FlagReg2).addReg(FlagReg1); } } NeedTest = false; } else if (foldX86XALUIntrinsic(CC, I, Cond)) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. unsigned TmpReg = getRegForValue(Cond); if (TmpReg == 0) return false; NeedTest = false; } if (NeedTest) { // Selects operate on i1, however, CondReg is 8 bits width and may contain // garbage. Indeed, only the less significant bit is supposed to be // accurate. If we read more than the lsb, we may see non-zero values // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for // the select. This is achieved by performing TEST against 1. unsigned CondReg = getRegForValue(Cond); if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { unsigned KCondReg = CondReg; CondReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true, X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)) .addImm(1); } const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); unsigned RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); unsigned LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); if (!LHSReg || !RHSReg) return false; const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8); unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill); updateValueMap(I, ResultReg); return true; } /// Emit SSE or AVX instructions to lower the select. /// /// Try to use SSE1/SSE2 instructions to simulate a select without branches. /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary /// SSE instructions are available. If AVX is available, try to use a VBLENDV. bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have // initialized registers). const auto *CI = dyn_cast(I->getOperand(0)); if (!CI || (CI->getParent() != I->getParent())) return false; if (I->getType() != CI->getOperand(0)->getType() || !((Subtarget->hasSSE1() && RetVT == MVT::f32) || (Subtarget->hasSSE2() && RetVT == MVT::f64))) return false; const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. // We don't have to materialize a zero constant for this case and can just use // %x again on the RHS. if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { const auto *CmpRHSC = dyn_cast(CmpRHS); if (CmpRHSC && CmpRHSC->isNullValue()) CmpRHS = CmpLHS; } unsigned CC; bool NeedSwap; std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); if (CC > 7 && !Subtarget->hasAVX()) return false; if (NeedSwap) std::swap(CmpLHS, CmpRHS); // Choose the SSE instruction sequence based on data type (float or double). static const uint16_t OpcTable[2][4] = { { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } }; const uint16_t *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; case MVT::f32: Opc = &OpcTable[0][0]; break; case MVT::f64: Opc = &OpcTable[1][0]; break; } const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); unsigned LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); unsigned RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); unsigned CmpLHSReg = getRegForValue(CmpLHS); bool CmpLHSIsKill = hasTrivialKill(CmpLHS); unsigned CmpRHSReg = getRegForValue(CmpRHS); bool CmpRHSIsKill = hasTrivialKill(CmpRHS); if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg; if (Subtarget->hasAVX512()) { // If we have AVX512 we can use a mask compare and masked movss/sd. const TargetRegisterClass *VR128X = &X86::VR128XRegClass; const TargetRegisterClass *VK1 = &X86::VK1RegClass; unsigned CmpOpcode = (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr; unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); // Need an IMPLICIT_DEF for the input that is used to generate the upper // bits of the result register since its not based on any of the inputs. unsigned ImplicitDefReg = createResultReg(VR128X); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); // Place RHSReg is the passthru of the masked movss/sd operation and put // LHS in the input. The mask input comes from the compare. unsigned MovOpcode = (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk; unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill, CmpReg, true, ImplicitDefReg, true, LHSReg, LHSIsKill); ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg); } else if (Subtarget->hasAVX()) { const TargetRegisterClass *VR128 = &X86::VR128RegClass; // If we have AVX, create 1 blendv instead of 3 logic instructions. // Blendv was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. unsigned CmpOpcode = (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; unsigned BlendOpcode = (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CmpReg, true); ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { const TargetRegisterClass *VR128 = &X86::VR128RegClass; unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true, AndReg, /*IsKill=*/true); ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg); } updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { // These are pseudo CMOV instructions and will be later expanded into control- // flow. unsigned Opc; switch (RetVT.SimpleTy) { default: return false; case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; case MVT::i32: Opc = X86::CMOV_GR32; break; case MVT::f32: Opc = X86::CMOV_FR32; break; case MVT::f64: Opc = X86::CMOV_FR64; break; } const Value *Cond = I->getOperand(0); X86::CondCode CC = X86::COND_NE; // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have // initialized registers). const auto *CI = dyn_cast(Cond); if (CI && (CI->getParent() == I->getParent())) { bool NeedSwap; std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate()); if (CC > X86::LAST_VALID_COND) return false; const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); if (NeedSwap) std::swap(CmpLHS, CmpRHS); EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; } else { unsigned CondReg = getRegForValue(Cond); if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { unsigned KCondReg = CondReg; CondReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true, X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)) .addImm(1); } const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); unsigned LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); unsigned RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); if (!LHSReg || !RHSReg) return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectSelect(const Instruction *I) { MVT RetVT; if (!isTypeLegal(I->getType(), RetVT)) return false; // Check if we can fold the select. if (const auto *CI = dyn_cast(I->getOperand(0))) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); const Value *Opnd = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; } // No need for a select anymore - this is an unconditional move. if (Opnd) { unsigned OpReg = getRegForValue(Opnd); if (OpReg == 0) return false; bool OpIsKill = hasTrivialKill(Opnd); const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(OpReg, getKillRegState(OpIsKill)); updateValueMap(I, ResultReg); return true; } } // First try to use real conditional move instructions. if (X86FastEmitCMoveSelect(RetVT, I)) return true; // Try to use a sequence of SSE instructions to simulate a conditional move. if (X86FastEmitSSESelect(RetVT, I)) return true; // Fall-back to pseudo conditional move instructions, which will be later // converted to control-flow. if (X86FastEmitPseudoSelect(RetVT, I)) return true; return false; } // Common code for X86SelectSIToFP and X86SelectUIToFP. bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) { // The target-independent selection algorithm in FastISel already knows how // to select a SINT_TO_FP if the target is SSE but not AVX. // Early exit if the subtarget doesn't have AVX. // Unsigned conversion requires avx512. bool HasAVX512 = Subtarget->hasAVX512(); if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512)) return false; // TODO: We could sign extend narrower types. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT != MVT::i32 && SrcVT != MVT::i64) return false; // Select integer to float/double conversion. unsigned OpReg = getRegForValue(I->getOperand(0)); if (OpReg == 0) return false; unsigned Opcode; static const uint16_t SCvtOpc[2][2][2] = { { { X86::VCVTSI2SSrr, X86::VCVTSI642SSrr }, { X86::VCVTSI2SDrr, X86::VCVTSI642SDrr } }, { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr }, { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } }, }; static const uint16_t UCvtOpc[2][2] = { { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr }, { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr }, }; bool Is64Bit = SrcVT == MVT::i64; if (I->getType()->isDoubleTy()) { // s/uitofp int -> double Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit]; } else if (I->getType()->isFloatTy()) { // s/uitofp int -> float Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit]; } else return false; MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT(); const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT); unsigned ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); unsigned ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectSIToFP(const Instruction *I) { return X86SelectIntToFP(I, /*IsSigned*/true); } bool X86FastISel::X86SelectUIToFP(const Instruction *I) { return X86SelectIntToFP(I, /*IsSigned*/false); } // Helper method used by X86SelectFPExt and X86SelectFPTrunc. bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned TargetOpc, const TargetRegisterClass *RC) { assert((I->getOpcode() == Instruction::FPExt || I->getOpcode() == Instruction::FPTrunc) && "Instruction must be an FPExt or FPTrunc!"); unsigned OpReg = getRegForValue(I->getOperand(0)); if (OpReg == 0) return false; unsigned ImplicitDefReg; if (Subtarget->hasAVX()) { ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); } unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), ResultReg); if (Subtarget->hasAVX()) MIB.addReg(ImplicitDefReg); MIB.addReg(OpReg); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectFPExt(const Instruction *I) { if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && I->getOperand(0)->getType()->isFloatTy()) { bool HasAVX512 = Subtarget->hasAVX512(); // fpext from float to double. unsigned Opc = HasAVX512 ? X86::VCVTSS2SDZrr : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; return X86SelectFPExtOrFPTrunc( I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass); } return false; } bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { if (X86ScalarSSEf64 && I->getType()->isFloatTy() && I->getOperand(0)->getType()->isDoubleTy()) { bool HasAVX512 = Subtarget->hasAVX512(); // fptrunc from double to float. unsigned Opc = HasAVX512 ? X86::VCVTSD2SSZrr : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; return X86SelectFPExtOrFPTrunc( I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass); } return false; } bool X86FastISel::X86SelectTrunc(const Instruction *I) { EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(DL, I->getType()); // This code only handles truncation to byte. if (DstVT != MVT::i8 && DstVT != MVT::i1) return false; if (!TLI.isTypeLegal(SrcVT)) return false; unsigned InputReg = getRegForValue(I->getOperand(0)); if (!InputReg) // Unhandled operand. Halt "fast" selection and bail. return false; if (SrcVT == MVT::i8) { // Truncate from i8 to i1; no code needed. updateValueMap(I, InputReg); return true; } // Issue an extract_subreg. unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg, false, X86::sub_8bit); if (!ResultReg) return false; updateValueMap(I, ResultReg); return true; } bool X86FastISel::IsMemcpySmall(uint64_t Len) { return Len <= (Subtarget->is64Bit() ? 32 : 16); } bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len) { // Make sure we don't bloat code by inlining very large memcpy's. if (!IsMemcpySmall(Len)) return false; bool i64Legal = Subtarget->is64Bit(); // We don't care about alignment here since we just emit integer accesses. while (Len) { MVT VT; if (Len >= 8 && i64Legal) VT = MVT::i64; else if (Len >= 4) VT = MVT::i32; else if (Len >= 2) VT = MVT::i16; else VT = MVT::i8; unsigned Reg; bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); assert(RV && "Failed to emit load or store??"); unsigned Size = VT.getSizeInBits()/8; Len -= Size; DestAM.Disp += Size; SrcAM.Disp += Size; } return true; } bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. switch (II->getIntrinsicID()) { default: return false; case Intrinsic::convert_from_fp16: case Intrinsic::convert_to_fp16: { if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) return false; const Value *Op = II->getArgOperand(0); unsigned InputReg = getRegForValue(Op); if (InputReg == 0) return false; // F16C only allows converting from float to half and from half to float. bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16; if (IsFloatToHalf) { if (!Op->getType()->isFloatTy()) return false; } else { if (!II->getType()->isFloatTy()) return false; } unsigned ResultReg = 0; const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16); if (IsFloatToHalf) { // 'InputReg' is implicitly promoted from register class FR32 to // register class VR128 by method 'constrainOperandRegClass' which is // directly called by 'fastEmitInst_ri'. // Instruction VCVTPS2PHrr takes an extra immediate operand which is // used to provide rounding control: use MXCSR.RC, encoded as 0b100. // It's consistent with the other FP instructions, which are usually // controlled by MXCSR. InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4); // Move the lower 32-bits of ResultReg to another register of class GR32. ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::VMOVPDI2DIrr), ResultReg) .addReg(InputReg, RegState::Kill); // The result value is in the lower 16-bits of ResultReg. unsigned RegIdx = X86::sub_16bit; ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); } else { assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); // Explicitly sign-extend the input to 32-bit. InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg, /*Kill=*/false); // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, InputReg, /*Kill=*/true); InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true); // The result value is in the lower 32-bits of ResultReg. // Emit an explicit copy from register class VR128 to register class FR32. ResultReg = createResultReg(&X86::FR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(InputReg, RegState::Kill); } updateValueMap(II, ResultReg); return true; } case Intrinsic::frameaddress: { MachineFunction *MF = FuncInfo.MF; if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI()) return false; Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; unsigned Opc; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: llvm_unreachable("Invalid result type for frameaddress."); case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; } // This needs to be set before we call getPtrSizedFrameRegister, otherwise // we get the wrong frame register. MachineFrameInfo &MFI = MF->getFrameInfo(); MFI.setFrameAddressIsTaken(true); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); // Always make a copy of the frame register to a vreg first, so that we // never directly reference the frame register (the TwoAddressInstruction- // Pass doesn't like that). unsigned SrcReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); // Now recursively load from the frame address. // movq (%rbp), %rax // movq (%rax), %rax // movq (%rax), %rax // ... unsigned DestReg; unsigned Depth = cast(II->getOperand(0))->getZExtValue(); while (Depth--) { DestReg = createResultReg(RC); addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg), SrcReg); SrcReg = DestReg; } updateValueMap(II, SrcReg); return true; } case Intrinsic::memcpy: { const MemCpyInst *MCI = cast(II); // Don't handle volatile or variable length memcpys. if (MCI->isVolatile()) return false; if (isa(MCI->getLength())) { // Small memcpy's are common enough that we want to do them // without a call if possible. uint64_t Len = cast(MCI->getLength())->getZExtValue(); if (IsMemcpySmall(Len)) { X86AddressMode DestAM, SrcAM; if (!X86SelectAddress(MCI->getRawDest(), DestAM) || !X86SelectAddress(MCI->getRawSource(), SrcAM)) return false; TryEmitSmallMemcpy(DestAM, SrcAM, Len); return true; } } unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) return false; return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1); } case Intrinsic::memset: { const MemSetInst *MSI = cast(II); if (MSI->isVolatile()) return false; unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; if (MSI->getDestAddressSpace() > 255) return false; return lowerCallTo(II, "memset", II->getNumArgOperands() - 1); } case Intrinsic::stackprotector: { // Emit code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(DL); const Value *Op1 = II->getArgOperand(0); // The guard's value. const AllocaInst *Slot = cast(II->getArgOperand(1)); MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); // Grab the frame index. X86AddressMode AM; if (!X86SelectAddress(Slot, AM)) return false; if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; return true; } case Intrinsic::dbg_declare: { const DbgDeclareInst *DI = cast(II); X86AddressMode AM; assert(DI->getAddress() && "Null address should be checked earlier!"); if (!X86SelectAddress(DI->getAddress(), AM)) return false; const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); // FIXME may need to add RegState::Debug to any registers produced, // although ESP/EBP should be the only ones at the moment. assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) && "Expected inlined-at fields to agree"); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) .addImm(0) .addMetadata(DI->getVariable()) .addMetadata(DI->getExpression()); return true; } case Intrinsic::trap: { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); return true; } case Intrinsic::sqrt: { if (!Subtarget->hasSSE1()) return false; Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT // is not generated by FastISel yet. // FIXME: Update this code once tablegen can handle it. static const uint16_t SqrtOpc[3][2] = { { X86::SQRTSSr, X86::SQRTSDr }, { X86::VSQRTSSr, X86::VSQRTSDr }, { X86::VSQRTSSZr, X86::VSQRTSDZr }, }; unsigned AVXLevel = Subtarget->hasAVX512() ? 2 : Subtarget->hasAVX() ? 1 : 0; unsigned Opc; switch (VT.SimpleTy) { default: return false; case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break; case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break; } const Value *SrcVal = II->getArgOperand(0); unsigned SrcReg = getRegForValue(SrcVal); if (SrcReg == 0) return false; const TargetRegisterClass *RC = TLI.getRegClassFor(VT); unsigned ImplicitDefReg = 0; if (AVXLevel > 0) { ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); } unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); if (ImplicitDefReg) MIB.addReg(ImplicitDefReg); MIB.addReg(SrcReg); updateValueMap(II, ResultReg); return true; } case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::usub_with_overflow: case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: { // This implements the basic lowering of the xalu with overflow intrinsics // into add/sub/mul followed by either seto or setb. const Function *Callee = II->getCalledFunction(); auto *Ty = cast(Callee->getReturnType()); Type *RetTy = Ty->getTypeAtIndex(0U); assert(Ty->getTypeAtIndex(1)->isIntegerTy() && Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 && "Overflow value expected to be an i1"); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; if (VT < MVT::i8 || VT > MVT::i64) return false; const Value *LHS = II->getArgOperand(0); const Value *RHS = II->getArgOperand(1); // Canonicalize immediate to the RHS. if (isa(LHS) && !isa(RHS) && isCommutativeIntrinsic(II)) std::swap(LHS, RHS); bool UseIncDec = false; if (isa(RHS) && cast(RHS)->isOne()) UseIncDec = true; unsigned BaseOpc, CondOpc; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); CondOpc = X86::SETOr; break; case Intrinsic::uadd_with_overflow: BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; case Intrinsic::ssub_with_overflow: BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); CondOpc = X86::SETOr; break; case Intrinsic::usub_with_overflow: BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; case Intrinsic::smul_with_overflow: BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; case Intrinsic::umul_with_overflow: BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; } unsigned LHSReg = getRegForValue(LHS); if (LHSReg == 0) return false; bool LHSIsKill = hasTrivialKill(LHS); unsigned ResultReg = 0; // Check if we have an immediate version. if (const auto *CI = dyn_cast(RHS)) { static const uint16_t Opc[2][4] = { { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } }; if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { ResultReg = createResultReg(TLI.getRegClassFor(VT)); bool IsDec = BaseOpc == X86ISD::DEC; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) .addReg(LHSReg, getKillRegState(LHSIsKill)); } else ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, CI->getZExtValue()); } unsigned RHSReg; bool RHSIsKill; if (!ResultReg) { RHSReg = getRegForValue(RHS); if (RHSReg == 0) return false; RHSIsKill = hasTrivialKill(RHS); ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); } // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit // it manually. if (BaseOpc == X86ISD::UMUL && !ResultReg) { static const uint16_t MULOpc[] = { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; // First copy the first operand into RAX, which is an implicit input to // the X86::MUL*r instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) .addReg(LHSReg, getKillRegState(LHSIsKill)); ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { static const uint16_t MULOpc[] = { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; if (VT == MVT::i8) { // Copy the first operand into AL, which is an implicit input to the // X86::IMUL8r instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), X86::AL) .addReg(LHSReg, getKillRegState(LHSIsKill)); ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), LHSReg, LHSIsKill, RHSReg, RHSIsKill); } if (!ResultReg) return false; // Assign to a GPR since the overflow return value is lowered to a SETcc. unsigned ResultReg2 = createResultReg(&X86::GR8RegClass); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); updateValueMap(II, ResultReg, 2); return true; } case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: { bool IsInputDouble; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic."); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: if (!Subtarget->hasSSE1()) return false; IsInputDouble = false; break; case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: if (!Subtarget->hasSSE2()) return false; IsInputDouble = true; break; } Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; static const uint16_t CvtOpc[3][2][2] = { { { X86::CVTTSS2SIrr, X86::CVTTSS2SI64rr }, { X86::CVTTSD2SIrr, X86::CVTTSD2SI64rr } }, { { X86::VCVTTSS2SIrr, X86::VCVTTSS2SI64rr }, { X86::VCVTTSD2SIrr, X86::VCVTTSD2SI64rr } }, { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr }, { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } }, }; unsigned AVXLevel = Subtarget->hasAVX512() ? 2 : Subtarget->hasAVX() ? 1 : 0; unsigned Opc; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected result type."); case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break; case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break; } // Check if we can fold insertelement instructions into the convert. const Value *Op = II->getArgOperand(0); while (auto *IE = dyn_cast(Op)) { const Value *Index = IE->getOperand(2); if (!isa(Index)) break; unsigned Idx = cast(Index)->getZExtValue(); if (Idx == 0) { Op = IE->getOperand(1); break; } Op = IE->getOperand(0); } unsigned Reg = getRegForValue(Op); if (Reg == 0) return false; unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Reg); updateValueMap(II, ResultReg); return true; } } } bool X86FastISel::fastLowerArguments() { if (!FuncInfo.CanLowerReturn) return false; const Function *F = FuncInfo.Fn; if (F->isVarArg()) return false; CallingConv::ID CC = F->getCallingConv(); if (CC != CallingConv::C) return false; if (Subtarget->isCallingConvWin64(CC)) return false; if (!Subtarget->is64Bit()) return false; if (Subtarget->useSoftFloat()) return false; // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. unsigned GPRCnt = 0; unsigned FPRCnt = 0; for (auto const &Arg : F->args()) { if (Arg.hasAttribute(Attribute::ByVal) || Arg.hasAttribute(Attribute::InReg) || Arg.hasAttribute(Attribute::StructRet) || Arg.hasAttribute(Attribute::SwiftSelf) || Arg.hasAttribute(Attribute::SwiftError) || Arg.hasAttribute(Attribute::Nest)) return false; Type *ArgTy = Arg.getType(); if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; EVT ArgVT = TLI.getValueType(DL, ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { default: return false; case MVT::i32: case MVT::i64: ++GPRCnt; break; case MVT::f32: case MVT::f64: if (!Subtarget->hasSSE1()) return false; ++FPRCnt; break; } if (GPRCnt > 6) return false; if (FPRCnt > 8) return false; } static const MCPhysReg GPR32ArgRegs[] = { X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D }; static const MCPhysReg GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 }; static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned GPRIdx = 0; unsigned FPRIdx = 0; for (auto const &Arg : F->args()) { MVT VT = TLI.getSimpleValueType(DL, Arg.getType()); const TargetRegisterClass *RC = TLI.getRegClassFor(VT); unsigned SrcReg; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type."); case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; case MVT::f32: LLVM_FALLTHROUGH; case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); updateValueMap(&Arg, ResultReg); } return true; } static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, CallingConv::ID CC, ImmutableCallSite *CS) { if (Subtarget->is64Bit()) return 0; if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; if (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE) return 0; if (CS) if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) || CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU()) return 0; return 4; } bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { auto &OutVals = CLI.OutVals; auto &OutFlags = CLI.OutFlags; auto &OutRegs = CLI.OutRegs; auto &Ins = CLI.Ins; auto &InRegs = CLI.InRegs; CallingConv::ID CC = CLI.CallConv; bool &IsTailCall = CLI.IsTailCall; bool IsVarArg = CLI.IsVarArg; const Value *Callee = CLI.Callee; MCSymbol *Symbol = CLI.Symbol; bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CC); const CallInst *CI = CLI.CS ? dyn_cast(CLI.CS->getInstruction()) : nullptr; const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr; // Call / invoke instructions with NoCfCheck attribute require special // handling. const auto *II = CLI.CS ? dyn_cast(CLI.CS->getInstruction()) : nullptr; if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck())) return false; // Functions with no_caller_saved_registers that need special handling. if ((CI && CI->hasFnAttr("no_caller_saved_registers")) || (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) return false; // Functions using retpoline should use SDISel for calls. if (Subtarget->useRetpoline()) return false; // Handle only C, fastcc, and webkit_js calling conventions for now. switch (CC) { default: return false; case CallingConv::C: case CallingConv::Fast: case CallingConv::WebKit_JS: case CallingConv::Swift: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: case CallingConv::X86_ThisCall: case CallingConv::Win64: case CallingConv::X86_64_SysV: break; } // Allow SelectionDAG isel to handle tail calls. if (IsTailCall) return false; // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; // Don't know how to handle Win64 varargs yet. Nothing special needed for // x86-32. Special handling for x86-64 is implemented. if (IsVarArg && IsWin64) return false; // Don't know about inalloca yet. if (CLI.CS && CLI.CS->hasInAllocaArgument()) return false; for (auto Flag : CLI.OutFlags) if (Flag.isSwiftError()) return false; SmallVector OutVTs; SmallVector ArgRegs; // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra // instruction. This is safe because it is common to all FastISel supported // calling conventions on x86. for (int i = 0, e = OutVals.size(); i != e; ++i) { Value *&Val = OutVals[i]; ISD::ArgFlagsTy Flags = OutFlags[i]; if (auto *CI = dyn_cast(Val)) { if (CI->getBitWidth() < 32) { if (Flags.isSExt()) Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); else Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); } } // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". MVT VT; auto *TI = dyn_cast(Val); unsigned ResultReg; if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && (TI->getParent() == CLI.CS->getInstruction()->getParent()) && TI->hasOneUse()) { Value *PrevVal = TI->getOperand(0); ResultReg = getRegForValue(PrevVal); if (!ResultReg) return false; if (!isTypeLegal(PrevVal->getType(), VT)) return false; ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); } else { if (!isTypeLegal(Val->getType(), VT)) return false; ResultReg = getRegForValue(Val); } if (!ResultReg) return false; ArgRegs.push_back(ResultReg); OutVTs.push_back(VT); } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) .addImm(NumBytes).addImm(0).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign const &VA = ArgLocs[i]; const Value *ArgVal = OutVals[VA.getValNo()]; MVT ArgVT = OutVTs[VA.getValNo()]; if (ArgVT == MVT::x86mmx) return false; unsigned ArgReg = ArgRegs[VA.getValNo()]; // Promote the value if needed. switch (VA.getLocInfo()) { case CCValAssign::Full: break; case CCValAssign::SExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); if (ArgVT == MVT::i1) return false; bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::ZExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); // Handle zero-extension from i1 to i8, which is common. if (ArgVT == MVT::i1) { // Set the high bits to zero. ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); ArgVT = MVT::i8; if (ArgReg == 0) return false; } bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::AExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); if (!Emitted) Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); if (!Emitted) Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); assert(Emitted && "Failed to emit a aext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::BCvt: { ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, /*TODO: Kill=*/false); assert(ArgReg && "Failed to emit a bitcast!"); ArgVT = VA.getLocVT(); break; } case CCValAssign::VExt: // VExt has not been implemented, so this should be impossible to reach // for now. However, fallback to Selection DAG isel once implemented. return false; case CCValAssign::AExtUpper: case CCValAssign::SExtUpper: case CCValAssign::ZExtUpper: case CCValAssign::FPExt: llvm_unreachable("Unexpected loc info!"); case CCValAssign::Indirect: // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully // support this. return false; } if (VA.isRegLoc()) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); OutRegs.push_back(VA.getLocReg()); } else { assert(VA.isMemLoc()); // Don't emit stores for undef values. if (isa(ArgVal)) continue; unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; SrcAM.Base.Reg = ArgReg; if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) return false; } else if (isa(ArgVal) || isa(ArgVal)) { // If this is a really simple value, emit this with the Value* version // of X86FastEmitStore. If it isn't simple, we don't want to do this, // as it can cause us to reevaluate the argument. if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) return false; } else { bool ValIsKill = hasTrivialKill(ArgVal); if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) return false; } } } // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (Subtarget->isPICStyleGOT()) { unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); } if (Is64Bit && IsVarArg && !IsWin64) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), X86::AL).addImm(NumXMMRegs); } // Materialize callee address in a register. FIXME: GV address can be // handled with a CALLpcrel32 instead. X86AddressMode CalleeAM; if (!X86SelectCallAddress(Callee, CalleeAM)) return false; unsigned CalleeOp = 0; const GlobalValue *GV = nullptr; if (CalleeAM.GV != nullptr) { GV = CalleeAM.GV; } else if (CalleeAM.Base.Reg != 0) { CalleeOp = CalleeAM.Base.Reg; } else return false; // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { // Register-indirect call. unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) .addReg(CalleeOp); } else { // Direct call. assert(GV && "Not a direct call"); // See if we need any target-specific flags on the GV operand. unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV); // This will be a direct call, or an indirect call through memory for // NonLazyBind calls or dllimport calls. bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL; unsigned CallOpc = NeedLoad ? (Is64Bit ? X86::CALL64m : X86::CALL32m) : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); if (NeedLoad) MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0); if (Symbol) MIB.addSym(Symbol, OpFlags); else MIB.addGlobalAddress(GV, 0, OpFlags); if (NeedLoad) MIB.addReg(0); } // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX, RegState::Implicit); if (Is64Bit && IsVarArg && !IsWin64) MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. for (auto Reg : OutRegs) MIB.addReg(Reg, RegState::Implicit); // Issue CALLSEQ_END unsigned NumBytesForCalleeToPop = X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, TM.Options.GuaranteedTailCallOpt) ? NumBytes // Callee pops everything. : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS); unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) .addImm(NumBytes).addImm(NumBytesForCalleeToPop); // Now handle call return values. SmallVector RVLocs; CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, CLI.RetTy->getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; unsigned SrcReg = VA.getLocReg(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; CopyReg = createResultReg(&X86::RFP80RegClass); } // Copy out the result. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg); InRegs.push_back(VA.getLocReg()); // Round the f80 to the right size, which also moves it to the appropriate // xmm register. This is accomplished by storing the f80 value in memory // and then loading it back. if (CopyVT != VA.getValVT()) { EVT ResVT = VA.getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; int FI = MFI.CreateStackObject(MemSize, MemSize, false); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)), FI) .addReg(CopyReg); Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg + i), FI); } } CLI.ResultReg = ResultReg; CLI.NumResultRegs = RVLocs.size(); CLI.Call = MIB; return true; } bool X86FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: break; case Instruction::Load: return X86SelectLoad(I); case Instruction::Store: return X86SelectStore(I); case Instruction::Ret: return X86SelectRet(I); case Instruction::ICmp: case Instruction::FCmp: return X86SelectCmp(I); case Instruction::ZExt: return X86SelectZExt(I); case Instruction::SExt: return X86SelectSExt(I); case Instruction::Br: return X86SelectBranch(I); case Instruction::LShr: case Instruction::AShr: case Instruction::Shl: return X86SelectShift(I); case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: case Instruction::URem: return X86SelectDivRem(I); case Instruction::Select: return X86SelectSelect(I); case Instruction::Trunc: return X86SelectTrunc(I); case Instruction::FPExt: return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); case Instruction::SIToFP: return X86SelectSIToFP(I); case Instruction::UIToFP: return X86SelectUIToFP(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(DL, I->getType()); if (DstVT.bitsGT(SrcVT)) return X86SelectZExt(I); if (DstVT.bitsLT(SrcVT)) return X86SelectTrunc(I); unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; updateValueMap(I, Reg); return true; } case Instruction::BitCast: { // Select SSE2/AVX bitcasts between 128/256 bit vector types. if (!Subtarget->hasSSE2()) return false; EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(DL, I->getType()); if (!SrcVT.isSimple() || !DstVT.isSimple()) return false; MVT SVT = SrcVT.getSimpleVT(); MVT DVT = DstVT.getSimpleVT(); if (!SVT.is128BitVector() && !(Subtarget->hasAVX() && SVT.is256BitVector()) && !(Subtarget->hasAVX512() && SVT.is512BitVector() && (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 && DVT.getScalarSizeInBits() >= 32)))) return false; unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; // No instruction is needed for conversion. Reuse the register used by // the fist operand. updateValueMap(I, Reg); return true; } } return false; } unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { if (VT > MVT::i64) return 0; uint64_t Imm = CI->getZExtValue(); if (Imm == 0) { unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: case MVT::i8: return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, X86::sub_8bit); case MVT::i16: return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, X86::sub_16bit); case MVT::i32: return SrcReg; case MVT::i64: { unsigned ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); return ResultReg; } } } unsigned Opc = 0; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: // TODO: Support this properly. if (Subtarget->hasAVX512()) return 0; VT = MVT::i8; LLVM_FALLTHROUGH; case MVT::i8: Opc = X86::MOV8ri; break; case MVT::i16: Opc = X86::MOV16ri; break; case MVT::i32: Opc = X86::MOV32ri; break; case MVT::i64: { if (isUInt<32>(Imm)) Opc = X86::MOV32ri; else if (isInt<32>(Imm)) Opc = X86::MOV64ri32; else Opc = X86::MOV64ri; break; } } if (VT == MVT::i64 && Opc == X86::MOV32ri) { unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); unsigned ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); return ResultReg; } return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); } unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { if (CFP->isNullValue()) return fastMaterializeFloatZero(CFP); // Can't handle alternate code models yet. CodeModel::Model CM = TM.getCodeModel(); if (CM != CodeModel::Small && CM != CodeModel::Large) return 0; // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX512() ? X86::VMOVSSZrm : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = Subtarget->hasAVX512() ? X86::VMOVSDZrm : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; RC = &X86::RFP64RegClass; } break; case MVT::f80: // No f80 support yet. return 0; } // MachineConstantPool wants an explicit alignment. unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); if (Align == 0) { // Alignment of vector types. FIXME! Align = DL.getTypeAllocSize(CFP->getType()); } // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr); if (OpFlag == X86II::MO_PIC_BASE_OFFSET) PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); else if (OpFlag == X86II::MO_GOTOFF) PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small) PICBase = X86::RIP; // Create the load from the constant pool. unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); unsigned ResultReg = createResultReg(RC); if (CM == CodeModel::Large) { unsigned AddrReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), AddrReg) .addConstantPoolIndex(CPI, 0, OpFlag); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getConstantPool(*FuncInfo.MF), MachineMemOperand::MOLoad, DL.getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), CPI, PICBase, OpFlag); return ResultReg; } unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return 0; // Materialize addresses with LEA/MOV instructions. X86AddressMode AM; if (X86SelectAddress(GV, AM)) { // If the expression is just a basereg, then we're done, otherwise we need // to emit an LEA. if (AM.BaseType == X86AddressMode::RegBase && AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); if (TM.getRelocationModel() == Reloc::Static && TLI.getPointerTy(DL) == MVT::i64) { // The displacement code could be more than 32 bits away so we need to use // an instruction with a 64 bit immediate BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), ResultReg) .addGlobalAddress(GV); } else { unsigned Opc = TLI.getPointerTy(DL) == MVT::i32 ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) : X86::LEA64r; addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); } return ResultReg; } return 0; } unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { EVT CEVT = TLI.getValueType(DL, C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) return 0; MVT VT = CEVT.getSimpleVT(); if (const auto *CI = dyn_cast(C)) return X86MaterializeInt(CI, VT); else if (const ConstantFP *CFP = dyn_cast(C)) return X86MaterializeFP(CFP, VT); else if (const GlobalValue *GV = dyn_cast(C)) return X86MaterializeGV(GV, VT); return 0; } unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { // Fail on dynamic allocas. At this point, getRegForValue has already // checked its CSE maps, so if we're here trying to handle a dynamic // alloca, we're not going to succeed. X86SelectAddress has a // check for dynamic allocas, because it's called directly from // various places, but targetMaterializeAlloca also needs a check // in order to avoid recursion between getRegForValue, // X86SelectAddrss, and targetMaterializeAlloca. if (!FuncInfo.StaticAllocaMap.count(C)) return 0; assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; unsigned Opc = TLI.getPointerTy(DL) == MVT::i32 ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) : X86::LEA64r; const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); return ResultReg; } unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { MVT VT; if (!isTypeLegal(CF->getType(), VT)) return 0; // Get opcode and regclass for the given zero. bool HasAVX512 = Subtarget->hasAVX512(); unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: if (X86ScalarSSEf32) { Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; } else { Opc = X86::LD_Fp032; RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; } else { Opc = X86::LD_Fp064; RC = &X86::RFP64RegClass; } break; case MVT::f80: // No f80 support yet. return 0; } unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); return ResultReg; } bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) { const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; if (!X86SelectAddress(Ptr, AM)) return false; const X86InstrInfo &XII = (const X86InstrInfo &)TII; unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = DL.getABITypeAlignment(LI->getType()); SmallVector AddrOps; AM.getFullAddress(AddrOps); MachineInstr *Result = XII.foldMemoryOperandImpl( *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment, /*AllowCommute=*/true); if (!Result) return false; // The index register could be in the wrong register class. Unfortunately, // foldMemoryOperandImpl could have commuted the instruction so its not enough // to just look at OpNo + the offset to the index reg. We actually need to // scan the instruction to find the index reg and see if its the correct reg // class. unsigned OperandNo = 0; for (MachineInstr::mop_iterator I = Result->operands_begin(), E = Result->operands_end(); I != E; ++I, ++OperandNo) { MachineOperand &MO = *I; if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg) continue; // Found the index reg, now try to rewrite it. unsigned IndexReg = constrainOperandRegClass(Result->getDesc(), MO.getReg(), OperandNo); if (IndexReg == MO.getReg()) continue; MO.setReg(IndexReg); } Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); MI->eraseFromParent(); return true; } unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, unsigned Op1, bool Op1IsKill, unsigned Op2, bool Op2IsKill, unsigned Op3, bool Op3IsKill) { const MCInstrDesc &II = TII.get(MachineInstOpcode); unsigned ResultReg = createResultReg(RC); Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2); Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3); if (II.getNumDefs() >= 1) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) .addReg(Op0, getKillRegState(Op0IsKill)) .addReg(Op1, getKillRegState(Op1IsKill)) .addReg(Op2, getKillRegState(Op2IsKill)) .addReg(Op3, getKillRegState(Op3IsKill)); else { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) .addReg(Op0, getKillRegState(Op0IsKill)) .addReg(Op1, getKillRegState(Op1IsKill)) .addReg(Op2, getKillRegState(Op2IsKill)) .addReg(Op3, getKillRegState(Op3IsKill)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); } return ResultReg; } namespace llvm { FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) { return new X86FastISel(funcInfo, libInfo); } } Index: vendor/llvm/dist-release_70/test/CodeGen/AArch64/f16-instructions.ll =================================================================== --- vendor/llvm/dist-release_70/test/CodeGen/AArch64/f16-instructions.ll (revision 337298) +++ vendor/llvm/dist-release_70/test/CodeGen/AArch64/f16-instructions.ll (revision 337299) @@ -1,1157 +1,1187 @@ ; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -disable-fp-elim | FileCheck %s --check-prefix=CHECK-CVT --check-prefix=CHECK-COMMON ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -disable-fp-elim | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-FP16 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; CHECK-CVT-LABEL: test_fadd: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fadd: ; CHECK-FP16-NEXT: fadd h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_fadd(half %a, half %b) #0 { %r = fadd half %a, %b ret half %r } ; CHECK-CVT-LABEL: test_fsub: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fsub s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fsub: ; CHECK-FP16-NEXT: fsub h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_fsub(half %a, half %b) #0 { %r = fsub half %a, %b ret half %r } ; CHECK-CVT-LABEL: test_fmul: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fmul s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fmul: ; CHECK-FP16-NEXT: fmul h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_fmul(half %a, half %b) #0 { %r = fmul half %a, %b ret half %r } ; CHECK-CVT-LABEL: test_fdiv: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fdiv s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fdiv: ; CHECK-FP16-NEXT: fdiv h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_fdiv(half %a, half %b) #0 { %r = fdiv half %a, %b ret half %r } ; CHECK-COMMON-LABEL: test_frem: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: fcvt s1, h1 ; CHECK-COMMON-NEXT: bl {{_?}}fmodf ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_frem(half %a, half %b) #0 { %r = frem half %a, %b ret half %r } ; CHECK-COMMON-LABEL: test_store: ; CHECK-COMMON-NEXT: str h0, [x0] ; CHECK-COMMON-NEXT: ret define void @test_store(half %a, half* %b) #0 { store half %a, half* %b ret void } ; CHECK-COMMON-LABEL: test_load: ; CHECK-COMMON-NEXT: ldr h0, [x0] ; CHECK-COMMON-NEXT: ret define half @test_load(half* %a) #0 { %r = load half, half* %a ret half %r } declare half @test_callee(half %a, half %b) #0 ; CHECK-COMMON-LABEL: test_call: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: bl {{_?}}test_callee ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_call(half %a, half %b) #0 { %r = call half @test_callee(half %a, half %b) ret half %r } ; CHECK-COMMON-LABEL: test_call_flipped: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: mov.16b v2, v0 ; CHECK-COMMON-NEXT: mov.16b v0, v1 ; CHECK-COMMON-NEXT: mov.16b v1, v2 ; CHECK-COMMON-NEXT: bl {{_?}}test_callee ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_call_flipped(half %a, half %b) #0 { %r = call half @test_callee(half %b, half %a) ret half %r } ; CHECK-COMMON-LABEL: test_tailcall_flipped: ; CHECK-COMMON-NEXT: mov.16b v2, v0 ; CHECK-COMMON-NEXT: mov.16b v0, v1 ; CHECK-COMMON-NEXT: mov.16b v1, v2 ; CHECK-COMMON-NEXT: b {{_?}}test_callee define half @test_tailcall_flipped(half %a, half %b) #0 { %r = tail call half @test_callee(half %b, half %a) ret half %r } ; CHECK-CVT-LABEL: test_select: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: cmp w0, #0 ; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_select: ; CHECK-FP16-NEXT: cmp w0, #0 ; CHECK-FP16-NEXT: fcsel h0, h0, h1, ne ; CHECK-FP16-NEXT: ret define half @test_select(half %a, half %b, i1 zeroext %c) #0 { %r = select i1 %c, half %a, half %b ret half %r } ; CHECK-CVT-LABEL: test_select_cc: ; CHECK-CVT-DAG: fcvt s3, h3 ; CHECK-CVT-DAG: fcvt s2, h2 ; CHECK-CVT-DAG: fcvt s1, h1 ; CHECK-CVT-DAG: fcvt s0, h0 ; CHECK-CVT-DAG: fcmp s2, s3 ; CHECK-CVT-DAG: cset [[CC:w[0-9]+]], ne ; CHECK-CVT-DAG: cmp [[CC]], #0 ; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_select_cc: ; CHECK-FP16-NEXT: fcmp h2, h3 ; CHECK-FP16-NEXT: fcsel h0, h0, h1, ne ; CHECK-FP16-NEXT: ret define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { %cc = fcmp une half %c, %d %r = select i1 %cc, half %a, half %b ret half %r } ; CHECK-CVT-LABEL: test_select_cc_f32_f16: ; CHECK-CVT-DAG: fcvt s2, h2 ; CHECK-CVT-DAG: fcvt s3, h3 ; CHECK-CVT-NEXT: fcmp s2, s3 ; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_select_cc_f32_f16: ; CHECK-FP16-NEXT: fcmp h2, h3 ; CHECK-FP16-NEXT: fcsel s0, s0, s1, ne ; CHECK-FP16-NEXT: ret define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { %cc = fcmp une half %c, %d %r = select i1 %cc, float %a, float %b ret float %r } ; CHECK-CVT-LABEL: test_select_cc_f16_f32: ; CHECK-CVT-DAG: fcvt s0, h0 ; CHECK-CVT-DAG: fcvt s1, h1 ; CHECK-CVT-DAG: fcmp s2, s3 ; CHECK-CVT-DAG: cset w8, ne ; CHECK-CVT-NEXT: cmp w8, #0 ; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_select_cc_f16_f32: ; CHECK-FP16-NEXT: fcmp s2, s3 ; CHECK-FP16-NEXT: fcsel h0, h0, h1, ne ; CHECK-FP16-NEXT: ret define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { %cc = fcmp une float %c, %d %r = select i1 %cc, half %a, half %b ret half %r } ; CHECK-CVT-LABEL: test_fcmp_une: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, ne ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_une: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, ne ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_une(half %a, half %b) #0 { %r = fcmp une half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ueq: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset [[TRUE:w[0-9]+]], eq ; CHECK-CVT-NEXT: csinc w0, [[TRUE]], wzr, vc ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ueq: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset [[TRUE:w[0-9]+]], eq ; CHECK-FP16-NEXT: csinc w0, [[TRUE]], wzr, vc ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ueq(half %a, half %b) #0 { %r = fcmp ueq half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ugt: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, hi ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ugt: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, hi ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ugt(half %a, half %b) #0 { %r = fcmp ugt half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_uge: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, pl ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_uge: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, pl ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_uge(half %a, half %b) #0 { %r = fcmp uge half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ult: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, lt ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ult: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, lt ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ult(half %a, half %b) #0 { %r = fcmp ult half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ule: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, le ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ule: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, le ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ule(half %a, half %b) #0 { %r = fcmp ule half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_uno: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, vs ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_uno: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, vs ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_uno(half %a, half %b) #0 { %r = fcmp uno half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_one: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset [[TRUE:w[0-9]+]], mi ; CHECK-CVT-NEXT: csinc w0, [[TRUE]], wzr, le ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_one: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset [[TRUE:w[0-9]+]], mi ; CHECK-FP16-NEXT: csinc w0, [[TRUE]], wzr, le ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_one(half %a, half %b) #0 { %r = fcmp one half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_oeq: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, eq ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_oeq: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, eq ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_oeq(half %a, half %b) #0 { %r = fcmp oeq half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ogt: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, gt ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ogt: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, gt ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ogt(half %a, half %b) #0 { %r = fcmp ogt half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_oge: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, ge ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_oge: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, ge ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_oge(half %a, half %b) #0 { %r = fcmp oge half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_olt: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, mi ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_olt: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, mi ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_olt(half %a, half %b) #0 { %r = fcmp olt half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ole: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, ls ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ole: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, ls ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ole(half %a, half %b) #0 { %r = fcmp ole half %a, %b ret i1 %r } ; CHECK-CVT-LABEL: test_fcmp_ord: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w0, vc ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fcmp_ord: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: cset w0, vc ; CHECK-FP16-NEXT: ret define i1 @test_fcmp_ord(half %a, half %b) #0 { %r = fcmp ord half %a, %b ret i1 %r } +; CHECK-COMMON-LABEL: test_fccmp: +; CHECK-CVT: fcvt s0, h0 +; CHECK-CVT-NEXT: fmov s1, #8.00000000 +; CHECK-CVT-NEXT: fmov s2, #5.00000000 +; CHECK-CVT-NEXT: fcmp s0, s1 +; CHECK-CVT-NEXT: cset w8, gt +; CHECK-CVT-NEXT: fcmp s0, s2 +; CHECK-CVT-NEXT: cset w9, mi +; CHECK-CVT-NEXT: tst w8, w9 +; CHECK-CVT-NEXT: fcsel s0, s0, s2, ne +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: str h0, [x0] +; CHECK-CVT-NEXT: ret +; CHECK-FP16: fmov h1, #5.00000000 +; CHECK-FP16-NEXT: fcmp h0, h1 +; CHECK-FP16-NEXT: fmov h2, #8.00000000 +; CHECK-FP16-NEXT: fccmp h0, h2, #4, mi +; CHECK-FP16-NEXT: fcsel h0, h0, h1, gt +; CHECK-FP16-NEXT: str h0, [x0] +; CHECK-FP16-NEXT: ret + +define void @test_fccmp(half %in, half* %out) { + %cmp1 = fcmp ogt half %in, 0xH4800 + %cmp2 = fcmp olt half %in, 0xH4500 + %cond = and i1 %cmp1, %cmp2 + %result = select i1 %cond, half %in, half 0xH4500 + store half %result, half* %out + ret void +} + ; CHECK-CVT-LABEL: test_br_cc: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: b.mi [[BRCC_ELSE:.?LBB[0-9_]+]] ; CHECK-CVT-NEXT: str wzr, [x0] ; CHECK-CVT-NEXT: ret ; CHECK-CVT-NEXT: [[BRCC_ELSE]]: ; CHECK-CVT-NEXT: str wzr, [x1] ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_br_cc: ; CHECK-FP16-NEXT: fcmp h0, h1 ; CHECK-FP16-NEXT: b.mi [[BRCC_ELSE:.?LBB[0-9_]+]] ; CHECK-FP16-NEXT: str wzr, [x0] ; CHECK-FP16-NEXT: ret ; CHECK-FP16-NEXT: [[BRCC_ELSE]]: ; CHECK-FP16-NEXT: str wzr, [x1] ; CHECK-FP16-NEXT: ret define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 { %c = fcmp uge half %a, %b br i1 %c, label %then, label %else then: store i32 0, i32* %p1 ret void else: store i32 0, i32* %p2 ret void } ; CHECK-COMMON-LABEL: test_phi: ; CHECK-COMMON: mov x[[PTR:[0-9]+]], x0 ; CHECK-COMMON: ldr h[[AB:[0-9]+]], [x0] ; CHECK-COMMON: [[LOOP:LBB[0-9_]+]]: ; CHECK-COMMON: mov.16b v[[R:[0-9]+]], v[[AB]] ; CHECK-COMMON: ldr h[[AB]], [x[[PTR]]] ; CHECK-COMMON: mov x0, x[[PTR]] ; CHECK-COMMON: bl {{_?}}test_dummy ; CHECK-COMMON: mov.16b v0, v[[R]] ; CHECK-COMMON: ret define half @test_phi(half* %p1) #0 { entry: %a = load half, half* %p1 br label %loop loop: %r = phi half [%a, %entry], [%b, %loop] %b = load half, half* %p1 %c = call i1 @test_dummy(half* %p1) br i1 %c, label %loop, label %return return: ret half %r } declare i1 @test_dummy(half* %p1) #0 ; CHECK-CVT-LABEL: test_fptosi_i32: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvtzs w0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fptosi_i32: ; CHECK-FP16-NEXT: fcvtzs w0, h0 ; CHECK-FP16-NEXT: ret define i32 @test_fptosi_i32(half %a) #0 { %r = fptosi half %a to i32 ret i32 %r } ; CHECK-CVT-LABEL: test_fptosi_i64: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvtzs x0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fptosi_i64: ; CHECK-FP16-NEXT: fcvtzs x0, h0 ; CHECK-FP16-NEXT: ret define i64 @test_fptosi_i64(half %a) #0 { %r = fptosi half %a to i64 ret i64 %r } ; CHECK-CVT-LABEL: test_fptoui_i32: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvtzu w0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fptoui_i32: ; CHECK-FP16-NEXT: fcvtzu w0, h0 ; CHECK-FP16-NEXT: ret define i32 @test_fptoui_i32(half %a) #0 { %r = fptoui half %a to i32 ret i32 %r } ; CHECK-CVT-LABEL: test_fptoui_i64: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvtzu x0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fptoui_i64: ; CHECK-FP16-NEXT: fcvtzu x0, h0 ; CHECK-FP16-NEXT: ret define i64 @test_fptoui_i64(half %a) #0 { %r = fptoui half %a to i64 ret i64 %r } ; CHECK-CVT-LABEL: test_uitofp_i32: ; CHECK-CVT-NEXT: ucvtf s0, w0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_uitofp_i32: ; CHECK-FP16-NEXT: ucvtf h0, w0 ; CHECK-FP16-NEXT: ret define half @test_uitofp_i32(i32 %a) #0 { %r = uitofp i32 %a to half ret half %r } ; CHECK-CVT-LABEL: test_uitofp_i64: ; CHECK-CVT-NEXT: ucvtf s0, x0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_uitofp_i64: ; CHECK-FP16-NEXT: ucvtf h0, x0 ; CHECK-FP16-NEXT: ret define half @test_uitofp_i64(i64 %a) #0 { %r = uitofp i64 %a to half ret half %r } ; CHECK-CVT-LABEL: test_sitofp_i32: ; CHECK-CVT-NEXT: scvtf s0, w0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_sitofp_i32: ; CHECK-FP16-NEXT: scvtf h0, w0 ; CHECK-FP16-NEXT: ret define half @test_sitofp_i32(i32 %a) #0 { %r = sitofp i32 %a to half ret half %r } ; CHECK-CVT-LABEL: test_sitofp_i64: ; CHECK-CVT-NEXT: scvtf s0, x0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_sitofp_i64: ; CHECK-FP16-NEXT: scvtf h0, x0 ; CHECK-FP16-NEXT: ret define half @test_sitofp_i64(i64 %a) #0 { %r = sitofp i64 %a to half ret half %r } ; CHECK-CVT-LABEL: test_uitofp_i32_fadd: ; CHECK-CVT-NEXT: ucvtf s1, w0 ; CHECK-CVT-NEXT: fcvt h1, s1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_uitofp_i32_fadd: ; CHECK-FP16-NEXT: ucvtf h1, w0 ; CHECK-FP16-NEXT: fadd h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { %c = uitofp i32 %a to half %r = fadd half %b, %c ret half %r } ; CHECK-CVT-LABEL: test_sitofp_i32_fadd: ; CHECK-CVT-NEXT: scvtf s1, w0 ; CHECK-CVT-NEXT: fcvt h1, s1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_sitofp_i32_fadd: ; CHECK-FP16-NEXT: scvtf h1, w0 ; CHECK-FP16-NEXT: fadd h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { %c = sitofp i32 %a to half %r = fadd half %b, %c ret half %r } ; CHECK-COMMON-LABEL: test_fptrunc_float: ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ret define half @test_fptrunc_float(float %a) #0 { %r = fptrunc float %a to half ret half %r } ; CHECK-COMMON-LABEL: test_fptrunc_double: ; CHECK-COMMON-NEXT: fcvt h0, d0 ; CHECK-COMMON-NEXT: ret define half @test_fptrunc_double(double %a) #0 { %r = fptrunc double %a to half ret half %r } ; CHECK-COMMON-LABEL: test_fpext_float: ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: ret define float @test_fpext_float(half %a) #0 { %r = fpext half %a to float ret float %r } ; CHECK-COMMON-LABEL: test_fpext_double: ; CHECK-COMMON-NEXT: fcvt d0, h0 ; CHECK-COMMON-NEXT: ret define double @test_fpext_double(half %a) #0 { %r = fpext half %a to double ret double %r } ; CHECK-COMMON-LABEL: test_bitcast_halftoi16: ; CHECK-COMMON-NEXT: fmov w0, s0 ; CHECK-COMMON-NEXT: ret define i16 @test_bitcast_halftoi16(half %a) #0 { %r = bitcast half %a to i16 ret i16 %r } ; CHECK-COMMON-LABEL: test_bitcast_i16tohalf: ; CHECK-COMMON-NEXT: fmov s0, w0 ; CHECK-COMMON-NEXT: ret define half @test_bitcast_i16tohalf(i16 %a) #0 { %r = bitcast i16 %a to half ret half %r } declare half @llvm.sqrt.f16(half %a) #0 declare half @llvm.powi.f16(half %a, i32 %b) #0 declare half @llvm.sin.f16(half %a) #0 declare half @llvm.cos.f16(half %a) #0 declare half @llvm.pow.f16(half %a, half %b) #0 declare half @llvm.exp.f16(half %a) #0 declare half @llvm.exp2.f16(half %a) #0 declare half @llvm.log.f16(half %a) #0 declare half @llvm.log10.f16(half %a) #0 declare half @llvm.log2.f16(half %a) #0 declare half @llvm.fma.f16(half %a, half %b, half %c) #0 declare half @llvm.fabs.f16(half %a) #0 declare half @llvm.minnum.f16(half %a, half %b) #0 declare half @llvm.maxnum.f16(half %a, half %b) #0 declare half @llvm.copysign.f16(half %a, half %b) #0 declare half @llvm.floor.f16(half %a) #0 declare half @llvm.ceil.f16(half %a) #0 declare half @llvm.trunc.f16(half %a) #0 declare half @llvm.rint.f16(half %a) #0 declare half @llvm.nearbyint.f16(half %a) #0 declare half @llvm.round.f16(half %a) #0 declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 declare half @llvm.aarch64.neon.frecpe.f16(half %a) #0 declare half @llvm.aarch64.neon.frecpx.f16(half %a) #0 declare half @llvm.aarch64.neon.frsqrte.f16(half %a) #0 ; CHECK-CVT-LABEL: test_sqrt: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fsqrt s0, s0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_sqrt: ; CHECK-FP16-NEXT: fsqrt h0, h0 ; CHECK-FP16-NEXT: ret define half @test_sqrt(half %a) #0 { %r = call half @llvm.sqrt.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_powi: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}__powisf2 ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_powi(half %a, i32 %b) #0 { %r = call half @llvm.powi.f16(half %a, i32 %b) ret half %r } ; CHECK-COMMON-LABEL: test_sin: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}sinf ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_sin(half %a) #0 { %r = call half @llvm.sin.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_cos: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}cosf ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_cos(half %a) #0 { %r = call half @llvm.cos.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_pow: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: fcvt s1, h1 ; CHECK-COMMON-NEXT: bl {{_?}}powf ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_pow(half %a, half %b) #0 { %r = call half @llvm.pow.f16(half %a, half %b) ret half %r } ; CHECK-COMMON-LABEL: test_exp: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}expf ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_exp(half %a) #0 { %r = call half @llvm.exp.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_exp2: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}exp2f ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_exp2(half %a) #0 { %r = call half @llvm.exp2.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_log: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}logf ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_log(half %a) #0 { %r = call half @llvm.log.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_log10: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}log10f ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_log10(half %a) #0 { %r = call half @llvm.log10.f16(half %a) ret half %r } ; CHECK-COMMON-LABEL: test_log2: ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: fcvt s0, h0 ; CHECK-COMMON-NEXT: bl {{_?}}log2f ; CHECK-COMMON-NEXT: fcvt h0, s0 ; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 ; CHECK-COMMON-NEXT: ret define half @test_log2(half %a) #0 { %r = call half @llvm.log2.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_fma: ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fmadd s0, s0, s1, s2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fma: ; CHECK-FP16-NEXT: fmadd h0, h0, h1, h2 ; CHECK-FP16-NEXT: ret define half @test_fma(half %a, half %b, half %c) #0 { %r = call half @llvm.fma.f16(half %a, half %b, half %c) ret half %r } ; CHECK-CVT-LABEL: test_fabs: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fabs s0, s0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fabs: ; CHECK-FP16-NEXT: fabs h0, h0 ; CHECK-FP16-NEXT: ret define half @test_fabs(half %a) #0 { %r = call half @llvm.fabs.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_minnum: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fminnm s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_minnum: ; CHECK-FP16-NEXT: fminnm h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_minnum(half %a, half %b) #0 { %r = call half @llvm.minnum.f16(half %a, half %b) ret half %r } ; CHECK-CVT-LABEL: test_maxnum: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fmaxnm s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_maxnum: ; CHECK-FP16-NEXT: fmaxnm h0, h0, h1 ; CHECK-FP16-NEXT: ret define half @test_maxnum(half %a, half %b) #0 { %r = call half @llvm.maxnum.f16(half %a, half %b) ret half %r } ; CHECK-CVT-LABEL: test_copysign: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign: ; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 ; CHECK-FP16-NEXT: bit.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret define half @test_copysign(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) ret half %r } ; CHECK-CVT-LABEL: test_copysign_f32: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_f32: ; CHECK-FP16-NEXT: fcvt h1, s1 ; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 ; CHECK-FP16-NEXT: bit.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret define half @test_copysign_f32(half %a, float %b) #0 { %tb = fptrunc float %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) ret half %r } ; CHECK-CVT-LABEL: test_copysign_f64: ; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_f64: ; CHECK-FP16-NEXT: fcvt h1, d1 ; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 ; CHECK-FP16-NEXT: bit.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret define half @test_copysign_f64(half %a, double %b) #0 { %tb = fptrunc double %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) ret half %r } ; Check that the FP promotion will use a truncating FP_ROUND, so we can fold ; away the (fpext (fp_round )) here. ; CHECK-CVT-LABEL: test_copysign_extended: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_extended: ; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 ; CHECK-FP16-NEXT: bit.16b v0, v1, v2 ; CHECK-FP16-NEXT: fcvt s0, h0 ; CHECK-FP16-NEXT: ret define float @test_copysign_extended(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) %xr = fpext half %r to float ret float %xr } ; CHECK-CVT-LABEL: test_floor: ; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 ; CHECK-CVT-NEXT: frintm [[INT32:s[0-9]+]], [[FLOAT32]] ; CHECK-CVT-NEXT: fcvt h0, [[INT32]] ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_floor: ; CHECK-FP16-NEXT: frintm h0, h0 ; CHECK-FP16-NEXT: ret define half @test_floor(half %a) #0 { %r = call half @llvm.floor.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_ceil: ; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 ; CHECK-CVT-NEXT: frintp [[INT32:s[0-9]+]], [[FLOAT32]] ; CHECK-CVT-NEXT: fcvt h0, [[INT32]] ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_ceil: ; CHECK-FP16-NEXT: frintp h0, h0 ; CHECK-FP16-NEXT: ret define half @test_ceil(half %a) #0 { %r = call half @llvm.ceil.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_trunc: ; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 ; CHECK-CVT-NEXT: frintz [[INT32:s[0-9]+]], [[FLOAT32]] ; CHECK-CVT-NEXT: fcvt h0, [[INT32]] ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_trunc: ; CHECK-FP16-NEXT: frintz h0, h0 ; CHECK-FP16-NEXT: ret define half @test_trunc(half %a) #0 { %r = call half @llvm.trunc.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_rint: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: frintx s0, s0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_rint: ; CHECK-FP16-NEXT: frintx h0, h0 ; CHECK-FP16-NEXT: ret define half @test_rint(half %a) #0 { %r = call half @llvm.rint.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_nearbyint: ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: frinti s0, s0 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_nearbyint: ; CHECK-FP16-NEXT: frinti h0, h0 ; CHECK-FP16-NEXT: ret define half @test_nearbyint(half %a) #0 { %r = call half @llvm.nearbyint.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_round: ; CHECK-CVT-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0 ; CHECK-CVT-NEXT: frinta [[INT32:s[0-9]+]], [[FLOAT32]] ; CHECK-CVT-NEXT: fcvt h0, [[INT32]] ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_round: ; CHECK-FP16-NEXT: frinta h0, h0 ; CHECK-FP16-NEXT: ret define half @test_round(half %a) #0 { %r = call half @llvm.round.f16(half %a) ret half %r } ; CHECK-CVT-LABEL: test_fmuladd: ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fmul s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: fcvt s1, h2 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_fmuladd: ; CHECK-FP16-NEXT: fmul h0, h0, h1 ; CHECK-FP16-NEXT: fadd h0, h0, h2 ; CHECK-FP16-NEXT: ret define half @test_fmuladd(half %a, half %b, half %c) #0 { %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) ret half %r } ; CHECK-FP16-LABEL: test_vrecpeh_f16: ; CHECK-FP16-NEXT: frecpe h0, h0 ; CHECK-FP16-NEXT: ret define half @test_vrecpeh_f16(half %a) #0 { %r = call half @llvm.aarch64.neon.frecpe.f16(half %a) ret half %r } ; CHECK-FP16-LABEL: test_vrecpxh_f16: ; CHECK-FP16-NEXT: frecpx h0, h0 ; CHECK-FP16-NEXT: ret define half @test_vrecpxh_f16(half %a) #0 { %r = call half @llvm.aarch64.neon.frecpx.f16(half %a) ret half %r } ; CHECK-FP16-LABEL: test_vrsqrteh_f16: ; CHECK-FP16-NEXT: frsqrte h0, h0 ; CHECK-FP16-NEXT: ret define half @test_vrsqrteh_f16(half %a) #0 { %r = call half @llvm.aarch64.neon.frsqrte.f16(half %a) ret half %r } attributes #0 = { nounwind } Index: vendor/llvm/dist-release_70/test/CodeGen/PowerPC/build-vector-tests.ll =================================================================== --- vendor/llvm/dist-release_70/test/CodeGen/PowerPC/build-vector-tests.ll (revision 337298) +++ vendor/llvm/dist-release_70/test/CodeGen/PowerPC/build-vector-tests.ll (revision 337299) @@ -1,4858 +1,4807 @@ ; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck -allow-deprecated-dag-overlap %s \ ; RUN: -check-prefix=P9BE -implicit-check-not frsp ; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck -allow-deprecated-dag-overlap %s \ ; RUN: -check-prefix=P9LE -implicit-check-not frsp ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck -allow-deprecated-dag-overlap %s \ ; RUN: -check-prefix=P8BE -implicit-check-not frsp ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck -allow-deprecated-dag-overlap %s \ ; RUN: -check-prefix=P8LE -implicit-check-not frsp ; This test case comes from the following C test case (included as it may be ; slightly more readable than the LLVM IR. ;/* This test case provides various ways of building vectors to ensure we ; produce optimal code for all cases. The cases are (for each type): ; - All zeros ; - All ones ; - Splat of a constant ; - From different values already in registers ; - From different constants ; - From different values in memory ; - Splat of a value in register ; - Splat of a value in memory ; - Inserting element into existing vector ; - Inserting element from existing vector into existing vector ; ; With conversions (float <-> int) ; - Splat of a constant ; - From different values already in registers ; - From different constants ; - From different values in memory ; - Splat of a value in register ; - Splat of a value in memory ; - Inserting element into existing vector ; - Inserting element from existing vector into existing vector ;*/ ; ;/*=================================== int ===================================*/ ;// P8: xxlxor // ;// P9: xxlxor // ;vector int allZeroi() { // ; return (vector int)0; // ;} // ;// P8: vspltisb -1 // ;// P9: xxspltisb 255 // ;vector int allOnei() { // ; return (vector int)-1; // ;} // ;// P8: vspltisw 1 // ;// P9: vspltisw 1 // ;vector int spltConst1i() { // ; return (vector int)1; // ;} // ;// P8: vspltisw -15; vsrw // ;// P9: vspltisw -15; vsrw // ;vector int spltConst16ki() { // ; return (vector int)((1<<15) - 1); // ;} // ;// P8: vspltisw -16; vsrw // ;// P9: vspltisw -16; vsrw // ;vector int spltConst32ki() { // ; return (vector int)((1<<16) - 1); // ;} // ;// P8: 4 x mtvsrwz, 2 x xxmrgh, vmrgow // ;// P9: 2 x mtvsrdd, vmrgow // ;vector int fromRegsi(int a, int b, int c, int d) { // ; return (vector int){ a, b, c, d }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (or even lxv) // ;vector int fromDiffConstsi() { // ; return (vector int) { 242, -113, 889, 19 }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx // ;vector int fromDiffMemConsAi(int *arr) { // ; return (vector int) { arr[0], arr[1], arr[2], arr[3] }; // ;} // ;// P8: 2 x lxvd2x, 2 x xxswapd, vperm // ;// P9: 2 x lxvx, vperm // ;vector int fromDiffMemConsDi(int *arr) { // ; return (vector int) { arr[3], arr[2], arr[1], arr[0] }; // ;} // ;// P8: sldi 2, lxvd2x, xxswapd // ;// P9: sldi 2, lxvx // ;vector int fromDiffMemVarAi(int *arr, int elem) { // ; return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] }; // ;} // ;// P8: sldi 2, 2 x lxvd2x, 2 x xxswapd, vperm // ;// P9: sldi 2, 2 x lxvx, vperm // ;vector int fromDiffMemVarDi(int *arr, int elem) { // ; return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] }; // ;} // ;// P8: 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow // ;// P9: 4 x lwz, 2 x mtvsrdd, vmrgow // ;vector int fromRandMemConsi(int *arr) { // ; return (vector int) { arr[4], arr[18], arr[2], arr[88] }; // ;} // ;// P8: sldi 2, 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow // ;// P9: sldi 2, add, 4 x lwz, 2 x mtvsrdd, vmrgow // ;vector int fromRandMemVari(int *arr, int elem) { // ; return (vector int) { arr[elem+4], arr[elem+1], arr[elem+2], arr[elem+8] };// ;} // ;// P8: mtvsrwz, xxspltw // ;// P9: mtvsrws // ;vector int spltRegVali(int val) { // ; return (vector int) val; // ;} // ;// P8: lxsiwax, xxspltw // ;// P9: lxvwsx // ;vector int spltMemVali(int *ptr) { // ; return (vector int)*ptr; // ;} // ;// P8: vspltisw // ;// P9: vspltisw // ;vector int spltCnstConvftoi() { // ; return (vector int) 4.74f; // ;} // -;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvdpsxws // +;// P8: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;vector int fromRegsConvftoi(float a, float b, float c, float d) { // ; return (vector int) { a, b, c, d }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector int fromDiffConstsConvftoi() { // ; return (vector int) { 24.46f, 234.f, 988.19f, 422.39f }; // ;} // ;// P8: lxvd2x, xxswapd, xvcvspsxws // ;// P9: lxvx, xvcvspsxws // ;vector int fromDiffMemConsAConvftoi(float *ptr) { // ; return (vector int) { ptr[0], ptr[1], ptr[2], ptr[3] }; // ;} // ;// P8: 2 x lxvd2x, 2 x xxswapd, vperm, xvcvspsxws // ;// P9: 2 x lxvx, vperm, xvcvspsxws // ;vector int fromDiffMemConsDConvftoi(float *ptr) { // ; return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // ;} // -;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // ;// sldi 2, load, xvcvspuxws // ;vector int fromDiffMemVarAConvftoi(float *arr, int elem) { // ; return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] }; // ;} // -;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // ;// sldi 2, 2 x load, vperm, xvcvspuxws // ;vector int fromDiffMemVarDConvftoi(float *arr, int elem) { // ; return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] }; // ;} // ;// P8: xscvdpsxws, xxspltw // ;// P9: xscvdpsxws, xxspltw // ;vector int spltRegValConvftoi(float val) { // ; return (vector int) val; // ;} // ;// P8: lxsspx, xscvdpsxws, xxspltw // ;// P9: lxvwsx, xvcvspsxws // ;vector int spltMemValConvftoi(float *ptr) { // ; return (vector int)*ptr; // ;} // ;// P8: vspltisw // ;// P9: vspltisw // ;vector int spltCnstConvdtoi() { // ; return (vector int) 4.74; // ;} // -;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +;// P8: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;vector int fromRegsConvdtoi(double a, double b, double c, double d) { // ; return (vector int) { a, b, c, d }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector int fromDiffConstsConvdtoi() { // ; return (vector int) { 24.46, 234., 988.19, 422.39 }; // ;} // -;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, // -;// xvcvspsxws // -;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, // -;// xvcvspsxws // +;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspsxws, vmrgew // ;vector int fromDiffMemConsAConvdtoi(double *ptr) { // ; return (vector int) { ptr[0], ptr[1], ptr[2], ptr[3] }; // ;} // -;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;vector int fromDiffMemConsDConvdtoi(double *ptr) { // ; return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // ;} // -;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;vector int fromDiffMemVarAConvdtoi(double *arr, int elem) { // ; return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] }; // ;} // -;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // -;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // +;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ;vector int fromDiffMemVarDConvdtoi(double *arr, int elem) { // ; return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] }; // ;} // ;// P8: xscvdpsxws, xxspltw // ;// P9: xscvdpsxws, xxspltw // ;vector int spltRegValConvdtoi(double val) { // ; return (vector int) val; // ;} // ;// P8: lxsdx, xscvdpsxws, xxspltw // ;// P9: lxssp, xscvdpsxws, xxspltw // ;vector int spltMemValConvdtoi(double *ptr) { // ; return (vector int)*ptr; // ;} // ;/*=================================== int ===================================*/ ;/*=============================== unsigned int ==============================*/ ;// P8: xxlxor // ;// P9: xxlxor // ;vector unsigned int allZeroui() { // ; return (vector unsigned int)0; // ;} // ;// P8: vspltisb -1 // ;// P9: xxspltisb 255 // ;vector unsigned int allOneui() { // ; return (vector unsigned int)-1; // ;} // ;// P8: vspltisw 1 // ;// P9: vspltisw 1 // ;vector unsigned int spltConst1ui() { // ; return (vector unsigned int)1; // ;} // ;// P8: vspltisw -15; vsrw // ;// P9: vspltisw -15; vsrw // ;vector unsigned int spltConst16kui() { // ; return (vector unsigned int)((1<<15) - 1); // ;} // ;// P8: vspltisw -16; vsrw // ;// P9: vspltisw -16; vsrw // ;vector unsigned int spltConst32kui() { // ; return (vector unsigned int)((1<<16) - 1); // ;} // ;// P8: 4 x mtvsrwz, 2 x xxmrghd, vmrgow // ;// P9: 2 x mtvsrdd, vmrgow // ;vector unsigned int fromRegsui(unsigned int a, unsigned int b, // ; unsigned int c, unsigned int d) { // ; return (vector unsigned int){ a, b, c, d }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (or even lxv) // ;vector unsigned int fromDiffConstsui() { // ; return (vector unsigned int) { 242, -113, 889, 19 }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx // ;vector unsigned int fromDiffMemConsAui(unsigned int *arr) { // ; return (vector unsigned int) { arr[0], arr[1], arr[2], arr[3] }; // ;} // ;// P8: 2 x lxvd2x, 2 x xxswapd, vperm // ;// P9: 2 x lxvx, vperm // ;vector unsigned int fromDiffMemConsDui(unsigned int *arr) { // ; return (vector unsigned int) { arr[3], arr[2], arr[1], arr[0] }; // ;} // ;// P8: sldi 2, lxvd2x, xxswapd // ;// P9: sldi 2, lxvx // ;vector unsigned int fromDiffMemVarAui(unsigned int *arr, int elem) { // ; return (vector unsigned int) { arr[elem], arr[elem+1], // ; arr[elem+2], arr[elem+3] }; // ;} // ;// P8: sldi 2, 2 x lxvd2x, 2 x xxswapd, vperm // ;// P9: sldi 2, 2 x lxvx, vperm // ;vector unsigned int fromDiffMemVarDui(unsigned int *arr, int elem) { // ; return (vector unsigned int) { arr[elem], arr[elem-1], // ; arr[elem-2], arr[elem-3] }; // ;} // ;// P8: 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow // ;// P9: 4 x lwz, 2 x mtvsrdd, vmrgow // ;vector unsigned int fromRandMemConsui(unsigned int *arr) { // ; return (vector unsigned int) { arr[4], arr[18], arr[2], arr[88] }; // ;} // ;// P8: sldi 2, 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow // ;// P9: sldi 2, add, 4 x lwz, 2 x mtvsrdd, vmrgow // ;vector unsigned int fromRandMemVarui(unsigned int *arr, int elem) { // ; return (vector unsigned int) { arr[elem+4], arr[elem+1], // ; arr[elem+2], arr[elem+8] }; // ;} // ;// P8: mtvsrwz, xxspltw // ;// P9: mtvsrws // ;vector unsigned int spltRegValui(unsigned int val) { // ; return (vector unsigned int) val; // ;} // ;// P8: lxsiwax, xxspltw // ;// P9: lxvwsx // ;vector unsigned int spltMemValui(unsigned int *ptr) { // ; return (vector unsigned int)*ptr; // ;} // ;// P8: vspltisw // ;// P9: vspltisw // ;vector unsigned int spltCnstConvftoui() { // ; return (vector unsigned int) 4.74f; // ;} // -;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;vector unsigned int fromRegsConvftoui(float a, float b, float c, float d) { // ; return (vector unsigned int) { a, b, c, d }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector unsigned int fromDiffConstsConvftoui() { // ; return (vector unsigned int) { 24.46f, 234.f, 988.19f, 422.39f }; // ;} // ;// P8: lxvd2x, xxswapd, xvcvspuxws // ;// P9: lxvx, xvcvspuxws // ;vector unsigned int fromDiffMemConsAConvftoui(float *ptr) { // ; return (vector unsigned int) { ptr[0], ptr[1], ptr[2], ptr[3] }; // ;} // ;// P8: 2 x lxvd2x, 2 x xxswapd, vperm, xvcvspuxws // ;// P9: 2 x lxvx, vperm, xvcvspuxws // ;vector unsigned int fromDiffMemConsDConvftoui(float *ptr) { // ; return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // ;} // -;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // ;// sldi 2, load, xvcvspuxws // ;vector unsigned int fromDiffMemVarAConvftoui(float *arr, int elem) { // ; return (vector unsigned int) { arr[elem], arr[elem+1], // ; arr[elem+2], arr[elem+3] }; // ;} // -;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // ;// sldi 2, 2 x load, vperm, xvcvspuxws // ;vector unsigned int fromDiffMemVarDConvftoui(float *arr, int elem) { // ; return (vector unsigned int) { arr[elem], arr[elem-1], // ; arr[elem-2], arr[elem-3] }; // ;} // ;// P8: xscvdpuxws, xxspltw // ;// P9: xscvdpuxws, xxspltw // ;vector unsigned int spltRegValConvftoui(float val) { // ; return (vector unsigned int) val; // ;} // ;// P8: lxsspx, xscvdpuxws, xxspltw // ;// P9: lxvwsx, xvcvspuxws // ;vector unsigned int spltMemValConvftoui(float *ptr) { // ; return (vector unsigned int)*ptr; // ;} // ;// P8: vspltisw // ;// P9: vspltisw // ;vector unsigned int spltCnstConvdtoui() { // ; return (vector unsigned int) 4.74; // ;} // -;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;vector unsigned int fromRegsConvdtoui(double a, double b, // ; double c, double d) { // ; return (vector unsigned int) { a, b, c, d }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector unsigned int fromDiffConstsConvdtoui() { // ; return (vector unsigned int) { 24.46, 234., 988.19, 422.39 }; // ;} // -;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, // -;// xvcvspuxws // -;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvspuxws, vmrgew // ;vector unsigned int fromDiffMemConsAConvdtoui(double *ptr) { // ; return (vector unsigned int) { ptr[0], ptr[1], ptr[2], ptr[3] }; // ;} // -;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;vector unsigned int fromDiffMemConsDConvdtoui(double *ptr) { // ; return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // ;} // -;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;vector unsigned int fromDiffMemVarAConvdtoui(double *arr, int elem) { // ; return (vector unsigned int) { arr[elem], arr[elem+1], // ; arr[elem+2], arr[elem+3] }; // ;} // -;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // -;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // +;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ;vector unsigned int fromDiffMemVarDConvdtoui(double *arr, int elem) { // ; return (vector unsigned int) { arr[elem], arr[elem-1], // ; arr[elem-2], arr[elem-3] }; // ;} // ;// P8: xscvdpuxws, xxspltw // ;// P9: xscvdpuxws, xxspltw // ;vector unsigned int spltRegValConvdtoui(double val) { // ; return (vector unsigned int) val; // ;} // ;// P8: lxsspx, xscvdpuxws, xxspltw // ;// P9: lfd, xscvdpuxws, xxspltw // ;vector unsigned int spltMemValConvdtoui(double *ptr) { // ; return (vector unsigned int)*ptr; // ;} // ;/*=============================== unsigned int ==============================*/ ;/*=============================== long long =================================*/ ;// P8: xxlxor // ;// P9: xxlxor // ;vector long long allZeroll() { // ; return (vector long long)0; // ;} // ;// P8: vspltisb -1 // ;// P9: xxspltisb 255 // ;vector long long allOnell() { // ; return (vector long long)-1; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;vector long long spltConst1ll() { // ; return (vector long long)1; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;vector long long spltConst16kll() { // ; return (vector long long)((1<<15) - 1); // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;vector long long spltConst32kll() { // ; return (vector long long)((1<<16) - 1); // ;} // ;// P8: 2 x mtvsrd, xxmrghd // ;// P9: mtvsrdd // ;vector long long fromRegsll(long long a, long long b) { // ; return (vector long long){ a, b }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (or even lxv) // ;vector long long fromDiffConstsll() { // ; return (vector long long) { 242, -113 }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx // ;vector long long fromDiffMemConsAll(long long *arr) { // ; return (vector long long) { arr[0], arr[1] }; // ;} // ;// P8: lxvd2x // ;// P9: lxvx, xxswapd (maybe just use lxvd2x) // ;vector long long fromDiffMemConsDll(long long *arr) { // ; return (vector long long) { arr[3], arr[2] }; // ;} // ;// P8: sldi 3, lxvd2x, xxswapd // ;// P9: sldi 3, lxvx // ;vector long long fromDiffMemVarAll(long long *arr, int elem) { // ; return (vector long long) { arr[elem], arr[elem+1] }; // ;} // ;// P8: sldi 3, lxvd2x // ;// P9: sldi 3, lxvx, xxswapd (maybe just use lxvd2x) // ;vector long long fromDiffMemVarDll(long long *arr, int elem) { // ; return (vector long long) { arr[elem], arr[elem-1] }; // ;} // ;// P8: 2 x ld, 2 x mtvsrd, xxmrghd // ;// P9: 2 x ld, mtvsrdd // ;vector long long fromRandMemConsll(long long *arr) { // ; return (vector long long) { arr[4], arr[18] }; // ;} // ;// P8: sldi 3, add, 2 x ld, 2 x mtvsrd, xxmrghd // ;// P9: sldi 3, add, 2 x ld, mtvsrdd // ;vector long long fromRandMemVarll(long long *arr, int elem) { // ; return (vector long long) { arr[elem+4], arr[elem+1] }; // ;} // ;// P8: mtvsrd, xxspltd // ;// P9: mtvsrdd // ;vector long long spltRegValll(long long val) { // ; return (vector long long) val; // ;} // ;// P8: lxvdsx // ;// P9: lxvdsx // ;vector long long spltMemValll(long long *ptr) { // ; return (vector long long)*ptr; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;vector long long spltCnstConvftoll() { // ; return (vector long long) 4.74f; // ;} // ;// P8: xxmrghd, xvcvdpsxds // ;// P9: xxmrghd, xvcvdpsxds // ;vector long long fromRegsConvftoll(float a, float b) { // ; return (vector long long) { a, b }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector long long fromDiffConstsConvftoll() { // ; return (vector long long) { 24.46f, 234.f }; // ;} // ;// P8: 2 x lxsspx, xxmrghd, xvcvdpsxds // ;// P9: 2 x lxssp, xxmrghd, xvcvdpsxds // ;vector long long fromDiffMemConsAConvftoll(float *ptr) { // ; return (vector long long) { ptr[0], ptr[1] }; // ;} // ;// P8: 2 x lxsspx, xxmrghd, xvcvdpsxds // ;// P9: 2 x lxssp, xxmrghd, xvcvdpsxds // ;vector long long fromDiffMemConsDConvftoll(float *ptr) { // ; return (vector long long) { ptr[3], ptr[2] }; // ;} // ;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpsxds // ;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpsxds // ;vector long long fromDiffMemVarAConvftoll(float *arr, int elem) { // ; return (vector long long) { arr[elem], arr[elem+1] }; // ;} // ;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpsxds // ;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpsxds // ;vector long long fromDiffMemVarDConvftoll(float *arr, int elem) { // ; return (vector long long) { arr[elem], arr[elem-1] }; // ;} // ;// P8: xscvdpsxds, xxspltd // ;// P9: xscvdpsxds, xxspltd // ;vector long long spltRegValConvftoll(float val) { // ; return (vector long long) val; // ;} // ;// P8: lxsspx, xscvdpsxds, xxspltd // ;// P9: lfs, xscvdpsxds, xxspltd // ;vector long long spltMemValConvftoll(float *ptr) { // ; return (vector long long)*ptr; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;vector long long spltCnstConvdtoll() { // ; return (vector long long) 4.74; // ;} // ;// P8: xxmrghd, xvcvdpsxds // ;// P9: xxmrghd, xvcvdpsxds // ;vector long long fromRegsConvdtoll(double a, double b) { // ; return (vector long long) { a, b }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector long long fromDiffConstsConvdtoll() { // ; return (vector long long) { 24.46, 234. }; // ;} // ;// P8: lxvd2x, xxswapd, xvcvdpsxds // ;// P9: lxvx, xvcvdpsxds // ;vector long long fromDiffMemConsAConvdtoll(double *ptr) { // ; return (vector long long) { ptr[0], ptr[1] }; // ;} // ;// P8: lxvd2x, xvcvdpsxds // ;// P9: lxvx, xxswapd, xvcvdpsxds // ;vector long long fromDiffMemConsDConvdtoll(double *ptr) { // ; return (vector long long) { ptr[3], ptr[2] }; // ;} // ;// P8: sldi 3, lxvd2x, xxswapd, xvcvdpsxds // ;// P9: sldi 3, lxvx, xvcvdpsxds // ;vector long long fromDiffMemVarAConvdtoll(double *arr, int elem) { // ; return (vector long long) { arr[elem], arr[elem+1] }; // ;} // ;// P8: sldi 3, lxvd2x, xvcvdpsxds // ;// P9: sldi 3, lxvx, xxswapd, xvcvdpsxds // ;vector long long fromDiffMemVarDConvdtoll(double *arr, int elem) { // ; return (vector long long) { arr[elem], arr[elem-1] }; // ;} // ;// P8: xscvdpsxds, xxspltd // ;// P9: xscvdpsxds, xxspltd // ;vector long long spltRegValConvdtoll(double val) { // ; return (vector long long) val; // ;} // ;// P8: lxvdsx, xvcvdpsxds // ;// P9: lxvdsx, xvcvdpsxds // ;vector long long spltMemValConvdtoll(double *ptr) { // ; return (vector long long)*ptr; // ;} // ;/*=============================== long long =================================*/ ;/*========================== unsigned long long =============================*/ ;// P8: xxlxor // ;// P9: xxlxor // ;vector unsigned long long allZeroull() { // ; return (vector unsigned long long)0; // ;} // ;// P8: vspltisb -1 // ;// P9: xxspltisb 255 // ;vector unsigned long long allOneull() { // ; return (vector unsigned long long)-1; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;vector unsigned long long spltConst1ull() { // ; return (vector unsigned long long)1; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;vector unsigned long long spltConst16kull() { // ; return (vector unsigned long long)((1<<15) - 1); // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw)) // ;vector unsigned long long spltConst32kull() { // ; return (vector unsigned long long)((1<<16) - 1); // ;} // ;// P8: 2 x mtvsrd, xxmrghd // ;// P9: mtvsrdd // ;vector unsigned long long fromRegsull(unsigned long long a, // ; unsigned long long b) { // ; return (vector unsigned long long){ a, b }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (or even lxv) // ;vector unsigned long long fromDiffConstsull() { // ; return (vector unsigned long long) { 242, -113 }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx // ;vector unsigned long long fromDiffMemConsAull(unsigned long long *arr) { // ; return (vector unsigned long long) { arr[0], arr[1] }; // ;} // ;// P8: lxvd2x // ;// P9: lxvx, xxswapd (maybe just use lxvd2x) // ;vector unsigned long long fromDiffMemConsDull(unsigned long long *arr) { // ; return (vector unsigned long long) { arr[3], arr[2] }; // ;} // ;// P8: sldi 3, lxvd2x, xxswapd // ;// P9: sldi 3, lxvx // ;vector unsigned long long fromDiffMemVarAull(unsigned long long *arr, // ; int elem) { // ; return (vector unsigned long long) { arr[elem], arr[elem+1] }; // ;} // ;// P8: sldi 3, lxvd2x // ;// P9: sldi 3, lxvx, xxswapd (maybe just use lxvd2x) // ;vector unsigned long long fromDiffMemVarDull(unsigned long long *arr, // ; int elem) { // ; return (vector unsigned long long) { arr[elem], arr[elem-1] }; // ;} // ;// P8: 2 x ld, 2 x mtvsrd, xxmrghd // ;// P9: 2 x ld, mtvsrdd // ;vector unsigned long long fromRandMemConsull(unsigned long long *arr) { // ; return (vector unsigned long long) { arr[4], arr[18] }; // ;} // ;// P8: sldi 3, add, 2 x ld, 2 x mtvsrd, xxmrghd // ;// P9: sldi 3, add, 2 x ld, mtvsrdd // ;vector unsigned long long fromRandMemVarull(unsigned long long *arr, // ; int elem) { // ; return (vector unsigned long long) { arr[elem+4], arr[elem+1] }; // ;} // ;// P8: mtvsrd, xxspltd // ;// P9: mtvsrdd // ;vector unsigned long long spltRegValull(unsigned long long val) { // ; return (vector unsigned long long) val; // ;} // ;// P8: lxvdsx // ;// P9: lxvdsx // ;vector unsigned long long spltMemValull(unsigned long long *ptr) { // ; return (vector unsigned long long)*ptr; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;vector unsigned long long spltCnstConvftoull() { // ; return (vector unsigned long long) 4.74f; // ;} // ;// P8: xxmrghd, xvcvdpuxds // ;// P9: xxmrghd, xvcvdpuxds // ;vector unsigned long long fromRegsConvftoull(float a, float b) { // ; return (vector unsigned long long) { a, b }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector unsigned long long fromDiffConstsConvftoull() { // ; return (vector unsigned long long) { 24.46f, 234.f }; // ;} // ;// P8: 2 x lxsspx, xxmrghd, xvcvdpuxds // ;// P9: 2 x lxssp, xxmrghd, xvcvdpuxds // ;vector unsigned long long fromDiffMemConsAConvftoull(float *ptr) { // ; return (vector unsigned long long) { ptr[0], ptr[1] }; // ;} // ;// P8: 2 x lxsspx, xxmrghd, xvcvdpuxds // ;// P9: 2 x lxssp, xxmrghd, xvcvdpuxds // ;vector unsigned long long fromDiffMemConsDConvftoull(float *ptr) { // ; return (vector unsigned long long) { ptr[3], ptr[2] }; // ;} // ;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpuxds // ;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpuxds // ;vector unsigned long long fromDiffMemVarAConvftoull(float *arr, int elem) { // ; return (vector unsigned long long) { arr[elem], arr[elem+1] }; // ;} // ;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpuxds // ;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpuxds // ;vector unsigned long long fromDiffMemVarDConvftoull(float *arr, int elem) { // ; return (vector unsigned long long) { arr[elem], arr[elem-1] }; // ;} // ;// P8: xscvdpuxds, xxspltd // ;// P9: xscvdpuxds, xxspltd // ;vector unsigned long long spltRegValConvftoull(float val) { // ; return (vector unsigned long long) val; // ;} // ;// P8: lxsspx, xscvdpuxds, xxspltd // ;// P9: lfs, xscvdpuxds, xxspltd // ;vector unsigned long long spltMemValConvftoull(float *ptr) { // ; return (vector unsigned long long)*ptr; // ;} // ;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw)) // ;vector unsigned long long spltCnstConvdtoull() { // ; return (vector unsigned long long) 4.74; // ;} // ;// P8: xxmrghd, xvcvdpuxds // ;// P9: xxmrghd, xvcvdpuxds // ;vector unsigned long long fromRegsConvdtoull(double a, double b) { // ; return (vector unsigned long long) { a, b }; // ;} // ;// P8: lxvd2x, xxswapd // ;// P9: lxvx (even lxv) // ;vector unsigned long long fromDiffConstsConvdtoull() { // ; return (vector unsigned long long) { 24.46, 234. }; // ;} // ;// P8: lxvd2x, xxswapd, xvcvdpuxds // ;// P9: lxvx, xvcvdpuxds // ;vector unsigned long long fromDiffMemConsAConvdtoull(double *ptr) { // ; return (vector unsigned long long) { ptr[0], ptr[1] }; // ;} // ;// P8: lxvd2x, xvcvdpuxds // ;// P9: lxvx, xxswapd, xvcvdpuxds // ;vector unsigned long long fromDiffMemConsDConvdtoull(double *ptr) { // ; return (vector unsigned long long) { ptr[3], ptr[2] }; // ;} // ;// P8: sldi 3, lxvd2x, xxswapd, xvcvdpuxds // ;// P9: sldi 3, lxvx, xvcvdpuxds // ;vector unsigned long long fromDiffMemVarAConvdtoull(double *arr, int elem) { // ; return (vector unsigned long long) { arr[elem], arr[elem+1] }; // ;} // ;// P8: sldi 3, lxvd2x, xvcvdpuxds // ;// P9: sldi 3, lxvx, xxswapd, xvcvdpuxds // ;vector unsigned long long fromDiffMemVarDConvdtoull(double *arr, int elem) { // ; return (vector unsigned long long) { arr[elem], arr[elem-1] }; // ;} // ;// P8: xscvdpuxds, xxspltd // ;// P9: xscvdpuxds, xxspltd // ;vector unsigned long long spltRegValConvdtoull(double val) { // ; return (vector unsigned long long) val; // ;} // ;// P8: lxvdsx, xvcvdpuxds // ;// P9: lxvdsx, xvcvdpuxds // ;vector unsigned long long spltMemValConvdtoull(double *ptr) { // ; return (vector unsigned long long)*ptr; // ;} // ;/*========================== unsigned long long ==============================*/ ; Function Attrs: norecurse nounwind readnone define <4 x i32> @allZeroi() { entry: ret <4 x i32> zeroinitializer ; P9BE-LABEL: allZeroi ; P9LE-LABEL: allZeroi ; P8BE-LABEL: allZeroi ; P8LE-LABEL: allZeroi ; P9BE: xxlxor v2, v2, v2 ; P9BE: blr ; P9LE: xxlxor v2, v2, v2 ; P9LE: blr ; P8BE: xxlxor v2, v2, v2 ; P8BE: blr ; P8LE: xxlxor v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @allOnei() { entry: ret <4 x i32> ; P9BE-LABEL: allOnei ; P9LE-LABEL: allOnei ; P8BE-LABEL: allOnei ; P8LE-LABEL: allOnei ; P9BE: xxspltib v2, 255 ; P9BE: blr ; P9LE: xxspltib v2, 255 ; P9LE: blr ; P8BE: vspltisb v2, -1 ; P8BE: blr ; P8LE: vspltisb v2, -1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltConst1i() { entry: ret <4 x i32> ; P9BE-LABEL: spltConst1i ; P9LE-LABEL: spltConst1i ; P8BE-LABEL: spltConst1i ; P8LE-LABEL: spltConst1i ; P9BE: vspltisw v2, 1 ; P9BE: blr ; P9LE: vspltisw v2, 1 ; P9LE: blr ; P8BE: vspltisw v2, 1 ; P8BE: blr ; P8LE: vspltisw v2, 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltConst16ki() { entry: ret <4 x i32> ; P9BE-LABEL: spltConst16ki ; P9LE-LABEL: spltConst16ki ; P8BE-LABEL: spltConst16ki ; P8LE-LABEL: spltConst16ki ; P9BE: vspltisw v2, -15 ; P9BE: vsrw v2, v2, v2 ; P9BE: blr ; P9LE: vspltisw v2, -15 ; P9LE: vsrw v2, v2, v2 ; P9LE: blr ; P8BE: vspltisw v2, -15 ; P8BE: vsrw v2, v2, v2 ; P8BE: blr ; P8LE: vspltisw v2, -15 ; P8LE: vsrw v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltConst32ki() { entry: ret <4 x i32> ; P9BE-LABEL: spltConst32ki ; P9LE-LABEL: spltConst32ki ; P8BE-LABEL: spltConst32ki ; P8LE-LABEL: spltConst32ki ; P9BE: vspltisw v2, -16 ; P9BE: vsrw v2, v2, v2 ; P9BE: blr ; P9LE: vspltisw v2, -16 ; P9LE: vsrw v2, v2, v2 ; P9LE: blr ; P8BE: vspltisw v2, -16 ; P8BE: vsrw v2, v2, v2 ; P8BE: blr ; P8LE: vspltisw v2, -16 ; P8LE: vsrw v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromRegsi(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d) { entry: %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %c, i32 2 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %d, i32 3 ret <4 x i32> %vecinit3 ; P9BE-LABEL: fromRegsi ; P9LE-LABEL: fromRegsi ; P8BE-LABEL: fromRegsi ; P8LE-LABEL: fromRegsi ; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5 ; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6 ; P9BE: vmrgow v2, [[REG1]], [[REG2]] ; P9BE: blr ; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3 ; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4 ; P9LE: vmrgow v2, [[REG2]], [[REG1]] ; P9LE: blr ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3 ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4 ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5 ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6 ; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]] ; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]] ; P8BE: vmrgow v2, [[REG5]], [[REG6]] ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3 ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4 ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5 ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6 ; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]] ; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]] ; P8LE: vmrgow v2, [[REG6]], [[REG5]] } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromDiffConstsi() { entry: ret <4 x i32> ; P9BE-LABEL: fromDiffConstsi ; P9LE-LABEL: fromDiffConstsi ; P8BE-LABEL: fromDiffConstsi ; P8LE-LABEL: fromDiffConstsi ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lvx ; P8LE-NOT: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsAi(i32* nocapture readonly %arr) { entry: %0 = load i32, i32* %arr, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 1 %1 = load i32, i32* %arrayidx1, align 4 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2 %2 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2 %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 3 %3 = load i32, i32* %arrayidx5, align 4 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromDiffMemConsAi ; P9LE-LABEL: fromDiffMemConsAi ; P8BE-LABEL: fromDiffMemConsAi ; P8LE-LABEL: fromDiffMemConsAi ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsDi(i32* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3 %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2 %1 = load i32, i32* %arrayidx1, align 4 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 1 %2 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2 %3 = load i32, i32* %arr, align 4 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromDiffMemConsDi ; P9LE-LABEL: fromDiffMemConsDi ; P8BE-LABEL: fromDiffMemConsDi ; P8LE-LABEL: fromDiffMemConsDi ; P9BE: lxv ; P9BE: lxv ; P9BE: vperm ; P9BE: blr ; P9LE: lxv ; P9LE: lxv ; P9LE: vperm ; P9LE: blr ; P8BE: lxvw4x ; P8BE: lxvw4x ; P8BE: vperm ; P8BE: blr ; P8LE: lxvd2x ; P8LE-DAG: lvx ; P8LE: xxswapd ; P8LE: vperm ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarAi(i32* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1 %1 = load i32, i32* %arrayidx2, align 4 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %add4 = add nsw i32 %elem, 2 %idxprom5 = sext i32 %add4 to i64 %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5 %2 = load i32, i32* %arrayidx6, align 4 %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2 %add8 = add nsw i32 %elem, 3 %idxprom9 = sext i32 %add8 to i64 %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9 %3 = load i32, i32* %arrayidx10, align 4 %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3 ret <4 x i32> %vecinit11 ; P9BE-LABEL: fromDiffMemVarAi ; P9LE-LABEL: fromDiffMemVarAi ; P8BE-LABEL: fromDiffMemVarAi ; P8LE-LABEL: fromDiffMemVarAi ; P9BE: sldi r4, r4, 2 ; P9BE: lxvx v2, r3, r4 ; P9BE: blr ; P9LE: sldi r4, r4, 2 ; P9LE: lxvx v2, r3, r4 ; P9LE: blr ; P8BE: sldi r4, r4, 2 ; P8BE: lxvw4x {{[vs0-9]+}}, r3, r4 ; P8BE: blr ; P8LE: sldi r4, r4, 2 ; P8LE: lxvd2x {{[vs0-9]+}}, r3, r4 ; P8LE: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarDi(i32* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1 %1 = load i32, i32* %arrayidx2, align 4 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %sub4 = add nsw i32 %elem, -2 %idxprom5 = sext i32 %sub4 to i64 %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5 %2 = load i32, i32* %arrayidx6, align 4 %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2 %sub8 = add nsw i32 %elem, -3 %idxprom9 = sext i32 %sub8 to i64 %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9 %3 = load i32, i32* %arrayidx10, align 4 %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3 ret <4 x i32> %vecinit11 ; P9BE-LABEL: fromDiffMemVarDi ; P9LE-LABEL: fromDiffMemVarDi ; P8BE-LABEL: fromDiffMemVarDi ; P8LE-LABEL: fromDiffMemVarDi ; P9BE: sldi {{r[0-9]+}}, r4, 2 ; P9BE-DAG: lxvx {{v[0-9]+}} ; P9BE-DAG: lxvx ; P9BE: vperm ; P9BE: blr ; P9LE: sldi {{r[0-9]+}}, r4, 2 ; P9LE-DAG: lxvx {{v[0-9]+}} ; P9LE-DAG: lxvx ; P9LE: vperm ; P9LE: blr ; P8BE: sldi {{r[0-9]+}}, r4, 2 ; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3 ; P8BE-DAG: lxvw4x ; P8BE: vperm ; P8BE: blr ; P8LE: sldi {{r[0-9]+}}, r4, 2 ; P8LE-DAG: lxvd2x ; P8LE-DAG: lxvd2x ; P8LE: xxswapd ; P8LE: vperm ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromRandMemConsi(i32* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 4 %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 18 %1 = load i32, i32* %arrayidx1, align 4 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2 %2 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2 %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 88 %3 = load i32, i32* %arrayidx5, align 4 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromRandMemConsi ; P9LE-LABEL: fromRandMemConsi ; P8BE-LABEL: fromRandMemConsi ; P8LE-LABEL: fromRandMemConsi ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: mtvsrdd ; P9BE: mtvsrdd ; P9BE: vmrgow ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: mtvsrdd ; P9LE: mtvsrdd ; P9LE: vmrgow ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: xxmrghd ; P8BE: xxmrghd ; P8BE: vmrgow ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: xxmrghd ; P8LE: xxmrghd ; P8LE: vmrgow } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromRandMemVari(i32* nocapture readonly %arr, i32 signext %elem) { entry: %add = add nsw i32 %elem, 4 %idxprom = sext i32 %add to i64 %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %add1 = add nsw i32 %elem, 1 %idxprom2 = sext i32 %add1 to i64 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 %idxprom2 %1 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %add5 = add nsw i32 %elem, 2 %idxprom6 = sext i32 %add5 to i64 %arrayidx7 = getelementptr inbounds i32, i32* %arr, i64 %idxprom6 %2 = load i32, i32* %arrayidx7, align 4 %vecinit8 = insertelement <4 x i32> %vecinit4, i32 %2, i32 2 %add9 = add nsw i32 %elem, 8 %idxprom10 = sext i32 %add9 to i64 %arrayidx11 = getelementptr inbounds i32, i32* %arr, i64 %idxprom10 %3 = load i32, i32* %arrayidx11, align 4 %vecinit12 = insertelement <4 x i32> %vecinit8, i32 %3, i32 3 ret <4 x i32> %vecinit12 ; P9BE-LABEL: fromRandMemVari ; P9LE-LABEL: fromRandMemVari ; P8BE-LABEL: fromRandMemVari ; P8LE-LABEL: fromRandMemVari ; P9BE: sldi r4, r4, 2 ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: mtvsrdd ; P9BE: mtvsrdd ; P9BE: vmrgow ; P9LE: sldi r4, r4, 2 ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: mtvsrdd ; P9LE: mtvsrdd ; P9LE: vmrgow ; P8BE: sldi r4, r4, 2 ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: xxmrghd ; P8BE: xxmrghd ; P8BE: vmrgow ; P8LE: sldi r4, r4, 2 ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: xxmrghd ; P8LE: xxmrghd ; P8LE: vmrgow } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltRegVali(i32 signext %val) { entry: %splat.splatinsert = insertelement <4 x i32> undef, i32 %val, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltRegVali ; P9LE-LABEL: spltRegVali ; P8BE-LABEL: spltRegVali ; P8LE-LABEL: spltRegVali ; P9BE: mtvsrws v2, r3 ; P9BE: blr ; P9LE: mtvsrws v2, r3 ; P9LE: blr ; P8BE: mtvsrwz {{[vsf0-9]+}}, r3 ; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8BE: blr ; P8LE: mtvsrwz {{[vsf0-9]+}}, r3 ; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @spltMemVali(i32* nocapture readonly %ptr) { entry: %0 = load i32, i32* %ptr, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltMemVali ; P9LE-LABEL: spltMemVali ; P8BE-LABEL: spltMemVali ; P8LE-LABEL: spltMemVali ; P9BE: lxvwsx v2, 0, r3 ; P9BE: blr ; P9LE: lxvwsx v2, 0, r3 ; P9LE: blr ; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3 ; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8BE: blr ; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3 ; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltCnstConvftoi() { entry: ret <4 x i32> ; P9BE-LABEL: spltCnstConvftoi ; P9LE-LABEL: spltCnstConvftoi ; P8BE-LABEL: spltCnstConvftoi ; P8LE-LABEL: spltCnstConvftoi ; P9BE: vspltisw v2, 4 ; P9BE: blr ; P9LE: vspltisw v2, 4 ; P9LE: blr ; P8BE: vspltisw v2, 4 ; P8BE: blr ; P8LE: vspltisw v2, 4 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) { entry: %conv = fptosi float %a to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %conv1 = fptosi float %b to i32 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1 %conv3 = fptosi float %c to i32 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2 %conv5 = fptosi float %d to i32 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromRegsConvftoi ; P9LE-LABEL: fromRegsConvftoi ; P8BE-LABEL: fromRegsConvftoi ; P8LE-LABEL: fromRegsConvftoi ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9BE: vmrgew v2, [[REG3]], [[REG4]] -; P9BE: xvcvspsxws v2, v2 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9LE: vmrgew v2, [[REG4]], [[REG3]] -; P9LE: xvcvspsxws v2, v2 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8BE: vmrgew v2, [[REG3]], [[REG4]] -; P8BE: xvcvspsxws v2, v2 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8LE: vmrgew v2, [[REG4]], [[REG3]] -; P8LE: xvcvspsxws v2, v2 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromDiffConstsConvftoi() { entry: ret <4 x i32> ; P9BE-LABEL: fromDiffConstsConvftoi ; P9LE-LABEL: fromDiffConstsConvftoi ; P8BE-LABEL: fromDiffConstsConvftoi ; P8LE-LABEL: fromDiffConstsConvftoi ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lvx ; P8LE-NOT: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsAConvftoi(float* nocapture readonly %ptr) { entry: %0 = bitcast float* %ptr to <4 x float>* %1 = load <4 x float>, <4 x float>* %0, align 4 %2 = fptosi <4 x float> %1 to <4 x i32> ret <4 x i32> %2 ; P9BE-LABEL: fromDiffMemConsAConvftoi ; P9LE-LABEL: fromDiffMemConsAConvftoi ; P8BE-LABEL: fromDiffMemConsAConvftoi ; P8LE-LABEL: fromDiffMemConsAConvftoi ; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9BE: xvcvspsxws v2, [[REG1]] ; P9BE: blr ; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9LE: xvcvspsxws v2, [[REG1]] ; P9LE: blr ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3 ; P8BE: xvcvspsxws v2, [[REG1]] ; P8BE: blr ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 ; P8LE: xxswapd ; P8LE: xvcvspsxws v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsDConvftoi(float* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds float, float* %ptr, i64 3 %0 = load float, float* %arrayidx, align 4 %conv = fptosi float %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2 %1 = load float, float* %arrayidx1, align 4 %conv2 = fptosi float %1 to i32 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 1 %2 = load float, float* %arrayidx4, align 4 %conv5 = fptosi float %2 to i32 %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 %3 = load float, float* %ptr, align 4 %conv8 = fptosi float %3 to i32 %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 ret <4 x i32> %vecinit9 ; P9BE-LABEL: fromDiffMemConsDConvftoi ; P9LE-LABEL: fromDiffMemConsDConvftoi ; P8BE-LABEL: fromDiffMemConsDConvftoi ; P8LE-LABEL: fromDiffMemConsDConvftoi ; P9BE: lxv ; P9BE: lxv ; P9BE: vperm ; P9BE: xvcvspsxws ; P9BE: blr ; P9LE: lxv ; P9LE: lxv ; P9LE: vperm ; P9LE: xvcvspsxws ; P9LE: blr ; P8BE: lxvw4x ; P8BE: lxvw4x ; P8BE: vperm ; P8BE: xvcvspsxws ; P8BE: blr ; P8LE: lxvd2x ; P8LE-DAG: lvx ; P8LE: xxswapd ; P8LE: vperm ; P8LE: xvcvspsxws ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarAConvftoi(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptosi float %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptosi float %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %add5 = add nsw i32 %elem, 2 %idxprom6 = sext i32 %add5 to i64 %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6 %2 = load float, float* %arrayidx7, align 4 %conv8 = fptosi float %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %add10 = add nsw i32 %elem, 3 %idxprom11 = sext i32 %add10 to i64 %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11 %3 = load float, float* %arrayidx12, align 4 %conv13 = fptosi float %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarAConvftoi ; P9LE-LABEL: fromDiffMemVarAConvftoi ; P8BE-LABEL: fromDiffMemVarAConvftoi ; P8LE-LABEL: fromDiffMemVarAConvftoi ; FIXME: implement finding consecutive loads with pre-inc ; P9BE: lfsux ; P9LE: lfsux ; P8BE: lfsux ; P8LE: lfsux } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarDConvftoi(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptosi float %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptosi float %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %sub5 = add nsw i32 %elem, -2 %idxprom6 = sext i32 %sub5 to i64 %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6 %2 = load float, float* %arrayidx7, align 4 %conv8 = fptosi float %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %sub10 = add nsw i32 %elem, -3 %idxprom11 = sext i32 %sub10 to i64 %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11 %3 = load float, float* %arrayidx12, align 4 %conv13 = fptosi float %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarDConvftoi ; P9LE-LABEL: fromDiffMemVarDConvftoi ; P8BE-LABEL: fromDiffMemVarDConvftoi ; P8LE-LABEL: fromDiffMemVarDConvftoi ; FIXME: implement finding consecutive loads with pre-inc ; P9BE: lfsux ; P9LE: lfsux ; P8BE: lfsux ; P8LE: lfsux } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltRegValConvftoi(float %val) { entry: %conv = fptosi float %val to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltRegValConvftoi ; P9LE-LABEL: spltRegValConvftoi ; P8BE-LABEL: spltRegValConvftoi ; P8LE-LABEL: spltRegValConvftoi ; P9BE: xscvdpsxws f[[REG1:[0-9]+]], f1 ; P9BE: xxspltw v2, vs[[REG1]], 1 ; P9BE: blr ; P9LE: xscvdpsxws f[[REG1:[0-9]+]], f1 ; P9LE: xxspltw v2, vs[[REG1]], 1 ; P9LE: blr ; P8BE: xscvdpsxws f[[REG1:[0-9]+]], f1 ; P8BE: xxspltw v2, vs[[REG1]], 1 ; P8BE: blr ; P8LE: xscvdpsxws f[[REG1:[0-9]+]], f1 ; P8LE: xxspltw v2, vs[[REG1]], 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @spltMemValConvftoi(float* nocapture readonly %ptr) { entry: %0 = load float, float* %ptr, align 4 %conv = fptosi float %0 to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltMemValConvftoi ; P9LE-LABEL: spltMemValConvftoi ; P8BE-LABEL: spltMemValConvftoi ; P8LE-LABEL: spltMemValConvftoi ; P9BE: lxvwsx [[REG1:[vs0-9]+]], 0, r3 ; P9BE: xvcvspsxws v2, [[REG1]] ; P9LE: [[REG1:[vs0-9]+]], 0, r3 ; P9LE: xvcvspsxws v2, [[REG1]] ; P8BE: lfsx [[REG1:f[0-9]+]], 0, r3 ; P8BE: xscvdpsxws f[[REG2:[0-9]+]], [[REG1]] ; P8BE: xxspltw v2, vs[[REG2]], 1 ; P8LE: lfsx [[REG1:f[0-9]+]], 0, r3 ; P8LE: xscvdpsxws f[[REG2:[vs0-9]+]], [[REG1]] ; P8LE: xxspltw v2, vs[[REG2]], 1 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltCnstConvdtoi() { entry: ret <4 x i32> ; P9BE-LABEL: spltCnstConvdtoi ; P9LE-LABEL: spltCnstConvdtoi ; P8BE-LABEL: spltCnstConvdtoi ; P8LE-LABEL: spltCnstConvdtoi ; P9BE: vspltisw v2, 4 ; P9BE: blr ; P9LE: vspltisw v2, 4 ; P9LE: blr ; P8BE: vspltisw v2, 4 ; P8BE: blr ; P8LE: vspltisw v2, 4 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) { entry: %conv = fptosi double %a to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %conv1 = fptosi double %b to i32 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1 %conv3 = fptosi double %c to i32 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2 %conv5 = fptosi double %d to i32 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromRegsConvdtoi ; P9LE-LABEL: fromRegsConvdtoi ; P8BE-LABEL: fromRegsConvdtoi ; P8LE-LABEL: fromRegsConvdtoi ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9BE: vmrgew v2, [[REG3]], [[REG4]] -; P9BE: xvcvspsxws v2, v2 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9LE: vmrgew v2, [[REG4]], [[REG3]] -; P9LE: xvcvspsxws v2, v2 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8BE: vmrgew v2, [[REG3]], [[REG4]] -; P8BE: xvcvspsxws v2, v2 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8LE: vmrgew v2, [[REG4]], [[REG3]] -; P8LE: xvcvspsxws v2, v2 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromDiffConstsConvdtoi() { entry: ret <4 x i32> ; P9BE-LABEL: fromDiffConstsConvdtoi ; P9LE-LABEL: fromDiffConstsConvdtoi ; P8BE-LABEL: fromDiffConstsConvdtoi ; P8LE-LABEL: fromDiffConstsConvdtoi ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lvx ; P8LE-NOT: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsAConvdtoi(double* nocapture readonly %ptr) { entry: %0 = bitcast double* %ptr to <2 x double>* %1 = load <2 x double>, <2 x double>* %0, align 8 %2 = fptosi <2 x double> %1 to <2 x i32> %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 2 %3 = bitcast double* %arrayidx4 to <2 x double>* %4 = load <2 x double>, <2 x double>* %3, align 8 %5 = fptosi <2 x double> %4 to <2 x i32> %vecinit9 = shufflevector <2 x i32> %2, <2 x i32> %5, <4 x i32> ret <4 x i32> %vecinit9 ; P9BE-LABEL: fromDiffMemConsAConvdtoi ; P9LE-LABEL: fromDiffMemConsAConvdtoi ; P8BE-LABEL: fromDiffMemConsAConvdtoi ; P8LE-LABEL: fromDiffMemConsAConvdtoi ; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] -; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] -; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] +; P9BE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]] +; P9BE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]] ; P9BE: vmrgew v2, [[REG6]], [[REG5]] -; P9BE: xvcvspsxws v2, v2 ; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] -; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] -; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] +; P9LE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]] +; P9LE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]] ; P9LE: vmrgew v2, [[REG6]], [[REG5]] -; P9LE: xvcvspsxws v2, v2 ; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 ; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 ; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] -; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] -; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] +; P8BE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]] +; P8BE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]] ; P8BE: vmrgew v2, [[REG6]], [[REG5]] -; P8BE: xvcvspsxws v2, v2 ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 ; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 ; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]] ; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]] ; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]] ; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]] -; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]] -; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]] +; P8LE-DAG: xvcvdpsxws [[REG7:[vs0-9]+]], [[REG5]] +; P8LE-DAG: xvcvdpsxws [[REG8:[vs0-9]+]], [[REG6]] ; P8LE: vmrgew v2, [[REG8]], [[REG7]] -; P8LE: xvcvspsxws v2, v2 } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsDConvdtoi(double* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds double, double* %ptr, i64 3 %0 = load double, double* %arrayidx, align 8 %conv = fptosi double %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2 %1 = load double, double* %arrayidx1, align 8 %conv2 = fptosi double %1 to i32 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 1 %2 = load double, double* %arrayidx4, align 8 %conv5 = fptosi double %2 to i32 %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 %3 = load double, double* %ptr, align 8 %conv8 = fptosi double %3 to i32 %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 ret <4 x i32> %vecinit9 ; P9BE-LABEL: fromDiffMemConsDConvdtoi ; P9LE-LABEL: fromDiffMemConsDConvdtoi ; P8BE-LABEL: fromDiffMemConsDConvdtoi ; P8LE-LABEL: fromDiffMemConsDConvdtoi ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: xxmrghd ; P9BE: xxmrghd -; P9BE: xvcvdpsp -; P9BE: xvcvdpsp -; P9BE: vmrgew -; P9BE: xvcvspsxws v2 +; P9BE: xvcvdpsxws +; P9BE: xvcvdpsxws +; P9BE: vmrgew v2 ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: xxmrghd ; P9LE: xxmrghd -; P9LE: xvcvdpsp -; P9LE: xvcvdpsp -; P9LE: vmrgew -; P9LE: xvcvspsxws v2 +; P9LE: xvcvdpsxws +; P9LE: xvcvdpsxws +; P9LE: vmrgew v2 ; P8BE: lfdx ; P8BE: lfd ; P8BE: lfd ; P8BE: lfd ; P8BE: xxmrghd ; P8BE: xxmrghd -; P8BE: xvcvdpsp -; P8BE: xvcvdpsp -; P8BE: vmrgew -; P8BE: xvcvspsxws v2 +; P8BE: xvcvdpsxws +; P8BE: xvcvdpsxws +; P8BE: vmrgew v2 ; P8LE: lfdx ; P8LE: lfd ; P8LE: lfd ; P8LE: lfd ; P8LE: xxmrghd ; P8LE: xxmrghd -; P8LE: xvcvdpsp -; P8LE: xvcvdpsp -; P8LE: vmrgew -; P8LE: xvcvspsxws v2 +; P8LE: xvcvdpsxws +; P8LE: xvcvdpsxws +; P8LE: vmrgew v2 } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarAConvdtoi(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptosi double %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptosi double %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %add5 = add nsw i32 %elem, 2 %idxprom6 = sext i32 %add5 to i64 %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6 %2 = load double, double* %arrayidx7, align 8 %conv8 = fptosi double %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %add10 = add nsw i32 %elem, 3 %idxprom11 = sext i32 %add10 to i64 %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11 %3 = load double, double* %arrayidx12, align 8 %conv13 = fptosi double %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarAConvdtoi ; P9LE-LABEL: fromDiffMemVarAConvdtoi ; P8BE-LABEL: fromDiffMemVarAConvdtoi ; P8LE-LABEL: fromDiffMemVarAConvdtoi ; P9BE: lfdux ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: xxmrghd ; P9BE: xxmrghd -; P9BE: xvcvdpsp -; P9BE: xvcvdpsp -; P9BE: vmrgew -; P9BE: xvcvspsxws v2 +; P9BE: xvcvdpsxws +; P9BE: xvcvdpsxws +; P9BE: vmrgew v2 ; P9LE: lfdux ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: xxmrghd ; P9LE: xxmrghd -; P9LE: xvcvdpsp -; P9LE: xvcvdpsp -; P9LE: vmrgew -; P9LE: xvcvspsxws v2 +; P9LE: xvcvdpsxws +; P9LE: xvcvdpsxws +; P9LE: vmrgew v2 ; P8BE: lfdux ; P8BE: lfd ; P8BE: lfd ; P8BE: lfd ; P8BE: xxmrghd ; P8BE: xxmrghd -; P8BE: xvcvdpsp -; P8BE: xvcvdpsp -; P8BE: vmrgew -; P8BE: xvcvspsxws v2 +; P8BE: xvcvdpsxws +; P8BE: xvcvdpsxws +; P8BE: vmrgew v2 ; P8LE: lfdux ; P8LE: lfd ; P8LE: lfd ; P8LE: lfd ; P8LE: xxmrghd ; P8LE: xxmrghd -; P8LE: xvcvdpsp -; P8LE: xvcvdpsp -; P8LE: vmrgew -; P8LE: xvcvspsxws v2 +; P8LE: xvcvdpsxws +; P8LE: xvcvdpsxws +; P8LE: vmrgew v2 } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarDConvdtoi(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptosi double %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptosi double %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %sub5 = add nsw i32 %elem, -2 %idxprom6 = sext i32 %sub5 to i64 %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6 %2 = load double, double* %arrayidx7, align 8 %conv8 = fptosi double %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %sub10 = add nsw i32 %elem, -3 %idxprom11 = sext i32 %sub10 to i64 %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11 %3 = load double, double* %arrayidx12, align 8 %conv13 = fptosi double %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarDConvdtoi ; P9LE-LABEL: fromDiffMemVarDConvdtoi ; P8BE-LABEL: fromDiffMemVarDConvdtoi ; P8LE-LABEL: fromDiffMemVarDConvdtoi ; P9BE: lfdux ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: xxmrghd ; P9BE: xxmrghd -; P9BE: xvcvdpsp -; P9BE: xvcvdpsp -; P9BE: vmrgew -; P9BE: xvcvspsxws v2 +; P9BE: xvcvdpsxws +; P9BE: xvcvdpsxws +; P9BE: vmrgew v2 ; P9LE: lfdux ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: xxmrghd ; P9LE: xxmrghd -; P9LE: xvcvdpsp -; P9LE: xvcvdpsp -; P9LE: vmrgew -; P9LE: xvcvspsxws v2 +; P9LE: xvcvdpsxws +; P9LE: xvcvdpsxws +; P9LE: vmrgew v2 ; P8BE: lfdux ; P8BE: lfd ; P8BE: lfd ; P8BE: lfd ; P8BE: xxmrghd ; P8BE: xxmrghd -; P8BE: xvcvdpsp -; P8BE: xvcvdpsp -; P8BE: vmrgew -; P8BE: xvcvspsxws v2 +; P8BE: xvcvdpsxws +; P8BE: xvcvdpsxws +; P8BE: vmrgew v2 ; P8LE: lfdux ; P8LE: lfd ; P8LE: lfd ; P8LE: lfd ; P8LE: xxmrghd ; P8LE: xxmrghd -; P8LE: xvcvdpsp -; P8LE: xvcvdpsp -; P8LE: vmrgew -; P8LE: xvcvspsxws v2 +; P8LE: xvcvdpsxws +; P8LE: xvcvdpsxws +; P8LE: vmrgew v2 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltRegValConvdtoi(double %val) { entry: %conv = fptosi double %val to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltRegValConvdtoi ; P9LE-LABEL: spltRegValConvdtoi ; P8BE-LABEL: spltRegValConvdtoi ; P8LE-LABEL: spltRegValConvdtoi ; P9BE: xscvdpsxws ; P9BE: xxspltw ; P9BE: blr ; P9LE: xscvdpsxws ; P9LE: xxspltw ; P9LE: blr ; P8BE: xscvdpsxws ; P8BE: xxspltw ; P8BE: blr ; P8LE: xscvdpsxws ; P8LE: xxspltw ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @spltMemValConvdtoi(double* nocapture readonly %ptr) { entry: %0 = load double, double* %ptr, align 8 %conv = fptosi double %0 to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltMemValConvdtoi ; P9LE-LABEL: spltMemValConvdtoi ; P8BE-LABEL: spltMemValConvdtoi ; P8LE-LABEL: spltMemValConvdtoi ; P9BE: lfd ; P9BE: xscvdpsxws ; P9BE: xxspltw ; P9BE: blr ; P9LE: lfd ; P9LE: xscvdpsxws ; P9LE: xxspltw ; P9LE: blr ; P8BE: lfdx ; P8BE: xscvdpsxws ; P8BE: xxspltw ; P8BE: blr ; P8LE: lfdx ; P8LE: xscvdpsxws ; P8LE: xxspltw ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @allZeroui() { entry: ret <4 x i32> zeroinitializer ; P9BE-LABEL: allZeroui ; P9LE-LABEL: allZeroui ; P8BE-LABEL: allZeroui ; P8LE-LABEL: allZeroui ; P9BE: xxlxor v2, v2, v2 ; P9BE: blr ; P9LE: xxlxor v2, v2, v2 ; P9LE: blr ; P8BE: xxlxor v2, v2, v2 ; P8BE: blr ; P8LE: xxlxor v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @allOneui() { entry: ret <4 x i32> ; P9BE-LABEL: allOneui ; P9LE-LABEL: allOneui ; P8BE-LABEL: allOneui ; P8LE-LABEL: allOneui ; P9BE: xxspltib v2, 255 ; P9BE: blr ; P9LE: xxspltib v2, 255 ; P9LE: blr ; P8BE: vspltisb v2, -1 ; P8BE: blr ; P8LE: vspltisb v2, -1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltConst1ui() { entry: ret <4 x i32> ; P9BE-LABEL: spltConst1ui ; P9LE-LABEL: spltConst1ui ; P8BE-LABEL: spltConst1ui ; P8LE-LABEL: spltConst1ui ; P9BE: vspltisw v2, 1 ; P9BE: blr ; P9LE: vspltisw v2, 1 ; P9LE: blr ; P8BE: vspltisw v2, 1 ; P8BE: blr ; P8LE: vspltisw v2, 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltConst16kui() { entry: ret <4 x i32> ; P9BE-LABEL: spltConst16kui ; P9LE-LABEL: spltConst16kui ; P8BE-LABEL: spltConst16kui ; P8LE-LABEL: spltConst16kui ; P9BE: vspltisw v2, -15 ; P9BE: vsrw v2, v2, v2 ; P9BE: blr ; P9LE: vspltisw v2, -15 ; P9LE: vsrw v2, v2, v2 ; P9LE: blr ; P8BE: vspltisw v2, -15 ; P8BE: vsrw v2, v2, v2 ; P8BE: blr ; P8LE: vspltisw v2, -15 ; P8LE: vsrw v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltConst32kui() { entry: ret <4 x i32> ; P9BE-LABEL: spltConst32kui ; P9LE-LABEL: spltConst32kui ; P8BE-LABEL: spltConst32kui ; P8LE-LABEL: spltConst32kui ; P9BE: vspltisw v2, -16 ; P9BE: vsrw v2, v2, v2 ; P9BE: blr ; P9LE: vspltisw v2, -16 ; P9LE: vsrw v2, v2, v2 ; P9LE: blr ; P8BE: vspltisw v2, -16 ; P8BE: vsrw v2, v2, v2 ; P8BE: blr ; P8LE: vspltisw v2, -16 ; P8LE: vsrw v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromRegsui(i32 zeroext %a, i32 zeroext %b, i32 zeroext %c, i32 zeroext %d) { entry: %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %c, i32 2 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %d, i32 3 ret <4 x i32> %vecinit3 ; P9BE-LABEL: fromRegsui ; P9LE-LABEL: fromRegsui ; P8BE-LABEL: fromRegsui ; P8LE-LABEL: fromRegsui ; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5 ; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6 ; P9BE: vmrgow v2, [[REG1]], [[REG2]] ; P9BE: blr ; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3 ; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4 ; P9LE: vmrgow v2, [[REG2]], [[REG1]] ; P9LE: blr ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3 ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4 ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5 ; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6 ; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]] ; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]] ; P8BE: vmrgow v2, [[REG5]], [[REG6]] ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3 ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4 ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5 ; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6 ; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]] ; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]] ; P8LE: vmrgow v2, [[REG6]], [[REG5]] } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromDiffConstsui() { entry: ret <4 x i32> ; P9BE-LABEL: fromDiffConstsui ; P9LE-LABEL: fromDiffConstsui ; P8BE-LABEL: fromDiffConstsui ; P8LE-LABEL: fromDiffConstsui ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lvx ; P8LE-NOT: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsAui(i32* nocapture readonly %arr) { entry: %0 = load i32, i32* %arr, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 1 %1 = load i32, i32* %arrayidx1, align 4 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2 %2 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2 %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 3 %3 = load i32, i32* %arrayidx5, align 4 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromDiffMemConsAui ; P9LE-LABEL: fromDiffMemConsAui ; P8BE-LABEL: fromDiffMemConsAui ; P8LE-LABEL: fromDiffMemConsAui ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsDui(i32* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3 %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2 %1 = load i32, i32* %arrayidx1, align 4 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 1 %2 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2 %3 = load i32, i32* %arr, align 4 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromDiffMemConsDui ; P9LE-LABEL: fromDiffMemConsDui ; P8BE-LABEL: fromDiffMemConsDui ; P8LE-LABEL: fromDiffMemConsDui ; P9BE: lxv ; P9BE: lxv ; P9BE: vperm ; P9BE: blr ; P9LE: lxv ; P9LE: lxv ; P9LE: vperm ; P9LE: blr ; P8BE: lxvw4x ; P8BE: lxvw4x ; P8BE: vperm ; P8BE: blr ; P8LE: lxvd2x ; P8LE-DAG: lvx ; P8LE-NOT: xxswapd ; P8LE: xxswapd ; P8LE: vperm ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarAui(i32* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1 %1 = load i32, i32* %arrayidx2, align 4 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %add4 = add nsw i32 %elem, 2 %idxprom5 = sext i32 %add4 to i64 %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5 %2 = load i32, i32* %arrayidx6, align 4 %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2 %add8 = add nsw i32 %elem, 3 %idxprom9 = sext i32 %add8 to i64 %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9 %3 = load i32, i32* %arrayidx10, align 4 %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3 ret <4 x i32> %vecinit11 ; P9BE-LABEL: fromDiffMemVarAui ; P9LE-LABEL: fromDiffMemVarAui ; P8BE-LABEL: fromDiffMemVarAui ; P8LE-LABEL: fromDiffMemVarAui ; P9BE: sldi r4, r4, 2 ; P9BE: lxvx v2, r3, r4 ; P9BE: blr ; P9LE: sldi r4, r4, 2 ; P9LE: lxvx v2, r3, r4 ; P9LE: blr ; P8BE: sldi r4, r4, 2 ; P8BE: lxvw4x {{[vs0-9]+}}, r3, r4 ; P8BE: blr ; P8LE: sldi r4, r4, 2 ; P8LE: lxvd2x {{[vs0-9]+}}, r3, r4 ; P8LE: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarDui(i32* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1 %1 = load i32, i32* %arrayidx2, align 4 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %sub4 = add nsw i32 %elem, -2 %idxprom5 = sext i32 %sub4 to i64 %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5 %2 = load i32, i32* %arrayidx6, align 4 %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2 %sub8 = add nsw i32 %elem, -3 %idxprom9 = sext i32 %sub8 to i64 %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9 %3 = load i32, i32* %arrayidx10, align 4 %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3 ret <4 x i32> %vecinit11 ; P9BE-LABEL: fromDiffMemVarDui ; P9LE-LABEL: fromDiffMemVarDui ; P8BE-LABEL: fromDiffMemVarDui ; P8LE-LABEL: fromDiffMemVarDui ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2 ; P9BE-DAG: addi r3, r3, -12 ; P9BE-DAG: lxvx {{v[0-9]+}}, 0, r3 ; P9BE-DAG: lxvx ; P9BE: vperm ; P9BE: blr ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2 ; P9LE-DAG: addi r3, r3, -12 ; P9LE-DAG: lxvx {{v[0-9]+}}, 0, r3 ; P9LE-DAG: lxv ; P9LE: vperm ; P9LE: blr ; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2 ; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3 ; P8BE-DAG: lxvw4x ; P8BE: vperm ; P8BE: blr ; P8LE-DAG: sldi {{r[0-9]+}}, r4, 2 ; P8LE-DAG: lvx ; P8LE-DAG: lvx ; P8LE: vperm ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromRandMemConsui(i32* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 4 %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 18 %1 = load i32, i32* %arrayidx1, align 4 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2 %2 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2 %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 88 %3 = load i32, i32* %arrayidx5, align 4 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromRandMemConsui ; P9LE-LABEL: fromRandMemConsui ; P8BE-LABEL: fromRandMemConsui ; P8LE-LABEL: fromRandMemConsui ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: mtvsrdd ; P9BE: mtvsrdd ; P9BE: vmrgow ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: mtvsrdd ; P9LE: mtvsrdd ; P9LE: vmrgow ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: xxmrghd ; P8BE: xxmrghd ; P8BE: vmrgow ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: xxmrghd ; P8LE: xxmrghd ; P8LE: vmrgow } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromRandMemVarui(i32* nocapture readonly %arr, i32 signext %elem) { entry: %add = add nsw i32 %elem, 4 %idxprom = sext i32 %add to i64 %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4 %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0 %add1 = add nsw i32 %elem, 1 %idxprom2 = sext i32 %add1 to i64 %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 %idxprom2 %1 = load i32, i32* %arrayidx3, align 4 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %1, i32 1 %add5 = add nsw i32 %elem, 2 %idxprom6 = sext i32 %add5 to i64 %arrayidx7 = getelementptr inbounds i32, i32* %arr, i64 %idxprom6 %2 = load i32, i32* %arrayidx7, align 4 %vecinit8 = insertelement <4 x i32> %vecinit4, i32 %2, i32 2 %add9 = add nsw i32 %elem, 8 %idxprom10 = sext i32 %add9 to i64 %arrayidx11 = getelementptr inbounds i32, i32* %arr, i64 %idxprom10 %3 = load i32, i32* %arrayidx11, align 4 %vecinit12 = insertelement <4 x i32> %vecinit8, i32 %3, i32 3 ret <4 x i32> %vecinit12 ; P9BE-LABEL: fromRandMemVarui ; P9LE-LABEL: fromRandMemVarui ; P8BE-LABEL: fromRandMemVarui ; P8LE-LABEL: fromRandMemVarui ; P9BE: sldi r4, r4, 2 ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: lwz ; P9BE: mtvsrdd ; P9BE: mtvsrdd ; P9BE: vmrgow ; P9LE: sldi r4, r4, 2 ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: lwz ; P9LE: mtvsrdd ; P9LE: mtvsrdd ; P9LE: vmrgow ; P8BE: sldi r4, r4, 2 ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: lwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: mtvsrwz ; P8BE: xxmrghd ; P8BE: xxmrghd ; P8BE: vmrgow ; P8LE: sldi r4, r4, 2 ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: lwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: mtvsrwz ; P8LE: xxmrghd ; P8LE: xxmrghd ; P8LE: vmrgow } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltRegValui(i32 zeroext %val) { entry: %splat.splatinsert = insertelement <4 x i32> undef, i32 %val, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltRegValui ; P9LE-LABEL: spltRegValui ; P8BE-LABEL: spltRegValui ; P8LE-LABEL: spltRegValui ; P9BE: mtvsrws v2, r3 ; P9BE: blr ; P9LE: mtvsrws v2, r3 ; P9LE: blr ; P8BE: mtvsrwz {{[vsf0-9]+}}, r3 ; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8BE: blr ; P8LE: mtvsrwz {{[vsf0-9]+}}, r3 ; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @spltMemValui(i32* nocapture readonly %ptr) { entry: %0 = load i32, i32* %ptr, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltMemValui ; P9LE-LABEL: spltMemValui ; P8BE-LABEL: spltMemValui ; P8LE-LABEL: spltMemValui ; P9BE: lxvwsx v2, 0, r3 ; P9BE: blr ; P9LE: lxvwsx v2, 0, r3 ; P9LE: blr ; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3 ; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8BE: blr ; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3 ; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltCnstConvftoui() { entry: ret <4 x i32> ; P9BE-LABEL: spltCnstConvftoui ; P9LE-LABEL: spltCnstConvftoui ; P8BE-LABEL: spltCnstConvftoui ; P8LE-LABEL: spltCnstConvftoui ; P9BE: vspltisw v2, 4 ; P9BE: blr ; P9LE: vspltisw v2, 4 ; P9LE: blr ; P8BE: vspltisw v2, 4 ; P8BE: blr ; P8LE: vspltisw v2, 4 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) { entry: %conv = fptoui float %a to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %conv1 = fptoui float %b to i32 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1 %conv3 = fptoui float %c to i32 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2 %conv5 = fptoui float %d to i32 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromRegsConvftoui ; P9LE-LABEL: fromRegsConvftoui ; P8BE-LABEL: fromRegsConvftoui ; P8LE-LABEL: fromRegsConvftoui ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9BE: vmrgew v2, [[REG3]], [[REG4]] -; P9BE: xvcvspuxws v2, v2 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9LE: vmrgew v2, [[REG4]], [[REG3]] -; P9LE: xvcvspuxws v2, v2 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8BE: vmrgew v2, [[REG3]], [[REG4]] -; P8BE: xvcvspuxws v2, v2 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8LE: vmrgew v2, [[REG4]], [[REG3]] -; P8LE: xvcvspuxws v2, v2 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromDiffConstsConvftoui() { entry: ret <4 x i32> ; P9BE-LABEL: fromDiffConstsConvftoui ; P9LE-LABEL: fromDiffConstsConvftoui ; P8BE-LABEL: fromDiffConstsConvftoui ; P8LE-LABEL: fromDiffConstsConvftoui ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lvx ; P8LE-NOT: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsAConvftoui(float* nocapture readonly %ptr) { entry: %0 = bitcast float* %ptr to <4 x float>* %1 = load <4 x float>, <4 x float>* %0, align 4 %2 = fptoui <4 x float> %1 to <4 x i32> ret <4 x i32> %2 ; P9BE-LABEL: fromDiffMemConsAConvftoui ; P9LE-LABEL: fromDiffMemConsAConvftoui ; P8BE-LABEL: fromDiffMemConsAConvftoui ; P8LE-LABEL: fromDiffMemConsAConvftoui ; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9BE: xvcvspuxws v2, [[REG1]] ; P9BE: blr ; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9LE: xvcvspuxws v2, [[REG1]] ; P9LE: blr ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3 ; P8BE: xvcvspuxws v2, [[REG1]] ; P8BE: blr ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 ; P8LE: xxswapd v2, [[REG1]] ; P8LE: xvcvspuxws v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsDConvftoui(float* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds float, float* %ptr, i64 3 %0 = load float, float* %arrayidx, align 4 %conv = fptoui float %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2 %1 = load float, float* %arrayidx1, align 4 %conv2 = fptoui float %1 to i32 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 1 %2 = load float, float* %arrayidx4, align 4 %conv5 = fptoui float %2 to i32 %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 %3 = load float, float* %ptr, align 4 %conv8 = fptoui float %3 to i32 %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 ret <4 x i32> %vecinit9 ; P9BE-LABEL: fromDiffMemConsDConvftoui ; P9LE-LABEL: fromDiffMemConsDConvftoui ; P8BE-LABEL: fromDiffMemConsDConvftoui ; P8LE-LABEL: fromDiffMemConsDConvftoui ; P9BE: lxv ; P9BE: lxv ; P9BE: vperm ; P9BE: xvcvspuxws ; P9BE: blr ; P9LE: lxv ; P9LE: lxv ; P9LE: vperm ; P9LE: xvcvspuxws ; P9LE: blr ; P8BE: lxvw4x ; P8BE: lxvw4x ; P8BE: vperm ; P8BE: xvcvspuxws ; P8BE: blr ; P8LE-DAG: lxvd2x ; P8LE-DAG: lvx ; P8LE: xxswapd ; P8LE: vperm ; P8LE: xvcvspuxws ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarAConvftoui(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptoui float %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptoui float %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %add5 = add nsw i32 %elem, 2 %idxprom6 = sext i32 %add5 to i64 %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6 %2 = load float, float* %arrayidx7, align 4 %conv8 = fptoui float %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %add10 = add nsw i32 %elem, 3 %idxprom11 = sext i32 %add10 to i64 %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11 %3 = load float, float* %arrayidx12, align 4 %conv13 = fptoui float %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarAConvftoui ; P9LE-LABEL: fromDiffMemVarAConvftoui ; P8BE-LABEL: fromDiffMemVarAConvftoui ; P8LE-LABEL: fromDiffMemVarAConvftoui ; FIXME: implement finding consecutive loads with pre-inc ; P9BE: lfsux ; P9LE: lfsux ; P8BE: lfsux ; P8LE: lfsux } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarDConvftoui(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptoui float %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptoui float %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %sub5 = add nsw i32 %elem, -2 %idxprom6 = sext i32 %sub5 to i64 %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6 %2 = load float, float* %arrayidx7, align 4 %conv8 = fptoui float %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %sub10 = add nsw i32 %elem, -3 %idxprom11 = sext i32 %sub10 to i64 %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11 %3 = load float, float* %arrayidx12, align 4 %conv13 = fptoui float %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarDConvftoui ; P9LE-LABEL: fromDiffMemVarDConvftoui ; P8BE-LABEL: fromDiffMemVarDConvftoui ; P8LE-LABEL: fromDiffMemVarDConvftoui ; FIXME: implement finding consecutive loads with pre-inc ; P9BE: lfsux ; P9LE: lfsux ; P8BE: lfsux ; P8LE: lfsux } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltRegValConvftoui(float %val) { entry: %conv = fptoui float %val to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltRegValConvftoui ; P9LE-LABEL: spltRegValConvftoui ; P8BE-LABEL: spltRegValConvftoui ; P8LE-LABEL: spltRegValConvftoui ; P9BE: xscvdpuxws f[[REG1:[0-9]+]], f1 ; P9BE: xxspltw v2, vs[[REG1]], 1 ; P9BE: blr ; P9LE: xscvdpuxws f[[REG1:[0-9]+]], f1 ; P9LE: xxspltw v2, vs[[REG1]], 1 ; P9LE: blr ; P8BE: xscvdpuxws f[[REG1:[0-9]+]], f1 ; P8BE: xxspltw v2, vs[[REG1]], 1 ; P8BE: blr ; P8LE: xscvdpuxws f[[REG1:[0-9]+]], f1 ; P8LE: xxspltw v2, vs[[REG1]], 1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @spltMemValConvftoui(float* nocapture readonly %ptr) { entry: %0 = load float, float* %ptr, align 4 %conv = fptoui float %0 to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltMemValConvftoui ; P9LE-LABEL: spltMemValConvftoui ; P8BE-LABEL: spltMemValConvftoui ; P8LE-LABEL: spltMemValConvftoui ; P9BE: lxvwsx [[REG1:[vs0-9]+]], 0, r3 ; P9BE: xvcvspuxws v2, [[REG1]] ; P9LE: [[REG1:[vs0-9]+]], 0, r3 ; P9LE: xvcvspuxws v2, [[REG1]] ; P8BE: lfsx [[REG1:f[0-9]+]], 0, r3 ; P8BE: xscvdpuxws f[[REG2:[0-9]+]], [[REG1]] ; P8BE: xxspltw v2, vs[[REG2]], 1 ; P8LE: lfsx [[REG1:f[0-9]+]], 0, r3 ; P8LE: xscvdpuxws f[[REG2:[vs0-9]+]], [[REG1]] ; P8LE: xxspltw v2, vs[[REG2]], 1 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltCnstConvdtoui() { entry: ret <4 x i32> ; P9BE-LABEL: spltCnstConvdtoui ; P9LE-LABEL: spltCnstConvdtoui ; P8BE-LABEL: spltCnstConvdtoui ; P8LE-LABEL: spltCnstConvdtoui ; P9BE: vspltisw v2, 4 ; P9BE: blr ; P9LE: vspltisw v2, 4 ; P9LE: blr ; P8BE: vspltisw v2, 4 ; P8BE: blr ; P8LE: vspltisw v2, 4 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d) { entry: %conv = fptoui double %a to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %conv1 = fptoui double %b to i32 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1 %conv3 = fptoui double %c to i32 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2 %conv5 = fptoui double %d to i32 %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3 ret <4 x i32> %vecinit6 ; P9BE-LABEL: fromRegsConvdtoui ; P9LE-LABEL: fromRegsConvdtoui ; P8BE-LABEL: fromRegsConvdtoui ; P8LE-LABEL: fromRegsConvdtoui ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9BE: vmrgew v2, [[REG3]], [[REG4]] -; P9BE: xvcvspuxws v2, v2 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P9LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P9LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P9LE: vmrgew v2, [[REG4]], [[REG3]] -; P9LE: xvcvspuxws v2, v2 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 -; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8BE: vmrgew v2, [[REG3]], [[REG4]] -; P8BE: xvcvspuxws v2, v2 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 -; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] -; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] +; P8LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +; P8LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ; P8LE: vmrgew v2, [[REG4]], [[REG3]] -; P8LE: xvcvspuxws v2, v2 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @fromDiffConstsConvdtoui() { entry: ret <4 x i32> ; P9BE-LABEL: fromDiffConstsConvdtoui ; P9LE-LABEL: fromDiffConstsConvdtoui ; P8BE-LABEL: fromDiffConstsConvdtoui ; P8LE-LABEL: fromDiffConstsConvdtoui ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr ; P8LE: lvx ; P8LE-NOT: xxswapd ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsAConvdtoui(double* nocapture readonly %ptr) { entry: %0 = bitcast double* %ptr to <2 x double>* %1 = load <2 x double>, <2 x double>* %0, align 8 %2 = fptoui <2 x double> %1 to <2 x i32> %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 2 %3 = bitcast double* %arrayidx4 to <2 x double>* %4 = load <2 x double>, <2 x double>* %3, align 8 %5 = fptoui <2 x double> %4 to <2 x i32> %vecinit9 = shufflevector <2 x i32> %2, <2 x i32> %5, <4 x i32> ret <4 x i32> %vecinit9 ; P9BE-LABEL: fromDiffMemConsAConvdtoui ; P9LE-LABEL: fromDiffMemConsAConvdtoui ; P8BE-LABEL: fromDiffMemConsAConvdtoui ; P8LE-LABEL: fromDiffMemConsAConvdtoui ; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] -; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] -; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] +; P9BE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]] +; P9BE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]] ; P9BE: vmrgew v2, [[REG6]], [[REG5]] -; P9BE: xvcvspuxws v2, v2 ; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) -; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] -; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] -; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] +; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] +; P9LE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]] +; P9LE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]] ; P9LE: vmrgew v2, [[REG6]], [[REG5]] -; P9LE: xvcvspuxws v2, v2 ; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 ; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 ; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] -; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] -; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] +; P8BE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]] +; P8BE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]] ; P8BE: vmrgew v2, [[REG6]], [[REG5]] -; P8BE: xvcvspuxws v2, v2 ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 ; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 ; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]] ; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]] ; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]] ; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]] -; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]] -; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]] +; P8LE-DAG: xvcvdpuxws [[REG7:[vs0-9]+]], [[REG5]] +; P8LE-DAG: xvcvdpuxws [[REG8:[vs0-9]+]], [[REG6]] ; P8LE: vmrgew v2, [[REG8]], [[REG7]] -; P8LE: xvcvspuxws v2, v2 } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemConsDConvdtoui(double* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds double, double* %ptr, i64 3 %0 = load double, double* %arrayidx, align 8 %conv = fptoui double %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2 %1 = load double, double* %arrayidx1, align 8 %conv2 = fptoui double %1 to i32 %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 1 %2 = load double, double* %arrayidx4, align 8 %conv5 = fptoui double %2 to i32 %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 %3 = load double, double* %ptr, align 8 %conv8 = fptoui double %3 to i32 %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 ret <4 x i32> %vecinit9 ; P9BE-LABEL: fromDiffMemConsDConvdtoui ; P9LE-LABEL: fromDiffMemConsDConvdtoui ; P8BE-LABEL: fromDiffMemConsDConvdtoui ; P8LE-LABEL: fromDiffMemConsDConvdtoui ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: xxmrghd ; P9BE: xxmrghd -; P9BE: xvcvdpsp -; P9BE: xvcvdpsp -; P9BE: vmrgew -; P9BE: xvcvspuxws v2 +; P9BE: xvcvdpuxws +; P9BE: xvcvdpuxws +; P9BE: vmrgew v2 ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: xxmrghd ; P9LE: xxmrghd -; P9LE: xvcvdpsp -; P9LE: xvcvdpsp -; P9LE: vmrgew -; P9LE: xvcvspuxws v2 +; P9LE: xvcvdpuxws +; P9LE: xvcvdpuxws +; P9LE: vmrgew v2 ; P8BE: lfdx ; P8BE: lfd ; P8BE: lfd ; P8BE: lfd ; P8BE: xxmrghd ; P8BE: xxmrghd -; P8BE: xvcvdpsp -; P8BE: xvcvdpsp -; P8BE: vmrgew -; P8BE: xvcvspuxws v2 +; P8BE: xvcvdpuxws +; P8BE: xvcvdpuxws +; P8BE: vmrgew v2 ; P8LE: lfdx ; P8LE: lfd ; P8LE: lfd ; P8LE: lfd ; P8LE: xxmrghd ; P8LE: xxmrghd -; P8LE: xvcvdpsp -; P8LE: xvcvdpsp -; P8LE: vmrgew -; P8LE: xvcvspuxws v2 +; P8LE: xvcvdpuxws +; P8LE: xvcvdpuxws +; P8LE: vmrgew v2 } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarAConvdtoui(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptoui double %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptoui double %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %add5 = add nsw i32 %elem, 2 %idxprom6 = sext i32 %add5 to i64 %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6 %2 = load double, double* %arrayidx7, align 8 %conv8 = fptoui double %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %add10 = add nsw i32 %elem, 3 %idxprom11 = sext i32 %add10 to i64 %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11 %3 = load double, double* %arrayidx12, align 8 %conv13 = fptoui double %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarAConvdtoui ; P9LE-LABEL: fromDiffMemVarAConvdtoui ; P8BE-LABEL: fromDiffMemVarAConvdtoui ; P8LE-LABEL: fromDiffMemVarAConvdtoui ; P9BE: lfdux ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: xxmrghd ; P9BE: xxmrghd -; P9BE: xvcvdpsp -; P9BE: xvcvdpsp -; P9BE: vmrgew -; P9BE: xvcvspuxws v2 +; P9BE: xvcvdpuxws +; P9BE: xvcvdpuxws +; P9BE: vmrgew v2 ; P9LE: lfdux ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: xxmrghd ; P9LE: xxmrghd -; P9LE: xvcvdpsp -; P9LE: xvcvdpsp -; P9LE: vmrgew -; P9LE: xvcvspuxws v2 +; P9LE: xvcvdpuxws +; P9LE: xvcvdpuxws +; P9LE: vmrgew v2 ; P8BE: lfdux ; P8BE: lfd ; P8BE: lfd ; P8BE: lfd ; P8BE: xxmrghd ; P8BE: xxmrghd -; P8BE: xvcvdpsp -; P8BE: xvcvdpsp -; P8BE: vmrgew -; P8BE: xvcvspuxws v2 +; P8BE: xvcvdpuxws +; P8BE: xvcvdpuxws +; P8BE: vmrgew v2 ; P8LE: lfdux ; P8LE: lfd ; P8LE: lfd ; P8LE: lfd ; P8LE: xxmrghd ; P8LE: xxmrghd -; P8LE: xvcvdpsp -; P8LE: xvcvdpsp -; P8LE: vmrgew -; P8LE: xvcvspuxws v2 +; P8LE: xvcvdpuxws +; P8LE: xvcvdpuxws +; P8LE: vmrgew v2 } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @fromDiffMemVarDConvdtoui(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptoui double %0 to i32 %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptoui double %1 to i32 %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1 %sub5 = add nsw i32 %elem, -2 %idxprom6 = sext i32 %sub5 to i64 %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6 %2 = load double, double* %arrayidx7, align 8 %conv8 = fptoui double %2 to i32 %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2 %sub10 = add nsw i32 %elem, -3 %idxprom11 = sext i32 %sub10 to i64 %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11 %3 = load double, double* %arrayidx12, align 8 %conv13 = fptoui double %3 to i32 %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3 ret <4 x i32> %vecinit14 ; P9BE-LABEL: fromDiffMemVarDConvdtoui ; P9LE-LABEL: fromDiffMemVarDConvdtoui ; P8BE-LABEL: fromDiffMemVarDConvdtoui ; P8LE-LABEL: fromDiffMemVarDConvdtoui ; P9BE: lfdux ; P9BE: lfd ; P9BE: lfd ; P9BE: lfd ; P9BE: xxmrghd ; P9BE: xxmrghd -; P9BE: xvcvdpsp -; P9BE: xvcvdpsp -; P9BE: vmrgew -; P9BE: xvcvspuxws v2 +; P9BE: xvcvdpuxws +; P9BE: xvcvdpuxws +; P9BE: vmrgew v2 ; P9LE: lfdux ; P9LE: lfd ; P9LE: lfd ; P9LE: lfd ; P9LE: xxmrghd ; P9LE: xxmrghd -; P9LE: xvcvdpsp -; P9LE: xvcvdpsp -; P9LE: vmrgew -; P9LE: xvcvspuxws v2 +; P9LE: xvcvdpuxws +; P9LE: xvcvdpuxws +; P9LE: vmrgew v2 ; P8BE: lfdux ; P8BE: lfd ; P8BE: lfd ; P8BE: lfd ; P8BE: xxmrghd ; P8BE: xxmrghd -; P8BE: xvcvdpsp -; P8BE: xvcvdpsp -; P8BE: vmrgew -; P8BE: xvcvspuxws v2 +; P8BE: xvcvdpuxws +; P8BE: xvcvdpuxws +; P8BE: vmrgew v2 ; P8LE: lfdux ; P8LE: lfd ; P8LE: lfd ; P8LE: lfd ; P8LE: xxmrghd ; P8LE: xxmrghd -; P8LE: xvcvdpsp -; P8LE: xvcvdpsp -; P8LE: vmrgew -; P8LE: xvcvspuxws v2 +; P8LE: xvcvdpuxws +; P8LE: xvcvdpuxws +; P8LE: vmrgew v2 } ; Function Attrs: norecurse nounwind readnone define <4 x i32> @spltRegValConvdtoui(double %val) { entry: %conv = fptoui double %val to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltRegValConvdtoui ; P9LE-LABEL: spltRegValConvdtoui ; P8BE-LABEL: spltRegValConvdtoui ; P8LE-LABEL: spltRegValConvdtoui ; P9BE: xscvdpuxws ; P9BE: xxspltw ; P9BE: blr ; P9LE: xscvdpuxws ; P9LE: xxspltw ; P9LE: blr ; P8BE: xscvdpuxws ; P8BE: xxspltw ; P8BE: blr ; P8LE: xscvdpuxws ; P8LE: xxspltw ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <4 x i32> @spltMemValConvdtoui(double* nocapture readonly %ptr) { entry: %0 = load double, double* %ptr, align 8 %conv = fptoui double %0 to i32 %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat ; P9BE-LABEL: spltMemValConvdtoui ; P9LE-LABEL: spltMemValConvdtoui ; P8BE-LABEL: spltMemValConvdtoui ; P8LE-LABEL: spltMemValConvdtoui ; P9BE: lfd ; P9BE: xscvdpuxws ; P9BE: xxspltw ; P9BE: blr ; P9LE: lfd ; P9LE: xscvdpuxws ; P9LE: xxspltw ; P9LE: blr ; P8BE: lfdx ; P8BE: xscvdpuxws ; P8BE: xxspltw ; P8BE: blr ; P8LE: lfdx ; P8LE: xscvdpuxws ; P8LE: xxspltw ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @allZeroll() { entry: ret <2 x i64> zeroinitializer ; P9BE-LABEL: allZeroll ; P9LE-LABEL: allZeroll ; P8BE-LABEL: allZeroll ; P8LE-LABEL: allZeroll ; P9BE: xxlxor v2, v2, v2 ; P9BE: blr ; P9LE: xxlxor v2, v2, v2 ; P9LE: blr ; P8BE: xxlxor v2, v2, v2 ; P8BE: blr ; P8LE: xxlxor v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @allOnell() { entry: ret <2 x i64> ; P9BE-LABEL: allOnell ; P9LE-LABEL: allOnell ; P8BE-LABEL: allOnell ; P8LE-LABEL: allOnell ; P9BE: xxspltib v2, 255 ; P9BE: blr ; P9LE: xxspltib v2, 255 ; P9LE: blr ; P8BE: vspltisb v2, -1 ; P8BE: blr ; P8LE: vspltisb v2, -1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltConst1ll() { entry: ret <2 x i64> ; P9BE-LABEL: spltConst1ll ; P9LE-LABEL: spltConst1ll ; P8BE-LABEL: spltConst1ll ; P8LE-LABEL: spltConst1ll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltConst16kll() { entry: ret <2 x i64> ; P9BE-LABEL: spltConst16kll ; P9LE-LABEL: spltConst16kll ; P8BE-LABEL: spltConst16kll ; P8LE-LABEL: spltConst16kll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltConst32kll() { entry: ret <2 x i64> ; P9BE-LABEL: spltConst32kll ; P9LE-LABEL: spltConst32kll ; P8BE-LABEL: spltConst32kll ; P8LE-LABEL: spltConst32kll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromRegsll(i64 %a, i64 %b) { entry: %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 ret <2 x i64> %vecinit1 ; P9BE-LABEL: fromRegsll ; P9LE-LABEL: fromRegsll ; P8BE-LABEL: fromRegsll ; P8LE-LABEL: fromRegsll ; P9BE: mtvsrdd v2, r3, r4 ; P9BE: blr ; P9LE: mtvsrdd v2, r4, r3 ; P9LE: blr ; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r3 ; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r4 ; P8BE: xxmrghd v2 ; P8BE: blr ; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r3 ; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r4 ; P8LE: xxmrghd v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromDiffConstsll() { entry: ret <2 x i64> ; P9BE-LABEL: fromDiffConstsll ; P9LE-LABEL: fromDiffConstsll ; P8BE-LABEL: fromDiffConstsll ; P8LE-LABEL: fromDiffConstsll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsAll(i64* nocapture readonly %arr) { entry: %0 = load i64, i64* %arr, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 1 %1 = load i64, i64* %arrayidx1, align 8 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromDiffMemConsAll ; P9LE-LABEL: fromDiffMemConsAll ; P8BE-LABEL: fromDiffMemConsAll ; P8LE-LABEL: fromDiffMemConsAll ; P9BE: lxv v2 ; P9BE: blr ; P9LE: lxv v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr ; P8LE: lxvd2x ; P8LE: xxswapd v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsDll(i64* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i64, i64* %arr, i64 3 %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 2 %1 = load i64, i64* %arrayidx1, align 8 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromDiffMemConsDll ; P9LE-LABEL: fromDiffMemConsDll ; P8BE-LABEL: fromDiffMemConsDll ; P8LE-LABEL: fromDiffMemConsDll ; P9BE: lxv v2 ; P9BE: blr ; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE: blr ; P8BE: lxvd2x ; P8BE: xxswapd v2 ; P8BE-NEXT: blr ; P8LE: lxvd2x v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarAll(i64* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1 %1 = load i64, i64* %arrayidx2, align 8 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemVarAll ; P9LE-LABEL: fromDiffMemVarAll ; P8BE-LABEL: fromDiffMemVarAll ; P8LE-LABEL: fromDiffMemVarAll ; P9BE: sldi ; P9BE: lxvx v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxvx v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x ; P8LE: xxswapd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarDll(i64* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1 %1 = load i64, i64* %arrayidx2, align 8 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemVarDll ; P9LE-LABEL: fromDiffMemVarDll ; P8BE-LABEL: fromDiffMemVarDll ; P8LE-LABEL: fromDiffMemVarDll ; P9BE: sldi ; P9BE: lxv ; P9BE: xxswapd v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x ; P8BE: xxswapd v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromRandMemConsll(i64* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i64, i64* %arr, i64 4 %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 18 %1 = load i64, i64* %arrayidx1, align 8 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromRandMemConsll ; P9LE-LABEL: fromRandMemConsll ; P8BE-LABEL: fromRandMemConsll ; P8LE-LABEL: fromRandMemConsll ; P9BE: ld ; P9BE: ld ; P9BE: mtvsrdd v2 ; P9BE-NEXT: blr ; P9LE: ld ; P9LE: ld ; P9LE: mtvsrdd v2 ; P9LE-NEXT: blr ; P8BE: ld ; P8BE: ld ; P8BE-DAG: mtvsrd ; P8BE-DAG: mtvsrd ; P8BE: xxmrghd v2 ; P8BE-NEXT: blr ; P8LE: ld ; P8LE: ld ; P8LE-DAG: mtvsrd ; P8LE-DAG: mtvsrd ; P8LE: xxmrghd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromRandMemVarll(i64* nocapture readonly %arr, i32 signext %elem) { entry: %add = add nsw i32 %elem, 4 %idxprom = sext i32 %add to i64 %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %add1 = add nsw i32 %elem, 1 %idxprom2 = sext i32 %add1 to i64 %arrayidx3 = getelementptr inbounds i64, i64* %arr, i64 %idxprom2 %1 = load i64, i64* %arrayidx3, align 8 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromRandMemVarll ; P9LE-LABEL: fromRandMemVarll ; P8BE-LABEL: fromRandMemVarll ; P8LE-LABEL: fromRandMemVarll ; P9BE: sldi ; P9BE: ld ; P9BE: ld ; P9BE: mtvsrdd v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: ld ; P9LE: ld ; P9LE: mtvsrdd v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: ld ; P8BE: ld ; P8BE: mtvsrd ; P8BE: mtvsrd ; P8BE: xxmrghd v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: ld ; P8LE: ld ; P8LE: mtvsrd ; P8LE: mtvsrd ; P8LE: xxmrghd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltRegValll(i64 %val) { entry: %splat.splatinsert = insertelement <2 x i64> undef, i64 %val, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltRegValll ; P9LE-LABEL: spltRegValll ; P8BE-LABEL: spltRegValll ; P8LE-LABEL: spltRegValll ; P9BE: mtvsrdd v2, r3, r3 ; P9BE-NEXT: blr ; P9LE: mtvsrdd v2, r3, r3 ; P9LE-NEXT: blr ; P8BE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3 ; P8BE: xxspltd v2, {{[vsf]+}}[[REG1]], 0 ; P8BE-NEXT: blr ; P8LE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3 ; P8LE: xxspltd v2, {{[vsf]+}}[[REG1]], 0 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @spltMemValll(i64* nocapture readonly %ptr) { entry: %0 = load i64, i64* %ptr, align 8 %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltMemValll ; P9LE-LABEL: spltMemValll ; P8BE-LABEL: spltMemValll ; P8LE-LABEL: spltMemValll ; P9BE: lxvdsx v2 ; P9BE-NEXT: blr ; P9LE: lxvdsx v2 ; P9LE-NEXT: blr ; P8BE: lxvdsx v2 ; P8BE-NEXT: blr ; P8LE: lxvdsx v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltCnstConvftoll() { entry: ret <2 x i64> ; P9BE-LABEL: spltCnstConvftoll ; P9LE-LABEL: spltCnstConvftoll ; P8BE-LABEL: spltCnstConvftoll ; P8LE-LABEL: spltCnstConvftoll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromRegsConvftoll(float %a, float %b) { entry: %conv = fptosi float %a to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %conv1 = fptosi float %b to i64 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromRegsConvftoll ; P9LE-LABEL: fromRegsConvftoll ; P8BE-LABEL: fromRegsConvftoll ; P8LE-LABEL: fromRegsConvftoll ; P9BE: xxmrghd ; P9BE: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: xxmrghd ; P9LE: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: xxmrghd ; P8BE: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: xxmrghd ; P8LE: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromDiffConstsConvftoll() { entry: ret <2 x i64> ; P9BE-LABEL: fromDiffConstsConvftoll ; P9LE-LABEL: fromDiffConstsConvftoll ; P8BE-LABEL: fromDiffConstsConvftoll ; P8LE-LABEL: fromDiffConstsConvftoll ; P9BE: lxvx v2 ; P9BE: blr ; P9LE: lxvx v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr ; P8LE: lxvd2x ; P8LE: xxswapd v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsAConvftoll(float* nocapture readonly %ptr) { entry: %0 = load float, float* %ptr, align 4 %conv = fptosi float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1 %1 = load float, float* %arrayidx1, align 4 %conv2 = fptosi float %1 to i64 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemConsAConvftoll ; P9LE-LABEL: fromDiffMemConsAConvftoll ; P8BE-LABEL: fromDiffMemConsAConvftoll ; P8LE-LABEL: fromDiffMemConsAConvftoll ; P9BE: lfs ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: lfs ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: lfs ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: lfs ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsDConvftoll(float* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds float, float* %ptr, i64 3 %0 = load float, float* %arrayidx, align 4 %conv = fptosi float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2 %1 = load float, float* %arrayidx1, align 4 %conv2 = fptosi float %1 to i64 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemConsDConvftoll ; P9LE-LABEL: fromDiffMemConsDConvftoll ; P8BE-LABEL: fromDiffMemConsDConvftoll ; P8LE-LABEL: fromDiffMemConsDConvftoll ; P9BE: lfs ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: lfs ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: lfs ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: lfs ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarAConvftoll(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptosi float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptosi float %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarAConvftoll ; P9LE-LABEL: fromDiffMemVarAConvftoll ; P8BE-LABEL: fromDiffMemVarAConvftoll ; P8LE-LABEL: fromDiffMemVarAConvftoll ; P9BE: sldi ; P9BE: lfsux ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lfsux ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lfsux ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lfsux ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarDConvftoll(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptosi float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptosi float %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarDConvftoll ; P9LE-LABEL: fromDiffMemVarDConvftoll ; P8BE-LABEL: fromDiffMemVarDConvftoll ; P8LE-LABEL: fromDiffMemVarDConvftoll ; P9BE: sldi ; P9BE: lfsux ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lfsux ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lfsux ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lfsux ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltRegValConvftoll(float %val) { entry: %conv = fptosi float %val to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltRegValConvftoll ; P9LE-LABEL: spltRegValConvftoll ; P8BE-LABEL: spltRegValConvftoll ; P8LE-LABEL: spltRegValConvftoll ; P9BE: xscvdpsxds ; P9BE-NEXT: xxspltd v2 ; P9BE-NEXT: blr ; P9LE: xscvdpsxds ; P9LE-NEXT: xxspltd v2 ; P9LE-NEXT: blr ; P8BE: xscvdpsxds ; P8BE-NEXT: xxspltd v2 ; P8BE-NEXT: blr ; P8LE: xscvdpsxds ; P8LE-NEXT: xxspltd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @spltMemValConvftoll(float* nocapture readonly %ptr) { entry: %0 = load float, float* %ptr, align 4 %conv = fptosi float %0 to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltMemValConvftoll ; P9LE-LABEL: spltMemValConvftoll ; P8BE-LABEL: spltMemValConvftoll ; P8LE-LABEL: spltMemValConvftoll ; P9BE: lfs ; P9BE-NEXT: xscvdpsxds ; P9BE-NEXT: xxspltd v2 ; P9BE-NEXT: blr ; P9LE: lfs ; P9LE-NEXT: xscvdpsxds ; P9LE-NEXT: xxspltd v2 ; P9LE-NEXT: blr ; P8BE: lfs ; P8BE-NEXT: xscvdpsxds ; P8BE-NEXT: xxspltd v2 ; P8BE-NEXT: blr ; P8LE: lfs ; P8LE-NEXT: xscvdpsxds ; P8LE-NEXT: xxspltd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltCnstConvdtoll() { entry: ret <2 x i64> ; P9BE-LABEL: spltCnstConvdtoll ; P9LE-LABEL: spltCnstConvdtoll ; P8BE-LABEL: spltCnstConvdtoll ; P8LE-LABEL: spltCnstConvdtoll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromRegsConvdtoll(double %a, double %b) { entry: %conv = fptosi double %a to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %conv1 = fptosi double %b to i64 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromRegsConvdtoll ; P9LE-LABEL: fromRegsConvdtoll ; P8BE-LABEL: fromRegsConvdtoll ; P8LE-LABEL: fromRegsConvdtoll ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpsxds ; P9BE-NEXT: blr ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpsxds ; P9LE-NEXT: blr ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpsxds ; P8BE-NEXT: blr ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpsxds ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromDiffConstsConvdtoll() { entry: ret <2 x i64> ; P9BE-LABEL: fromDiffConstsConvdtoll ; P9LE-LABEL: fromDiffConstsConvdtoll ; P8BE-LABEL: fromDiffConstsConvdtoll ; P8LE-LABEL: fromDiffConstsConvdtoll ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsAConvdtoll(double* nocapture readonly %ptr) { entry: %0 = bitcast double* %ptr to <2 x double>* %1 = load <2 x double>, <2 x double>* %0, align 8 %2 = fptosi <2 x double> %1 to <2 x i64> ret <2 x i64> %2 ; P9BE-LABEL: fromDiffMemConsAConvdtoll ; P9LE-LABEL: fromDiffMemConsAConvdtoll ; P8BE-LABEL: fromDiffMemConsAConvdtoll ; P8LE-LABEL: fromDiffMemConsAConvdtoll ; P9BE: lxv ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: lxv ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: lxvd2x ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: lxvd2x ; P8LE: xxswapd ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsDConvdtoll(double* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds double, double* %ptr, i64 3 %0 = load double, double* %arrayidx, align 8 %conv = fptosi double %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2 %1 = load double, double* %arrayidx1, align 8 %conv2 = fptosi double %1 to i64 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemConsDConvdtoll ; P9LE-LABEL: fromDiffMemConsDConvdtoll ; P8BE-LABEL: fromDiffMemConsDConvdtoll ; P8LE-LABEL: fromDiffMemConsDConvdtoll ; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: lxvd2x ; P8BE-NEXT: xxswapd ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: lxvd2x ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarAConvdtoll(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptosi double %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptosi double %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarAConvdtoll ; P9LE-LABEL: fromDiffMemVarAConvdtoll ; P8BE-LABEL: fromDiffMemVarAConvdtoll ; P8LE-LABEL: fromDiffMemVarAConvdtoll ; P9BE: sldi ; P9BE: lxvx ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxvx ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x ; P8LE-NEXT: xxswapd ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarDConvdtoll(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptosi double %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptosi double %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarDConvdtoll ; P9LE-LABEL: fromDiffMemVarDConvdtoll ; P8BE-LABEL: fromDiffMemVarDConvdtoll ; P8LE-LABEL: fromDiffMemVarDConvdtoll ; P9BE: sldi ; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x ; P8BE-NEXT: xxswapd ; P8BE-NEXT: xvcvdpsxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x ; P8LE-NEXT: xvcvdpsxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltRegValConvdtoll(double %val) { entry: %conv = fptosi double %val to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltRegValConvdtoll ; P9LE-LABEL: spltRegValConvdtoll ; P8BE-LABEL: spltRegValConvdtoll ; P8LE-LABEL: spltRegValConvdtoll ; P9BE: xscvdpsxds ; P9BE-NEXT: xxspltd v2 ; P9BE-NEXT: blr ; P9LE: xscvdpsxds ; P9LE-NEXT: xxspltd v2 ; P9LE-NEXT: blr ; P8BE: xscvdpsxds ; P8BE-NEXT: xxspltd v2 ; P8BE-NEXT: blr ; P8LE: xscvdpsxds ; P8LE-NEXT: xxspltd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @spltMemValConvdtoll(double* nocapture readonly %ptr) { entry: %0 = load double, double* %ptr, align 8 %conv = fptosi double %0 to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltMemValConvdtoll ; P9LE-LABEL: spltMemValConvdtoll ; P8BE-LABEL: spltMemValConvdtoll ; P8LE-LABEL: spltMemValConvdtoll ; P9BE: lxvdsx ; P9BE-NEXT: xvcvdpsxds ; P9BE-NEXT: blr ; P9LE: lxvdsx ; P9LE-NEXT: xvcvdpsxds ; P9LE-NEXT: blr ; P8BE: lxvdsx ; P8BE-NEXT: xvcvdpsxds ; P8BE-NEXT: blr ; P8LE: lxvdsx ; P8LE-NEXT: xvcvdpsxds ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @allZeroull() { entry: ret <2 x i64> zeroinitializer ; P9BE-LABEL: allZeroull ; P9LE-LABEL: allZeroull ; P8BE-LABEL: allZeroull ; P8LE-LABEL: allZeroull ; P9BE: xxlxor v2, v2, v2 ; P9BE: blr ; P9LE: xxlxor v2, v2, v2 ; P9LE: blr ; P8BE: xxlxor v2, v2, v2 ; P8BE: blr ; P8LE: xxlxor v2, v2, v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @allOneull() { entry: ret <2 x i64> ; P9BE-LABEL: allOneull ; P9LE-LABEL: allOneull ; P8BE-LABEL: allOneull ; P8LE-LABEL: allOneull ; P9BE: xxspltib v2, 255 ; P9BE: blr ; P9LE: xxspltib v2, 255 ; P9LE: blr ; P8BE: vspltisb v2, -1 ; P8BE: blr ; P8LE: vspltisb v2, -1 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltConst1ull() { entry: ret <2 x i64> ; P9BE-LABEL: spltConst1ull ; P9LE-LABEL: spltConst1ull ; P8BE-LABEL: spltConst1ull ; P8LE-LABEL: spltConst1ull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltConst16kull() { entry: ret <2 x i64> ; P9BE-LABEL: spltConst16kull ; P9LE-LABEL: spltConst16kull ; P8BE-LABEL: spltConst16kull ; P8LE-LABEL: spltConst16kull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltConst32kull() { entry: ret <2 x i64> ; P9BE-LABEL: spltConst32kull ; P9LE-LABEL: spltConst32kull ; P8BE-LABEL: spltConst32kull ; P8LE-LABEL: spltConst32kull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromRegsull(i64 %a, i64 %b) { entry: %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 ret <2 x i64> %vecinit1 ; P9BE-LABEL: fromRegsull ; P9LE-LABEL: fromRegsull ; P8BE-LABEL: fromRegsull ; P8LE-LABEL: fromRegsull ; P9BE: mtvsrdd v2, r3, r4 ; P9BE: blr ; P9LE: mtvsrdd v2, r4, r3 ; P9LE: blr ; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r3 ; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r4 ; P8BE: xxmrghd v2 ; P8BE: blr ; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r3 ; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r4 ; P8LE: xxmrghd v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromDiffConstsull() { entry: ret <2 x i64> ; P9BE-LABEL: fromDiffConstsull ; P9LE-LABEL: fromDiffConstsull ; P8BE-LABEL: fromDiffConstsull ; P8LE-LABEL: fromDiffConstsull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsAull(i64* nocapture readonly %arr) { entry: %0 = load i64, i64* %arr, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 1 %1 = load i64, i64* %arrayidx1, align 8 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromDiffMemConsAull ; P9LE-LABEL: fromDiffMemConsAull ; P8BE-LABEL: fromDiffMemConsAull ; P8LE-LABEL: fromDiffMemConsAull ; P9BE: lxv v2 ; P9BE: blr ; P9LE: lxv v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr ; P8LE: lxvd2x ; P8LE: xxswapd v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsDull(i64* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i64, i64* %arr, i64 3 %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 2 %1 = load i64, i64* %arrayidx1, align 8 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromDiffMemConsDull ; P9LE-LABEL: fromDiffMemConsDull ; P8BE-LABEL: fromDiffMemConsDull ; P8LE-LABEL: fromDiffMemConsDull ; P9BE: lxv v2 ; P9BE: blr ; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE: blr ; P8BE: lxvd2x ; P8BE: xxswapd v2 ; P8BE-NEXT: blr ; P8LE: lxvd2x v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarAull(i64* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1 %1 = load i64, i64* %arrayidx2, align 8 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemVarAull ; P9LE-LABEL: fromDiffMemVarAull ; P8BE-LABEL: fromDiffMemVarAull ; P8LE-LABEL: fromDiffMemVarAull ; P9BE: sldi ; P9BE: lxvx v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxvx v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x ; P8LE: xxswapd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarDull(i64* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1 %1 = load i64, i64* %arrayidx2, align 8 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemVarDull ; P9LE-LABEL: fromDiffMemVarDull ; P8BE-LABEL: fromDiffMemVarDull ; P8LE-LABEL: fromDiffMemVarDull ; P9BE: sldi ; P9BE: lxv ; P9BE: xxswapd v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x ; P8BE: xxswapd v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromRandMemConsull(i64* nocapture readonly %arr) { entry: %arrayidx = getelementptr inbounds i64, i64* %arr, i64 4 %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 18 %1 = load i64, i64* %arrayidx1, align 8 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromRandMemConsull ; P9LE-LABEL: fromRandMemConsull ; P8BE-LABEL: fromRandMemConsull ; P8LE-LABEL: fromRandMemConsull ; P9BE: ld ; P9BE: ld ; P9BE: mtvsrdd v2 ; P9BE-NEXT: blr ; P9LE: ld ; P9LE: ld ; P9LE: mtvsrdd v2 ; P9LE-NEXT: blr ; P8BE: ld ; P8BE: ld ; P8BE-DAG: mtvsrd ; P8BE-DAG: mtvsrd ; P8BE: xxmrghd v2 ; P8BE-NEXT: blr ; P8LE: ld ; P8LE: ld ; P8LE-DAG: mtvsrd ; P8LE-DAG: mtvsrd ; P8LE: xxmrghd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromRandMemVarull(i64* nocapture readonly %arr, i32 signext %elem) { entry: %add = add nsw i32 %elem, 4 %idxprom = sext i32 %add to i64 %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom %0 = load i64, i64* %arrayidx, align 8 %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0 %add1 = add nsw i32 %elem, 1 %idxprom2 = sext i32 %add1 to i64 %arrayidx3 = getelementptr inbounds i64, i64* %arr, i64 %idxprom2 %1 = load i64, i64* %arrayidx3, align 8 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromRandMemVarull ; P9LE-LABEL: fromRandMemVarull ; P8BE-LABEL: fromRandMemVarull ; P8LE-LABEL: fromRandMemVarull ; P9BE: sldi ; P9BE: ld ; P9BE: ld ; P9BE: mtvsrdd v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: ld ; P9LE: ld ; P9LE: mtvsrdd v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: ld ; P8BE: ld ; P8BE: mtvsrd ; P8BE: mtvsrd ; P8BE: xxmrghd v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: ld ; P8LE: ld ; P8LE: mtvsrd ; P8LE: mtvsrd ; P8LE: xxmrghd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltRegValull(i64 %val) { entry: %splat.splatinsert = insertelement <2 x i64> undef, i64 %val, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltRegValull ; P9LE-LABEL: spltRegValull ; P8BE-LABEL: spltRegValull ; P8LE-LABEL: spltRegValull ; P9BE: mtvsrdd v2, r3, r3 ; P9BE-NEXT: blr ; P9LE: mtvsrdd v2, r3, r3 ; P9LE-NEXT: blr ; P8BE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3 ; P8BE: xxspltd v2, {{[vsf]+}}[[REG1]], 0 ; P8BE-NEXT: blr ; P8LE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3 ; P8LE: xxspltd v2, {{[vsf]+}}[[REG1]], 0 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @spltMemValull(i64* nocapture readonly %ptr) { entry: %0 = load i64, i64* %ptr, align 8 %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltMemValull ; P9LE-LABEL: spltMemValull ; P8BE-LABEL: spltMemValull ; P8LE-LABEL: spltMemValull ; P9BE: lxvdsx v2 ; P9BE-NEXT: blr ; P9LE: lxvdsx v2 ; P9LE-NEXT: blr ; P8BE: lxvdsx v2 ; P8BE-NEXT: blr ; P8LE: lxvdsx v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltCnstConvftoull() { entry: ret <2 x i64> ; P9BE-LABEL: spltCnstConvftoull ; P9LE-LABEL: spltCnstConvftoull ; P8BE-LABEL: spltCnstConvftoull ; P8LE-LABEL: spltCnstConvftoull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromRegsConvftoull(float %a, float %b) { entry: %conv = fptoui float %a to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %conv1 = fptoui float %b to i64 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromRegsConvftoull ; P9LE-LABEL: fromRegsConvftoull ; P8BE-LABEL: fromRegsConvftoull ; P8LE-LABEL: fromRegsConvftoull ; P9BE: xxmrghd ; P9BE: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: xxmrghd ; P9LE: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: xxmrghd ; P8BE: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: xxmrghd ; P8LE: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromDiffConstsConvftoull() { entry: ret <2 x i64> ; P9BE-LABEL: fromDiffConstsConvftoull ; P9LE-LABEL: fromDiffConstsConvftoull ; P8BE-LABEL: fromDiffConstsConvftoull ; P8LE-LABEL: fromDiffConstsConvftoull ; P9BE: lxvx v2 ; P9BE: blr ; P9LE: lxvx v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr ; P8LE: lxvd2x ; P8LE: xxswapd v2 ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsAConvftoull(float* nocapture readonly %ptr) { entry: %0 = load float, float* %ptr, align 4 %conv = fptoui float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1 %1 = load float, float* %arrayidx1, align 4 %conv2 = fptoui float %1 to i64 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemConsAConvftoull ; P9LE-LABEL: fromDiffMemConsAConvftoull ; P8BE-LABEL: fromDiffMemConsAConvftoull ; P8LE-LABEL: fromDiffMemConsAConvftoull ; P9BE: lfs ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: lfs ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: lfs ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: lfs ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsDConvftoull(float* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds float, float* %ptr, i64 3 %0 = load float, float* %arrayidx, align 4 %conv = fptoui float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2 %1 = load float, float* %arrayidx1, align 4 %conv2 = fptoui float %1 to i64 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemConsDConvftoull ; P9LE-LABEL: fromDiffMemConsDConvftoull ; P8BE-LABEL: fromDiffMemConsDConvftoull ; P8LE-LABEL: fromDiffMemConsDConvftoull ; P9BE: lfs ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: lfs ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: lfs ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: lfs ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarAConvftoull(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptoui float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptoui float %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarAConvftoull ; P9LE-LABEL: fromDiffMemVarAConvftoull ; P8BE-LABEL: fromDiffMemVarAConvftoull ; P8LE-LABEL: fromDiffMemVarAConvftoull ; P9BE: sldi ; P9BE: lfsux ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lfsux ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lfsux ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lfsux ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarDConvftoull(float* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom %0 = load float, float* %arrayidx, align 4 %conv = fptoui float %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1 %1 = load float, float* %arrayidx2, align 4 %conv3 = fptoui float %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarDConvftoull ; P9LE-LABEL: fromDiffMemVarDConvftoull ; P8BE-LABEL: fromDiffMemVarDConvftoull ; P8LE-LABEL: fromDiffMemVarDConvftoull ; P9BE: sldi ; P9BE: lfsux ; P9BE: lfs ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lfsux ; P9LE: lfs ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lfsux ; P8BE: lfs ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lfsux ; P8LE: lfs ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltRegValConvftoull(float %val) { entry: %conv = fptoui float %val to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltRegValConvftoull ; P9LE-LABEL: spltRegValConvftoull ; P8BE-LABEL: spltRegValConvftoull ; P8LE-LABEL: spltRegValConvftoull ; P9BE: xscvdpuxds ; P9BE-NEXT: xxspltd v2 ; P9BE-NEXT: blr ; P9LE: xscvdpuxds ; P9LE-NEXT: xxspltd v2 ; P9LE-NEXT: blr ; P8BE: xscvdpuxds ; P8BE-NEXT: xxspltd v2 ; P8BE-NEXT: blr ; P8LE: xscvdpuxds ; P8LE-NEXT: xxspltd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @spltMemValConvftoull(float* nocapture readonly %ptr) { entry: %0 = load float, float* %ptr, align 4 %conv = fptoui float %0 to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltMemValConvftoull ; P9LE-LABEL: spltMemValConvftoull ; P8BE-LABEL: spltMemValConvftoull ; P8LE-LABEL: spltMemValConvftoull ; P9BE: lfs ; P9BE-NEXT: xscvdpuxds ; P9BE-NEXT: xxspltd v2 ; P9BE-NEXT: blr ; P9LE: lfs ; P9LE-NEXT: xscvdpuxds ; P9LE-NEXT: xxspltd v2 ; P9LE-NEXT: blr ; P8BE: lfs ; P8BE-NEXT: xscvdpuxds ; P8BE-NEXT: xxspltd v2 ; P8BE-NEXT: blr ; P8LE: lfs ; P8LE-NEXT: xscvdpuxds ; P8LE-NEXT: xxspltd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltCnstConvdtoull() { entry: ret <2 x i64> ; P9BE-LABEL: spltCnstConvdtoull ; P9LE-LABEL: spltCnstConvdtoull ; P8BE-LABEL: spltCnstConvdtoull ; P8LE-LABEL: spltCnstConvdtoull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromRegsConvdtoull(double %a, double %b) { entry: %conv = fptoui double %a to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %conv1 = fptoui double %b to i64 %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1 ret <2 x i64> %vecinit2 ; P9BE-LABEL: fromRegsConvdtoull ; P9LE-LABEL: fromRegsConvdtoull ; P8BE-LABEL: fromRegsConvdtoull ; P8LE-LABEL: fromRegsConvdtoull ; P9BE: xxmrghd ; P9BE-NEXT: xvcvdpuxds ; P9BE-NEXT: blr ; P9LE: xxmrghd ; P9LE-NEXT: xvcvdpuxds ; P9LE-NEXT: blr ; P8BE: xxmrghd ; P8BE-NEXT: xvcvdpuxds ; P8BE-NEXT: blr ; P8LE: xxmrghd ; P8LE-NEXT: xvcvdpuxds ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @fromDiffConstsConvdtoull() { entry: ret <2 x i64> ; P9BE-LABEL: fromDiffConstsConvdtoull ; P9LE-LABEL: fromDiffConstsConvdtoull ; P8BE-LABEL: fromDiffConstsConvdtoull ; P8LE-LABEL: fromDiffConstsConvdtoull ; P9BE: lxv ; P9BE: blr ; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr ; P8LE: lxvd2x ; P8LE: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsAConvdtoull(double* nocapture readonly %ptr) { entry: %0 = bitcast double* %ptr to <2 x double>* %1 = load <2 x double>, <2 x double>* %0, align 8 %2 = fptoui <2 x double> %1 to <2 x i64> ret <2 x i64> %2 ; P9BE-LABEL: fromDiffMemConsAConvdtoull ; P9LE-LABEL: fromDiffMemConsAConvdtoull ; P8BE-LABEL: fromDiffMemConsAConvdtoull ; P8LE-LABEL: fromDiffMemConsAConvdtoull ; P9BE: lxv ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: lxv ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: lxvd2x ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: lxvd2x ; P8LE: xxswapd ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemConsDConvdtoull(double* nocapture readonly %ptr) { entry: %arrayidx = getelementptr inbounds double, double* %ptr, i64 3 %0 = load double, double* %arrayidx, align 8 %conv = fptoui double %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2 %1 = load double, double* %arrayidx1, align 8 %conv2 = fptoui double %1 to i64 %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 ; P9BE-LABEL: fromDiffMemConsDConvdtoull ; P9LE-LABEL: fromDiffMemConsDConvdtoull ; P8BE-LABEL: fromDiffMemConsDConvdtoull ; P8LE-LABEL: fromDiffMemConsDConvdtoull ; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: lxvd2x ; P8BE-NEXT: xxswapd ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: lxvd2x ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarAConvdtoull(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptoui double %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %add = add nsw i32 %elem, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptoui double %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarAConvdtoull ; P9LE-LABEL: fromDiffMemVarAConvdtoull ; P8BE-LABEL: fromDiffMemVarAConvdtoull ; P8LE-LABEL: fromDiffMemVarAConvdtoull ; P9BE: sldi ; P9BE: lxvx ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxvx ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x ; P8LE-NEXT: xxswapd ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @fromDiffMemVarDConvdtoull(double* nocapture readonly %arr, i32 signext %elem) { entry: %idxprom = sext i32 %elem to i64 %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom %0 = load double, double* %arrayidx, align 8 %conv = fptoui double %0 to i64 %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 %sub = add nsw i32 %elem, -1 %idxprom1 = sext i32 %sub to i64 %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1 %1 = load double, double* %arrayidx2, align 8 %conv3 = fptoui double %1 to i64 %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1 ret <2 x i64> %vecinit4 ; P9BE-LABEL: fromDiffMemVarDConvdtoull ; P9LE-LABEL: fromDiffMemVarDConvdtoull ; P8BE-LABEL: fromDiffMemVarDConvdtoull ; P8LE-LABEL: fromDiffMemVarDConvdtoull ; P9BE: sldi ; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: sldi ; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: sldi ; P8BE: lxvd2x ; P8BE-NEXT: xxswapd ; P8BE-NEXT: xvcvdpuxds v2 ; P8BE-NEXT: blr ; P8LE: sldi ; P8LE: lxvd2x ; P8LE-NEXT: xvcvdpuxds v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readnone define <2 x i64> @spltRegValConvdtoull(double %val) { entry: %conv = fptoui double %val to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltRegValConvdtoull ; P9LE-LABEL: spltRegValConvdtoull ; P8BE-LABEL: spltRegValConvdtoull ; P8LE-LABEL: spltRegValConvdtoull ; P9BE: xscvdpuxds ; P9BE-NEXT: xxspltd v2 ; P9BE-NEXT: blr ; P9LE: xscvdpuxds ; P9LE-NEXT: xxspltd v2 ; P9LE-NEXT: blr ; P8BE: xscvdpuxds ; P8BE-NEXT: xxspltd v2 ; P8BE-NEXT: blr ; P8LE: xscvdpuxds ; P8LE-NEXT: xxspltd v2 ; P8LE-NEXT: blr } ; Function Attrs: norecurse nounwind readonly define <2 x i64> @spltMemValConvdtoull(double* nocapture readonly %ptr) { entry: %0 = load double, double* %ptr, align 8 %conv = fptoui double %0 to i64 %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat ; P9BE-LABEL: spltMemValConvdtoull ; P9LE-LABEL: spltMemValConvdtoull ; P8BE-LABEL: spltMemValConvdtoull ; P8LE-LABEL: spltMemValConvdtoull ; P9BE: lxvdsx ; P9BE-NEXT: xvcvdpuxds ; P9BE-NEXT: blr ; P9LE: lxvdsx ; P9LE-NEXT: xvcvdpuxds ; P9LE-NEXT: blr ; P8BE: lxvdsx ; P8BE-NEXT: xvcvdpuxds ; P8BE-NEXT: blr ; P8LE: lxvdsx ; P8LE-NEXT: xvcvdpuxds ; P8LE-NEXT: blr } Index: vendor/llvm/dist-release_70/test/CodeGen/X86/absolute-bit-mask-fastisel.ll =================================================================== --- vendor/llvm/dist-release_70/test/CodeGen/X86/absolute-bit-mask-fastisel.ll (nonexistent) +++ vendor/llvm/dist-release_70/test/CodeGen/X86/absolute-bit-mask-fastisel.ll (revision 337299) @@ -0,0 +1,28 @@ +; RUN: llc < %s | FileCheck %s +; RUN: llc -relocation-model=pic < %s | FileCheck %s + +; Regression test for PR38200 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@bit_mask8 = external hidden global i8, !absolute_symbol !0 + +declare void @f() + +define void @foo8(i8* %ptr) noinline optnone { + %load = load i8, i8* %ptr + ; CHECK: movl $bit_mask8, %ecx + %and = and i8 %load, ptrtoint (i8* @bit_mask8 to i8) + %icmp = icmp eq i8 %and, 0 + br i1 %icmp, label %t, label %f + +t: + call void @f() + ret void + +f: + ret void +} + +!0 = !{i64 0, i64 256} Index: vendor/llvm/dist-release_70/test/tools/llvm-ar/invalid-command-line.test =================================================================== --- vendor/llvm/dist-release_70/test/tools/llvm-ar/invalid-command-line.test (revision 337298) +++ vendor/llvm/dist-release_70/test/tools/llvm-ar/invalid-command-line.test (revision 337299) @@ -1,5 +1,5 @@ Test that llvm-ar exits with 1 when there is an error. RUN: not llvm-ar e 2>&1 | FileCheck %s CHECK: unknown option e. -CHECK: OVERVIEW: LLVM Archiver (llvm-ar) +CHECK: OVERVIEW: LLVM Archiver Index: vendor/llvm/dist-release_70/tools/llvm-ar/llvm-ar.cpp =================================================================== --- vendor/llvm/dist-release_70/tools/llvm-ar/llvm-ar.cpp (revision 337298) +++ vendor/llvm/dist-release_70/tools/llvm-ar/llvm-ar.cpp (revision 337299) @@ -1,971 +1,969 @@ //===-- llvm-ar.cpp - LLVM archive librarian utility ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Builds up (relatively) standard unix archive files (.a) containing LLVM // bitcode or other files. // //===----------------------------------------------------------------------===// #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Object/Archive.h" #include "llvm/Object/ArchiveWriter.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Chrono.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/StringSaver.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h" #include "llvm/ToolDrivers/llvm-lib/LibDriver.h" #if !defined(_MSC_VER) && !defined(__MINGW32__) #include #else #include #endif using namespace llvm; // The name this program was invoked as. static StringRef ToolName; // The basename of this program. static StringRef Stem; const char RanlibHelp[] = R"( OVERVIEW: LLVM Ranlib (llvm-ranlib) This program generates an index to speed access to archives USAGE: llvm-ranlib OPTIONS: -help - Display available options -version - Display the version of this program )"; const char ArHelp[] = R"( -OVERVIEW: LLVM Archiver (llvm-ar) +OVERVIEW: LLVM Archiver - This program archives bitcode files into single libraries +USAGE: llvm-ar [options] [-][modifiers] [relpos] [files] + llvm-ar -M [ [members]... - OPTIONS: - -M - - -format - Archive format to create - =default - default - =gnu - gnu - =darwin - darwin - =bsd - bsd - -plugin= - plugin (ignored for compatibility - -help - Display available options - -version - Display the version of this program + --format - Archive format to create + =default - default + =gnu - gnu + =darwin - darwin + =bsd - bsd + --plugin= - Ignored for compatibility + --help - Display available options + --version - Display the version of this program OPERATIONS: - d[NsS] - delete file(s) from the archive - m[abiSs] - move file(s) in the archive - p[kN] - print file(s) found in the archive - q[ufsS] - quick append file(s) to the archive - r[abfiuRsS] - replace or insert file(s) into the archive - t - display contents of archive - x[No] - extract file(s) from the archive + d - delete [files] from the archive + m - move [files] in the archive + p - print [files] found in the archive + q - quick append [files] to the archive + r - replace or insert [files] into the archive + s - act as ranlib + t - display contents of archive + x - extract [files] from the archive -MODIFIERS (operation specific): - [a] - put file(s) after [relpos] - [b] - put file(s) before [relpos] (same as [i]) +MODIFIERS: + [a] - put [files] after [relpos] + [b] - put [files] before [relpos] (same as [i]) + [c] - do not warn if archive had to be created [D] - use zero for timestamps and uids/gids (default) - [i] - put file(s) before [relpos] (same as [b]) + [i] - put [files] before [relpos] (same as [b]) + [l] - ignored for compatibility [o] - preserve original dates [s] - create an archive index (cf. ranlib) [S] - do not build a symbol table [T] - create a thin archive - [u] - update only files newer than archive contents + [u] - update only [files] newer than archive contents [U] - use actual timestamps and uids/gids - -MODIFIERS (generic): - [c] - do not warn if the library had to be created [v] - be verbose about actions taken )"; void printHelpMessage() { if (Stem.contains_lower("ranlib")) outs() << RanlibHelp; else if (Stem.contains_lower("ar")) outs() << ArHelp; } // Show the error message and exit. LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) { errs() << ToolName << ": " << Error << ".\n"; printHelpMessage(); exit(1); } static void failIfError(std::error_code EC, Twine Context = "") { if (!EC) return; std::string ContextStr = Context.str(); if (ContextStr == "") fail(EC.message()); fail(Context + ": " + EC.message()); } static void failIfError(Error E, Twine Context = "") { if (!E) return; handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) { std::string ContextStr = Context.str(); if (ContextStr == "") fail(EIB.message()); fail(Context + ": " + EIB.message()); }); } static SmallVector PositionalArgs; static bool MRI; namespace { enum Format { Default, GNU, BSD, DARWIN, Unknown }; } static Format FormatType = Default; static std::string Options; // This enumeration delineates the kinds of operations on an archive // that are permitted. enum ArchiveOperation { Print, ///< Print the contents of the archive Delete, ///< Delete the specified members Move, ///< Move members to end or as given by {a,b,i} modifiers QuickAppend, ///< Quickly append to end of archive ReplaceOrInsert, ///< Replace or Insert members DisplayTable, ///< Display the table of contents Extract, ///< Extract files back to file system CreateSymTab ///< Create a symbol table in an existing archive }; // Modifiers to follow operation to vary behavior static bool AddAfter = false; ///< 'a' modifier static bool AddBefore = false; ///< 'b' modifier static bool Create = false; ///< 'c' modifier static bool OriginalDates = false; ///< 'o' modifier static bool OnlyUpdate = false; ///< 'u' modifier static bool Verbose = false; ///< 'v' modifier static bool Symtab = true; ///< 's' modifier static bool Deterministic = true; ///< 'D' and 'U' modifiers static bool Thin = false; ///< 'T' modifier // Relative Positional Argument (for insert/move). This variable holds // the name of the archive member to which the 'a', 'b' or 'i' modifier // refers. Only one of 'a', 'b' or 'i' can be specified so we only need // one variable. static std::string RelPos; // This variable holds the name of the archive file as given on the // command line. static std::string ArchiveName; // This variable holds the list of member files to proecess, as given // on the command line. static std::vector Members; // Extract the member filename from the command line for the [relpos] argument // associated with a, b, and i modifiers static void getRelPos() { if (PositionalArgs.size() == 0) fail("Expected [relpos] for a, b, or i modifier"); RelPos = PositionalArgs[0]; PositionalArgs.erase(PositionalArgs.begin()); } // Get the archive file name from the command line static void getArchive() { if (PositionalArgs.size() == 0) fail("An archive name must be specified"); ArchiveName = PositionalArgs[0]; PositionalArgs.erase(PositionalArgs.begin()); } // Copy over remaining items in PositionalArgs to our Members vector static void getMembers() { for (auto &Arg : PositionalArgs) Members.push_back(Arg); } static void runMRIScript(); // Parse the command line options as presented and return the operation // specified. Process all modifiers and check to make sure that constraints on // modifier/operation pairs have not been violated. static ArchiveOperation parseCommandLine() { if (MRI) { if (!PositionalArgs.empty() || !Options.empty()) fail("Cannot mix -M and other options"); runMRIScript(); } // Keep track of number of operations. We can only specify one // per execution. unsigned NumOperations = 0; // Keep track of the number of positional modifiers (a,b,i). Only // one can be specified. unsigned NumPositional = 0; // Keep track of which operation was requested ArchiveOperation Operation; bool MaybeJustCreateSymTab = false; for(unsigned i=0; i 1) fail("Only one operation may be specified"); if (NumPositional > 1) fail("You may only specify one of a, b, and i modifiers"); if (AddAfter || AddBefore) { if (Operation != Move && Operation != ReplaceOrInsert) fail("The 'a', 'b' and 'i' modifiers can only be specified with " "the 'm' or 'r' operations"); } if (OriginalDates && Operation != Extract) fail("The 'o' modifier is only applicable to the 'x' operation"); if (OnlyUpdate && Operation != ReplaceOrInsert) fail("The 'u' modifier is only applicable to the 'r' operation"); // Return the parsed operation to the caller return Operation; } // Implements the 'p' operation. This function traverses the archive // looking for members that match the path list. static void doPrint(StringRef Name, const object::Archive::Child &C) { if (Verbose) outs() << "Printing " << Name << "\n"; Expected DataOrErr = C.getBuffer(); failIfError(DataOrErr.takeError()); StringRef Data = *DataOrErr; outs().write(Data.data(), Data.size()); } // Utility function for printing out the file mode when the 't' operation is in // verbose mode. static void printMode(unsigned mode) { outs() << ((mode & 004) ? "r" : "-"); outs() << ((mode & 002) ? "w" : "-"); outs() << ((mode & 001) ? "x" : "-"); } // Implement the 't' operation. This function prints out just // the file names of each of the members. However, if verbose mode is requested // ('v' modifier) then the file type, permission mode, user, group, size, and // modification time are also printed. static void doDisplayTable(StringRef Name, const object::Archive::Child &C) { if (Verbose) { Expected ModeOrErr = C.getAccessMode(); failIfError(ModeOrErr.takeError()); sys::fs::perms Mode = ModeOrErr.get(); printMode((Mode >> 6) & 007); printMode((Mode >> 3) & 007); printMode(Mode & 007); Expected UIDOrErr = C.getUID(); failIfError(UIDOrErr.takeError()); outs() << ' ' << UIDOrErr.get(); Expected GIDOrErr = C.getGID(); failIfError(GIDOrErr.takeError()); outs() << '/' << GIDOrErr.get(); Expected Size = C.getSize(); failIfError(Size.takeError()); outs() << ' ' << format("%6llu", Size.get()); auto ModTimeOrErr = C.getLastModified(); failIfError(ModTimeOrErr.takeError()); outs() << ' ' << ModTimeOrErr.get(); outs() << ' '; } if (C.getParent()->isThin()) { outs() << sys::path::parent_path(ArchiveName); outs() << '/'; } outs() << Name << "\n"; } // Implement the 'x' operation. This function extracts files back to the file // system. static void doExtract(StringRef Name, const object::Archive::Child &C) { // Retain the original mode. Expected ModeOrErr = C.getAccessMode(); failIfError(ModeOrErr.takeError()); sys::fs::perms Mode = ModeOrErr.get(); int FD; failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD, sys::fs::CD_CreateAlways, sys::fs::F_None, Mode), Name); { raw_fd_ostream file(FD, false); // Get the data and its length Expected BufOrErr = C.getBuffer(); failIfError(BufOrErr.takeError()); StringRef Data = BufOrErr.get(); // Write the data. file.write(Data.data(), Data.size()); } // If we're supposed to retain the original modification times, etc. do so // now. if (OriginalDates) { auto ModTimeOrErr = C.getLastModified(); failIfError(ModTimeOrErr.takeError()); failIfError( sys::fs::setLastModificationAndAccessTime(FD, ModTimeOrErr.get())); } if (close(FD)) fail("Could not close the file"); } static bool shouldCreateArchive(ArchiveOperation Op) { switch (Op) { case Print: case Delete: case Move: case DisplayTable: case Extract: case CreateSymTab: return false; case QuickAppend: case ReplaceOrInsert: return true; } llvm_unreachable("Missing entry in covered switch."); } static void performReadOperation(ArchiveOperation Operation, object::Archive *OldArchive) { if (Operation == Extract && OldArchive->isThin()) fail("extracting from a thin archive is not supported"); bool Filter = !Members.empty(); { Error Err = Error::success(); for (auto &C : OldArchive->children(Err)) { Expected NameOrErr = C.getName(); failIfError(NameOrErr.takeError()); StringRef Name = NameOrErr.get(); if (Filter) { auto I = find(Members, Name); if (I == Members.end()) continue; Members.erase(I); } switch (Operation) { default: llvm_unreachable("Not a read operation"); case Print: doPrint(Name, C); break; case DisplayTable: doDisplayTable(Name, C); break; case Extract: doExtract(Name, C); break; } } failIfError(std::move(Err)); } if (Members.empty()) return; for (StringRef Name : Members) errs() << Name << " was not found\n"; exit(1); } static void addMember(std::vector &Members, StringRef FileName, int Pos = -1) { Expected NMOrErr = NewArchiveMember::getFile(FileName, Deterministic); failIfError(NMOrErr.takeError(), FileName); // Use the basename of the object path for the member name. NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName); if (Pos == -1) Members.push_back(std::move(*NMOrErr)); else Members[Pos] = std::move(*NMOrErr); } static void addMember(std::vector &Members, const object::Archive::Child &M, int Pos = -1) { if (Thin && !M.getParent()->isThin()) fail("Cannot convert a regular archive to a thin one"); Expected NMOrErr = NewArchiveMember::getOldMember(M, Deterministic); failIfError(NMOrErr.takeError()); if (Pos == -1) Members.push_back(std::move(*NMOrErr)); else Members[Pos] = std::move(*NMOrErr); } enum InsertAction { IA_AddOldMember, IA_AddNewMember, IA_Delete, IA_MoveOldMember, IA_MoveNewMember }; static InsertAction computeInsertAction(ArchiveOperation Operation, const object::Archive::Child &Member, StringRef Name, std::vector::iterator &Pos) { if (Operation == QuickAppend || Members.empty()) return IA_AddOldMember; auto MI = find_if(Members, [Name](StringRef Path) { return Name == sys::path::filename(Path); }); if (MI == Members.end()) return IA_AddOldMember; Pos = MI; if (Operation == Delete) return IA_Delete; if (Operation == Move) return IA_MoveOldMember; if (Operation == ReplaceOrInsert) { StringRef PosName = sys::path::filename(RelPos); if (!OnlyUpdate) { if (PosName.empty()) return IA_AddNewMember; return IA_MoveNewMember; } // We could try to optimize this to a fstat, but it is not a common // operation. sys::fs::file_status Status; failIfError(sys::fs::status(*MI, Status), *MI); auto ModTimeOrErr = Member.getLastModified(); failIfError(ModTimeOrErr.takeError()); if (Status.getLastModificationTime() < ModTimeOrErr.get()) { if (PosName.empty()) return IA_AddOldMember; return IA_MoveOldMember; } if (PosName.empty()) return IA_AddNewMember; return IA_MoveNewMember; } llvm_unreachable("No such operation"); } // We have to walk this twice and computing it is not trivial, so creating an // explicit std::vector is actually fairly efficient. static std::vector computeNewArchiveMembers(ArchiveOperation Operation, object::Archive *OldArchive) { std::vector Ret; std::vector Moved; int InsertPos = -1; StringRef PosName = sys::path::filename(RelPos); if (OldArchive) { Error Err = Error::success(); for (auto &Child : OldArchive->children(Err)) { int Pos = Ret.size(); Expected NameOrErr = Child.getName(); failIfError(NameOrErr.takeError()); StringRef Name = NameOrErr.get(); if (Name == PosName) { assert(AddAfter || AddBefore); if (AddBefore) InsertPos = Pos; else InsertPos = Pos + 1; } std::vector::iterator MemberI = Members.end(); InsertAction Action = computeInsertAction(Operation, Child, Name, MemberI); switch (Action) { case IA_AddOldMember: addMember(Ret, Child); break; case IA_AddNewMember: addMember(Ret, *MemberI); break; case IA_Delete: break; case IA_MoveOldMember: addMember(Moved, Child); break; case IA_MoveNewMember: addMember(Moved, *MemberI); break; } if (MemberI != Members.end()) Members.erase(MemberI); } failIfError(std::move(Err)); } if (Operation == Delete) return Ret; if (!RelPos.empty() && InsertPos == -1) fail("Insertion point not found"); if (RelPos.empty()) InsertPos = Ret.size(); assert(unsigned(InsertPos) <= Ret.size()); int Pos = InsertPos; for (auto &M : Moved) { Ret.insert(Ret.begin() + Pos, std::move(M)); ++Pos; } for (unsigned I = 0; I != Members.size(); ++I) Ret.insert(Ret.begin() + InsertPos, NewArchiveMember()); Pos = InsertPos; for (auto &Member : Members) { addMember(Ret, Member, Pos); ++Pos; } return Ret; } static object::Archive::Kind getDefaultForHost() { return Triple(sys::getProcessTriple()).isOSDarwin() ? object::Archive::K_DARWIN : object::Archive::K_GNU; } static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) { Expected> OptionalObject = object::ObjectFile::createObjectFile(Member.Buf->getMemBufferRef()); if (OptionalObject) return isa(**OptionalObject) ? object::Archive::K_DARWIN : object::Archive::K_GNU; // squelch the error in case we had a non-object file consumeError(OptionalObject.takeError()); return getDefaultForHost(); } static void performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive, std::unique_ptr OldArchiveBuf, std::vector *NewMembersP) { std::vector NewMembers; if (!NewMembersP) NewMembers = computeNewArchiveMembers(Operation, OldArchive); object::Archive::Kind Kind; switch (FormatType) { case Default: if (Thin) Kind = object::Archive::K_GNU; else if (OldArchive) Kind = OldArchive->kind(); else if (NewMembersP) Kind = NewMembersP->size() ? getKindFromMember(NewMembersP->front()) : getDefaultForHost(); else Kind = NewMembers.size() ? getKindFromMember(NewMembers.front()) : getDefaultForHost(); break; case GNU: Kind = object::Archive::K_GNU; break; case BSD: if (Thin) fail("Only the gnu format has a thin mode"); Kind = object::Archive::K_BSD; break; case DARWIN: if (Thin) fail("Only the gnu format has a thin mode"); Kind = object::Archive::K_DARWIN; break; case Unknown: llvm_unreachable(""); } Error E = writeArchive(ArchiveName, NewMembersP ? *NewMembersP : NewMembers, Symtab, Kind, Deterministic, Thin, std::move(OldArchiveBuf)); failIfError(std::move(E), ArchiveName); } static void createSymbolTable(object::Archive *OldArchive) { // When an archive is created or modified, if the s option is given, the // resulting archive will have a current symbol table. If the S option // is given, it will have no symbol table. // In summary, we only need to update the symbol table if we have none. // This is actually very common because of broken build systems that think // they have to run ranlib. if (OldArchive->hasSymbolTable()) return; performWriteOperation(CreateSymTab, OldArchive, nullptr, nullptr); } static void performOperation(ArchiveOperation Operation, object::Archive *OldArchive, std::unique_ptr OldArchiveBuf, std::vector *NewMembers) { switch (Operation) { case Print: case DisplayTable: case Extract: performReadOperation(Operation, OldArchive); return; case Delete: case Move: case QuickAppend: case ReplaceOrInsert: performWriteOperation(Operation, OldArchive, std::move(OldArchiveBuf), NewMembers); return; case CreateSymTab: createSymbolTable(OldArchive); return; } llvm_unreachable("Unknown operation."); } static int performOperation(ArchiveOperation Operation, std::vector *NewMembers) { // Create or open the archive object. ErrorOr> Buf = MemoryBuffer::getFile(ArchiveName, -1, false); std::error_code EC = Buf.getError(); if (EC && EC != errc::no_such_file_or_directory) fail("error opening '" + ArchiveName + "': " + EC.message() + "!"); if (!EC) { Error Err = Error::success(); object::Archive Archive(Buf.get()->getMemBufferRef(), Err); EC = errorToErrorCode(std::move(Err)); failIfError(EC, "error loading '" + ArchiveName + "': " + EC.message() + "!"); performOperation(Operation, &Archive, std::move(Buf.get()), NewMembers); return 0; } assert(EC == errc::no_such_file_or_directory); if (!shouldCreateArchive(Operation)) { failIfError(EC, Twine("error loading '") + ArchiveName + "'"); } else { if (!Create) { // Produce a warning if we should and we're creating the archive errs() << ToolName << ": creating " << ArchiveName << "\n"; } } performOperation(Operation, nullptr, nullptr, NewMembers); return 0; } static void runMRIScript() { enum class MRICommand { AddLib, AddMod, Create, Delete, Save, End, Invalid }; ErrorOr> Buf = MemoryBuffer::getSTDIN(); failIfError(Buf.getError()); const MemoryBuffer &Ref = *Buf.get(); bool Saved = false; std::vector NewMembers; std::vector> ArchiveBuffers; std::vector> Archives; for (line_iterator I(Ref, /*SkipBlanks*/ true, ';'), E; I != E; ++I) { StringRef Line = *I; StringRef CommandStr, Rest; std::tie(CommandStr, Rest) = Line.split(' '); Rest = Rest.trim(); if (!Rest.empty() && Rest.front() == '"' && Rest.back() == '"') Rest = Rest.drop_front().drop_back(); auto Command = StringSwitch(CommandStr.lower()) .Case("addlib", MRICommand::AddLib) .Case("addmod", MRICommand::AddMod) .Case("create", MRICommand::Create) .Case("delete", MRICommand::Delete) .Case("save", MRICommand::Save) .Case("end", MRICommand::End) .Default(MRICommand::Invalid); switch (Command) { case MRICommand::AddLib: { auto BufOrErr = MemoryBuffer::getFile(Rest, -1, false); failIfError(BufOrErr.getError(), "Could not open library"); ArchiveBuffers.push_back(std::move(*BufOrErr)); auto LibOrErr = object::Archive::create(ArchiveBuffers.back()->getMemBufferRef()); failIfError(errorToErrorCode(LibOrErr.takeError()), "Could not parse library"); Archives.push_back(std::move(*LibOrErr)); object::Archive &Lib = *Archives.back(); { Error Err = Error::success(); for (auto &Member : Lib.children(Err)) addMember(NewMembers, Member); failIfError(std::move(Err)); } break; } case MRICommand::AddMod: addMember(NewMembers, Rest); break; case MRICommand::Create: Create = true; if (!ArchiveName.empty()) fail("Editing multiple archives not supported"); if (Saved) fail("File already saved"); ArchiveName = Rest; break; case MRICommand::Delete: { StringRef Name = sys::path::filename(Rest); llvm::erase_if(NewMembers, [=](NewArchiveMember &M) { return M.MemberName == Name; }); break; } case MRICommand::Save: Saved = true; break; case MRICommand::End: break; case MRICommand::Invalid: fail("Unknown command: " + CommandStr); } } // Nothing to do if not saved. if (Saved) performOperation(ReplaceOrInsert, &NewMembers); exit(0); } static bool handleGenericOption(StringRef arg) { if (arg == "-help" || arg == "--help") { printHelpMessage(); return true; } if (arg == "-version" || arg == "--version") { cl::PrintVersionMessage(); return true; } return false; } static int ar_main(int argc, char **argv) { SmallVector Argv(argv, argv + argc); BumpPtrAllocator Alloc; StringSaver Saver(Alloc); cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv); for(size_t i = 1; i < Argv.size(); ++i) { StringRef Arg = Argv[i]; const char *match; auto MatchFlagWithArg = [&](const char *expected) { size_t len = strlen(expected); if (Arg == expected) { if (++i >= Argv.size()) fail(std::string(expected) + " requires an argument"); match = Argv[i]; return true; } if (Arg.startswith(expected) && Arg.size() > len && Arg[len] == '=') { match = Arg.data() + len + 1; return true; } return false; }; if (handleGenericOption(Argv[i])) return 0; if (Arg == "--") { for(; i < Argv.size(); ++i) PositionalArgs.push_back(Argv[i]); break; } if (Arg[0] == '-') { if (Arg.startswith("--")) Arg = Argv[i] + 2; else Arg = Argv[i] + 1; if (Arg == "M") { MRI = true; } else if (MatchFlagWithArg("format")) { FormatType = StringSwitch(match) .Case("default", Default) .Case("gnu", GNU) .Case("darwin", DARWIN) .Case("bsd", BSD) .Default(Unknown); if (FormatType == Unknown) fail(std::string("Invalid format ") + match); } else if (MatchFlagWithArg("plugin")) { // Ignored. } else { Options += Argv[i] + 1; } } else if (Options.empty()) { Options += Argv[i]; } else { PositionalArgs.push_back(Argv[i]); } } ArchiveOperation Operation = parseCommandLine(); return performOperation(Operation, nullptr); } static int ranlib_main(int argc, char **argv) { bool ArchiveSpecified = false; for(int i = 1; i < argc; ++i) { if (handleGenericOption(argv[i])) { return 0; } else { if (ArchiveSpecified) fail("Exactly one archive should be specified"); ArchiveSpecified = true; ArchiveName = argv[i]; } } return performOperation(CreateSymTab, nullptr); } int main(int argc, char **argv) { InitLLVM X(argc, argv); ToolName = argv[0]; llvm::InitializeAllTargetInfos(); llvm::InitializeAllTargetMCs(); llvm::InitializeAllAsmParsers(); Stem = sys::path::stem(ToolName); if (Stem.contains_lower("dlltool")) return dlltoolDriverMain(makeArrayRef(argv, argc)); if (Stem.contains_lower("ranlib")) return ranlib_main(argc, argv); if (Stem.contains_lower("lib")) return libDriverMain(makeArrayRef(argv, argc)); if (Stem.contains_lower("ar")) return ar_main(argc, argv); fail("Not ranlib, ar, lib or dlltool!"); } Index: vendor/llvm/dist-release_70/tools/llvm-shlib/CMakeLists.txt =================================================================== --- vendor/llvm/dist-release_70/tools/llvm-shlib/CMakeLists.txt (revision 337298) +++ vendor/llvm/dist-release_70/tools/llvm-shlib/CMakeLists.txt (revision 337299) @@ -1,110 +1,106 @@ # This tool creates a shared library from the LLVM libraries. Generating this # library is enabled by setting LLVM_BUILD_LLVM_DYLIB=yes on the CMake # commandline. By default the shared library only exports the LLVM C API. set(SOURCES libllvm.cpp ) llvm_map_components_to_libnames(LIB_NAMES ${LLVM_DYLIB_COMPONENTS}) if(LLVM_LINK_LLVM_DYLIB AND LLVM_DYLIB_EXPORTED_SYMBOL_FILE) message(WARNING "Using LLVM_LINK_LLVM_DYLIB with LLVM_DYLIB_EXPORTED_SYMBOL_FILE may not work. Use at your own risk.") endif() # libLLVM.so should not have any dependencies on any other LLVM # shared libraries. When using the "all" pseudo-component, # LLVM_AVAILABLE_LIBS is added to the dependencies, which may # contain shared libraries (e.g. libLTO). # # Also exclude libLLVMTableGen for the following reasons: # - it is only used by internal *-tblgen utilities; # - it pollutes the global options space. foreach(lib ${LIB_NAMES}) get_target_property(t ${lib} TYPE) if("${lib}" STREQUAL "LLVMTableGen") elseif("x${t}" STREQUAL "xSTATIC_LIBRARY") list(APPEND FILTERED_LIB_NAMES ${lib}) endif() endforeach() set(LIB_NAMES ${FILTERED_LIB_NAMES}) if(LLVM_DYLIB_EXPORTED_SYMBOL_FILE) set(LLVM_EXPORTED_SYMBOL_FILE ${LLVM_DYLIB_EXPORTED_SYMBOL_FILE}) add_custom_target(libLLVMExports DEPENDS ${LLVM_EXPORTED_SYMBOL_FILE}) endif() add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${SOURCES}) list(REMOVE_DUPLICATES LIB_NAMES) if(("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") OR (MINGW) OR (HAIKU) OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "FreeBSD") OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "OpenBSD") OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "Fuchsia") OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "DragonFly") OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "SunOS")) # FIXME: It should be "GNU ld for elf" configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/simple_version_script.map.in ${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map) # GNU ld doesn't resolve symbols in the version script. set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive) if (NOT LLVM_LINKER_IS_SOLARISLD) # Solaris ld does not accept global: *; so there is no way to version *all* global symbols set(LIB_NAMES -Wl,--version-script,${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map ${LIB_NAMES}) endif() elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") set(LIB_NAMES -Wl,-all_load ${LIB_NAMES}) endif() target_link_libraries(LLVM PRIVATE ${LIB_NAMES}) -if (LLVM_DYLIB_SYMBOL_VERSIONING) - set_property(TARGET LLVM APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--default-symver") -endif() - if (APPLE) set_property(TARGET LLVM APPEND_STRING PROPERTY LINK_FLAGS " -compatibility_version 1 -current_version ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}") endif() if(TARGET libLLVMExports) add_dependencies(LLVM libLLVMExports) endif() if(LLVM_BUILD_LLVM_C_DYLIB) # To get the export list for a single llvm library: # nm ${LIB_PATH} | awk "/T _LLVM/ { print $3 }" | sort -u | sed -e "s/^_//g" > ${LIB_PATH}.exports if(NOT APPLE) message(FATAL_ERROR "Generating libLLVM-c is only supported on Darwin") endif() set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_BINARY_DIR}/libllvm-c.exports) set(LIB_DIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX}) set(LIB_NAME ${LIB_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}LLVM) set(LIB_PATH ${LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) set(LIB_EXPORTS_PATH ${LIB_NAME}.exports) list(APPEND LLVM_DYLIB_REQUIRED_EXPORTS ${LIB_EXPORTS_PATH}) add_custom_command(OUTPUT ${LLVM_EXPORTED_SYMBOL_FILE} COMMAND nm ${LIB_PATH} | awk "/T _LLVM/ || /T LLVM/ { print $3 }" | sort -u | sed -e "s/^_//g" > ${LLVM_EXPORTED_SYMBOL_FILE} WORKING_DIRECTORY ${LIB_DIR} DEPENDS LLVM COMMENT "Generating Export list for LLVM..." VERBATIM ) add_custom_target(libLLVMCExports DEPENDS ${LLVM_EXPORTED_SYMBOL_FILE}) add_llvm_library(LLVM-C SHARED ${SOURCES}) target_link_libraries(LLVM-C PUBLIC LLVM) add_dependencies(LLVM-C libLLVMCExports) set_property(TARGET LLVM-C APPEND_STRING PROPERTY LINK_FLAGS " -compatibility_version 1 -current_version ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH} -Wl,-reexport_library ${LIB_PATH}") endif() Index: vendor/llvm/dist-release_70/utils/release/tag.sh =================================================================== --- vendor/llvm/dist-release_70/utils/release/tag.sh (revision 337298) +++ vendor/llvm/dist-release_70/utils/release/tag.sh (revision 337299) @@ -1,130 +1,130 @@ #!/bin/sh #===-- tag.sh - Tag the LLVM release candidates ----------------------------===# # # The LLVM Compiler Infrastructure # # This file is distributed under the University of Illinois Open Source # License. # #===------------------------------------------------------------------------===# # # Create branches and release candidates for the LLVM release. # #===------------------------------------------------------------------------===# set -e release="" rc="" rebranch="no" -projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind" +projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind debuginfo-tests" dryrun="" revision="HEAD" base_url="https://llvm.org/svn/llvm-project" usage() { echo "usage: `basename $0` -release [-rebranch] [-revision ] [-dry-run]" echo "usage: `basename $0` -release -rc [-dry-run]" echo " " echo " -release The version number of the release" echo " -rc The release candidate number" echo " -rebranch Remove existing branch, if present, before branching" echo " -final Tag final release candidate" echo " -revision Revision to branch off (default: HEAD)" echo " -dry-run Make no changes to the repository, just print the commands" } tag_version() { set -x for proj in $projects; do if svn ls $base_url/$proj/branches/release_$branch_release > /dev/null 2>&1 ; then if [ $rebranch = "no" ]; then continue fi ${dryrun} svn remove -m "Removing old release_$branch_release branch for rebranching." \ $base_url/$proj/branches/release_$branch_release fi ${dryrun} svn copy -m "Creating release_$branch_release branch off revision ${revision}" \ -r ${revision} \ $base_url/$proj/trunk \ $base_url/$proj/branches/release_$branch_release done set +x } tag_release_candidate() { set -x for proj in $projects ; do if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release > /dev/null 2>&1 ; then ${dryrun} svn mkdir -m "Creating release directory for release_$tag_release." $base_url/$proj/tags/RELEASE_$tag_release fi if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release/$rc > /dev/null 2>&1 ; then ${dryrun} svn copy -m "Creating release candidate $rc from release_$tag_release branch" \ $base_url/$proj/branches/release_$branch_release \ $base_url/$proj/tags/RELEASE_$tag_release/$rc fi done set +x } while [ $# -gt 0 ]; do case $1 in -release | --release ) shift release=$1 ;; -rc | --rc ) shift rc="rc$1" ;; -rebranch | --rebranch ) rebranch="yes" ;; -final | --final ) rc="final" ;; -revision | --revision ) shift revision="$1" ;; -dry-run | --dry-run ) dryrun="echo" ;; -h | --help | -help ) usage exit 0 ;; * ) echo "unknown option: $1" usage exit 1 ;; esac shift done if [ "x$release" = "x" ]; then echo "error: need to specify a release version" echo usage exit 1 fi branch_release=`echo $release | sed -e 's,\([0-9]*\.[0-9]*\).*,\1,' | sed -e 's,\.,,g'` tag_release=`echo $release | sed -e 's,\.,,g'` if [ "x$rc" = "x" ]; then tag_version else if [ "$revision" != "HEAD" ]; then echo "error: cannot use -revision with -rc" echo usage exit 1 fi tag_release_candidate fi exit 0