Index: vendor/llvm/dist-release_70/CMakeLists.txt
===================================================================
--- vendor/llvm/dist-release_70/CMakeLists.txt	(revision 337630)
+++ vendor/llvm/dist-release_70/CMakeLists.txt	(revision 337631)
@@ -1,1042 +1,1042 @@
 # See docs/CMake.html for instructions about how to build LLVM with CMake.
 
 cmake_minimum_required(VERSION 3.4.3)
 
 cmake_policy(SET CMP0022 NEW)
 
 cmake_policy(SET CMP0048 NEW)
 
 # CMake 3.1 and higher include generator expressions of the form
 # $<TARGETLIB:obj> in the SOURCES property.  These need to be
 # stripped everywhere that access the SOURCES property, so we just
 # defer to the OLD behavior of not including generator expressions
 # in the output for now.
 cmake_policy(SET CMP0051 OLD)
 
 cmake_policy(SET CMP0056 NEW)
 
 cmake_policy(SET CMP0057 NEW)
 
 if(POLICY CMP0068)
   cmake_policy(SET CMP0068 NEW)
   set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 endif()
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 7)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX "")
 endif()
 
 if (NOT PACKAGE_VERSION)
   set(PACKAGE_VERSION
     "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}")
 endif()
 
 if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQUAL ""))
   message(WARNING "Visual Studio generators use the x86 host compiler by "
                   "default, even for 64-bit targets. This can result in linker "
                   "instability and out of memory errors. To use the 64-bit "
                   "host compiler, pass -Thost=x64 on the CMake command line.")
 endif()
 
 project(LLVM
   VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}
   LANGUAGES C CXX ASM)
 
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "No build type selected, default to Debug")
   set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type (default Debug)" FORCE)
 endif()
 
 # This should only apply if you are both on an Apple host, and targeting Apple.
 if(CMAKE_HOST_APPLE AND APPLE)
   # if CMAKE_LIBTOOL is not set, try and find it with xcrun or find_program
   if(NOT CMAKE_LIBTOOL)
     if(NOT CMAKE_XCRUN)
       find_program(CMAKE_XCRUN NAMES xcrun)
     endif()
     if(CMAKE_XCRUN)
       execute_process(COMMAND ${CMAKE_XCRUN} -find libtool
         OUTPUT_VARIABLE CMAKE_LIBTOOL
         OUTPUT_STRIP_TRAILING_WHITESPACE)
     endif()
 
     if(NOT CMAKE_LIBTOOL OR NOT EXISTS CMAKE_LIBTOOL)
       find_program(CMAKE_LIBTOOL NAMES libtool)
     endif()
   endif()
 
   get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
   if(CMAKE_LIBTOOL)
     set(CMAKE_LIBTOOL ${CMAKE_LIBTOOL} CACHE PATH "libtool executable")
     message(STATUS "Found libtool - ${CMAKE_LIBTOOL}")
 
     execute_process(COMMAND ${CMAKE_LIBTOOL} -V
       OUTPUT_VARIABLE LIBTOOL_V_OUTPUT
       OUTPUT_STRIP_TRAILING_WHITESPACE)
     if("${LIBTOOL_V_OUTPUT}" MATCHES ".*cctools-([0-9.]+).*")
       string(REGEX REPLACE ".*cctools-([0-9.]+).*" "\\1" LIBTOOL_VERSION
         ${LIBTOOL_V_OUTPUT})
       if(NOT LIBTOOL_VERSION VERSION_LESS "862")
         set(LIBTOOL_NO_WARNING_FLAG "-no_warning_for_no_symbols")
       endif()
     endif()
 
     foreach(lang ${languages})
       set(CMAKE_${lang}_CREATE_STATIC_LIBRARY
         "\"${CMAKE_LIBTOOL}\" -static ${LIBTOOL_NO_WARNING_FLAG} -o <TARGET> \
         <LINK_FLAGS> <OBJECTS> ")
     endforeach()
   endif()
 
   # If DYLD_LIBRARY_PATH is set we need to set it on archiver commands
   if(DYLD_LIBRARY_PATH)
     set(dyld_envar "DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}")
     foreach(lang ${languages})
       foreach(cmd ${CMAKE_${lang}_CREATE_STATIC_LIBRARY})
         list(APPEND CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW
              "${dyld_envar} ${cmd}")
       endforeach()
       set(CMAKE_${lang}_CREATE_STATIC_LIBRARY
         ${CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW})
     endforeach()
   endif()
 endif()
 
 # Side-by-side subprojects layout: automatically set the
 # LLVM_EXTERNAL_${project}_SOURCE_DIR using LLVM_ALL_PROJECTS
 # This allows an easy way of setting up a build directory for llvm and another
 # one for llvm+clang+... using the same sources.
 set(LLVM_ALL_PROJECTS "clang;libcxx;libcxxabi;lldb;compiler-rt;lld;polly;debuginfo-tests")
 set(LLVM_ENABLE_PROJECTS "" CACHE STRING
 	"Semicolon-separated list of projects to build (${LLVM_ALL_PROJECTS}), or \"all\".")
 if( LLVM_ENABLE_PROJECTS STREQUAL "all" )
   set( LLVM_ENABLE_PROJECTS ${LLVM_ALL_PROJECTS})
 endif()
 foreach(proj ${LLVM_ENABLE_PROJECTS})
   set(PROJ_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${proj}")
   if(NOT EXISTS "${PROJ_DIR}" OR NOT IS_DIRECTORY "${PROJ_DIR}")
     message(FATAL_ERROR "LLVM_ENABLE_PROJECTS requests ${proj} but directory not found: ${PROJ_DIR}")
   endif()
   string(TOUPPER "${proj}" upper_proj)
   STRING(REGEX REPLACE "-" "_" upper_proj ${upper_proj})
   set(LLVM_EXTERNAL_${upper_proj}_SOURCE_DIR   "${CMAKE_CURRENT_SOURCE_DIR}/../${proj}")
   # There is a widely spread opinion that clang-tools-extra should be merged
   # into clang. The following simulates it by always enabling clang-tools-extra
   # when enabling clang.
   if (proj STREQUAL "clang")
     set(LLVM_EXTERNAL_CLANG_TOOLS_EXTRA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../clang-tools-extra")
   endif()
 endforeach()
 
 # Build llvm with ccache if the package is present
 set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
 if(LLVM_CCACHE_BUILD)
   find_program(CCACHE_PROGRAM ccache)
   if(CCACHE_PROGRAM)
       set(LLVM_CCACHE_MAXSIZE "" CACHE STRING "Size of ccache")
       set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data")
       set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes CCACHE_HASHDIR=yes"
           CACHE STRING "Parameters to pass through to ccache")
 
       set(CCACHE_PROGRAM "${LLVM_CCACHE_PARAMS} ${CCACHE_PROGRAM}")
       if (LLVM_CCACHE_MAXSIZE)
         set(CCACHE_PROGRAM "CCACHE_MAXSIZE=${LLVM_CCACHE_MAXSIZE} ${CCACHE_PROGRAM}")
       endif()
       if (LLVM_CCACHE_DIR)
         set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}")
       endif()
       set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
   else()
     message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF")
   endif()
 endif()
 
 option(LLVM_DEPENDENCY_DEBUGGING "Dependency debugging mode to verify correctly expressed library dependencies (Darwin only)" OFF)
 
 # Some features of the LLVM build may be disallowed when dependency debugging is
 # enabled. In particular you cannot use ccache because we want to force compile
 # operations to always happen.
 if(LLVM_DEPENDENCY_DEBUGGING)
   if(NOT CMAKE_HOST_APPLE)
     message(FATAL_ERROR "Dependency debugging is only currently supported on Darwin hosts.")
   endif()
   if(LLVM_CCACHE_BUILD)
     message(FATAL_ERROR "Cannot enable dependency debugging while using ccache.")
   endif()
 endif()
 
 option(LLVM_ENABLE_DAGISEL_COV "Debug: Prints tablegen patterns that were used for selecting" OFF)
 option(LLVM_ENABLE_GISEL_COV "Enable collection of GlobalISel rule coverage" OFF)
 if(LLVM_ENABLE_GISEL_COV)
   set(LLVM_GISEL_COV_PREFIX "${CMAKE_BINARY_DIR}/gisel-coverage-" CACHE STRING "Provide a filename prefix to collect the GlobalISel rule coverage")
 endif()
 
 # Add path for custom modules
 set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules"
   )
 
 # Generate a CompilationDatabase (compile_commands.json file) for our build,
 # for use by clang_complete, YouCompleteMe, etc.
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 
 option(LLVM_INSTALL_BINUTILS_SYMLINKS
   "Install symlinks from the binutils tool names to the corresponding LLVM tools." OFF)
 
 option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OFF)
 
 option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF)
 
 option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON)
 if ( LLVM_USE_FOLDERS )
   set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 endif()
 
 include(VersionFromVCS)
 
 option(LLVM_APPEND_VC_REV
   "Embed the version control system revision id in LLVM" ON)
 
 set(PACKAGE_NAME LLVM)
 set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
 set(PACKAGE_BUGREPORT "https://bugs.llvm.org/")
 
 set(BUG_REPORT_URL "${PACKAGE_BUGREPORT}" CACHE STRING
   "Default URL where bug reports are to be submitted.")
 
 # Configure CPack.
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "LLVM")
 set(CPACK_PACKAGE_VENDOR "LLVM")
 set(CPACK_PACKAGE_VERSION_MAJOR ${LLVM_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${LLVM_VERSION_MINOR})
 set(CPACK_PACKAGE_VERSION_PATCH ${LLVM_VERSION_PATCH})
 set(CPACK_PACKAGE_VERSION ${PACKAGE_VERSION})
 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.TXT")
 set(CPACK_NSIS_COMPRESSOR "/SOLID lzma \r\n SetCompressorDictSize 32")
 if(WIN32 AND NOT UNIX)
   set(CPACK_PACKAGE_INSTALL_REGISTRY_KEY "LLVM")
   set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_logo.bmp")
   set(CPACK_NSIS_MUI_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_icon.ico")
   set(CPACK_NSIS_MUI_UNIICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_icon.ico")
   set(CPACK_NSIS_MODIFY_PATH "ON")
   set(CPACK_NSIS_ENABLE_UNINSTALL_BEFORE_INSTALL "ON")
   if( CMAKE_CL_64 )
     set(CPACK_NSIS_INSTALL_ROOT "$PROGRAMFILES64")
   endif()
 endif()
 include(CPack)
 
 # Sanity check our source directory to make sure that we are not trying to
 # generate an in-source build (unless on MSVC_IDE, where it is ok), and to make
 # sure that we don't have any stray generated files lying around in the tree
 # (which would end up getting picked up by header search, instead of the correct
 # versions).
 if( CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR AND NOT MSVC_IDE )
   message(FATAL_ERROR "In-source builds are not allowed.
 CMake would overwrite the makefiles distributed with LLVM.
 Please create a directory and run cmake from there, passing the path
 to this source directory as the last argument.
 This process created the file `CMakeCache.txt' and the directory `CMakeFiles'.
 Please delete them.")
 endif()
 if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR )
   file(GLOB_RECURSE
     tablegenned_files_on_include_dir
     "${CMAKE_CURRENT_SOURCE_DIR}/include/llvm/*.gen")
   file(GLOB_RECURSE
     tablegenned_files_on_lib_dir
     "${CMAKE_CURRENT_SOURCE_DIR}/lib/Target/*.inc")
   if( tablegenned_files_on_include_dir OR tablegenned_files_on_lib_dir)
     message(FATAL_ERROR "Apparently there is a previous in-source build,
 probably as the result of running `configure' and `make' on
 ${CMAKE_CURRENT_SOURCE_DIR}.
 This may cause problems. The suspicious files are:
 ${tablegenned_files_on_lib_dir}
 ${tablegenned_files_on_include_dir}
 Please clean the source directory.")
   endif()
 endif()
 
 string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
 
 if (CMAKE_BUILD_TYPE AND
     NOT uppercase_CMAKE_BUILD_TYPE MATCHES "^(DEBUG|RELEASE|RELWITHDEBINFO|MINSIZEREL)$")
   message(FATAL_ERROR "Invalid value for CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
 endif()
 
 set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )
 
 set(LLVM_TOOLS_INSTALL_DIR "bin" CACHE STRING "Path for binary subdirectory (defaults to 'bin')")
 mark_as_advanced(LLVM_TOOLS_INSTALL_DIR)
 
 set(LLVM_UTILS_INSTALL_DIR "${LLVM_TOOLS_INSTALL_DIR}" CACHE STRING
     "Path to install LLVM utilities (enabled by LLVM_INSTALL_UTILS=ON) (defaults to LLVM_TOOLS_INSTALL_DIR)")
 mark_as_advanced(LLVM_UTILS_INSTALL_DIR)
 
 # They are used as destination of target generators.
 set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
 set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
 if(WIN32 OR CYGWIN)
   # DLL platform -- put DLLs into bin.
   set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
 else()
   set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
 endif()
 
 # Each of them corresponds to llvm-config's.
 set(LLVM_TOOLS_BINARY_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) # --bindir
 set(LLVM_LIBRARY_DIR      ${LLVM_LIBRARY_OUTPUT_INTDIR}) # --libdir
 set(LLVM_MAIN_SRC_DIR     ${CMAKE_CURRENT_SOURCE_DIR}  ) # --src-root
 set(LLVM_MAIN_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/include ) # --includedir
 set(LLVM_BINARY_DIR       ${CMAKE_CURRENT_BINARY_DIR}  ) # --prefix
 
 # Note: LLVM_CMAKE_PATH does not include generated files
 set(LLVM_CMAKE_PATH ${LLVM_MAIN_SRC_DIR}/cmake/modules)
 set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples)
 set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 
 # List of all targets to be built by default:
 set(LLVM_ALL_TARGETS
   AArch64
   AMDGPU
   ARM
   BPF
   Hexagon
   Lanai
   Mips
   MSP430
   NVPTX
   PowerPC
   Sparc
   SystemZ
   X86
   XCore
   )
 
 # List of targets with JIT support:
 set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ)
 
 set(LLVM_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
 
 set(LLVM_EXPERIMENTAL_TARGETS_TO_BUILD ""
   CACHE STRING "Semicolon-separated list of experimental targets to build.")
 
 option(BUILD_SHARED_LIBS
   "Build all libraries as shared libraries instead of static" OFF)
 
 option(LLVM_ENABLE_BACKTRACES "Enable embedding backtraces on crash." ON)
 if(LLVM_ENABLE_BACKTRACES)
   set(ENABLE_BACKTRACES 1)
 endif()
 
 option(LLVM_ENABLE_CRASH_OVERRIDES "Enable crash overrides." ON)
 if(LLVM_ENABLE_CRASH_OVERRIDES)
   set(ENABLE_CRASH_OVERRIDES 1)
 endif()
 
 option(LLVM_ENABLE_FFI "Use libffi to call external functions from the interpreter" OFF)
 set(FFI_LIBRARY_DIR "" CACHE PATH "Additional directory, where CMake should search for libffi.so")
 set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should search for ffi.h or ffi/ffi.h")
 
 set(LLVM_TARGET_ARCH "host"
   CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.")
 
 option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)
 
 set(LLVM_ENABLE_LIBXML2 "ON" CACHE STRING "Use libxml2 if available. Can be ON, OFF, or FORCE_ON")
 
 option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON)
 
 option(LLVM_ENABLE_LIBPFM "Use libpfm for performance counters if available." ON)
 
 option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 
 option(LLVM_ENABLE_ZLIB "Use zlib for compression/decompression if available." ON)
 
 if( LLVM_TARGETS_TO_BUILD STREQUAL "all" )
   set( LLVM_TARGETS_TO_BUILD ${LLVM_ALL_TARGETS} )
 endif()
 
 set(LLVM_TARGETS_TO_BUILD
    ${LLVM_TARGETS_TO_BUILD}
    ${LLVM_EXPERIMENTAL_TARGETS_TO_BUILD})
 list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD)
 
 option(LLVM_ENABLE_PIC "Build Position-Independent Code" ON)
 option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON)
 option(LLVM_ENABLE_MODULES "Compile with C++ modules enabled." OFF)
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   option(LLVM_ENABLE_MODULE_DEBUGGING "Compile with -gmodules." ON)
   option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." OFF)
 else()
   option(LLVM_ENABLE_MODULE_DEBUGGING "Compile with -gmodules." OFF)
   option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." ON)
 endif()
 option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF)
 option(LLVM_ENABLE_CXX1Z "Compile with C++1z enabled." OFF)
 option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF)
 option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF)
 option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
 option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
 option(LLVM_ENABLE_DUMP "Enable dump functions even when assertions are disabled" OFF)
 
 if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
   option(LLVM_ENABLE_ASSERTIONS "Enable assertions" OFF)
 else()
   option(LLVM_ENABLE_ASSERTIONS "Enable assertions" ON)
 endif()
 
 option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF)
 
 set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING
   "Enable abi-breaking checks.  Can be WITH_ASSERTS, FORCE_ON or FORCE_OFF.")
 
 option(LLVM_FORCE_USE_OLD_HOST_TOOLCHAIN
        "Set to ON to force using an old, unsupported host toolchain." OFF)
 
 option(LLVM_USE_INTEL_JITEVENTS
   "Use Intel JIT API to inform Intel(R) VTune(TM) Amplifier XE 2011 about JIT code"
   OFF)
 
 if( LLVM_USE_INTEL_JITEVENTS )
   # Verify we are on a supported platform
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
     message(FATAL_ERROR
       "Intel JIT API support is available on Linux and Windows only.")
   endif()
 endif( LLVM_USE_INTEL_JITEVENTS )
 
 option(LLVM_USE_OPROFILE
   "Use opagent JIT interface to inform OProfile about JIT code" OFF)
 
 option(LLVM_EXTERNALIZE_DEBUGINFO
   "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
 
 option(LLVM_CODESIGNING_IDENTITY
   "Sign executables and dylibs with the given identity (Darwin Only)" OFF)
 
 # If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
     message(FATAL_ERROR "OProfile support is available on Linux only.")
   endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
 endif( LLVM_USE_OPROFILE )
 
 option(LLVM_USE_PERF
   "Use perf JIT interface to inform perf about JIT code" OFF)
 
 # If enabled, verify we are on a platform that supports perf.
 if( LLVM_USE_PERF )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
     message(FATAL_ERROR "perf support is available on Linux only.")
   endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
 endif( LLVM_USE_PERF )
 
 set(LLVM_USE_SANITIZER "" CACHE STRING
   "Define the sanitizer used to build binaries and tests.")
 option(LLVM_OPTIMIZE_SANITIZED_BUILDS "Pass -O1 on debug sanitizer builds" ON)
 set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
   "Path to fuzzing library for linking with fuzz targets")
 
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm." OFF)
 
 option(LLVM_POLLY_LINK_INTO_TOOLS "Statically link Polly into tools (if available)" ON)
 option(LLVM_POLLY_BUILD "Build LLVM with Polly" ON)
 
 if (EXISTS ${LLVM_MAIN_SRC_DIR}/tools/polly/CMakeLists.txt)
   set(POLLY_IN_TREE TRUE)
 elseif(LLVM_EXTERNAL_POLLY_SOURCE_DIR)
   set(POLLY_IN_TREE TRUE)
 else()
   set(POLLY_IN_TREE FALSE)
 endif()
 
 if (LLVM_POLLY_BUILD AND POLLY_IN_TREE)
   set(WITH_POLLY ON)
 else()
   set(WITH_POLLY OFF)
 endif()
 
 if (LLVM_POLLY_LINK_INTO_TOOLS AND WITH_POLLY)
   set(LINK_POLLY_INTO_TOOLS ON)
 else()
   set(LINK_POLLY_INTO_TOOLS OFF)
 endif()
 
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
   # TODO: support other platforms and toolchains.
   option(LLVM_BUILD_32_BITS "Build 32 bits executables and libraries." OFF)
 endif()
 
 # Define the default arguments to use with 'lit', and an option for the user to
 # override.
 set(LIT_ARGS_DEFAULT "-sv")
 if (MSVC_IDE OR XCODE)
   set(LIT_ARGS_DEFAULT "${LIT_ARGS_DEFAULT} --no-progress-bar")
 endif()
 set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
 # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
 if( WIN32 AND NOT CYGWIN )
   set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
 endif()
 
 # Define options to control the inclusion and default build behavior for
 # components which may not strictly be necessary (tools, examples, and tests).
 #
 # This is primarily to support building smaller or faster project files.
 option(LLVM_INCLUDE_TOOLS "Generate build targets for the LLVM tools." ON)
 option(LLVM_BUILD_TOOLS
   "Build the LLVM tools. If OFF, just generate build targets." ON)
 
 option(LLVM_INCLUDE_UTILS "Generate build targets for the LLVM utils." ON)
 option(LLVM_BUILD_UTILS
   "Build LLVM utility binaries. If OFF, just generate build targets." ON)
 
 option(LLVM_INCLUDE_RUNTIMES "Generate build targets for the LLVM runtimes." ON)
 option(LLVM_BUILD_RUNTIMES
   "Build the LLVM runtimes. If OFF, just generate build targets." ON)
 
 option(LLVM_BUILD_RUNTIME
   "Build the LLVM runtime libraries." ON)
 option(LLVM_BUILD_EXAMPLES
   "Build the LLVM example programs. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON)
 
 option(LLVM_BUILD_TESTS
   "Build LLVM unit tests. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON)
 option(LLVM_INCLUDE_GO_TESTS "Include the Go bindings tests in test build targets." ON)
 
 option (LLVM_BUILD_DOCS "Build the llvm documentation." OFF)
 option (LLVM_INCLUDE_DOCS "Generate build targets for llvm documentation." ON)
 option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OFF)
 option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
 option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
 option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
 
 set(LLVM_INSTALL_DOXYGEN_HTML_DIR "share/doc/llvm/doxygen-html"
     CACHE STRING "Doxygen-generated HTML documentation install directory")
 set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "share/doc/llvm/ocaml-html"
     CACHE STRING "OCamldoc-generated HTML documentation install directory")
 
 option (LLVM_BUILD_EXTERNAL_COMPILER_RT
   "Build compiler-rt as an external project." OFF)
 
 option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO
   "Show target and host info when tools are invoked with --version." ON)
 
 # You can configure which libraries from LLVM you want to include in the
 # shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited
 # list of LLVM components. All component names handled by llvm-config are valid.
 if(NOT DEFINED LLVM_DYLIB_COMPONENTS)
   set(LLVM_DYLIB_COMPONENTS "all" CACHE STRING
     "Semicolon-separated list of components to include in libLLVM, or \"all\".")
 endif()
 option(LLVM_LINK_LLVM_DYLIB "Link tools against the libllvm dynamic library" OFF)
 option(LLVM_BUILD_LLVM_C_DYLIB "Build libllvm-c re-export library (Darwin Only)" OFF)
 set(LLVM_BUILD_LLVM_DYLIB_default OFF)
 if(LLVM_LINK_LLVM_DYLIB OR LLVM_BUILD_LLVM_C_DYLIB)
   set(LLVM_BUILD_LLVM_DYLIB_default ON)
 endif()
 option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" ${LLVM_BUILD_LLVM_DYLIB_default})
 
 option(LLVM_OPTIMIZED_TABLEGEN "Force TableGen to be built with optimization" OFF)
 if(CMAKE_CROSSCOMPILING OR (LLVM_OPTIMIZED_TABLEGEN AND (LLVM_ENABLE_ASSERTIONS OR CMAKE_CONFIGURATION_TYPES)))
   set(LLVM_USE_HOST_TOOLS ON)
 endif()
 
 if (MSVC_IDE AND NOT (MSVC_VERSION LESS 1900))
   option(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION "Configure project to use Visual Studio native visualizers" TRUE)
 else()
   set(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION FALSE CACHE INTERNAL "For Visual Studio 2013, manually copy natvis files to Documents\\Visual Studio 2013\\Visualizers" FORCE)
 endif()
 
 if (LLVM_BUILD_INSTRUMENTED OR LLVM_BUILD_INSTRUMENTED_COVERAGE OR
     LLVM_ENABLE_IR_PGO)
   if(NOT LLVM_PROFILE_MERGE_POOL_SIZE)
     # A pool size of 1-2 is probably sufficient on a SSD. 3-4 should be fine
     # for spining disks. Anything higher may only help on slower mediums.
     set(LLVM_PROFILE_MERGE_POOL_SIZE "4")
   endif()
   if(NOT LLVM_PROFILE_FILE_PATTERN)
     if(NOT LLVM_PROFILE_DATA_DIR)
       file(TO_NATIVE_PATH "${LLVM_BINARY_DIR}/profiles" LLVM_PROFILE_DATA_DIR)
     endif()
 		file(TO_NATIVE_PATH "${LLVM_PROFILE_DATA_DIR}/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN)
   endif()
 endif()
 
 if (LLVM_BUILD_STATIC)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
 endif()
 
 # Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV.
 set(LLVM_TARGET_TRIPLE_ENV CACHE STRING "The name of environment variable to override default target. Disabled by blank.")
 mark_as_advanced(LLVM_TARGET_TRIPLE_ENV)
 
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR OFF CACHE BOOL
   "Enable per-target runtimes directory")
 
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
 include(config-ix)
 
 string(REPLACE "Native" ${LLVM_NATIVE_ARCH}
   LLVM_TARGETS_TO_BUILD "${LLVM_TARGETS_TO_BUILD}")
 list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD)
 
 # By default, we target the host, but this can be overridden at CMake
 # invocation time.
 set(LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_HOST_TRIPLE}" CACHE STRING
   "Default target for which LLVM will generate code." )
 set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
 message(STATUS "LLVM host triple: ${LLVM_HOST_TRIPLE}")
 message(STATUS "LLVM default target triple: ${LLVM_DEFAULT_TARGET_TRIPLE}")
 
 include(HandleLLVMOptions)
 
 # Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
 # FIXME: We should support systems with only Python 3, but that requires work
 # on LLDB.
 set(Python_ADDITIONAL_VERSIONS 2.7)
 include(FindPythonInterp)
 if( NOT PYTHONINTERP_FOUND )
   message(FATAL_ERROR
 "Unable to find Python interpreter, required for builds and testing.
 
 Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
 endif()
 
 if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
   message(FATAL_ERROR "Python 2.7 or newer is required")
 endif()
 
 ######
 # LLVMBuild Integration
 #
 # We use llvm-build to generate all the data required by the CMake based
 # build system in one swoop:
 #
 #  - We generate a file (a CMake fragment) in the object root which contains
 #    all the definitions that are required by CMake.
 #
 #  - We generate the library table used by llvm-config.
 #
 #  - We generate the dependencies for the CMake fragment, so that we will
 #    automatically reconfigure outselves.
 
 set(LLVMBUILDTOOL "${LLVM_MAIN_SRC_DIR}/utils/llvm-build/llvm-build")
 set(LLVMCONFIGLIBRARYDEPENDENCIESINC
   "${LLVM_BINARY_DIR}/tools/llvm-config/LibraryDependencies.inc")
 set(LLVMBUILDCMAKEFRAG
   "${LLVM_BINARY_DIR}/LLVMBuild.cmake")
 
 # Create the list of optional components that are enabled
 if (LLVM_USE_INTEL_JITEVENTS)
   set(LLVMOPTIONALCOMPONENTS IntelJITEvents)
 endif (LLVM_USE_INTEL_JITEVENTS)
 if (LLVM_USE_OPROFILE)
   set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} OProfileJIT)
 endif (LLVM_USE_OPROFILE)
 if (LLVM_USE_PERF)
   set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} PerfJITEvents)
 endif (LLVM_USE_PERF)
 
 message(STATUS "Constructing LLVMBuild project information")
 execute_process(
   COMMAND ${PYTHON_EXECUTABLE} -B ${LLVMBUILDTOOL}
             --native-target "${LLVM_NATIVE_ARCH}"
             --enable-targets "${LLVM_TARGETS_TO_BUILD}"
             --enable-optional-components "${LLVMOPTIONALCOMPONENTS}"
             --write-library-table ${LLVMCONFIGLIBRARYDEPENDENCIESINC}
             --write-cmake-fragment ${LLVMBUILDCMAKEFRAG}
             OUTPUT_VARIABLE LLVMBUILDOUTPUT
             ERROR_VARIABLE LLVMBUILDERRORS
             OUTPUT_STRIP_TRAILING_WHITESPACE
             ERROR_STRIP_TRAILING_WHITESPACE
   RESULT_VARIABLE LLVMBUILDRESULT)
 
 # On Win32, CMake doesn't properly handle piping the default output/error
 # streams into the GUI console. So, we explicitly catch and report them.
 if( NOT "${LLVMBUILDOUTPUT}" STREQUAL "")
   message(STATUS "llvm-build output: ${LLVMBUILDOUTPUT}")
 endif()
 if( NOT "${LLVMBUILDRESULT}" STREQUAL "0" )
   message(FATAL_ERROR
     "Unexpected failure executing llvm-build: ${LLVMBUILDERRORS}")
 endif()
 
 # Include the generated CMake fragment. This will define properties from the
 # LLVMBuild files in a format which is easy to consume from CMake, and will add
 # the dependencies so that CMake will reconfigure properly when the LLVMBuild
 # files change.
 include(${LLVMBUILDCMAKEFRAG})
 
 ######
 
 # Configure all of the various header file fragments LLVM uses which depend on
 # configuration variables.
 set(LLVM_ENUM_TARGETS "")
 set(LLVM_ENUM_ASM_PRINTERS "")
 set(LLVM_ENUM_ASM_PARSERS "")
 set(LLVM_ENUM_DISASSEMBLERS "")
 foreach(t ${LLVM_TARGETS_TO_BUILD})
   set( td ${LLVM_MAIN_SRC_DIR}/lib/Target/${t} )
 
   list(FIND LLVM_ALL_TARGETS ${t} idx)
   list(FIND LLVM_EXPERIMENTAL_TARGETS_TO_BUILD ${t} idy)
   # At this point, LLVMBUILDTOOL already checked all the targets passed in
   # LLVM_TARGETS_TO_BUILD and LLVM_EXPERIMENTAL_TARGETS_TO_BUILD, so
   # this test just makes sure that any experimental targets were passed via
   # LLVM_EXPERIMENTAL_TARGETS_TO_BUILD, not LLVM_TARGETS_TO_BUILD.
   if( idx LESS 0 AND idy LESS 0 )
     message(FATAL_ERROR "The target `${t}' is experimental and must be passed "
       "via LLVM_EXPERIMENTAL_TARGETS_TO_BUILD.")
   else()
     set(LLVM_ENUM_TARGETS "${LLVM_ENUM_TARGETS}LLVM_TARGET(${t})\n")
   endif()
 
   file(GLOB asmp_file "${td}/*AsmPrinter.cpp")
   if( asmp_file )
     set(LLVM_ENUM_ASM_PRINTERS
       "${LLVM_ENUM_ASM_PRINTERS}LLVM_ASM_PRINTER(${t})\n")
   endif()
   if( EXISTS ${td}/AsmParser/CMakeLists.txt )
     set(LLVM_ENUM_ASM_PARSERS
       "${LLVM_ENUM_ASM_PARSERS}LLVM_ASM_PARSER(${t})\n")
   endif()
   if( EXISTS ${td}/Disassembler/CMakeLists.txt )
     set(LLVM_ENUM_DISASSEMBLERS
       "${LLVM_ENUM_DISASSEMBLERS}LLVM_DISASSEMBLER(${t})\n")
   endif()
 endforeach(t)
 
 # Produce the target definition files, which provide a way for clients to easily
 # include various classes of targets.
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/AsmPrinters.def.in
   ${LLVM_INCLUDE_DIR}/llvm/Config/AsmPrinters.def
   )
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/AsmParsers.def.in
   ${LLVM_INCLUDE_DIR}/llvm/Config/AsmParsers.def
   )
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/Disassemblers.def.in
   ${LLVM_INCLUDE_DIR}/llvm/Config/Disassemblers.def
   )
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/Targets.def.in
   ${LLVM_INCLUDE_DIR}/llvm/Config/Targets.def
   )
 
 # Configure the three LLVM configuration header files.
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/config.h.cmake
   ${LLVM_INCLUDE_DIR}/llvm/Config/config.h)
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/llvm-config.h.cmake
   ${LLVM_INCLUDE_DIR}/llvm/Config/llvm-config.h)
 configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/abi-breaking.h.cmake
   ${LLVM_INCLUDE_DIR}/llvm/Config/abi-breaking.h)
 
 # Add target for generating source rpm package.
 set(LLVM_SRPM_USER_BINARY_SPECFILE ${CMAKE_CURRENT_SOURCE_DIR}/llvm.spec.in
     CACHE FILEPATH ".spec file to use for srpm generation")
 set(LLVM_SRPM_BINARY_SPECFILE ${CMAKE_CURRENT_BINARY_DIR}/llvm.spec)
 set(LLVM_SRPM_DIR "${CMAKE_CURRENT_BINARY_DIR}/srpm")
 
 # SVN_REVISION and GIT_COMMIT get set by the call to add_version_info_from_vcs.
 # DUMMY_VAR contains a version string which we don't care about.
 add_version_info_from_vcs(DUMMY_VAR)
 if ( SVN_REVISION )
   set(LLVM_RPM_SPEC_REVISION "r${SVN_REVISION}")
 elseif ( GIT_COMMIT )
   set (LLVM_RPM_SPEC_REVISION "g${GIT_COMMIT}")
 endif()
 
 configure_file(
   ${LLVM_SRPM_USER_BINARY_SPECFILE}
   ${LLVM_SRPM_BINARY_SPECFILE} @ONLY)
 
 add_custom_target(srpm
   COMMAND cpack -G TGZ --config CPackSourceConfig.cmake -B ${LLVM_SRPM_DIR}/SOURCES
   COMMAND rpmbuild -bs --define '_topdir ${LLVM_SRPM_DIR}' ${LLVM_SRPM_BINARY_SPECFILE})
 set_target_properties(srpm PROPERTIES FOLDER "Misc")
 
 
 # They are not referenced. See set_output_directory().
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin )
 set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
 set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
 
 if(APPLE AND DARWIN_LTO_LIBRARY)
   set(CMAKE_EXE_LINKER_FLAGS
     "${CMAKE_EXE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")
   set(CMAKE_SHARED_LINKER_FLAGS
     "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")
   set(CMAKE_MODULE_LINKER_FLAGS
     "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")
 endif()
 
 # Work around a broken bfd ld behavior. When linking a binary with a
 # foo.so library, it will try to find any library that foo.so uses and
 # check its symbols. This is wasteful (the check was done when foo.so
 # was created) and can fail since it is not the dynamic linker and
 # doesn't know how to handle search paths correctly.
 if (UNIX AND NOT APPLE AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "SunOS|AIX")
   set(CMAKE_EXE_LINKER_FLAGS
       "${CMAKE_EXE_LINKER_FLAGS} -Wl,-allow-shlib-undefined")
 endif()
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
 
 # when crosscompiling import the executable targets from a file
 if(LLVM_USE_HOST_TOOLS)
   include(CrossCompile)
 endif(LLVM_USE_HOST_TOOLS)
 if(LLVM_TARGET_IS_CROSSCOMPILE_HOST)
 # Dummy use to avoid CMake Warning: Manually-specified variables were not used
 # (this is a variable that CrossCompile sets on recursive invocations)
 endif()
 
 if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
   # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
   # with libxml2, iconv.h, etc., we must add /usr/local paths.
   include_directories(SYSTEM "/usr/local/include")
   link_directories("/usr/local/lib")
 endif(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
 
 if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
    # special hack for Solaris to handle crazy system sys/regset.h
    include_directories("${LLVM_MAIN_INCLUDE_DIR}/llvm/Support/Solaris")
 endif( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
 
 # Make sure we don't get -rdynamic in every binary. For those that need it,
 # use export_executable_symbols(target).
 set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
 
 set(LLVM_PROFDATA_FILE "" CACHE FILEPATH
   "Profiling data file to use when compiling in order to improve runtime performance.")
 
 if(LLVM_PROFDATA_FILE AND EXISTS ${LLVM_PROFDATA_FILE})
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
     add_definitions("-fprofile-instr-use=${LLVM_PROFDATA_FILE}")
   else()
     message(FATAL_ERROR "LLVM_PROFDATA_FILE can only be specified when compiling with clang")
   endif()
 endif()
 
 include(AddLLVM)
 include(TableGen)
 
 if( MINGW AND NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
   # People report that -O3 is unreliable on MinGW. The traditional
   # build also uses -O2 for that reason:
   llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELEASE "-O3" "-O2")
 endif()
 
 # Put this before tblgen. Else we have a circular dependence.
 add_subdirectory(lib/Demangle)
 add_subdirectory(lib/Support)
 add_subdirectory(lib/TableGen)
 
 add_subdirectory(utils/TableGen)
 
 add_subdirectory(include/llvm)
 
 add_subdirectory(lib)
 
 if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/FileCheck)
   add_subdirectory(utils/PerfectShuffle)
   add_subdirectory(utils/count)
   add_subdirectory(utils/not)
   add_subdirectory(utils/yaml-bench)
 else()
   if ( LLVM_INCLUDE_TESTS )
     message(FATAL_ERROR "Including tests when not building utils will not work.
     Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLDE_TESTS to Off.")
   endif()
 endif()
 
 # Use LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION instead of LLVM_INCLUDE_UTILS because it is not really a util
 if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
   add_subdirectory(utils/LLVMVisualizers)
 endif()
 
 foreach( binding ${LLVM_BINDINGS_LIST} )
   if( EXISTS "${LLVM_MAIN_SRC_DIR}/bindings/${binding}/CMakeLists.txt" )
     add_subdirectory(bindings/${binding})
   endif()
 endforeach()
 
 add_subdirectory(projects)
 
 if( LLVM_INCLUDE_TOOLS )
   add_subdirectory(tools)
 endif()
 
 if( LLVM_INCLUDE_RUNTIMES )
   add_subdirectory(runtimes)
 endif()
 
 if( LLVM_INCLUDE_EXAMPLES )
   add_subdirectory(examples)
 endif()
 
 if( LLVM_INCLUDE_TESTS )
   if(EXISTS ${LLVM_MAIN_SRC_DIR}/projects/test-suite AND TARGET clang)
     include(LLVMExternalProjectUtils)
     llvm_ExternalProject_Add(test-suite ${LLVM_MAIN_SRC_DIR}/projects/test-suite
       USE_TOOLCHAIN
       EXCLUDE_FROM_ALL
       NO_INSTALL
       ALWAYS_CLEAN)
   endif()
   add_subdirectory(utils/lit)
   add_subdirectory(test)
   add_subdirectory(unittests)
   if( LLVM_INCLUDE_UTILS )
     add_subdirectory(utils/unittest)
   endif()
 
   if (WIN32)
     # This utility is used to prevent crashing tests from calling Dr. Watson on
     # Windows.
     add_subdirectory(utils/KillTheDoctor)
   endif()
 
   # Add a global check rule now that all subdirectories have been traversed
   # and we know the total set of lit testsuites.
   get_property(LLVM_LIT_TESTSUITES GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
   get_property(LLVM_LIT_PARAMS GLOBAL PROPERTY LLVM_LIT_PARAMS)
   get_property(LLVM_LIT_DEPENDS GLOBAL PROPERTY LLVM_LIT_DEPENDS)
   get_property(LLVM_LIT_EXTRA_ARGS GLOBAL PROPERTY LLVM_LIT_EXTRA_ARGS)
   get_property(LLVM_ADDITIONAL_TEST_TARGETS
                GLOBAL PROPERTY LLVM_ADDITIONAL_TEST_TARGETS)
   get_property(LLVM_ADDITIONAL_TEST_DEPENDS
                GLOBAL PROPERTY LLVM_ADDITIONAL_TEST_DEPENDS)
   add_lit_target(check-all
     "Running all regression tests"
     ${LLVM_LIT_TESTSUITES}
     PARAMS ${LLVM_LIT_PARAMS}
     DEPENDS ${LLVM_LIT_DEPENDS} ${LLVM_ADDITIONAL_TEST_TARGETS}
     ARGS ${LLVM_LIT_EXTRA_ARGS}
     )
   if(TARGET check-runtimes)
     add_dependencies(check-all check-runtimes)
   endif()
   add_custom_target(test-depends
                     DEPENDS ${LLVM_LIT_DEPENDS} ${LLVM_ADDITIONAL_TEST_DEPENDS})
   set_target_properties(test-depends PROPERTIES FOLDER "Tests")
 endif()
 
 if (LLVM_INCLUDE_DOCS)
   add_subdirectory(docs)
 endif()
 
 add_subdirectory(cmake/modules)
 
 # Do this last so that all lit targets have already been created.
 if (LLVM_INCLUDE_UTILS)
   add_subdirectory(utils/llvm-lit)
 endif()
 
 if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   install(DIRECTORY include/llvm include/llvm-c
     DESTINATION include
     COMPONENT llvm-headers
     FILES_MATCHING
     PATTERN "*.def"
     PATTERN "*.h"
     PATTERN "*.td"
     PATTERN "*.inc"
     PATTERN "LICENSE.TXT"
     PATTERN ".svn" EXCLUDE
     )
 
   install(DIRECTORY ${LLVM_INCLUDE_DIR}/llvm ${LLVM_INCLUDE_DIR}/llvm-c
     DESTINATION include
     COMPONENT llvm-headers
     FILES_MATCHING
     PATTERN "*.def"
     PATTERN "*.h"
     PATTERN "*.gen"
     PATTERN "*.inc"
     # Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def"
     PATTERN "CMakeFiles" EXCLUDE
     PATTERN "config.h" EXCLUDE
     PATTERN ".svn" EXCLUDE
     )
 
   # Installing the headers needs to depend on generating any public
   # tablegen'd headers.
   add_custom_target(llvm-headers DEPENDS intrinsics_gen)
   set_target_properties(llvm-headers PROPERTIES FOLDER "Misc")
 
   if (NOT CMAKE_CONFIGURATION_TYPES)
     add_llvm_install_targets(install-llvm-headers
                              DEPENDS llvm-headers
                              COMPONENT llvm-headers)
   endif()
 endif()
 
 # This must be at the end of the LLVM root CMakeLists file because it must run
 # after all targets are created.
 if(LLVM_DISTRIBUTION_COMPONENTS)
   if(CMAKE_CONFIGURATION_TYPES)
     message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)")
   endif()
 
   add_custom_target(distribution)
   add_custom_target(install-distribution)
   add_custom_target(install-distribution-stripped)
   foreach(target ${LLVM_DISTRIBUTION_COMPONENTS} ${LLVM_RUNTIME_DISTRIBUTION_COMPONENTS})
     if(TARGET ${target})
       add_dependencies(distribution ${target})
     else()
       message(SEND_ERROR "Specified distribution component '${target}' doesn't have a target")
     endif()
 
     if(TARGET install-${target})
       add_dependencies(install-distribution install-${target})
     else()
       message(SEND_ERROR "Specified distribution component '${target}' doesn't have an install target")
     endif()
 
     if(TARGET install-${target}-stripped)
       add_dependencies(install-distribution-stripped install-${target}-stripped)
     else()
       message(SEND_ERROR "Specified distribution component '${target}' doesn't have an install-stripped target."
                          " Its installation target creation should be changed to use add_llvm_install_targets,"
                          " or you should manually create the 'install-${target}-stripped' target.")
     endif()
   endforeach()
 endif()
 
 # This allows us to deploy the Universal CRT DLLs by passing -DCMAKE_INSTALL_UCRT_LIBRARIES=ON to CMake
-if (MSVC AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+if (MSVC AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows" AND CMAKE_INSTALL_UCRT_LIBRARIES)
   include(InstallRequiredSystemLibraries)
 endif()
Index: vendor/llvm/dist-release_70/docs/ReleaseNotes.rst
===================================================================
--- vendor/llvm/dist-release_70/docs/ReleaseNotes.rst	(revision 337630)
+++ vendor/llvm/dist-release_70/docs/ReleaseNotes.rst	(revision 337631)
@@ -1,212 +1,242 @@
 ========================
 LLVM 7.0.0 Release Notes
 ========================
 
 .. contents::
     :local:
 
 .. warning::
    These are in-progress notes for the upcoming LLVM 7 release.
    Release notes for previous releases can be found on
    `the Download Page <http://releases.llvm.org/download.html>`_.
 
 
 Introduction
 ============
 
 This document contains the release notes for the LLVM Compiler Infrastructure,
 release 7.0.0.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
 
 For more information about LLVM, including information about the latest
 release, please check out the `main LLVM web site <http://llvm.org/>`_.  If you
 have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
 Note that if you are reading this file from a Subversion checkout or the main
 LLVM web page, this document applies to the *next* release, not the current
 one.  To see the release notes for a specific release, please see the `releases
 page <http://llvm.org/releases/>`_.
 
 Non-comprehensive list of changes in this release
 =================================================
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
    point (e.g. maybe you would like to give an example of the
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
+* The Windows installer no longer includes a Visual Studio integration.
+  Instead, a new
+  `LLVM Compiler Toolchain Visual Studio extension <https://marketplace.visualstudio.com/items?itemName=LLVMExtensions.llvm-toolchain>`
+  is available on the Visual Studio Marketplace. The new integration includes
+  support for Visual Studio 2017.
+
 * Libraries have been renamed from 7.0 to 7. This change also impacts
   downstream libraries like lldb.
 
 * The LoopInstSimplify pass (-loop-instsimplify) has been removed.
 
 * Symbols starting with ``?`` are no longer mangled by LLVM when using the
   Windows ``x`` or ``w`` IR mangling schemes.
 
 * A new tool named :doc:`llvm-exegesis <CommandGuide/llvm-exegesis>` has been
   added. :program:`llvm-exegesis` automatically measures instruction scheduling
   properties (latency/uops) and provides a principled way to edit scheduling
   models.
 
 * A new tool named :doc:`llvm-mca <CommandGuide/llvm-mca>` has been added.
   :program:`llvm-mca` is a  static performance analysis tool that uses
   information available in LLVM to statically predict the performance of
   machine code for a specific CPU.
 
 * The optimization flag to merge constants (-fmerge-all-constants) is no longer
   applied by default.
 
 * Optimization of floating-point casts is improved. This may cause surprising
   results for code that is relying on the undefined behavior of overflowing 
   casts. The optimization can be disabled by specifying a function attribute:
   "strict-float-cast-overflow"="false". This attribute may be created by the
   clang option ``-fno-strict-float-cast-overflow``.
   Code sanitizers can be used to detect affected patterns. The option for
   detecting this problem alone is "-fsanitize=float-cast-overflow":
 
 .. code-block:: c
 
     int main() {
       float x = 4294967296.0f;
       x = (float)((int)x);
       printf("junk in the ftrunc: %f\n", x);
       return 0;
     }
 
 .. code-block:: bash
 
     clang -O1 ftrunc.c -fsanitize=float-cast-overflow ; ./a.out 
     ftrunc.c:5:15: runtime error: 4.29497e+09 is outside the range of representable values of type 'int'
     junk in the ftrunc: 0.000000
 
 * ``LLVM_ON_WIN32`` is no longer set by ``llvm/Config/config.h`` and
   ``llvm/Config/llvm-config.h``.  If you used this macro, use the compiler-set
   ``_WIN32`` instead which is set exactly when ``LLVM_ON_WIN32`` used to be set.
 
 * The ``DEBUG`` macro has been renamed to ``LLVM_DEBUG``, the interface remains
   the same.  If you used this macro you need to migrate to the new one.
   You should also clang-format your code to make it easier to integrate future
   changes locally.  This can be done with the following bash commands:
 
 .. code-block:: bash
 
     git grep -l 'DEBUG' | xargs perl -pi -e 's/\bDEBUG\s?\(/LLVM_DEBUG(/g'
     git diff -U0 master | ../clang/tools/clang-format/clang-format-diff.py -i -p1 -style LLVM
 
 * Early support for UBsan, X-Ray instrumentation and libFuzzer (x86 and x86_64) for OpenBSD. Support for MSan
   (x86_64), X-Ray instrumentation and libFuzzer (x86 and x86_64) for FreeBSD.
 
 * ``SmallVector<T, 0>`` shrank from ``sizeof(void*) * 4 + sizeof(T)`` to
   ``sizeof(void*) + sizeof(unsigned) * 2``, smaller than ``std::vector<T>`` on
   64-bit platforms.  The maximum capacity is now restricted to ``UINT32_MAX``.
   Since SmallVector doesn't have the exception-safety pessimizations some
   implementations saddle std::vector with and is better at using ``realloc``,
   it's now a better choice even on the heap (although when TinyPtrVector works,
   it's even smaller).
 
 * Preliminary/experimental support for DWARF v5 debugging information,
   including the new .debug_names accelerator table. DWARF emitted at ``-O0``
   should be fully DWARF v5 compliant. Type units and split DWARF are known
   not to be compliant, and higher optimization levels will still emit some
   information in v4 format.
 
 * Note..
 
 .. NOTE
    If you would like to document a larger change, then you can add a
    subsection about it right here. You can copy the following boilerplate
    and un-indent it (the indentation causes it to be inside this comment).
 
    Special New Feature
    -------------------
 
    Makes programs 10x faster by doing Special New Thing.
 
 Changes to the LLVM IR
 ----------------------
 
 * The signatures for the builtins @llvm.memcpy, @llvm.memmove, and @llvm.memset
   have changed. Alignment is no longer an argument, and are instead conveyed as
   parameter attributes.
 
 * invariant.group.barrier has been renamed to launder.invariant.group.
 
 * invariant.group metadata can now refer only empty metadata nodes.
 
 Changes to the ARM Backend
 --------------------------
 
  During this release ...
 
 
 Changes to the MIPS Target
 --------------------------
 
  During this release ...
 
 
 Changes to the PowerPC Target
 -----------------------------
 
  During this release ...
 
+Changes to the SystemZ Target
+-----------------------------
+
+During this release the SystemZ target has:
+
+* Added support for vector registers in inline asm statements.
+
+* Added support for stackmaps, patchpoints, and the anyregcc
+  calling convention.
+
+* Changed the default function alignment to 16 bytes.
+
+* Improved codegen for condition code handling.
+
+* Improved instruction scheduling and microarchitecture tuning for z13/z14.
+
+* Fixed support for generating GCOV coverage data.
+
+* Fixed some codegen bugs.
+
 Changes to the X86 Target
 -------------------------
 
  During this release ...
 
 Changes to the AMDGPU Target
 -----------------------------
 
  During this release ...
 
 Changes to the AVR Target
 -----------------------------
 
  During this release ...
 
 Changes to the OCaml bindings
 -----------------------------
 
 * Remove ``add_bb_vectorize``.
 
 
 Changes to the C API
 --------------------
 
 * Remove ``LLVMAddBBVectorizePass``. The implementation was removed and the C
   interface was made a deprecated no-op in LLVM 5. Use
   ``LLVMAddSLPVectorizePass`` instead to get the supported SLP vectorizer.
 
 Changes to the DAG infrastructure
 ---------------------------------
 * ADDC/ADDE/SUBC/SUBE are now deprecated and will default to expand. Backends
   that wish to continue to use these opcodes should explicitely request so
   using ``setOperationAction`` in their ``TargetLowering``. New backends
   should use UADDO/ADDCARRY/USUBO/SUBCARRY instead of the deprecated opcodes.
 
 * The SETCCE opcode has now been removed in favor of SETCCCARRY.
+
+* TableGen now supports multi-alternative pattern fragments via the PatFrags
+  class.  PatFrag is now derived from PatFrags, which may require minor
+  changes to backends that directly access PatFrag members.
 
 External Open Source Projects Using LLVM 7
 ==========================================
 
 * A project...
 
 
 Additional Information
 ======================
 
 A wide variety of additional information is available on the `LLVM web page
 <http://llvm.org/>`_, in particular in the `documentation
 <http://llvm.org/docs/>`_ section.  The web page also contains versions of the
 API documentation which is up-to-date with the Subversion version of the source
 code.  You can access versions of these documents specific to this release by
 going into the ``llvm/docs/`` directory in the LLVM tree.
 
 If you have any questions or comments about LLVM, please feel free to contact
 us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
Index: vendor/llvm/dist-release_70/lib/Analysis/InstructionSimplify.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Analysis/InstructionSimplify.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Analysis/InstructionSimplify.cpp	(revision 337631)
@@ -1,5147 +1,5181 @@
 //===- InstructionSimplify.cpp - Fold instruction operands ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements routines for folding instructions into simpler forms
 // that do not require creating new instructions.  This does constant folding
 // ("add i32 1, 1" -> "2") but can also handle non-constant operands, either
 // returning a constant ("and i32 %x, 0" -> "0") or an already existing value
 // ("and i32 %x, %x" -> "%x").  All operands are assumed to have already been
 // simplified: This is usually true and assuming it simplifies the logic (if
 // they have not been simplified then results are correct but maybe suboptimal).
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/KnownBits.h"
 #include <algorithm>
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instsimplify"
 
 enum { RecursionLimit = 3 };
 
 STATISTIC(NumExpand,  "Number of expansions");
 STATISTIC(NumReassoc, "Number of reassociations");
 
 static Value *SimplifyAndInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
                             unsigned);
 static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &,
                               const SimplifyQuery &, unsigned);
 static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
                               unsigned);
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse);
 static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyCastInst(unsigned, Value *, Type *,
                                const SimplifyQuery &, unsigned);
 static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, const SimplifyQuery &,
                               unsigned);
 
 static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
                                      Value *FalseVal) {
   BinaryOperator::BinaryOps BinOpCode;
   if (auto *BO = dyn_cast<BinaryOperator>(Cond))
     BinOpCode = BO->getOpcode();
   else
     return nullptr;
 
   CmpInst::Predicate ExpectedPred, Pred1, Pred2;
   if (BinOpCode == BinaryOperator::Or) {
     ExpectedPred = ICmpInst::ICMP_NE;
   } else if (BinOpCode == BinaryOperator::And) {
     ExpectedPred = ICmpInst::ICMP_EQ;
   } else
     return nullptr;
 
   // %A = icmp eq %TV, %FV
   // %B = icmp eq %X, %Y (and one of these is a select operand)
   // %C = and %A, %B
   // %D = select %C, %TV, %FV
   // -->
   // %FV
 
   // %A = icmp ne %TV, %FV
   // %B = icmp ne %X, %Y (and one of these is a select operand)
   // %C = or %A, %B
   // %D = select %C, %TV, %FV
   // -->
   // %TV
   Value *X, *Y;
   if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal),
                                       m_Specific(FalseVal)),
                              m_ICmp(Pred2, m_Value(X), m_Value(Y)))) ||
       Pred1 != Pred2 || Pred1 != ExpectedPred)
     return nullptr;
 
   if (X == TrueVal || X == FalseVal || Y == TrueVal || Y == FalseVal)
     return BinOpCode == BinaryOperator::Or ? TrueVal : FalseVal;
 
   return nullptr;
 }
 
 /// For a boolean type or a vector of boolean type, return false or a vector
 /// with every element false.
 static Constant *getFalse(Type *Ty) {
   return ConstantInt::getFalse(Ty);
 }
 
 /// For a boolean type or a vector of boolean type, return true or a vector
 /// with every element true.
 static Constant *getTrue(Type *Ty) {
   return ConstantInt::getTrue(Ty);
 }
 
 /// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
 static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
                           Value *RHS) {
   CmpInst *Cmp = dyn_cast<CmpInst>(V);
   if (!Cmp)
     return false;
   CmpInst::Predicate CPred = Cmp->getPredicate();
   Value *CLHS = Cmp->getOperand(0), *CRHS = Cmp->getOperand(1);
   if (CPred == Pred && CLHS == LHS && CRHS == RHS)
     return true;
   return CPred == CmpInst::getSwappedPredicate(Pred) && CLHS == RHS &&
     CRHS == LHS;
 }
 
 /// Does the given value dominate the specified phi node?
 static bool valueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     // Arguments and constants dominate all instructions.
     return true;
 
   // If we are processing instructions (and/or basic blocks) that have not been
   // fully added to a function, the parent nodes may still be null. Simply
   // return the conservative answer in these cases.
   if (!I->getParent() || !P->getParent() || !I->getFunction())
     return false;
 
   // If we have a DominatorTree then do a precise test.
   if (DT)
     return DT->dominates(I, P);
 
   // Otherwise, if the instruction is in the entry block and is not an invoke,
   // then it obviously dominates all phi nodes.
   if (I->getParent() == &I->getFunction()->getEntryBlock() &&
       !isa<InvokeInst>(I))
     return true;
 
   return false;
 }
 
 /// Simplify "A op (B op' C)" by distributing op over op', turning it into
 /// "(A op B) op' (A op C)".  Here "op" is given by Opcode and "op'" is
 /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
 static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
                           Instruction::BinaryOps OpcodeToExpand,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   // Check whether the expression has the form "(A op' B) op C".
   if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
     if (Op0->getOpcode() == OpcodeToExpand) {
       // It does!  Try turning it into "(A op C) op' (B op C)".
       Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
       // Do "A op C" and "B op C" both simplify?
       if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse))
         if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
           // They do! Return "L op' R" if it simplifies or is already available.
           // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
           if ((L == A && R == B) || (Instruction::isCommutative(OpcodeToExpand)
                                      && L == B && R == A)) {
             ++NumExpand;
             return LHS;
           }
           // Otherwise return "L op' R" if it simplifies.
           if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
             ++NumExpand;
             return V;
           }
         }
     }
 
   // Check whether the expression has the form "A op (B op' C)".
   if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
     if (Op1->getOpcode() == OpcodeToExpand) {
       // It does!  Try turning it into "(A op B) op' (A op C)".
       Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
       // Do "A op B" and "A op C" both simplify?
       if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse))
         if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) {
           // They do! Return "L op' R" if it simplifies or is already available.
           // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
           if ((L == B && R == C) || (Instruction::isCommutative(OpcodeToExpand)
                                      && L == C && R == B)) {
             ++NumExpand;
             return RHS;
           }
           // Otherwise return "L op' R" if it simplifies.
           if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
             ++NumExpand;
             return V;
           }
         }
     }
 
   return nullptr;
 }
 
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
 static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
                                        Value *LHS, Value *RHS,
                                        const SimplifyQuery &Q,
                                        unsigned MaxRecurse) {
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
 
   // Transform: "(A op B) op C" ==> "A op (B op C)" if it simplifies completely.
   if (Op0 && Op0->getOpcode() == Opcode) {
     Value *A = Op0->getOperand(0);
     Value *B = Op0->getOperand(1);
     Value *C = RHS;
 
     // Does "B op C" simplify?
     if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
       // It does!  Return "A op V" if it simplifies or is already available.
       // If V equals B then "A op V" is just the LHS.
       if (V == B) return LHS;
       // Otherwise return "A op V" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   // Transform: "A op (B op C)" ==> "(A op B) op C" if it simplifies completely.
   if (Op1 && Op1->getOpcode() == Opcode) {
     Value *A = LHS;
     Value *B = Op1->getOperand(0);
     Value *C = Op1->getOperand(1);
 
     // Does "A op B" simplify?
     if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
       // It does!  Return "V op C" if it simplifies or is already available.
       // If V equals B then "V op C" is just the RHS.
       if (V == B) return RHS;
       // Otherwise return "V op C" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   // The remaining transforms require commutativity as well as associativity.
   if (!Instruction::isCommutative(Opcode))
     return nullptr;
 
   // Transform: "(A op B) op C" ==> "(C op A) op B" if it simplifies completely.
   if (Op0 && Op0->getOpcode() == Opcode) {
     Value *A = Op0->getOperand(0);
     Value *B = Op0->getOperand(1);
     Value *C = RHS;
 
     // Does "C op A" simplify?
     if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "V op B" if it simplifies or is already available.
       // If V equals A then "V op B" is just the LHS.
       if (V == A) return LHS;
       // Otherwise return "V op B" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   // Transform: "A op (B op C)" ==> "B op (C op A)" if it simplifies completely.
   if (Op1 && Op1->getOpcode() == Opcode) {
     Value *A = LHS;
     Value *B = Op1->getOperand(0);
     Value *C = Op1->getOperand(1);
 
     // Does "C op A" simplify?
     if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "B op V" if it simplifies or is already available.
       // If V equals C then "B op V" is just the RHS.
       if (V == C) return RHS;
       // Otherwise return "B op V" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   return nullptr;
 }
 
 /// In the case of a binary operation with a select instruction as an operand,
 /// try to simplify the binop by seeing whether evaluating it on both branches
 /// of the select results in the same value. Returns the common value if so,
 /// otherwise returns null.
 static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   SelectInst *SI;
   if (isa<SelectInst>(LHS)) {
     SI = cast<SelectInst>(LHS);
   } else {
     assert(isa<SelectInst>(RHS) && "No select instruction operand!");
     SI = cast<SelectInst>(RHS);
   }
 
   // Evaluate the BinOp on the true and false branches of the select.
   Value *TV;
   Value *FV;
   if (SI == LHS) {
     TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
     FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
   } else {
     TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
     FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
   }
 
   // If they simplified to the same value, then return the common value.
   // If they both failed to simplify then return null.
   if (TV == FV)
     return TV;
 
   // If one branch simplified to undef, return the other one.
   if (TV && isa<UndefValue>(TV))
     return FV;
   if (FV && isa<UndefValue>(FV))
     return TV;
 
   // If applying the operation did not change the true and false select values,
   // then the result of the binop is the select itself.
   if (TV == SI->getTrueValue() && FV == SI->getFalseValue())
     return SI;
 
   // If one branch simplified and the other did not, and the simplified
   // value is equal to the unsimplified one, return the simplified value.
   // For example, select (cond, X, X & Z) & Z -> X & Z.
   if ((FV && !TV) || (TV && !FV)) {
     // Check that the simplified value has the form "X op Y" where "op" is the
     // same as the original operation.
     Instruction *Simplified = dyn_cast<Instruction>(FV ? FV : TV);
     if (Simplified && Simplified->getOpcode() == unsigned(Opcode)) {
       // The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS".
       // We already know that "op" is the same as for the simplified value.  See
       // if the operands match too.  If so, return the simplified value.
       Value *UnsimplifiedBranch = FV ? SI->getTrueValue() : SI->getFalseValue();
       Value *UnsimplifiedLHS = SI == LHS ? UnsimplifiedBranch : LHS;
       Value *UnsimplifiedRHS = SI == LHS ? RHS : UnsimplifiedBranch;
       if (Simplified->getOperand(0) == UnsimplifiedLHS &&
           Simplified->getOperand(1) == UnsimplifiedRHS)
         return Simplified;
       if (Simplified->isCommutative() &&
           Simplified->getOperand(1) == UnsimplifiedLHS &&
           Simplified->getOperand(0) == UnsimplifiedRHS)
         return Simplified;
     }
   }
 
   return nullptr;
 }
 
 /// In the case of a comparison with a select instruction, try to simplify the
 /// comparison by seeing whether both branches of the select result in the same
 /// value. Returns the common value if so, otherwise returns null.
 static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const SimplifyQuery &Q,
                                   unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   // Make sure the select is on the LHS.
   if (!isa<SelectInst>(LHS)) {
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
   assert(isa<SelectInst>(LHS) && "Not comparing with a select instruction!");
   SelectInst *SI = cast<SelectInst>(LHS);
   Value *Cond = SI->getCondition();
   Value *TV = SI->getTrueValue();
   Value *FV = SI->getFalseValue();
 
   // Now that we have "cmp select(Cond, TV, FV), RHS", analyse it.
   // Does "cmp TV, RHS" simplify?
   Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, Q, MaxRecurse);
   if (TCmp == Cond) {
     // It not only simplified, it simplified to the select condition.  Replace
     // it with 'true'.
     TCmp = getTrue(Cond->getType());
   } else if (!TCmp) {
     // It didn't simplify.  However if "cmp TV, RHS" is equal to the select
     // condition then we can replace it with 'true'.  Otherwise give up.
     if (!isSameCompare(Cond, Pred, TV, RHS))
       return nullptr;
     TCmp = getTrue(Cond->getType());
   }
 
   // Does "cmp FV, RHS" simplify?
   Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, Q, MaxRecurse);
   if (FCmp == Cond) {
     // It not only simplified, it simplified to the select condition.  Replace
     // it with 'false'.
     FCmp = getFalse(Cond->getType());
   } else if (!FCmp) {
     // It didn't simplify.  However if "cmp FV, RHS" is equal to the select
     // condition then we can replace it with 'false'.  Otherwise give up.
     if (!isSameCompare(Cond, Pred, FV, RHS))
       return nullptr;
     FCmp = getFalse(Cond->getType());
   }
 
   // If both sides simplified to the same value, then use it as the result of
   // the original comparison.
   if (TCmp == FCmp)
     return TCmp;
 
   // The remaining cases only make sense if the select condition has the same
   // type as the result of the comparison, so bail out if this is not so.
   if (Cond->getType()->isVectorTy() != RHS->getType()->isVectorTy())
     return nullptr;
   // If the false value simplified to false, then the result of the compare
   // is equal to "Cond && TCmp".  This also catches the case when the false
   // value simplified to false and the true value to true, returning "Cond".
   if (match(FCmp, m_Zero()))
     if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse))
       return V;
   // If the true value simplified to true, then the result of the compare
   // is equal to "Cond || FCmp".
   if (match(TCmp, m_One()))
     if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse))
       return V;
   // Finally, if the false value simplified to true and the true value to
   // false, then the result of the compare is equal to "!Cond".
   if (match(FCmp, m_One()) && match(TCmp, m_Zero()))
     if (Value *V =
         SimplifyXorInst(Cond, Constant::getAllOnesValue(Cond->getType()),
                         Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 /// In the case of a binary operation with an operand that is a PHI instruction,
 /// try to simplify the binop by seeing whether evaluating it on the incoming
 /// phi values yields the same result for every value. If so returns the common
 /// value, otherwise returns null.
 static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
                                  Value *RHS, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   PHINode *PI;
   if (isa<PHINode>(LHS)) {
     PI = cast<PHINode>(LHS);
     // Bail out if RHS and the phi may be mutually interdependent due to a loop.
     if (!valueDominatesPHI(RHS, PI, Q.DT))
       return nullptr;
   } else {
     assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
     PI = cast<PHINode>(RHS);
     // Bail out if LHS and the phi may be mutually interdependent due to a loop.
     if (!valueDominatesPHI(LHS, PI, Q.DT))
       return nullptr;
   }
 
   // Evaluate the BinOp on the incoming phi values.
   Value *CommonValue = nullptr;
   for (Value *Incoming : PI->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PI) continue;
     Value *V = PI == LHS ?
       SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) :
       SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
       return nullptr;
     CommonValue = V;
   }
 
   return CommonValue;
 }
 
 /// In the case of a comparison with a PHI instruction, try to simplify the
 /// comparison by seeing whether comparing with all of the incoming phi values
 /// yields the same result every time. If so returns the common result,
 /// otherwise returns null.
 static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   // Make sure the phi is on the LHS.
   if (!isa<PHINode>(LHS)) {
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
   assert(isa<PHINode>(LHS) && "Not comparing with a phi instruction!");
   PHINode *PI = cast<PHINode>(LHS);
 
   // Bail out if RHS and the phi may be mutually interdependent due to a loop.
   if (!valueDominatesPHI(RHS, PI, Q.DT))
     return nullptr;
 
   // Evaluate the BinOp on the incoming phi values.
   Value *CommonValue = nullptr;
   for (Value *Incoming : PI->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PI) continue;
     Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
       return nullptr;
     CommonValue = V;
   }
 
   return CommonValue;
 }
 
 static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
                                        Value *&Op0, Value *&Op1,
                                        const SimplifyQuery &Q) {
   if (auto *CLHS = dyn_cast<Constant>(Op0)) {
     if (auto *CRHS = dyn_cast<Constant>(Op1))
       return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS if this is a commutative operation.
     if (Instruction::isCommutative(Opcode))
       std::swap(Op0, Op1);
   }
   return nullptr;
 }
 
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
     return C;
 
   // X + undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
   // X + 0 -> X
   if (match(Op1, m_Zero()))
     return Op0;
 
   // If two operands are negative, return 0.
   if (isKnownNegation(Op0, Op1))
     return Constant::getNullValue(Op0->getType());
 
   // X + (Y - X) -> Y
   // (Y - X) + X -> Y
   // Eg: X + -X -> 0
   Value *Y = nullptr;
   if (match(Op1, m_Sub(m_Value(Y), m_Specific(Op0))) ||
       match(Op0, m_Sub(m_Value(Y), m_Specific(Op1))))
     return Y;
 
   // X + ~X -> -1   since   ~X = -X-1
   Type *Ty = Op0->getType();
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Ty);
 
   // add nsw/nuw (xor Y, signmask), signmask --> Y
   // The no-wrapping add guarantees that the top bit will be set by the add.
   // Therefore, the xor must be clearing the already set sign bit of Y.
   if ((IsNSW || IsNUW) && match(Op1, m_SignMask()) &&
       match(Op0, m_Xor(m_Value(Y), m_SignMask())))
     return Y;
 
   // add nuw %x, -1  ->  -1, because %x can only be 0.
   if (IsNUW && match(Op1, m_AllOnes()))
     return Op1; // Which is -1.
 
   /// i1 add -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Threading Add over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A + select(cond, B, C)" means evaluating
   // "A+B" and "A+C" and seeing if they are equal; but they are equal if and
   // only if B and C are equal.  If B and C are equal then (since we assume
   // that operands have already been simplified) "select(cond, B, C)" should
   // have been simplified to the common value of B and C already.  Analysing
   // "A+B" and "A+C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
   return nullptr;
 }
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                              const SimplifyQuery &Query) {
   return ::SimplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit);
 }
 
 /// Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
 /// accumulates the total constant offset applied in the returned constant. It
 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
 /// no constant offsets applied.
 ///
 /// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
 /// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
 /// folding.
 static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
                                                 bool AllowNonInbounds = false) {
   assert(V->getType()->isPtrOrPtrVectorTy());
 
   Type *IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType();
   APInt Offset = APInt::getNullValue(IntPtrTy->getIntegerBitWidth());
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       if ((!AllowNonInbounds && !GEP->isInBounds()) ||
           !GEP->accumulateConstantOffset(DL, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->isInterposable())
         break;
       V = GA->getAliasee();
     } else {
       if (auto CS = CallSite(V))
         if (Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
       break;
     }
     assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!");
   } while (Visited.insert(V).second);
 
   Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
   if (V->getType()->isVectorTy())
     return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
                                     OffsetIntPtr);
   return OffsetIntPtr;
 }
 
 /// Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
 static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
                                           Value *RHS) {
   Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
   Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
 
   // If LHS and RHS are not related via constant offsets to the same base
   // value, there is nothing we can do here.
   if (LHS != RHS)
     return nullptr;
 
   // Otherwise, the difference of LHS - RHS can be computed as:
   //    LHS - RHS
   //  = (LHSOffset + Base) - (RHSOffset + Base)
   //  = LHSOffset - RHSOffset
   return ConstantExpr::getSub(LHSOffset, RHSOffset);
 }
 
 /// Given operands for a Sub, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
     return C;
 
   // X - undef -> undef
   // undef - X -> undef
   if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
     return UndefValue::get(Op0->getType());
 
   // X - 0 -> X
   if (match(Op1, m_Zero()))
     return Op0;
 
   // X - X -> 0
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   // Is this a negation?
   if (match(Op0, m_Zero())) {
     // 0 - X -> 0 if the sub is NUW.
     if (isNUW)
       return Constant::getNullValue(Op0->getType());
 
     KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (Known.Zero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
       if (isNSW)
         return Constant::getNullValue(Op0->getType());
 
       // 0 - X -> X if X is 0 or the minimum signed value.
       return Op1;
     }
   }
 
   // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
   // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
   Value *X = nullptr, *Y = nullptr, *Z = Op1;
   if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
     // See if "V === Y - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
       // It does!  Now see if "X + V" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
       // It does!  Now see if "Y + V" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
   }
 
   // X - (Y + Z) -> (X - Y) - Z or (X - Z) - Y if everything simplifies.
   // For example, X - (X + 1) -> -1
   X = Op0;
   if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z)
     // See if "V === X - Y" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
       // It does!  Now see if "V - Z" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
       // It does!  Now see if "V - Y" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
   }
 
   // Z - (X - Y) -> (Z - X) + Y if everything simplifies.
   // For example, X - (X - Y) -> Y.
   Z = Op0;
   if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y)
     // See if "V === Z - X" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1))
       // It does!  Now see if "V + Y" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
 
   // trunc(X) - trunc(Y) -> trunc(X - Y) if everything simplifies.
   if (MaxRecurse && match(Op0, m_Trunc(m_Value(X))) &&
       match(Op1, m_Trunc(m_Value(Y))))
     if (X->getType() == Y->getType())
       // See if "V === X - Y" simplifies.
       if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
         // It does!  Now see if "trunc V" simplifies.
         if (Value *W = SimplifyCastInst(Instruction::Trunc, V, Op0->getType(),
                                         Q, MaxRecurse - 1))
           // It does, return the simplified "trunc V".
           return W;
 
   // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
   if (match(Op0, m_PtrToInt(m_Value(X))) &&
       match(Op1, m_PtrToInt(m_Value(Y))))
     if (Constant *Result = computePointerDifference(Q.DL, X, Y))
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Threading Sub over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A - select(cond, B, C)" means evaluating
   // "A-B" and "A-C" and seeing if they are equal; but they are equal if and
   // only if B and C are equal.  If B and C are equal then (since we assume
   // that operands have already been simplified) "select(cond, B, C)" should
   // have been simplified to the common value of B and C already.  Analysing
   // "A-B" and "A-C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
   return nullptr;
 }
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const SimplifyQuery &Q) {
   return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for a Mul, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
     return C;
 
   // X * undef -> 0
   // X * 0 -> 0
   if (match(Op1, m_CombineOr(m_Undef(), m_Zero())))
     return Constant::getNullValue(Op0->getType());
 
   // X * 1 -> X
   if (match(Op1, m_One()))
     return Op0;
 
   // (X / Y) * Y -> X if the division is exact.
   Value *X = nullptr;
   if (match(Op0, m_Exact(m_IDiv(m_Value(X), m_Specific(Op1)))) || // (X / Y) * Y
       match(Op1, m_Exact(m_IDiv(m_Value(X), m_Specific(Op0)))))   // Y * (X / Y)
     return X;
 
   // i1 mul -> and.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Mul distributes over Add. Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add,
                              Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q,
                                       MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Check for common or similar folds of integer division or integer remainder.
 /// This applies to all 4 opcodes (sdiv/udiv/srem/urem).
 static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
   Type *Ty = Op0->getType();
 
   // X / undef -> undef
   // X % undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
   // X / 0 -> undef
   // X % 0 -> undef
   // We don't need to preserve faults!
   if (match(Op1, m_Zero()))
     return UndefValue::get(Ty);
 
   // If any element of a constant divisor vector is zero or undef, the whole op
   // is undef.
   auto *Op1C = dyn_cast<Constant>(Op1);
   if (Op1C && Ty->isVectorTy()) {
     unsigned NumElts = Ty->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = Op1C->getAggregateElement(i);
       if (Elt && (Elt->isNullValue() || isa<UndefValue>(Elt)))
         return UndefValue::get(Ty);
     }
   }
 
   // undef / X -> 0
   // undef % X -> 0
   if (match(Op0, m_Undef()))
     return Constant::getNullValue(Ty);
 
   // 0 / X -> 0
   // 0 % X -> 0
   if (match(Op0, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X / X -> 1
   // X % X -> 0
   if (Op0 == Op1)
     return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty);
 
   // X / 1 -> X
   // X % 1 -> 0
   // If this is a boolean op (single-bit element type), we can't have
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
   // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1.
   Value *X;
   if (match(Op1, m_One()) || Ty->isIntOrIntVectorTy(1) ||
       (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
     return IsDiv ? Op0 : Constant::getNullValue(Ty);
 
   return nullptr;
 }
 
 /// Given a predicate and two operands, return true if the comparison is true.
 /// This is a helper for div/rem simplification where we return some other value
 /// when we can prove a relationship between the operands.
 static bool isICmpTrue(ICmpInst::Predicate Pred, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q, unsigned MaxRecurse) {
   Value *V = SimplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse);
   Constant *C = dyn_cast_or_null<Constant>(V);
   return (C && C->isAllOnesValue());
 }
 
 /// Return true if we can simplify X / Y to 0. Remainder can adapt that answer
 /// to simplify X % Y to X.
 static bool isDivZero(Value *X, Value *Y, const SimplifyQuery &Q,
                       unsigned MaxRecurse, bool IsSigned) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return false;
 
   if (IsSigned) {
     // |X| / |Y| --> 0
     //
     // We require that 1 operand is a simple constant. That could be extended to
     // 2 variables if we computed the sign bit for each.
     //
     // Make sure that a constant is not the minimum signed value because taking
     // the abs() of that is undefined.
     Type *Ty = X->getType();
     const APInt *C;
     if (match(X, m_APInt(C)) && !C->isMinSignedValue()) {
       // Is the variable divisor magnitude always greater than the constant
       // dividend magnitude?
       // |Y| > |C| --> Y < -abs(C) or Y > abs(C)
       Constant *PosDividendC = ConstantInt::get(Ty, C->abs());
       Constant *NegDividendC = ConstantInt::get(Ty, -C->abs());
       if (isICmpTrue(CmpInst::ICMP_SLT, Y, NegDividendC, Q, MaxRecurse) ||
           isICmpTrue(CmpInst::ICMP_SGT, Y, PosDividendC, Q, MaxRecurse))
         return true;
     }
     if (match(Y, m_APInt(C))) {
       // Special-case: we can't take the abs() of a minimum signed value. If
       // that's the divisor, then all we have to do is prove that the dividend
       // is also not the minimum signed value.
       if (C->isMinSignedValue())
         return isICmpTrue(CmpInst::ICMP_NE, X, Y, Q, MaxRecurse);
 
       // Is the variable dividend magnitude always less than the constant
       // divisor magnitude?
       // |X| < |C| --> X > -abs(C) and X < abs(C)
       Constant *PosDivisorC = ConstantInt::get(Ty, C->abs());
       Constant *NegDivisorC = ConstantInt::get(Ty, -C->abs());
       if (isICmpTrue(CmpInst::ICMP_SGT, X, NegDivisorC, Q, MaxRecurse) &&
           isICmpTrue(CmpInst::ICMP_SLT, X, PosDivisorC, Q, MaxRecurse))
         return true;
     }
     return false;
   }
 
   // IsSigned == false.
   // Is the dividend unsigned less than the divisor?
   return isICmpTrue(ICmpInst::ICMP_ULT, X, Y, Q, MaxRecurse);
 }
 
 /// These are simplifications common to SDiv and UDiv.
 static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
   if (Value *V = simplifyDivRem(Op0, Op1, true))
     return V;
 
   bool IsSigned = Opcode == Instruction::SDiv;
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
   Value *X;
   if (match(Op0, m_c_Mul(m_Value(X), m_Specific(Op1)))) {
     auto *Mul = cast<OverflowingBinaryOperator>(Op0);
     // If the Mul does not overflow, then we are good to go.
     if ((IsSigned && Mul->hasNoSignedWrap()) ||
         (!IsSigned && Mul->hasNoUnsignedWrap()))
       return X;
     // If X has the form X = A / Y, then X * Y cannot overflow.
     if ((IsSigned && match(X, m_SDiv(m_Value(), m_Specific(Op1)))) ||
         (!IsSigned && match(X, m_UDiv(m_Value(), m_Specific(Op1)))))
       return X;
   }
 
   // (X rem Y) / Y -> 0
   if ((IsSigned && match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) ||
       (!IsSigned && match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
     return Constant::getNullValue(Op0->getType());
 
   // (X /u C1) /u C2 -> 0 if C1 * C2 overflow
   ConstantInt *C1, *C2;
   if (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_ConstantInt(C1))) &&
       match(Op1, m_ConstantInt(C2))) {
     bool Overflow;
     (void)C1->getValue().umul_ov(C2->getValue(), Overflow);
     if (Overflow)
       return Constant::getNullValue(Op0->getType());
   }
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   if (isDivZero(Op0, Op1, Q, MaxRecurse, IsSigned))
     return Constant::getNullValue(Op0->getType());
 
   return nullptr;
 }
 
 /// These are simplifications common to SRem and URem.
 static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
   if (Value *V = simplifyDivRem(Op0, Op1, false))
     return V;
 
   // (X % Y) % Y -> X % Y
   if ((Opcode == Instruction::SRem &&
        match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) ||
       (Opcode == Instruction::URem &&
        match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
     return Op0;
 
   // (X << Y) % X -> 0
   if ((Opcode == Instruction::SRem &&
        match(Op0, m_NSWShl(m_Specific(Op1), m_Value()))) ||
       (Opcode == Instruction::URem &&
        match(Op0, m_NUWShl(m_Specific(Op1), m_Value()))))
     return Constant::getNullValue(Op0->getType());
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If X / Y == 0, then X % Y == X.
   if (isDivZero(Op0, Op1, Q, MaxRecurse, Opcode == Instruction::SRem))
     return Op0;
 
   return nullptr;
 }
 
 /// Given operands for an SDiv, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   // If two operands are negated and no signed overflow, return -1.
   if (isKnownNegation(Op0, Op1, /*NeedNSW=*/true))
     return Constant::getAllOnesValue(Op0->getType());
 
   return simplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a UDiv, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   return simplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for an SRem, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   // If the divisor is 0, the result is undefined, so assume the divisor is -1.
   // srem Op0, (sext i1 X) --> srem Op0, -1 --> 0
   Value *X;
   if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
     return ConstantInt::getNullValue(Op0->getType());
 
   // If the two operands are negated, return 0.
   if (isKnownNegation(Op0, Op1))
     return ConstantInt::getNullValue(Op0->getType());
 
   return simplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a URem, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   return simplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Returns true if a shift by \c Amount always yields undef.
 static bool isUndefShift(Value *Amount) {
   Constant *C = dyn_cast<Constant>(Amount);
   if (!C)
     return false;
 
   // X shift by undef -> undef because it may shift by the bitwidth.
   if (isa<UndefValue>(C))
     return true;
 
   // Shifting by the bitwidth or more is undefined.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
     if (CI->getValue().getLimitedValue() >=
         CI->getType()->getScalarSizeInBits())
       return true;
 
   // If all lanes of a vector shift are undefined the whole shift is.
   if (isa<ConstantVector>(C) || isa<ConstantDataVector>(C)) {
     for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I)
       if (!isUndefShift(C->getAggregateElement(I)))
         return false;
     return true;
   }
 
   return false;
 }
 
 /// Given operands for an Shl, LShr or AShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
                             Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X shift by 0 -> X
   // Shift-by-sign-extended bool must be shift-by-0 because shift-by-all-ones
   // would be poison.
   Value *X;
   if (match(Op1, m_Zero()) ||
       (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
     return Op0;
 
   // Fold undefined shifts.
   if (isUndefShift(Op1))
     return UndefValue::get(Op0->getType());
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If any bits in the shift amount make that value greater than or equal to
   // the number of bits in the type, the shift is undefined.
   KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
   if (Known.One.getLimitedValue() >= Known.getBitWidth())
     return UndefValue::get(Op0->getType());
 
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
   unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth());
   if (Known.countMinTrailingZeros() >= NumValidShiftBits)
     return Op0;
 
   return nullptr;
 }
 
 /// Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
                                  Value *Op1, bool isExact, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // X >> X -> 0
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   // undef >> X -> 0
   // undef >> X -> undef (if it's exact)
   if (match(Op0, m_Undef()))
     return isExact ? Op0 : Constant::getNullValue(Op0->getType());
 
   // The low bit cannot be shifted out of an exact shift if it is set.
   if (isExact) {
     KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
     if (Op0Known.One[0])
       return Op0;
   }
 
   return nullptr;
 }
 
 /// Given operands for an Shl, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // undef << X -> 0
   // undef << X -> undef if (if it's NSW/NUW)
   if (match(Op0, m_Undef()))
     return isNSW || isNUW ? Op0 : Constant::getNullValue(Op0->getType());
 
   // (X >> A) << A -> X
   Value *X;
   if (match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1)))))
     return X;
 
   // shl nuw i8 C, %x  ->  C  iff C has sign bit set.
   if (isNUW && match(Op0, m_Negative()))
     return Op0;
   // NOTE: could use computeKnownBits() / LazyValueInfo,
   // but the cost-benefit analysis suggests it isn't worth it.
 
   return nullptr;
 }
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const SimplifyQuery &Q) {
   return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for an LShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
       return V;
 
   // (X << A) >> A -> X
   Value *X;
   if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
   // ((X << A) | Y) >> A -> X  if effective width of Y is not larger than A.
   // We can return X as we do in the above case since OR alters no bits in X.
   // SimplifyDemandedBits in InstCombine can do more general optimization for
   // bit manipulation. This pattern aims to provide opportunities for other
   // optimizers by supporting a simple but common case in InstSimplify.
   Value *Y;
   const APInt *ShRAmt, *ShLAmt;
   if (match(Op1, m_APInt(ShRAmt)) &&
       match(Op0, m_c_Or(m_NUWShl(m_Value(X), m_APInt(ShLAmt)), m_Value(Y))) &&
       *ShRAmt == *ShLAmt) {
     const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     const unsigned Width = Op0->getType()->getScalarSizeInBits();
     const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
     if (EffWidthY <= ShRAmt->getZExtValue())
       return X;
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                               const SimplifyQuery &Q) {
   return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Given operands for an AShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
     return V;
 
   // all ones >>a X -> -1
   // Do not return Op0 because it may contain undef elements if it's a vector.
   if (match(Op0, m_AllOnes()))
     return Constant::getAllOnesValue(Op0->getType());
 
   // (X << A) >> A -> X
   Value *X;
   if (match(Op0, m_NSWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
   // Arithmetic shifting an all-sign-bit value is a no-op.
   unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
   if (NumSignBits == Op0->getType()->getScalarSizeInBits())
     return Op0;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                               const SimplifyQuery &Q) {
   return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
                                          ICmpInst *UnsignedICmp, bool IsAnd) {
   Value *X, *Y;
 
   ICmpInst::Predicate EqPred;
   if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(Y), m_Zero())) ||
       !ICmpInst::isEquality(EqPred))
     return nullptr;
 
   ICmpInst::Predicate UnsignedPred;
   if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Value(X), m_Specific(Y))) &&
       ICmpInst::isUnsigned(UnsignedPred))
     ;
   else if (match(UnsignedICmp,
                  m_ICmp(UnsignedPred, m_Specific(Y), m_Value(X))) &&
            ICmpInst::isUnsigned(UnsignedPred))
     UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
   else
     return nullptr;
 
   // X < Y && Y != 0  -->  X < Y
   // X < Y || Y != 0  -->  Y != 0
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
     return IsAnd ? UnsignedICmp : ZeroICmp;
 
   // X >= Y || Y != 0  -->  true
   // X >= Y || Y == 0  -->  X >= Y
   if (UnsignedPred == ICmpInst::ICMP_UGE && !IsAnd) {
     if (EqPred == ICmpInst::ICMP_NE)
       return getTrue(UnsignedICmp->getType());
     return UnsignedICmp;
   }
 
   // X < Y && Y == 0  -->  false
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ &&
       IsAnd)
     return getFalse(UnsignedICmp->getType());
 
   return nullptr;
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   Value *A ,*B;
   if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
       !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
     return nullptr;
 
   // We have (icmp Pred0, A, B) & (icmp Pred1, A, B).
   // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
   // can eliminate Op1 from this 'and'.
   if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
     return Op0;
 
   // Check for any combination of predicates that are guaranteed to be disjoint.
   if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_EQ && ICmpInst::isFalseWhenEqual(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT) ||
       (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT))
     return getFalse(Op0->getType());
 
   return nullptr;
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   Value *A ,*B;
   if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
       !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
     return nullptr;
 
   // We have (icmp Pred0, A, B) | (icmp Pred1, A, B).
   // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
   // can eliminate Op0 from this 'or'.
   if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
     return Op1;
 
   // Check for any combination of predicates that cover the entire range of
   // possibilities.
   if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) ||
       (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE))
     return getTrue(Op0->getType());
 
   return nullptr;
 }
 
 /// Test if a pair of compares with a shared operand and 2 constants has an
 /// empty set intersection, full set union, or if one compare is a superset of
 /// the other.
 static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
                                                 bool IsAnd) {
   // Look for this pattern: {and/or} (icmp X, C0), (icmp X, C1)).
   if (Cmp0->getOperand(0) != Cmp1->getOperand(0))
     return nullptr;
 
   const APInt *C0, *C1;
   if (!match(Cmp0->getOperand(1), m_APInt(C0)) ||
       !match(Cmp1->getOperand(1), m_APInt(C1)))
     return nullptr;
 
   auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
   auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
 
   // For and-of-compares, check if the intersection is empty:
   // (icmp X, C0) && (icmp X, C1) --> empty set --> false
   if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
     return getFalse(Cmp0->getType());
 
   // For or-of-compares, check if the union is full:
   // (icmp X, C0) || (icmp X, C1) --> full set --> true
   if (!IsAnd && Range0.unionWith(Range1).isFullSet())
     return getTrue(Cmp0->getType());
 
   // Is one range a superset of the other?
   // If this is and-of-compares, take the smaller set:
   // (icmp sgt X, 4) && (icmp sgt X, 42) --> icmp sgt X, 42
   // If this is or-of-compares, take the larger set:
   // (icmp sgt X, 4) || (icmp sgt X, 42) --> icmp sgt X, 4
   if (Range0.contains(Range1))
     return IsAnd ? Cmp1 : Cmp0;
   if (Range1.contains(Range0))
     return IsAnd ? Cmp0 : Cmp1;
 
   return nullptr;
 }
 
 static Value *simplifyAndOrOfICmpsWithZero(ICmpInst *Cmp0, ICmpInst *Cmp1,
                                            bool IsAnd) {
   ICmpInst::Predicate P0 = Cmp0->getPredicate(), P1 = Cmp1->getPredicate();
   if (!match(Cmp0->getOperand(1), m_Zero()) ||
       !match(Cmp1->getOperand(1), m_Zero()) || P0 != P1)
     return nullptr;
 
   if ((IsAnd && P0 != ICmpInst::ICMP_NE) || (!IsAnd && P1 != ICmpInst::ICMP_EQ))
     return nullptr;
 
   // We have either "(X == 0 || Y == 0)" or "(X != 0 && Y != 0)".
   Value *X = Cmp0->getOperand(0);
   Value *Y = Cmp1->getOperand(0);
 
   // If one of the compares is a masked version of a (not) null check, then
   // that compare implies the other, so we eliminate the other. Optionally, look
   // through a pointer-to-int cast to match a null check of a pointer type.
 
   // (X == 0) || (([ptrtoint] X & ?) == 0) --> ([ptrtoint] X & ?) == 0
   // (X == 0) || ((? & [ptrtoint] X) == 0) --> (? & [ptrtoint] X) == 0
   // (X != 0) && (([ptrtoint] X & ?) != 0) --> ([ptrtoint] X & ?) != 0
   // (X != 0) && ((? & [ptrtoint] X) != 0) --> (? & [ptrtoint] X) != 0
   if (match(Y, m_c_And(m_Specific(X), m_Value())) ||
       match(Y, m_c_And(m_PtrToInt(m_Specific(X)), m_Value())))
     return Cmp1;
 
   // (([ptrtoint] Y & ?) == 0) || (Y == 0) --> ([ptrtoint] Y & ?) == 0
   // ((? & [ptrtoint] Y) == 0) || (Y == 0) --> (? & [ptrtoint] Y) == 0
   // (([ptrtoint] Y & ?) != 0) && (Y != 0) --> ([ptrtoint] Y & ?) != 0
   // ((? & [ptrtoint] Y) != 0) && (Y != 0) --> (? & [ptrtoint] Y) != 0
   if (match(X, m_c_And(m_Specific(Y), m_Value())) ||
       match(X, m_c_And(m_PtrToInt(m_Specific(Y)), m_Value())))
     return Cmp0;
 
   return nullptr;
 }
 
 static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) & (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
   Value *V;
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
     return nullptr;
 
   if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
     return nullptr;
 
   auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
   if (AddInst->getOperand(1) != Op1->getOperand(1))
     return nullptr;
 
   Type *ITy = Op0->getType();
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
 
   const APInt Delta = *C1 - *C0;
   if (C0->isStrictlyPositive()) {
     if (Delta == 2) {
       if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_SGT)
         return getFalse(ITy);
       if (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT && isNSW)
         return getFalse(ITy);
     }
     if (Delta == 1) {
       if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_SGT)
         return getFalse(ITy);
       if (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGT && isNSW)
         return getFalse(ITy);
     }
   }
   if (C0->getBoolValue() && isNUW) {
     if (Delta == 2)
       if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT)
         return getFalse(ITy);
     if (Delta == 1)
       if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGT)
         return getFalse(ITy);
   }
 
   return nullptr;
 }
 
 static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
     return X;
   if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true))
     return X;
 
   if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
     return X;
   if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
     return X;
 
   if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1))
     return X;
   if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0))
     return X;
 
   return nullptr;
 }
 
 static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) | (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
   Value *V;
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
     return nullptr;
 
   if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
     return nullptr;
 
   auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
   if (AddInst->getOperand(1) != Op1->getOperand(1))
     return nullptr;
 
   Type *ITy = Op0->getType();
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
 
   const APInt Delta = *C1 - *C0;
   if (C0->isStrictlyPositive()) {
     if (Delta == 2) {
       if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE)
         return getTrue(ITy);
       if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW)
         return getTrue(ITy);
     }
     if (Delta == 1) {
       if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE)
         return getTrue(ITy);
       if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW)
         return getTrue(ITy);
     }
   }
   if (C0->getBoolValue() && isNUW) {
     if (Delta == 2)
       if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE)
         return getTrue(ITy);
     if (Delta == 1)
       if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE)
         return getTrue(ITy);
   }
 
   return nullptr;
 }
 
 static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
     return X;
   if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false))
     return X;
 
   if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
     return X;
   if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
     return X;
 
   if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1))
     return X;
   if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0))
     return X;
 
   return nullptr;
 }
 
 static Value *simplifyAndOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) {
   Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
   Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   if (LHS0->getType() != RHS0->getType())
     return nullptr;
 
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
       (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
     // (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y
     // (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X
     // (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y
     // (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X
     // (fcmp uno NNAN, X) | (fcmp uno X, Y) --> fcmp uno X, Y
     // (fcmp uno NNAN, X) | (fcmp uno Y, X) --> fcmp uno Y, X
     // (fcmp uno X, NNAN) | (fcmp uno X, Y) --> fcmp uno X, Y
     // (fcmp uno X, NNAN) | (fcmp uno Y, X) --> fcmp uno Y, X
     if ((isKnownNeverNaN(LHS0) && (LHS1 == RHS0 || LHS1 == RHS1)) ||
         (isKnownNeverNaN(LHS1) && (LHS0 == RHS0 || LHS0 == RHS1)))
       return RHS;
 
     // (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y
     // (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X
     // (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y
     // (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X
     // (fcmp uno X, Y) | (fcmp uno NNAN, X) --> fcmp uno X, Y
     // (fcmp uno Y, X) | (fcmp uno NNAN, X) --> fcmp uno Y, X
     // (fcmp uno X, Y) | (fcmp uno X, NNAN) --> fcmp uno X, Y
     // (fcmp uno Y, X) | (fcmp uno X, NNAN) --> fcmp uno Y, X
     if ((isKnownNeverNaN(RHS0) && (RHS1 == LHS0 || RHS1 == LHS1)) ||
         (isKnownNeverNaN(RHS1) && (RHS0 == LHS0 || RHS0 == LHS1)))
       return LHS;
   }
 
   return nullptr;
 }
 
 static Value *simplifyAndOrOfCmps(Value *Op0, Value *Op1, bool IsAnd) {
   // Look through casts of the 'and' operands to find compares.
   auto *Cast0 = dyn_cast<CastInst>(Op0);
   auto *Cast1 = dyn_cast<CastInst>(Op1);
   if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() &&
       Cast0->getSrcTy() == Cast1->getSrcTy()) {
     Op0 = Cast0->getOperand(0);
     Op1 = Cast1->getOperand(0);
   }
 
   Value *V = nullptr;
   auto *ICmp0 = dyn_cast<ICmpInst>(Op0);
   auto *ICmp1 = dyn_cast<ICmpInst>(Op1);
   if (ICmp0 && ICmp1)
     V = IsAnd ? simplifyAndOfICmps(ICmp0, ICmp1) :
                 simplifyOrOfICmps(ICmp0, ICmp1);
 
   auto *FCmp0 = dyn_cast<FCmpInst>(Op0);
   auto *FCmp1 = dyn_cast<FCmpInst>(Op1);
   if (FCmp0 && FCmp1)
     V = simplifyAndOrOfFCmps(FCmp0, FCmp1, IsAnd);
 
   if (!V)
     return nullptr;
   if (!Cast0)
     return V;
 
   // If we looked through casts, we can only handle a constant simplification
   // because we are not allowed to create a cast instruction here.
   if (auto *C = dyn_cast<Constant>(V))
     return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType());
 
   return nullptr;
 }
 
 /// Given operands for an And, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
     return C;
 
   // X & undef -> 0
   if (match(Op1, m_Undef()))
     return Constant::getNullValue(Op0->getType());
 
   // X & X = X
   if (Op0 == Op1)
     return Op0;
 
   // X & 0 = 0
   if (match(Op1, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X & -1 = X
   if (match(Op1, m_AllOnes()))
     return Op0;
 
   // A & ~A  =  ~A & A  =  0
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
   if (match(Op0, m_c_Or(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A & (A | ?) = A
   if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
   // A mask that only clears known zeros of a shifted value is a no-op.
   Value *X;
   const APInt *Mask;
   const APInt *ShAmt;
   if (match(Op1, m_APInt(Mask))) {
     // If all bits in the inverted and shifted mask are clear:
     // and (shl X, ShAmt), Mask --> shl X, ShAmt
     if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) &&
         (~(*Mask)).lshr(*ShAmt).isNullValue())
       return Op0;
 
     // If all bits in the inverted and shifted mask are clear:
     // and (lshr X, ShAmt), Mask --> lshr X, ShAmt
     if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
         (~(*Mask)).shl(*ShAmt).isNullValue())
       return Op0;
   }
 
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
     if (isKnownToBeAPowerOfTwo(Op0, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI,
                                Q.DT))
       return Op0;
     if (isKnownToBeAPowerOfTwo(Op1, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI,
                                Q.DT))
       return Op1;
   }
 
   if (Value *V = simplifyAndOrOfCmps(Op0, Op1, true))
     return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // And distributes over Or.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or,
                              Q, MaxRecurse))
     return V;
 
   // And distributes over Xor.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor,
                              Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q,
                                       MaxRecurse))
       return V;
 
+  // Assuming the effective width of Y is not larger than A, i.e. all bits
+  // from X and Y are disjoint in (X << A) | Y,
+  // if the mask of this AND op covers all bits of X or Y, while it covers
+  // no bits from the other, we can bypass this AND op. E.g.,
+  // ((X << A) | Y) & Mask -> Y,
+  //     if Mask = ((1 << effective_width_of(Y)) - 1)
+  // ((X << A) | Y) & Mask -> X << A,
+  //     if Mask = ((1 << effective_width_of(X)) - 1) << A
+  // SimplifyDemandedBits in InstCombine can optimize the general case.
+  // This pattern aims to help other passes for a common case.
+  Value *Y, *XShifted;
+  if (match(Op1, m_APInt(Mask)) &&
+      match(Op0, m_c_Or(m_CombineAnd(m_NUWShl(m_Value(X), m_APInt(ShAmt)),
+                                     m_Value(XShifted)),
+                        m_Value(Y)))) {
+    const unsigned ShftCnt = ShAmt->getZExtValue();
+    const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    const unsigned Width = Op0->getType()->getScalarSizeInBits();
+    const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
+    if (EffWidthY <= ShftCnt) {
+      const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI,
+                                                Q.DT);
+      const unsigned EffWidthX = Width - XKnown.countMinLeadingZeros();
+      const APInt EffBitsY = APInt::getLowBitsSet(Width, EffWidthY);
+      const APInt EffBitsX = APInt::getLowBitsSet(Width, EffWidthX) << ShftCnt;
+      // If the mask is extracting all bits from X or Y as is, we can skip
+      // this AND op.
+      if (EffBitsY.isSubsetOf(*Mask) && !EffBitsX.intersects(*Mask))
+        return Y;
+      if (EffBitsX.isSubsetOf(*Mask) && !EffBitsY.intersects(*Mask))
+        return XShifted;
+    }
+  }
+
   return nullptr;
 }
 
 Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for an Or, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                              unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
     return C;
 
   // X | undef -> -1
   // X | -1 = -1
   // Do not return Op1 because it may contain undef elements if it's a vector.
   if (match(Op1, m_Undef()) || match(Op1, m_AllOnes()))
     return Constant::getAllOnesValue(Op0->getType());
 
   // X | X = X
   // X | 0 = X
   if (Op0 == Op1 || match(Op1, m_Zero()))
     return Op0;
 
   // A | ~A  =  ~A | A  =  -1
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
   // (A & ?) | A = A
   if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A | (A & ?) = A
   if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
     return Op0;
 
   // ~(A & ?) | A = -1
   if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op1->getType());
 
   // A | ~(A & ?) = -1
   if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
   Value *A, *B;
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
   // (A & ~B) | (B ^ A) -> (B ^ A)
   // (~B & A) | (B ^ A) -> (B ^ A)
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
       (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op1;
 
   // Commute the 'or' operands.
   // (A ^ B) | (A & ~B) -> (A ^ B)
   // (A ^ B) | (~B & A) -> (A ^ B)
   // (B ^ A) | (A & ~B) -> (B ^ A)
   // (B ^ A) | (~B & A) -> (B ^ A)
   if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
       (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
   // (A & B) | (~A ^ B) -> (~A ^ B)
   // (B & A) | (~A ^ B) -> (~A ^ B)
   // (A & B) | (B ^ ~A) -> (B ^ ~A)
   // (B & A) | (B ^ ~A) -> (B ^ ~A)
   if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
       (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op1;
 
   // (~A ^ B) | (A & B) -> (~A ^ B)
   // (~A ^ B) | (B & A) -> (~A ^ B)
   // (B ^ ~A) | (A & B) -> (B ^ ~A)
   // (B ^ ~A) | (B & A) -> (B ^ ~A)
   if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
       (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
   if (Value *V = simplifyAndOrOfCmps(Op0, Op1, false))
     return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Or distributes over And.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q,
                              MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // (A & C1)|(B & C2)
   const APInt *C1, *C2;
   if (match(Op0, m_And(m_Value(A), m_APInt(C1))) &&
       match(Op1, m_And(m_Value(B), m_APInt(C2)))) {
     if (*C1 == ~*C2) {
       // (A & C1)|(B & C2)
       // If we have: ((V + N) & C1) | (V & C2)
       // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
       // replace with V+N.
       Value *N;
       if (C2->isMask() && // C2 == 0+1+
           match(A, m_c_Add(m_Specific(B), m_Value(N)))) {
         // Add commutes, try both ways.
         if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return A;
       }
       // Or commutes, try both ways.
       if (C1->isMask() &&
           match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
         // Add commutes, try both ways.
         if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
       }
     }
   }
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a Xor, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
     return C;
 
   // A ^ undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
   // A ^ 0 = A
   if (match(Op1, m_Zero()))
     return Op0;
 
   // A ^ A = 0
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   // A ^ ~A  =  ~A ^ A  =  -1
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Threading Xor over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A ^ select(cond, B, C)" means evaluating
   // "A^B" and "A^C" and seeing if they are equal; but they are equal if and
   // only if B and C are equal.  If B and C are equal then (since we assume
   // that operands have already been simplified) "select(cond, B, C)" should
   // have been simplified to the common value of B and C already.  Analysing
   // "A^B" and "A^C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
   return nullptr;
 }
 
 Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit);
 }
 
 
 static Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
 /// Rummage around inside V looking for something equivalent to the comparison
 /// "LHS Pred RHS". Return such a value if found, otherwise return null.
 /// Helper function for analyzing max/min idioms.
 static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
                                          Value *LHS, Value *RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI)
     return nullptr;
   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
   if (!Cmp)
     return nullptr;
   Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1);
   if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
     return Cmp;
   if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
       LHS == CmpRHS && RHS == CmpLHS)
     return Cmp;
   return nullptr;
 }
 
 // A significant optimization not implemented here is assuming that alloca
 // addresses are not equal to incoming argument values. They don't *alias*,
 // as we say, but that doesn't mean they aren't equal, so we take a
 // conservative approach.
 //
 // This is inspired in part by C++11 5.10p1:
 //   "Two pointers of the same type compare equal if and only if they are both
 //    null, both point to the same function, or both represent the same
 //    address."
 //
 // This is pretty permissive.
 //
 // It's also partly due to C11 6.5.9p6:
 //   "Two pointers compare equal if and only if both are null pointers, both are
 //    pointers to the same object (including a pointer to an object and a
 //    subobject at its beginning) or function, both are pointers to one past the
 //    last element of the same array object, or one is a pointer to one past the
 //    end of one array object and the other is a pointer to the start of a
 //    different array object that happens to immediately follow the first array
 //    object in the address space.)
 //
 // C11's version is more restrictive, however there's no reason why an argument
 // couldn't be a one-past-the-end value for a stack object in the caller and be
 // equal to the beginning of a stack object in the callee.
 //
 // If the C and C++ standards are ever made sufficiently restrictive in this
 // area, it may be possible to update LLVM's semantics accordingly and reinstate
 // this optimization.
 static Constant *
 computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
                    const DominatorTree *DT, CmpInst::Predicate Pred,
                    AssumptionCache *AC, const Instruction *CxtI,
                    Value *LHS, Value *RHS) {
   // First, skip past any trivial no-ops.
   LHS = LHS->stripPointerCasts();
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
   if (llvm::isKnownNonZero(LHS, DL) && isa<ConstantPointerNull>(RHS) &&
       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
 
   // We can only fold certain predicates on pointer comparisons.
   switch (Pred) {
   default:
     return nullptr;
 
     // Equality comaprisons are easy to fold.
   case CmpInst::ICMP_EQ:
   case CmpInst::ICMP_NE:
     break;
 
     // We can only handle unsigned relational comparisons because 'inbounds' on
     // a GEP only protects against unsigned wrapping.
   case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE:
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
     // However, we have to switch them to their signed variants to handle
     // negative indices from the base pointer.
     Pred = ICmpInst::getSignedPredicate(Pred);
     break;
   }
 
   // Strip off any constant offsets so that we can reason about them.
   // It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets
   // here and compare base addresses like AliasAnalysis does, however there are
   // numerous hazards. AliasAnalysis and its utilities rely on special rules
   // governing loads and stores which don't apply to icmps. Also, AliasAnalysis
   // doesn't need to guarantee pointer inequality when it says NoAlias.
   Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
   Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
 
   // If LHS and RHS are related via constant offsets to the same base
   // value, we can replace it with an icmp which just compares the offsets.
   if (LHS == RHS)
     return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
 
   // Various optimizations for (in)equality comparisons.
   if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
     // Different non-empty allocations that exist at the same time have
     // different addresses (if the program can tell). Global variables always
     // exist, so they always exist during the lifetime of each other and all
     // allocas. Two different allocas usually have different addresses...
     //
     // However, if there's an @llvm.stackrestore dynamically in between two
     // allocas, they may have the same address. It's tempting to reduce the
     // scope of the problem by only looking at *static* allocas here. That would
     // cover the majority of allocas while significantly reducing the likelihood
     // of having an @llvm.stackrestore pop up in the middle. However, it's not
     // actually impossible for an @llvm.stackrestore to pop up in the middle of
     // an entry block. Also, if we have a block that's not attached to a
     // function, we can't tell if it's "static" under the current definition.
     // Theoretically, this problem could be fixed by creating a new kind of
     // instruction kind specifically for static allocas. Such a new instruction
     // could be required to be at the top of the entry block, thus preventing it
     // from being subject to a @llvm.stackrestore. Instcombine could even
     // convert regular allocas into these special allocas. It'd be nifty.
     // However, until then, this problem remains open.
     //
     // So, we'll assume that two non-empty allocas have different addresses
     // for now.
     //
     // With all that, if the offsets are within the bounds of their allocations
     // (and not one-past-the-end! so we can't use inbounds!), and their
     // allocations aren't the same, the pointers are not equal.
     //
     // Note that it's not necessary to check for LHS being a global variable
     // address, due to canonicalization and constant folding.
     if (isa<AllocaInst>(LHS) &&
         (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
       ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
       ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
       Opts.NullIsUnknownSize =
           NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
       if (LHSOffsetCI && RHSOffsetCI &&
           getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
           getObjectSize(RHS, RHSSize, DL, TLI, Opts)) {
         const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
         const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
         if (!LHSOffsetValue.isNegative() &&
             !RHSOffsetValue.isNegative() &&
             LHSOffsetValue.ult(LHSSize) &&
             RHSOffsetValue.ult(RHSSize)) {
           return ConstantInt::get(GetCompareTy(LHS),
                                   !CmpInst::isTrueWhenEqual(Pred));
         }
       }
 
       // Repeat the above check but this time without depending on DataLayout
       // or being able to compute a precise size.
       if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
           !cast<PointerType>(RHS->getType())->isEmptyTy() &&
           LHSOffset->isNullValue() &&
           RHSOffset->isNullValue())
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
     }
 
     // Even if an non-inbounds GEP occurs along the path we can still optimize
     // equality comparisons concerning the result. We avoid walking the whole
     // chain again by starting where the last calls to
     // stripAndComputeConstantOffsets left off and accumulate the offsets.
     Constant *LHSNoBound = stripAndComputeConstantOffsets(DL, LHS, true);
     Constant *RHSNoBound = stripAndComputeConstantOffsets(DL, RHS, true);
     if (LHS == RHS)
       return ConstantExpr::getICmp(Pred,
                                    ConstantExpr::getAdd(LHSOffset, LHSNoBound),
                                    ConstantExpr::getAdd(RHSOffset, RHSNoBound));
 
     // If one side of the equality comparison must come from a noalias call
     // (meaning a system memory allocation function), and the other side must
     // come from a pointer that cannot overlap with dynamically-allocated
     // memory within the lifetime of the current function (allocas, byval
     // arguments, globals), then determine the comparison result here.
     SmallVector<Value *, 8> LHSUObjs, RHSUObjs;
     GetUnderlyingObjects(LHS, LHSUObjs, DL);
     GetUnderlyingObjects(RHS, RHSUObjs, DL);
 
     // Is the set of underlying objects all noalias calls?
     auto IsNAC = [](ArrayRef<Value *> Objects) {
       return all_of(Objects, isNoAliasCall);
     };
 
     // Is the set of underlying objects all things which must be disjoint from
     // noalias calls. For allocas, we consider only static ones (dynamic
     // allocas might be transformed into calls to malloc not simultaneously
     // live with the compared-to allocation). For globals, we exclude symbols
     // that might be resolve lazily to symbols in another dynamically-loaded
     // library (and, thus, could be malloc'ed by the implementation).
     auto IsAllocDisjoint = [](ArrayRef<Value *> Objects) {
       return all_of(Objects, [](Value *V) {
         if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
           return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
         if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
           return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
                   GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
                  !GV->isThreadLocal();
         if (const Argument *A = dyn_cast<Argument>(V))
           return A->hasByValAttr();
         return false;
       });
     };
 
     if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) ||
         (IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs)))
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
 
     // Fold comparisons for non-escaping pointer even if the allocation call
     // cannot be elided. We cannot fold malloc comparison to null. Also, the
     // dynamic allocation call could be either of the operands.
     Value *MI = nullptr;
     if (isAllocLikeFn(LHS, TLI) &&
         llvm::isKnownNonZero(RHS, DL, 0, nullptr, CxtI, DT))
       MI = LHS;
     else if (isAllocLikeFn(RHS, TLI) &&
              llvm::isKnownNonZero(LHS, DL, 0, nullptr, CxtI, DT))
       MI = RHS;
     // FIXME: We should also fold the compare when the pointer escapes, but the
     // compare dominates the pointer escape
     if (MI && !PointerMayBeCaptured(MI, true, true))
       return ConstantInt::get(GetCompareTy(LHS),
                               CmpInst::isFalseWhenEqual(Pred));
   }
 
   // Otherwise, fail.
   return nullptr;
 }
 
 /// Fold an icmp when its operands have i1 scalar type.
 static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const SimplifyQuery &Q) {
   Type *ITy = GetCompareTy(LHS); // The return type.
   Type *OpTy = LHS->getType();   // The operand type.
   if (!OpTy->isIntOrIntVectorTy(1))
     return nullptr;
 
   // A boolean compared to true/false can be simplified in 14 out of the 20
   // (10 predicates * 2 constants) possible combinations. Cases not handled here
   // require a 'not' of the LHS, so those must be transformed in InstCombine.
   if (match(RHS, m_Zero())) {
     switch (Pred) {
     case CmpInst::ICMP_NE:  // X !=  0 -> X
     case CmpInst::ICMP_UGT: // X >u  0 -> X
     case CmpInst::ICMP_SLT: // X <s  0 -> X
       return LHS;
 
     case CmpInst::ICMP_ULT: // X <u  0 -> false
     case CmpInst::ICMP_SGT: // X >s  0 -> false
       return getFalse(ITy);
 
     case CmpInst::ICMP_UGE: // X >=u 0 -> true
     case CmpInst::ICMP_SLE: // X <=s 0 -> true
       return getTrue(ITy);
 
     default: break;
     }
   } else if (match(RHS, m_One())) {
     switch (Pred) {
     case CmpInst::ICMP_EQ:  // X ==   1 -> X
     case CmpInst::ICMP_UGE: // X >=u  1 -> X
     case CmpInst::ICMP_SLE: // X <=s -1 -> X
       return LHS;
 
     case CmpInst::ICMP_UGT: // X >u   1 -> false
     case CmpInst::ICMP_SLT: // X <s  -1 -> false
       return getFalse(ITy);
 
     case CmpInst::ICMP_ULE: // X <=u  1 -> true
     case CmpInst::ICMP_SGE: // X >=s -1 -> true
       return getTrue(ITy);
 
     default: break;
     }
   }
 
   switch (Pred) {
   default:
     break;
   case ICmpInst::ICMP_UGE:
     if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_SGE:
     /// For signed comparison, the values for an i1 are 0 and -1
     /// respectively. This maps into a truth table of:
     /// LHS | RHS | LHS >=s RHS   | LHS implies RHS
     ///  0  |  0  |  1 (0 >= 0)   |  1
     ///  0  |  1  |  1 (0 >= -1)  |  1
     ///  1  |  0  |  0 (-1 >= 0)  |  0
     ///  1  |  1  |  1 (-1 >= -1) |  1
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_ULE:
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
   }
 
   return nullptr;
 }
 
 /// Try hard to fold icmp with zero RHS because this is a common case.
 static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
                                    Value *RHS, const SimplifyQuery &Q) {
   if (!match(RHS, m_Zero()))
     return nullptr;
 
   Type *ITy = GetCompareTy(LHS); // The return type.
   switch (Pred) {
   default:
     llvm_unreachable("Unknown ICmp predicate!");
   case ICmpInst::ICMP_ULT:
     return getFalse(ITy);
   case ICmpInst::ICMP_UGE:
     return getTrue(ITy);
   case ICmpInst::ICMP_EQ:
   case ICmpInst::ICMP_ULE:
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
   case ICmpInst::ICMP_NE:
   case ICmpInst::ICMP_UGT:
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_SLT: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getTrue(ITy);
     if (LHSKnown.isNonNegative())
       return getFalse(ITy);
     break;
   }
   case ICmpInst::ICMP_SLE: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getTrue(ITy);
     if (LHSKnown.isNonNegative() &&
         isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
   }
   case ICmpInst::ICMP_SGE: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getFalse(ITy);
     if (LHSKnown.isNonNegative())
       return getTrue(ITy);
     break;
   }
   case ICmpInst::ICMP_SGT: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getFalse(ITy);
     if (LHSKnown.isNonNegative() &&
         isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   }
   }
 
   return nullptr;
 }
 
 /// Many binary operators with a constant operand have an easy-to-compute
 /// range of outputs. This can be used to fold a comparison to always true or
 /// always false.
 static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper) {
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (BO.getOpcode()) {
   case Instruction::Add:
     if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
       // FIXME: If we have both nuw and nsw, we should reduce the range further.
       if (BO.hasNoUnsignedWrap()) {
         // 'add nuw x, C' produces [C, UINT_MAX].
         Lower = *C;
       } else if (BO.hasNoSignedWrap()) {
         if (C->isNegative()) {
           // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C].
           Lower = APInt::getSignedMinValue(Width);
           Upper = APInt::getSignedMaxValue(Width) + *C + 1;
         } else {
           // 'add nsw x, +C' produces [SINT_MIN + C, SINT_MAX].
           Lower = APInt::getSignedMinValue(Width) + *C;
           Upper = APInt::getSignedMaxValue(Width) + 1;
         }
       }
     }
     break;
 
   case Instruction::And:
     if (match(BO.getOperand(1), m_APInt(C)))
       // 'and x, C' produces [0, C].
       Upper = *C + 1;
     break;
 
   case Instruction::Or:
     if (match(BO.getOperand(1), m_APInt(C)))
       // 'or x, C' produces [C, UINT_MAX].
       Lower = *C;
     break;
 
   case Instruction::AShr:
     if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
       // 'ashr x, C' produces [INT_MIN >> C, INT_MAX >> C].
       Lower = APInt::getSignedMinValue(Width).ashr(*C);
       Upper = APInt::getSignedMaxValue(Width).ashr(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       unsigned ShiftAmount = Width - 1;
       if (!C->isNullValue() && BO.isExact())
         ShiftAmount = C->countTrailingZeros();
       if (C->isNegative()) {
         // 'ashr C, x' produces [C, C >> (Width-1)]
         Lower = *C;
         Upper = C->ashr(ShiftAmount) + 1;
       } else {
         // 'ashr C, x' produces [C >> (Width-1), C]
         Lower = C->ashr(ShiftAmount);
         Upper = *C + 1;
       }
     }
     break;
 
   case Instruction::LShr:
     if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
       // 'lshr x, C' produces [0, UINT_MAX >> C].
       Upper = APInt::getAllOnesValue(Width).lshr(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       // 'lshr C, x' produces [C >> (Width-1), C].
       unsigned ShiftAmount = Width - 1;
       if (!C->isNullValue() && BO.isExact())
         ShiftAmount = C->countTrailingZeros();
       Lower = C->lshr(ShiftAmount);
       Upper = *C + 1;
     }
     break;
 
   case Instruction::Shl:
     if (match(BO.getOperand(0), m_APInt(C))) {
       if (BO.hasNoUnsignedWrap()) {
         // 'shl nuw C, x' produces [C, C << CLZ(C)]
         Lower = *C;
         Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
       } else if (BO.hasNoSignedWrap()) { // TODO: What if both nuw+nsw?
         if (C->isNegative()) {
           // 'shl nsw C, x' produces [C << CLO(C)-1, C]
           unsigned ShiftAmount = C->countLeadingOnes() - 1;
           Lower = C->shl(ShiftAmount);
           Upper = *C + 1;
         } else {
           // 'shl nsw C, x' produces [C, C << CLZ(C)-1]
           unsigned ShiftAmount = C->countLeadingZeros() - 1;
           Lower = *C;
           Upper = C->shl(ShiftAmount) + 1;
         }
       }
     }
     break;
 
   case Instruction::SDiv:
     if (match(BO.getOperand(1), m_APInt(C))) {
       APInt IntMin = APInt::getSignedMinValue(Width);
       APInt IntMax = APInt::getSignedMaxValue(Width);
       if (C->isAllOnesValue()) {
         // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
         //    where C != -1 and C != 0 and C != 1
         Lower = IntMin + 1;
         Upper = IntMax + 1;
       } else if (C->countLeadingZeros() < Width - 1) {
         // 'sdiv x, C' produces [INT_MIN / C, INT_MAX / C]
         //    where C != -1 and C != 0 and C != 1
         Lower = IntMin.sdiv(*C);
         Upper = IntMax.sdiv(*C);
         if (Lower.sgt(Upper))
           std::swap(Lower, Upper);
         Upper = Upper + 1;
         assert(Upper != Lower && "Upper part of range has wrapped!");
       }
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       if (C->isMinSignedValue()) {
         // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
         Lower = *C;
         Upper = Lower.lshr(1) + 1;
       } else {
         // 'sdiv C, x' produces [-|C|, |C|].
         Upper = C->abs() + 1;
         Lower = (-Upper) + 1;
       }
     }
     break;
 
   case Instruction::UDiv:
     if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
       // 'udiv x, C' produces [0, UINT_MAX / C].
       Upper = APInt::getMaxValue(Width).udiv(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       // 'udiv C, x' produces [0, C].
       Upper = *C + 1;
     }
     break;
 
   case Instruction::SRem:
     if (match(BO.getOperand(1), m_APInt(C))) {
       // 'srem x, C' produces (-|C|, |C|).
       Upper = C->abs();
       Lower = (-Upper) + 1;
     }
     break;
 
   case Instruction::URem:
     if (match(BO.getOperand(1), m_APInt(C)))
       // 'urem x, C' produces [0, C).
       Upper = *C;
     break;
 
   default:
     break;
   }
 }
 
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS) {
   Type *ITy = GetCompareTy(RHS); // The return type.
 
   Value *X;
   // Sign-bit checks can be optimized to true/false after unsigned
   // floating-point casts:
   // icmp slt (bitcast (uitofp X)),  0 --> false
   // icmp sgt (bitcast (uitofp X)), -1 --> true
   if (match(LHS, m_BitCast(m_UIToFP(m_Value(X))))) {
     if (Pred == ICmpInst::ICMP_SLT && match(RHS, m_Zero()))
       return ConstantInt::getFalse(ITy);
     if (Pred == ICmpInst::ICMP_SGT && match(RHS, m_AllOnes()))
       return ConstantInt::getTrue(ITy);
   }
 
   const APInt *C;
   if (!match(RHS, m_APInt(C)))
     return nullptr;
 
   // Rule out tautological comparisons (eg., ult 0 or uge 0).
   ConstantRange RHS_CR = ConstantRange::makeExactICmpRegion(Pred, *C);
   if (RHS_CR.isEmptySet())
     return ConstantInt::getFalse(ITy);
   if (RHS_CR.isFullSet())
     return ConstantInt::getTrue(ITy);
 
   // Find the range of possible values for binary operators.
   unsigned Width = C->getBitWidth();
   APInt Lower = APInt(Width, 0);
   APInt Upper = APInt(Width, 0);
   if (auto *BO = dyn_cast<BinaryOperator>(LHS))
     setLimitsForBinOp(*BO, Lower, Upper);
 
   ConstantRange LHS_CR =
       Lower != Upper ? ConstantRange(Lower, Upper) : ConstantRange(Width, true);
 
   if (auto *I = dyn_cast<Instruction>(LHS))
     if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
       LHS_CR = LHS_CR.intersectWith(getConstantRangeFromMetadata(*Ranges));
 
   if (!LHS_CR.isFullSet()) {
     if (RHS_CR.contains(LHS_CR))
       return ConstantInt::getTrue(ITy);
     if (RHS_CR.inverse().contains(LHS_CR))
       return ConstantInt::getFalse(ITy);
   }
 
   return nullptr;
 }
 
 /// TODO: A large part of this logic is duplicated in InstCombine's
 /// foldICmpBinOp(). We should be able to share that and avoid the code
 /// duplication.
 static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   Type *ITy = GetCompareTy(LHS); // The return type.
 
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
   if (MaxRecurse && (LBO || RBO)) {
     // Analyze the case when either LHS or RHS is an add instruction.
     Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
     // LHS = A + B (or A and B are null); RHS = C + D (or C and D are null).
     bool NoLHSWrapProblem = false, NoRHSWrapProblem = false;
     if (LBO && LBO->getOpcode() == Instruction::Add) {
       A = LBO->getOperand(0);
       B = LBO->getOperand(1);
       NoLHSWrapProblem =
           ICmpInst::isEquality(Pred) ||
           (CmpInst::isUnsigned(Pred) && LBO->hasNoUnsignedWrap()) ||
           (CmpInst::isSigned(Pred) && LBO->hasNoSignedWrap());
     }
     if (RBO && RBO->getOpcode() == Instruction::Add) {
       C = RBO->getOperand(0);
       D = RBO->getOperand(1);
       NoRHSWrapProblem =
           ICmpInst::isEquality(Pred) ||
           (CmpInst::isUnsigned(Pred) && RBO->hasNoUnsignedWrap()) ||
           (CmpInst::isSigned(Pred) && RBO->hasNoSignedWrap());
     }
 
     // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
     if ((A == RHS || B == RHS) && NoLHSWrapProblem)
       if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A,
                                       Constant::getNullValue(RHS->getType()), Q,
                                       MaxRecurse - 1))
         return V;
 
     // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
     if ((C == LHS || D == LHS) && NoRHSWrapProblem)
       if (Value *V =
               SimplifyICmpInst(Pred, Constant::getNullValue(LHS->getType()),
                                C == LHS ? D : C, Q, MaxRecurse - 1))
         return V;
 
     // icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow.
     if (A && C && (A == C || A == D || B == C || B == D) && NoLHSWrapProblem &&
         NoRHSWrapProblem) {
       // Determine Y and Z in the form icmp (X+Y), (X+Z).
       Value *Y, *Z;
       if (A == C) {
         // C + B == C + D  ->  B == D
         Y = B;
         Z = D;
       } else if (A == D) {
         // D + B == C + D  ->  B == C
         Y = B;
         Z = C;
       } else if (B == C) {
         // A + C == C + D  ->  A == D
         Y = A;
         Z = D;
       } else {
         assert(B == D);
         // A + D == C + D  ->  A == C
         Y = A;
         Z = C;
       }
       if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse - 1))
         return V;
     }
   }
 
   {
     Value *Y = nullptr;
     // icmp pred (or X, Y), X
     if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) {
       if (Pred == ICmpInst::ICMP_ULT)
         return getFalse(ITy);
       if (Pred == ICmpInst::ICMP_UGE)
         return getTrue(ITy);
 
       if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
         KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         if (RHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
         if (RHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
       }
     }
     // icmp pred X, (or X, Y)
     if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) {
       if (Pred == ICmpInst::ICMP_ULE)
         return getTrue(ITy);
       if (Pred == ICmpInst::ICMP_UGT)
         return getFalse(ITy);
 
       if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
         KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         if (LHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
         if (LHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
       }
     }
   }
 
   // icmp pred (and X, Y), X
   if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
       return getTrue(ITy);
   }
   // icmp pred X, (and X, Y)
   if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) {
     if (Pred == ICmpInst::ICMP_UGE)
       return getTrue(ITy);
     if (Pred == ICmpInst::ICMP_ULT)
       return getFalse(ITy);
   }
 
   // 0 - (zext X) pred C
   if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
       if (RHSC->getValue().isStrictlyPositive()) {
         if (Pred == ICmpInst::ICMP_SLT)
           return ConstantInt::getTrue(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_SGE)
           return ConstantInt::getFalse(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_EQ)
           return ConstantInt::getFalse(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_NE)
           return ConstantInt::getTrue(RHSC->getContext());
       }
       if (RHSC->getValue().isNonNegative()) {
         if (Pred == ICmpInst::ICMP_SLE)
           return ConstantInt::getTrue(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_SGT)
           return ConstantInt::getFalse(RHSC->getContext());
       }
     }
   }
 
   // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: {
       KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE: {
       KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
       return getTrue(ITy);
     }
   }
 
   // icmp pred X, (urem Y, X)
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: {
       KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE: {
       KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
       return getFalse(ITy);
     }
   }
 
   // x >> y <=u x
   // x udiv y <=u x.
   if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) ||
               match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) {
     // icmp pred (X op Y), X
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
       return getTrue(ITy);
   }
 
   // x >=u x >> y
   // x >=u x udiv y.
   if (RBO && (match(RBO, m_LShr(m_Specific(LHS), m_Value())) ||
               match(RBO, m_UDiv(m_Specific(LHS), m_Value())))) {
     // icmp pred X, (X op Y)
     if (Pred == ICmpInst::ICMP_ULT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_UGE)
       return getTrue(ITy);
   }
 
   // handle:
   //   CI2 << X == CI
   //   CI2 << X != CI
   //
   //   where CI2 is a power of 2 and CI isn't
   if (auto *CI = dyn_cast<ConstantInt>(RHS)) {
     const APInt *CI2Val, *CIVal = &CI->getValue();
     if (LBO && match(LBO, m_Shl(m_APInt(CI2Val), m_Value())) &&
         CI2Val->isPowerOf2()) {
       if (!CIVal->isPowerOf2()) {
         // CI2 << X can equal zero in some circumstances,
         // this simplification is unsafe if CI is zero.
         //
         // We know it is safe if:
         // - The shift is nsw, we can't shift out the one bit.
         // - The shift is nuw, we can't shift out the one bit.
         // - CI2 is one
         // - CI isn't zero
         if (LBO->hasNoSignedWrap() || LBO->hasNoUnsignedWrap() ||
             CI2Val->isOneValue() || !CI->isZero()) {
           if (Pred == ICmpInst::ICMP_EQ)
             return ConstantInt::getFalse(RHS->getContext());
           if (Pred == ICmpInst::ICMP_NE)
             return ConstantInt::getTrue(RHS->getContext());
         }
       }
       if (CIVal->isSignMask() && CI2Val->isOneValue()) {
         if (Pred == ICmpInst::ICMP_UGT)
           return ConstantInt::getFalse(RHS->getContext());
         if (Pred == ICmpInst::ICMP_ULE)
           return ConstantInt::getTrue(RHS->getContext());
       }
     }
   }
 
   if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
       LBO->getOperand(1) == RBO->getOperand(1)) {
     switch (LBO->getOpcode()) {
     default:
       break;
     case Instruction::UDiv:
     case Instruction::LShr:
       if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
           return V;
       break;
     case Instruction::SDiv:
       if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     case Instruction::AShr:
       if (!LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     case Instruction::Shl: {
       bool NUW = LBO->hasNoUnsignedWrap() && RBO->hasNoUnsignedWrap();
       bool NSW = LBO->hasNoSignedWrap() && RBO->hasNoSignedWrap();
       if (!NUW && !NSW)
         break;
       if (!NSW && ICmpInst::isSigned(Pred))
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     }
     }
   }
   return nullptr;
 }
 
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
                                      Value *RHS, const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
   Type *ITy = GetCompareTy(LHS); // The return type.
   Value *A, *B;
   CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
   CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".
 
   // Signed variants on "max(a,b)>=a -> true".
   if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // smax(A, B) pred A.
     EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
     // We analyze this as smax(A, B) pred A.
     P = Pred;
   } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred smax(A, B).
     EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
     // We analyze this as smax(A, B) swapped-pred A.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
              (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // smin(A, B) pred A.
     EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
     // We analyze this as smax(-A, -B) swapped-pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred smin(A, B).
     EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
     // We analyze this as smax(-A, -B) pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = Pred;
   }
   if (P != CmpInst::BAD_ICMP_PREDICATE) {
     // Cases correspond to "max(A, B) p A".
     switch (P) {
     default:
       break;
     case CmpInst::ICMP_EQ:
     case CmpInst::ICMP_SLE:
       // Equivalent to "A EqP B".  This may be the same as the condition tested
       // in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     case CmpInst::ICMP_NE:
     case CmpInst::ICMP_SGT: {
       CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
       // Equivalent to "A InvEqP B".  This may be the same as the condition
       // tested in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     }
     case CmpInst::ICMP_SGE:
       // Always true.
       return getTrue(ITy);
     case CmpInst::ICMP_SLT:
       // Always false.
       return getFalse(ITy);
     }
   }
 
   // Unsigned variants on "max(a,b)>=a -> true".
   P = CmpInst::BAD_ICMP_PREDICATE;
   if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // umax(A, B) pred A.
     EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
     // We analyze this as umax(A, B) pred A.
     P = Pred;
   } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred umax(A, B).
     EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
     // We analyze this as umax(A, B) swapped-pred A.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
              (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // umin(A, B) pred A.
     EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
     // We analyze this as umax(-A, -B) swapped-pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred umin(A, B).
     EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
     // We analyze this as umax(-A, -B) pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = Pred;
   }
   if (P != CmpInst::BAD_ICMP_PREDICATE) {
     // Cases correspond to "max(A, B) p A".
     switch (P) {
     default:
       break;
     case CmpInst::ICMP_EQ:
     case CmpInst::ICMP_ULE:
       // Equivalent to "A EqP B".  This may be the same as the condition tested
       // in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     case CmpInst::ICMP_NE:
     case CmpInst::ICMP_UGT: {
       CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
       // Equivalent to "A InvEqP B".  This may be the same as the condition
       // tested in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     }
     case CmpInst::ICMP_UGE:
       // Always true.
       return getTrue(ITy);
     case CmpInst::ICMP_ULT:
       // Always false.
       return getFalse(ITy);
     }
   }
 
   // Variants on "max(x,y) >= min(x,z)".
   Value *C, *D;
   if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
       match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
       (A == C || A == D || B == C || B == D)) {
     // max(x, ?) pred min(x, ?).
     if (Pred == CmpInst::ICMP_SGE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_SLT)
       // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
              match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // min(x, ?) pred max(x, ?).
     if (Pred == CmpInst::ICMP_SLE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_SGT)
       // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // max(x, ?) pred min(x, ?).
     if (Pred == CmpInst::ICMP_UGE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_ULT)
       // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // min(x, ?) pred max(x, ?).
     if (Pred == CmpInst::ICMP_ULE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_UGT)
       // Always false.
       return getFalse(ITy);
   }
 
   return nullptr;
 }
 
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
       return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
 
   Type *ITy = GetCompareTy(LHS); // The return type.
 
   // icmp X, X -> true/false
   // icmp X, undef -> true/false because undef could be X.
   if (LHS == RHS || isa<UndefValue>(RHS))
     return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));
 
   if (Value *V = simplifyICmpOfBools(Pred, LHS, RHS, Q))
     return V;
 
   if (Value *V = simplifyICmpWithZero(Pred, LHS, RHS, Q))
     return V;
 
   if (Value *V = simplifyICmpWithConstant(Pred, LHS, RHS))
     return V;
 
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
   if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
     auto RHS_Instr = cast<Instruction>(RHS);
     auto LHS_Instr = cast<Instruction>(LHS);
 
     if (RHS_Instr->getMetadata(LLVMContext::MD_range) &&
         LHS_Instr->getMetadata(LLVMContext::MD_range)) {
       auto RHS_CR = getConstantRangeFromMetadata(
           *RHS_Instr->getMetadata(LLVMContext::MD_range));
       auto LHS_CR = getConstantRangeFromMetadata(
           *LHS_Instr->getMetadata(LLVMContext::MD_range));
 
       auto Satisfied_CR = ConstantRange::makeSatisfyingICmpRegion(Pred, RHS_CR);
       if (Satisfied_CR.contains(LHS_CR))
         return ConstantInt::getTrue(RHS->getContext());
 
       auto InversedSatisfied_CR = ConstantRange::makeSatisfyingICmpRegion(
                 CmpInst::getInversePredicate(Pred), RHS_CR);
       if (InversedSatisfied_CR.contains(LHS_CR))
         return ConstantInt::getFalse(RHS->getContext());
     }
   }
 
   // Compare of cast, for example (zext X) != 0 -> X != 0
   if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
     Instruction *LI = cast<CastInst>(LHS);
     Value *SrcOp = LI->getOperand(0);
     Type *SrcTy = SrcOp->getType();
     Type *DstTy = LI->getType();
 
     // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
     // if the integer type is the same size as the pointer type.
     if (MaxRecurse && isa<PtrToIntInst>(LI) &&
         Q.DL.getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) {
       if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
         // Transfer the cast to the constant.
         if (Value *V = SimplifyICmpInst(Pred, SrcOp,
                                         ConstantExpr::getIntToPtr(RHSC, SrcTy),
                                         Q, MaxRecurse-1))
           return V;
       } else if (PtrToIntInst *RI = dyn_cast<PtrToIntInst>(RHS)) {
         if (RI->getOperand(0)->getType() == SrcTy)
           // Compare without the cast.
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
                                           Q, MaxRecurse-1))
             return V;
       }
     }
 
     if (isa<ZExtInst>(LHS)) {
       // Turn icmp (zext X), (zext Y) into a compare of X and Y if they have the
       // same type.
       if (ZExtInst *RI = dyn_cast<ZExtInst>(RHS)) {
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that signed predicates become unsigned.
           if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
                                           SrcOp, RI->getOperand(0), Q,
                                           MaxRecurse-1))
             return V;
       }
       // Turn icmp (zext X), Cst into a compare of X and Cst if Cst is extended
       // too.  If not, then try to deduce the result of the comparison.
       else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
         // Compute the constant that would happen if we truncated to SrcTy then
         // reextended to DstTy.
         Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
         Constant *RExt = ConstantExpr::getCast(CastInst::ZExt, Trunc, DstTy);
 
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two zero-extended values.
         if (RExt == CI && MaxRecurse)
           if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
                                         SrcOp, Trunc, Q, MaxRecurse-1))
             return V;
 
         // Otherwise the upper bits of LHS are zero while RHS has a non-zero bit
         // there.  Use this to work out the result of the comparison.
         if (RExt != CI) {
           switch (Pred) {
           default: llvm_unreachable("Unknown ICmp predicate!");
           // LHS <u RHS.
           case ICmpInst::ICMP_EQ:
           case ICmpInst::ICMP_UGT:
           case ICmpInst::ICMP_UGE:
             return ConstantInt::getFalse(CI->getContext());
 
           case ICmpInst::ICMP_NE:
           case ICmpInst::ICMP_ULT:
           case ICmpInst::ICMP_ULE:
             return ConstantInt::getTrue(CI->getContext());
 
           // LHS is non-negative.  If RHS is negative then LHS >s LHS.  If RHS
           // is non-negative then LHS <s RHS.
           case ICmpInst::ICMP_SGT:
           case ICmpInst::ICMP_SGE:
             return CI->getValue().isNegative() ?
               ConstantInt::getTrue(CI->getContext()) :
               ConstantInt::getFalse(CI->getContext());
 
           case ICmpInst::ICMP_SLT:
           case ICmpInst::ICMP_SLE:
             return CI->getValue().isNegative() ?
               ConstantInt::getFalse(CI->getContext()) :
               ConstantInt::getTrue(CI->getContext());
           }
         }
       }
     }
 
     if (isa<SExtInst>(LHS)) {
       // Turn icmp (sext X), (sext Y) into a compare of X and Y if they have the
       // same type.
       if (SExtInst *RI = dyn_cast<SExtInst>(RHS)) {
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that the predicate does not change.
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
                                           Q, MaxRecurse-1))
             return V;
       }
       // Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended
       // too.  If not, then try to deduce the result of the comparison.
       else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
         // Compute the constant that would happen if we truncated to SrcTy then
         // reextended to DstTy.
         Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
         Constant *RExt = ConstantExpr::getCast(CastInst::SExt, Trunc, DstTy);
 
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two sign-extended values.
         if (RExt == CI && MaxRecurse)
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1))
             return V;
 
         // Otherwise the upper bits of LHS are all equal, while RHS has varying
         // bits there.  Use this to work out the result of the comparison.
         if (RExt != CI) {
           switch (Pred) {
           default: llvm_unreachable("Unknown ICmp predicate!");
           case ICmpInst::ICMP_EQ:
             return ConstantInt::getFalse(CI->getContext());
           case ICmpInst::ICMP_NE:
             return ConstantInt::getTrue(CI->getContext());
 
           // If RHS is non-negative then LHS <s RHS.  If RHS is negative then
           // LHS >s RHS.
           case ICmpInst::ICMP_SGT:
           case ICmpInst::ICMP_SGE:
             return CI->getValue().isNegative() ?
               ConstantInt::getTrue(CI->getContext()) :
               ConstantInt::getFalse(CI->getContext());
           case ICmpInst::ICMP_SLT:
           case ICmpInst::ICMP_SLE:
             return CI->getValue().isNegative() ?
               ConstantInt::getFalse(CI->getContext()) :
               ConstantInt::getTrue(CI->getContext());
 
           // If LHS is non-negative then LHS <u RHS.  If LHS is negative then
           // LHS >u RHS.
           case ICmpInst::ICMP_UGT:
           case ICmpInst::ICMP_UGE:
             // Comparison is true iff the LHS <s 0.
             if (MaxRecurse)
               if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
                                               Constant::getNullValue(SrcTy),
                                               Q, MaxRecurse-1))
                 return V;
             break;
           case ICmpInst::ICMP_ULT:
           case ICmpInst::ICMP_ULE:
             // Comparison is true iff the LHS >=s 0.
             if (MaxRecurse)
               if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
                                               Constant::getNullValue(SrcTy),
                                               Q, MaxRecurse-1))
                 return V;
             break;
           }
         }
       }
     }
   }
 
   // icmp eq|ne X, Y -> false|true if X != Y
   if (ICmpInst::isEquality(Pred) &&
       isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT)) {
     return Pred == ICmpInst::ICMP_NE ? getTrue(ITy) : getFalse(ITy);
   }
 
   if (Value *V = simplifyICmpWithBinOp(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
     if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI, LHS,
                                      RHS))
       return C;
   if (auto *CLHS = dyn_cast<PtrToIntOperator>(LHS))
     if (auto *CRHS = dyn_cast<PtrToIntOperator>(RHS))
       if (Q.DL.getTypeSizeInBits(CLHS->getPointerOperandType()) ==
               Q.DL.getTypeSizeInBits(CLHS->getType()) &&
           Q.DL.getTypeSizeInBits(CRHS->getPointerOperandType()) ==
               Q.DL.getTypeSizeInBits(CRHS->getType()))
         if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI,
                                          CLHS->getPointerOperand(),
                                          CRHS->getPointerOperand()))
           return C;
 
   if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
     if (GEPOperator *GRHS = dyn_cast<GEPOperator>(RHS)) {
       if (GLHS->getPointerOperand() == GRHS->getPointerOperand() &&
           GLHS->hasAllConstantIndices() && GRHS->hasAllConstantIndices() &&
           (ICmpInst::isEquality(Pred) ||
            (GLHS->isInBounds() && GRHS->isInBounds() &&
             Pred == ICmpInst::getSignedPredicate(Pred)))) {
         // The bases are equal and the indices are constant.  Build a constant
         // expression GEP with the same indices and a null base pointer to see
         // what constant folding can make out of it.
         Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType());
         SmallVector<Value *, 4> IndicesLHS(GLHS->idx_begin(), GLHS->idx_end());
         Constant *NewLHS = ConstantExpr::getGetElementPtr(
             GLHS->getSourceElementType(), Null, IndicesLHS);
 
         SmallVector<Value *, 4> IndicesRHS(GRHS->idx_begin(), GRHS->idx_end());
         Constant *NewRHS = ConstantExpr::getGetElementPtr(
             GLHS->getSourceElementType(), Null, IndicesRHS);
         return ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
       }
     }
   }
 
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
     if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
     if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q) {
   return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 /// Given operands for an FCmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                FastMathFlags FMF, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
       return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
 
   // Fold trivial predicates.
   Type *RetTy = GetCompareTy(LHS);
   if (Pred == FCmpInst::FCMP_FALSE)
     return getFalse(RetTy);
   if (Pred == FCmpInst::FCMP_TRUE)
     return getTrue(RetTy);
 
   // UNO/ORD predicates can be trivially folded if NaNs are ignored.
   if (FMF.noNaNs()) {
     if (Pred == FCmpInst::FCMP_UNO)
       return getFalse(RetTy);
     if (Pred == FCmpInst::FCMP_ORD)
       return getTrue(RetTy);
   }
 
   // NaN is unordered; NaN is not ordered.
   assert((FCmpInst::isOrdered(Pred) || FCmpInst::isUnordered(Pred)) &&
          "Comparison must be either ordered or unordered");
   if (match(RHS, m_NaN()))
     return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
 
   // fcmp pred x, undef  and  fcmp pred undef, x
   // fold to true if unordered, false if ordered
   if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) {
     // Choosing NaN for the undef will always make unordered comparison succeed
     // and ordered comparison fail.
     return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
   }
 
   // fcmp x,x -> true/false.  Not all compares are foldable.
   if (LHS == RHS) {
     if (CmpInst::isTrueWhenEqual(Pred))
       return getTrue(RetTy);
     if (CmpInst::isFalseWhenEqual(Pred))
       return getFalse(RetTy);
   }
 
   // Handle fcmp with constant RHS.
   const APFloat *C;
   if (match(RHS, m_APFloat(C))) {
     // Check whether the constant is an infinity.
     if (C->isInfinity()) {
       if (C->isNegative()) {
         switch (Pred) {
         case FCmpInst::FCMP_OLT:
           // No value is ordered and less than negative infinity.
           return getFalse(RetTy);
         case FCmpInst::FCMP_UGE:
           // All values are unordered with or at least negative infinity.
           return getTrue(RetTy);
         default:
           break;
         }
       } else {
         switch (Pred) {
         case FCmpInst::FCMP_OGT:
           // No value is ordered and greater than infinity.
           return getFalse(RetTy);
         case FCmpInst::FCMP_ULE:
           // All values are unordered with and at most infinity.
           return getTrue(RetTy);
         default:
           break;
         }
       }
     }
     if (C->isZero()) {
       switch (Pred) {
       case FCmpInst::FCMP_UGE:
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
         break;
       case FCmpInst::FCMP_OLT:
         // X < 0
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getFalse(RetTy);
         break;
       default:
         break;
       }
     } else if (C->isNegative()) {
       assert(!C->isNaN() && "Unexpected NaN constant!");
       // TODO: We can catch more cases by using a range check rather than
       //       relying on CannotBeOrderedLessThanZero.
       switch (Pred) {
       case FCmpInst::FCMP_UGE:
       case FCmpInst::FCMP_UGT:
       case FCmpInst::FCMP_UNE:
         // (X >= 0) implies (X > C) when (C < 0)
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
         break;
       case FCmpInst::FCMP_OEQ:
       case FCmpInst::FCMP_OLE:
       case FCmpInst::FCMP_OLT:
         // (X >= 0) implies !(X < C) when (C < 0)
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getFalse(RetTy);
         break;
       default:
         break;
       }
     }
   }
 
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
     if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
     if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               FastMathFlags FMF, const SimplifyQuery &Q) {
   return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// See if V simplifies when its operand Op is replaced with RepOp.
 static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                            const SimplifyQuery &Q,
                                            unsigned MaxRecurse) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
 
   // We cannot replace a constant, and shouldn't even try.
   if (isa<Constant>(Op))
     return nullptr;
 
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return nullptr;
 
   // If this is a binary operator, try to simplify it with the replaced op.
   if (auto *B = dyn_cast<BinaryOperator>(I)) {
     // Consider:
     //   %cmp = icmp eq i32 %x, 2147483647
     //   %add = add nsw i32 %x, 1
     //   %sel = select i1 %cmp, i32 -2147483648, i32 %add
     //
     // We can't replace %sel with %add unless we strip away the flags.
     if (isa<OverflowingBinaryOperator>(B))
       if (B->hasNoSignedWrap() || B->hasNoUnsignedWrap())
         return nullptr;
     if (isa<PossiblyExactOperator>(B))
       if (B->isExact())
         return nullptr;
 
     if (MaxRecurse) {
       if (B->getOperand(0) == Op)
         return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q,
                              MaxRecurse - 1);
       if (B->getOperand(1) == Op)
         return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, Q,
                              MaxRecurse - 1);
     }
   }
 
   // Same for CmpInsts.
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (MaxRecurse) {
       if (C->getOperand(0) == Op)
         return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), Q,
                                MaxRecurse - 1);
       if (C->getOperand(1) == Op)
         return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, Q,
                                MaxRecurse - 1);
     }
   }
 
   // Same for GEPs.
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     if (MaxRecurse) {
       SmallVector<Value *, 8> NewOps(GEP->getNumOperands());
       transform(GEP->operands(), NewOps.begin(),
                 [&](Value *V) { return V == Op ? RepOp : V; });
       return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q,
                              MaxRecurse - 1);
     }
   }
 
   // TODO: We could hand off more cases to instsimplify here.
 
   // If all operands are constant after substituting Op for RepOp then we can
   // constant fold the instruction.
   if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) {
     // Build a list of all constant operands.
     SmallVector<Constant *, 8> ConstOps;
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
       if (I->getOperand(i) == Op)
         ConstOps.push_back(CRepOp);
       else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i)))
         ConstOps.push_back(COp);
       else
         break;
     }
 
     // All operands were constants, fold it.
     if (ConstOps.size() == I->getNumOperands()) {
       if (CmpInst *C = dyn_cast<CmpInst>(I))
         return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
                                                ConstOps[1], Q.DL, Q.TLI);
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         if (!LI->isVolatile())
           return ConstantFoldLoadFromConstPtr(ConstOps[0], LI->getType(), Q.DL);
 
       return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI);
     }
   }
 
   return nullptr;
 }
 
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison where one operand of the compare is a constant.
 static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X,
                                     const APInt *Y, bool TrueWhenUnset) {
   const APInt *C;
 
   // (X & Y) == 0 ? X & ~Y : X  --> X
   // (X & Y) != 0 ? X & ~Y : X  --> X & ~Y
   if (FalseVal == X && match(TrueVal, m_And(m_Specific(X), m_APInt(C))) &&
       *Y == ~*C)
     return TrueWhenUnset ? FalseVal : TrueVal;
 
   // (X & Y) == 0 ? X : X & ~Y  --> X & ~Y
   // (X & Y) != 0 ? X : X & ~Y  --> X
   if (TrueVal == X && match(FalseVal, m_And(m_Specific(X), m_APInt(C))) &&
       *Y == ~*C)
     return TrueWhenUnset ? FalseVal : TrueVal;
 
   if (Y->isPowerOf2()) {
     // (X & Y) == 0 ? X | Y : X  --> X | Y
     // (X & Y) != 0 ? X | Y : X  --> X
     if (FalseVal == X && match(TrueVal, m_Or(m_Specific(X), m_APInt(C))) &&
         *Y == *C)
       return TrueWhenUnset ? TrueVal : FalseVal;
 
     // (X & Y) == 0 ? X : X | Y  --> X
     // (X & Y) != 0 ? X : X | Y  --> X | Y
     if (TrueVal == X && match(FalseVal, m_Or(m_Specific(X), m_APInt(C))) &&
         *Y == *C)
       return TrueWhenUnset ? TrueVal : FalseVal;
   }
 
   return nullptr;
 }
 
 /// An alternative way to test if a bit is set or not uses sgt/slt instead of
 /// eq/ne.
 static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
                                            ICmpInst::Predicate Pred,
                                            Value *TrueVal, Value *FalseVal) {
   Value *X;
   APInt Mask;
   if (!decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask))
     return nullptr;
 
   return simplifySelectBitTest(TrueVal, FalseVal, X, &Mask,
                                Pred == ICmpInst::ICMP_EQ);
 }
 
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison.
 static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                          Value *FalseVal, const SimplifyQuery &Q,
                                          unsigned MaxRecurse) {
   ICmpInst::Predicate Pred;
   Value *CmpLHS, *CmpRHS;
   if (!match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS))))
     return nullptr;
 
   if (ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero())) {
     Value *X;
     const APInt *Y;
     if (match(CmpLHS, m_And(m_Value(X), m_APInt(Y))))
       if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y,
                                            Pred == ICmpInst::ICMP_EQ))
         return V;
   }
 
   // Check for other compares that behave like bit test.
   if (Value *V = simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred,
                                               TrueVal, FalseVal))
     return V;
 
   // If we have an equality comparison, then we know the value in one of the
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             TrueVal ||
         SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             TrueVal)
       return FalseVal;
     if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             FalseVal ||
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             FalseVal)
       return FalseVal;
   } else if (Pred == ICmpInst::ICMP_NE) {
     if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             FalseVal ||
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             FalseVal)
       return TrueVal;
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             TrueVal ||
         SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             TrueVal)
       return TrueVal;
   }
 
   return nullptr;
 }
 
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                  const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *CondC = dyn_cast<Constant>(Cond)) {
     if (auto *TrueC = dyn_cast<Constant>(TrueVal))
       if (auto *FalseC = dyn_cast<Constant>(FalseVal))
         return ConstantFoldSelectInstruction(CondC, TrueC, FalseC);
 
     // select undef, X, Y -> X or Y
     if (isa<UndefValue>(CondC))
       return isa<Constant>(FalseVal) ? FalseVal : TrueVal;
 
     // TODO: Vector constants with undef elements don't simplify.
 
     // select true, X, Y  -> X
     if (CondC->isAllOnesValue())
       return TrueVal;
     // select false, X, Y -> Y
     if (CondC->isNullValue())
       return FalseVal;
   }
 
   // select ?, X, X -> X
   if (TrueVal == FalseVal)
     return TrueVal;
 
   if (isa<UndefValue>(TrueVal))   // select ?, undef, X -> X
     return FalseVal;
   if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
     return TrueVal;
 
   if (Value *V =
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
   if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal))
     return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                 const SimplifyQuery &Q) {
   return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit);
 }
 
 /// Given operands for an GetElementPtrInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                               const SimplifyQuery &Q, unsigned) {
   // The type of the GEP pointer operand.
   unsigned AS =
       cast<PointerType>(Ops[0]->getType()->getScalarType())->getAddressSpace();
 
   // getelementptr P -> P.
   if (Ops.size() == 1)
     return Ops[0];
 
   // Compute the (pointer) type returned by the GEP instruction.
   Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1));
   Type *GEPTy = PointerType::get(LastType, AS);
   if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
     GEPTy = VectorType::get(GEPTy, VT->getNumElements());
   else if (VectorType *VT = dyn_cast<VectorType>(Ops[1]->getType()))
     GEPTy = VectorType::get(GEPTy, VT->getNumElements());
 
   if (isa<UndefValue>(Ops[0]))
     return UndefValue::get(GEPTy);
 
   if (Ops.size() == 2) {
     // getelementptr P, 0 -> P.
     if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy)
       return Ops[0];
 
     Type *Ty = SrcTy;
     if (Ty->isSized()) {
       Value *P;
       uint64_t C;
       uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty);
       // getelementptr P, N -> P if P points to a type of zero size.
       if (TyAllocSize == 0 && Ops[0]->getType() == GEPTy)
         return Ops[0];
 
       // The following transforms are only safe if the ptrtoint cast
       // doesn't truncate the pointers.
       if (Ops[1]->getType()->getScalarSizeInBits() ==
           Q.DL.getIndexSizeInBits(AS)) {
         auto PtrToIntOrZero = [GEPTy](Value *P) -> Value * {
           if (match(P, m_Zero()))
             return Constant::getNullValue(GEPTy);
           Value *Temp;
           if (match(P, m_PtrToInt(m_Value(Temp))))
             if (Temp->getType() == GEPTy)
               return Temp;
           return nullptr;
         };
 
         // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
         if (TyAllocSize == 1 &&
             match(Ops[1], m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0])))))
           if (Value *R = PtrToIntOrZero(P))
             return R;
 
         // getelementptr V, (ashr (sub P, V), C) -> Q
         // if P points to a type of size 1 << C.
         if (match(Ops[1],
                   m_AShr(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
                          m_ConstantInt(C))) &&
             TyAllocSize == 1ULL << C)
           if (Value *R = PtrToIntOrZero(P))
             return R;
 
         // getelementptr V, (sdiv (sub P, V), C) -> Q
         // if P points to a type of size C.
         if (match(Ops[1],
                   m_SDiv(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
                          m_SpecificInt(TyAllocSize))))
           if (Value *R = PtrToIntOrZero(P))
             return R;
       }
     }
   }
 
   if (Q.DL.getTypeAllocSize(LastType) == 1 &&
       all_of(Ops.slice(1).drop_back(1),
              [](Value *Idx) { return match(Idx, m_Zero()); })) {
     unsigned IdxWidth =
         Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace());
     if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) {
       APInt BasePtrOffset(IdxWidth, 0);
       Value *StrippedBasePtr =
           Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL,
                                                             BasePtrOffset);
 
       // gep (gep V, C), (sub 0, V) -> C
       if (match(Ops.back(),
                 m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr))))) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
       // gep (gep V, C), (xor V, -1) -> C-1
       if (match(Ops.back(),
                 m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes()))) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
     }
   }
 
   // Check to see if this is constant foldable.
   if (!all_of(Ops, [](Value *V) { return isa<Constant>(V); }))
     return nullptr;
 
   auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
                                             Ops.slice(1));
   if (auto *CEFolded = ConstantFoldConstant(CE, Q.DL))
     return CEFolded;
   return CE;
 }
 
 Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                              const SimplifyQuery &Q) {
   return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
                                       ArrayRef<unsigned> Idxs, const SimplifyQuery &Q,
                                       unsigned) {
   if (Constant *CAgg = dyn_cast<Constant>(Agg))
     if (Constant *CVal = dyn_cast<Constant>(Val))
       return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
 
   // insertvalue x, undef, n -> x
   if (match(Val, m_Undef()))
     return Agg;
 
   // insertvalue x, (extractvalue y, n), n
   if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val))
     if (EV->getAggregateOperand()->getType() == Agg->getType() &&
         EV->getIndices() == Idxs) {
       // insertvalue undef, (extractvalue y, n), n -> y
       if (match(Agg, m_Undef()))
         return EV->getAggregateOperand();
 
       // insertvalue y, (extractvalue y, n), n -> y
       if (Agg == EV->getAggregateOperand())
         return Agg;
     }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
                                      ArrayRef<unsigned> Idxs,
                                      const SimplifyQuery &Q) {
   return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
                                        const SimplifyQuery &Q) {
   // Try to constant fold.
   auto *VecC = dyn_cast<Constant>(Vec);
   auto *ValC = dyn_cast<Constant>(Val);
   auto *IdxC = dyn_cast<Constant>(Idx);
   if (VecC && ValC && IdxC)
     return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);
 
   // Fold into undef if index is out of bounds.
   if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
     uint64_t NumElements = cast<VectorType>(Vec->getType())->getNumElements();
     if (CI->uge(NumElements))
       return UndefValue::get(Vec->getType());
   }
 
   // If index is undef, it might be out of bounds (see above case)
   if (isa<UndefValue>(Idx))
     return UndefValue::get(Vec->getType());
 
   return nullptr;
 }
 
 /// Given operands for an ExtractValueInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                        const SimplifyQuery &, unsigned) {
   if (auto *CAgg = dyn_cast<Constant>(Agg))
     return ConstantFoldExtractValueInstruction(CAgg, Idxs);
 
   // extractvalue x, (insertvalue y, elt, n), n -> elt
   unsigned NumIdxs = Idxs.size();
   for (auto *IVI = dyn_cast<InsertValueInst>(Agg); IVI != nullptr;
        IVI = dyn_cast<InsertValueInst>(IVI->getAggregateOperand())) {
     ArrayRef<unsigned> InsertValueIdxs = IVI->getIndices();
     unsigned NumInsertValueIdxs = InsertValueIdxs.size();
     unsigned NumCommonIdxs = std::min(NumInsertValueIdxs, NumIdxs);
     if (InsertValueIdxs.slice(0, NumCommonIdxs) ==
         Idxs.slice(0, NumCommonIdxs)) {
       if (NumIdxs == NumInsertValueIdxs)
         return IVI->getInsertedValueOperand();
       break;
     }
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                       const SimplifyQuery &Q) {
   return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit);
 }
 
 /// Given operands for an ExtractElementInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &,
                                          unsigned) {
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
       return ConstantFoldExtractElementInstruction(CVec, CIdx);
 
     // The index is not relevant if our vector is a splat.
     if (auto *Splat = CVec->getSplatValue())
       return Splat;
 
     if (isa<UndefValue>(Vec))
       return UndefValue::get(Vec->getType()->getVectorElementType());
   }
 
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
   if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
     if (IdxC->getValue().uge(Vec->getType()->getVectorNumElements()))
       // definitely out of bounds, thus undefined result
       return UndefValue::get(Vec->getType()->getVectorElementType());
     if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
       return Elt;
   }
 
   // An undef extract index can be arbitrarily chosen to be an out-of-range
   // index value, which would result in the instruction being undef.
   if (isa<UndefValue>(Idx))
     return UndefValue::get(Vec->getType()->getVectorElementType());
 
   return nullptr;
 }
 
 Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx,
                                         const SimplifyQuery &Q) {
   return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit);
 }
 
 /// See if we can fold the given phi. If not, returns null.
 static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
   Value *CommonValue = nullptr;
   bool HasUndefInput = false;
   for (Value *Incoming : PN->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PN) continue;
     if (isa<UndefValue>(Incoming)) {
       // Remember that we saw an undef value, but otherwise ignore them.
       HasUndefInput = true;
       continue;
     }
     if (CommonValue && Incoming != CommonValue)
       return nullptr;  // Not the same, bail out.
     CommonValue = Incoming;
   }
 
   // If CommonValue is null then all of the incoming values were either undef or
   // equal to the phi node itself.
   if (!CommonValue)
     return UndefValue::get(PN->getType());
 
   // If we have a PHI node like phi(X, undef, X), where X is defined by some
   // instruction, we cannot return X as the result of the PHI node unless it
   // dominates the PHI block.
   if (HasUndefInput)
     return valueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;
 
   return CommonValue;
 }
 
 static Value *SimplifyCastInst(unsigned CastOpc, Value *Op,
                                Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *C = dyn_cast<Constant>(Op))
     return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL);
 
   if (auto *CI = dyn_cast<CastInst>(Op)) {
     auto *Src = CI->getOperand(0);
     Type *SrcTy = Src->getType();
     Type *MidTy = CI->getType();
     Type *DstTy = Ty;
     if (Src->getType() == Ty) {
       auto FirstOp = static_cast<Instruction::CastOps>(CI->getOpcode());
       auto SecondOp = static_cast<Instruction::CastOps>(CastOpc);
       Type *SrcIntPtrTy =
           SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr;
       Type *MidIntPtrTy =
           MidTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(MidTy) : nullptr;
       Type *DstIntPtrTy =
           DstTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(DstTy) : nullptr;
       if (CastInst::isEliminableCastPair(FirstOp, SecondOp, SrcTy, MidTy, DstTy,
                                          SrcIntPtrTy, MidIntPtrTy,
                                          DstIntPtrTy) == Instruction::BitCast)
         return Src;
     }
   }
 
   // bitcast x -> x
   if (CastOpc == Instruction::BitCast)
     if (Op->getType() == Ty)
       return Op;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                               const SimplifyQuery &Q) {
   return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit);
 }
 
 /// For the given destination element of a shuffle, peek through shuffles to
 /// match a root vector source operand that contains that element in the same
 /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
 static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
                                    int MaskVal, Value *RootVec,
                                    unsigned MaxRecurse) {
   if (!MaxRecurse--)
     return nullptr;
 
   // Bail out if any mask value is undefined. That kind of shuffle may be
   // simplified further based on demanded bits or other folds.
   if (MaskVal == -1)
     return nullptr;
 
   // The mask value chooses which source operand we need to look at next.
   int InVecNumElts = Op0->getType()->getVectorNumElements();
   int RootElt = MaskVal;
   Value *SourceOp = Op0;
   if (MaskVal >= InVecNumElts) {
     RootElt = MaskVal - InVecNumElts;
     SourceOp = Op1;
   }
 
   // If the source operand is a shuffle itself, look through it to find the
   // matching root vector.
   if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
     return foldIdentityShuffles(
         DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
         SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
   }
 
   // TODO: Look through bitcasts? What if the bitcast changes the vector element
   // size?
 
   // The source operand is not a shuffle. Initialize the root vector value for
   // this shuffle if that has not been done yet.
   if (!RootVec)
     RootVec = SourceOp;
 
   // Give up as soon as a source operand does not match the existing root value.
   if (RootVec != SourceOp)
     return nullptr;
 
   // The element must be coming from the same lane in the source vector
   // (although it may have crossed lanes in intermediate shuffles).
   if (RootElt != DestElt)
     return nullptr;
 
   return RootVec;
 }
 
 static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
                                         Type *RetTy, const SimplifyQuery &Q,
                                         unsigned MaxRecurse) {
   if (isa<UndefValue>(Mask))
     return UndefValue::get(RetTy);
 
   Type *InVecTy = Op0->getType();
   unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
   unsigned InVecNumElts = InVecTy->getVectorNumElements();
 
   SmallVector<int, 32> Indices;
   ShuffleVectorInst::getShuffleMask(Mask, Indices);
   assert(MaskNumElts == Indices.size() &&
          "Size of Indices not same as number of mask elements?");
 
   // Canonicalization: If mask does not select elements from an input vector,
   // replace that input vector with undef.
   bool MaskSelects0 = false, MaskSelects1 = false;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     if (Indices[i] == -1)
       continue;
     if ((unsigned)Indices[i] < InVecNumElts)
       MaskSelects0 = true;
     else
       MaskSelects1 = true;
   }
   if (!MaskSelects0)
     Op0 = UndefValue::get(InVecTy);
   if (!MaskSelects1)
     Op1 = UndefValue::get(InVecTy);
 
   auto *Op0Const = dyn_cast<Constant>(Op0);
   auto *Op1Const = dyn_cast<Constant>(Op1);
 
   // If all operands are constant, constant fold the shuffle.
   if (Op0Const && Op1Const)
     return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
 
   // Canonicalization: if only one input vector is constant, it shall be the
   // second one.
   if (Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
     ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
   }
 
   // A shuffle of a splat is always the splat itself. Legal if the shuffle's
   // value type is same as the input vectors' type.
   if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
     if (isa<UndefValue>(Op1) && RetTy == InVecTy &&
         OpShuf->getMask()->getSplatValue())
       return Op0;
 
   // Don't fold a shuffle with undef mask elements. This may get folded in a
   // better way using demanded bits or other analysis.
   // TODO: Should we allow this?
   if (find(Indices, -1) != Indices.end())
     return nullptr;
 
   // Check if every element of this shuffle can be mapped back to the
   // corresponding element of a single root vector. If so, we don't need this
   // shuffle. This handles simple identity shuffles as well as chains of
   // shuffles that may widen/narrow and/or move elements across lanes and back.
   Value *RootVec = nullptr;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     // Note that recursion is limited for each vector element, so if any element
     // exceeds the limit, this will fail to simplify.
     RootVec =
         foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
 
     // We can't replace a widening/narrowing shuffle with one of its operands.
     if (!RootVec || RootVec->getType() != RetTy)
       return nullptr;
   }
   return RootVec;
 }
 
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
 Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
                                        Type *RetTy, const SimplifyQuery &Q) {
   return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
 }
 
 static Constant *propagateNaN(Constant *In) {
   // If the input is a vector with undef elements, just return a default NaN.
   if (!In->isNaN())
     return ConstantFP::getNaN(In->getType());
 
   // Propagate the existing NaN constant when possible.
   // TODO: Should we quiet a signaling NaN?
   return In;
 }
 
 static Constant *simplifyFPBinop(Value *Op0, Value *Op1) {
   if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
     return ConstantFP::getNaN(Op0->getType());
 
   if (match(Op0, m_NaN()))
     return propagateNaN(cast<Constant>(Op0));
   if (match(Op1, m_NaN()))
     return propagateNaN(cast<Constant>(Op1));
 
   return nullptr;
 }
 
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // fadd X, -0 ==> X
   if (match(Op1, m_NegZeroFP()))
     return Op0;
 
   // fadd X, 0 ==> X, when we know X is not -0
   if (match(Op1, m_PosZeroFP()) &&
       (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
   // With nnan: (+/-0.0 - X) + X --> 0.0 (and commuted variant)
   // We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN.
   // Negative zeros are allowed because we always end up with positive zero:
   // X = -0.0: (-0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
   // X = -0.0: ( 0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
   // X =  0.0: (-0.0 - ( 0.0)) + ( 0.0) == (-0.0) + ( 0.0) == 0.0
   // X =  0.0: ( 0.0 - ( 0.0)) + ( 0.0) == ( 0.0) + ( 0.0) == 0.0
   if (FMF.noNaNs() && (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) ||
                        match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0)))))
     return ConstantFP::getNullValue(Op0->getType());
 
   return nullptr;
 }
 
 /// Given operands for an FSub, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // fsub X, +0 ==> X
   if (match(Op1, m_PosZeroFP()))
     return Op0;
 
   // fsub X, -0 ==> X, when we know X is not -0
   if (match(Op1, m_NegZeroFP()) &&
       (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
   // fsub -0.0, (fsub -0.0, X) ==> X
   Value *X;
   if (match(Op0, m_NegZeroFP()) &&
       match(Op1, m_FSub(m_NegZeroFP(), m_Value(X))))
     return X;
 
   // fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored.
   if (FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()) &&
       match(Op1, m_FSub(m_AnyZeroFP(), m_Value(X))))
     return X;
 
   // fsub nnan x, x ==> 0.0
   if (FMF.noNaNs() && Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   return nullptr;
 }
 
 /// Given the operands for an FMul, see if we can fold the result
 static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // fmul X, 1.0 ==> X
   if (match(Op1, m_FPOne()))
     return Op0;
 
   // fmul nnan nsz X, 0 ==> 0
   if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZeroFP()))
     return ConstantFP::getNullValue(Op0->getType());
 
   // sqrt(X) * sqrt(X) --> X, if we can:
   // 1. Remove the intermediate rounding (reassociate).
   // 2. Ignore non-zero negative numbers because sqrt would produce NAN.
   // 3. Ignore -0.0 because sqrt(-0.0) == -0.0, but -0.0 * -0.0 == 0.0.
   Value *X;
   if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) &&
       FMF.allowReassoc() && FMF.noNaNs() && FMF.noSignedZeros())
     return X;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 
 Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // X / 1.0 -> X
   if (match(Op1, m_FPOne()))
     return Op0;
 
   // 0 / X -> 0
   // Requires that NaNs are off (X could be zero) and signed zeroes are
   // ignored (X could be positive or negative, so the output sign is unknown).
   if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()))
     return ConstantFP::getNullValue(Op0->getType());
 
   if (FMF.noNaNs()) {
     // X / X -> 1.0 is legal when NaNs are ignored.
     // We can ignore infinities because INF/INF is NaN.
     if (Op0 == Op1)
       return ConstantFP::get(Op0->getType(), 1.0);
 
     // (X * Y) / Y --> X if we can reassociate to the above form.
     Value *X;
     if (FMF.allowReassoc() && match(Op0, m_c_FMul(m_Value(X), m_Specific(Op1))))
       return X;
 
     // -X /  X -> -1.0 and
     //  X / -X -> -1.0 are legal when NaNs are ignored.
     // We can ignore signed zeros because +-0.0/+-0.0 is NaN and ignored.
     if ((BinaryOperator::isFNeg(Op0, /*IgnoreZeroSign=*/true) &&
          BinaryOperator::getFNegArgument(Op0) == Op1) ||
         (BinaryOperator::isFNeg(Op1, /*IgnoreZeroSign=*/true) &&
          BinaryOperator::getFNegArgument(Op1) == Op0))
       return ConstantFP::get(Op0->getType(), -1.0);
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // Unlike fdiv, the result of frem always matches the sign of the dividend.
   // The constant match may include undef elements in a vector, so return a full
   // zero constant as the result.
   if (FMF.noNaNs()) {
     // +0 % X -> 0
     if (match(Op0, m_PosZeroFP()))
       return ConstantFP::getNullValue(Op0->getType());
     // -0 % X -> -0
     if (match(Op0, m_NegZeroFP()))
       return ConstantFP::getNegativeZero(Op0->getType());
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 //=== Helper functions for higher up the class hierarchy.
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const SimplifyQuery &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
     return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::Sub:
     return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::Mul:
     return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::SDiv:
     return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::UDiv:
     return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::SRem:
     return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::URem:
     return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Shl:
     return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::LShr:
     return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::AShr:
     return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::And:
     return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Or:
     return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Xor:
     return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FMul:
     return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FDiv:
     return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FRem:
     return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   default:
     llvm_unreachable("Unexpected opcode");
   }
 }
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
 /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
 /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
 static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                               const FastMathFlags &FMF, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FMul:
     return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FDiv:
     return SimplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse);
   default:
     return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
   }
 }
 
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            const SimplifyQuery &Q) {
   return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                              FastMathFlags FMF, const SimplifyQuery &Q) {
   return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for a CmpInst, see if we can fold the result.
 static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
     return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
   return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                              const SimplifyQuery &Q) {
   return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 static bool IsIdempotent(Intrinsic::ID ID) {
   switch (ID) {
   default: return false;
 
   // Unary idempotent: f(f(x)) = f(x)
   case Intrinsic::fabs:
   case Intrinsic::floor:
   case Intrinsic::ceil:
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
   case Intrinsic::canonicalize:
     return true;
   }
 }
 
 static Value *SimplifyRelativeLoad(Constant *Ptr, Constant *Offset,
                                    const DataLayout &DL) {
   GlobalValue *PtrSym;
   APInt PtrOffset;
   if (!IsConstantOffsetFromGlobal(Ptr, PtrSym, PtrOffset, DL))
     return nullptr;
 
   Type *Int8PtrTy = Type::getInt8PtrTy(Ptr->getContext());
   Type *Int32Ty = Type::getInt32Ty(Ptr->getContext());
   Type *Int32PtrTy = Int32Ty->getPointerTo();
   Type *Int64Ty = Type::getInt64Ty(Ptr->getContext());
 
   auto *OffsetConstInt = dyn_cast<ConstantInt>(Offset);
   if (!OffsetConstInt || OffsetConstInt->getType()->getBitWidth() > 64)
     return nullptr;
 
   uint64_t OffsetInt = OffsetConstInt->getSExtValue();
   if (OffsetInt % 4 != 0)
     return nullptr;
 
   Constant *C = ConstantExpr::getGetElementPtr(
       Int32Ty, ConstantExpr::getBitCast(Ptr, Int32PtrTy),
       ConstantInt::get(Int64Ty, OffsetInt / 4));
   Constant *Loaded = ConstantFoldLoadFromConstPtr(C, Int32Ty, DL);
   if (!Loaded)
     return nullptr;
 
   auto *LoadedCE = dyn_cast<ConstantExpr>(Loaded);
   if (!LoadedCE)
     return nullptr;
 
   if (LoadedCE->getOpcode() == Instruction::Trunc) {
     LoadedCE = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
     if (!LoadedCE)
       return nullptr;
   }
 
   if (LoadedCE->getOpcode() != Instruction::Sub)
     return nullptr;
 
   auto *LoadedLHS = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
   if (!LoadedLHS || LoadedLHS->getOpcode() != Instruction::PtrToInt)
     return nullptr;
   auto *LoadedLHSPtr = LoadedLHS->getOperand(0);
 
   Constant *LoadedRHS = LoadedCE->getOperand(1);
   GlobalValue *LoadedRHSSym;
   APInt LoadedRHSOffset;
   if (!IsConstantOffsetFromGlobal(LoadedRHS, LoadedRHSSym, LoadedRHSOffset,
                                   DL) ||
       PtrSym != LoadedRHSSym || PtrOffset != LoadedRHSOffset)
     return nullptr;
 
   return ConstantExpr::getBitCast(LoadedLHSPtr, Int8PtrTy);
 }
 
 static bool maskIsAllZeroOrUndef(Value *Mask) {
   auto *ConstMask = dyn_cast<Constant>(Mask);
   if (!ConstMask)
     return false;
   if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
     return true;
   for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
        ++I) {
     if (auto *MaskElt = ConstMask->getAggregateElement(I))
       if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
         continue;
     return false;
   }
   return true;
 }
 
 static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
                                      const SimplifyQuery &Q) {
   // Idempotent functions return the same result when called repeatedly.
   Intrinsic::ID IID = F->getIntrinsicID();
   if (IsIdempotent(IID))
     if (auto *II = dyn_cast<IntrinsicInst>(Op0))
       if (II->getIntrinsicID() == IID)
         return II;
 
   Value *X;
   switch (IID) {
   case Intrinsic::fabs:
     if (SignBitMustBeZero(Op0, Q.TLI)) return Op0;
     break;
   case Intrinsic::bswap:
     // bswap(bswap(x)) -> x
     if (match(Op0, m_BSwap(m_Value(X)))) return X;
     break;
   case Intrinsic::bitreverse:
     // bitreverse(bitreverse(x)) -> x
     if (match(Op0, m_BitReverse(m_Value(X)))) return X;
     break;
   case Intrinsic::exp:
     // exp(log(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::log>(m_Value(X)))) return X;
     break;
   case Intrinsic::exp2:
     // exp2(log2(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::log2>(m_Value(X)))) return X;
     break;
   case Intrinsic::log:
     // log(exp(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X)))) return X;
     break;
   case Intrinsic::log2:
     // log2(exp2(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X)))) return X;
     break;
   default:
     break;
   }
 
   return nullptr;
 }
 
 static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
                                       const SimplifyQuery &Q) {
   Intrinsic::ID IID = F->getIntrinsicID();
   Type *ReturnType = F->getReturnType();
   switch (IID) {
   case Intrinsic::usub_with_overflow:
   case Intrinsic::ssub_with_overflow:
     // X - X -> { 0, false }
     if (Op0 == Op1)
       return Constant::getNullValue(ReturnType);
     // X - undef -> undef
     // undef - X -> undef
     if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
       return UndefValue::get(ReturnType);
     break;
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
     // X + undef -> undef
     if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
       return UndefValue::get(ReturnType);
     break;
   case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
     // 0 * X -> { 0, false }
     // X * 0 -> { 0, false }
     if (match(Op0, m_Zero()) || match(Op1, m_Zero()))
       return Constant::getNullValue(ReturnType);
     // undef * X -> { 0, false }
     // X * undef -> { 0, false }
     if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
       return Constant::getNullValue(ReturnType);
     break;
   case Intrinsic::load_relative:
     if (auto *C0 = dyn_cast<Constant>(Op0))
       if (auto *C1 = dyn_cast<Constant>(Op1))
         return SimplifyRelativeLoad(C0, C1, Q.DL);
     break;
   case Intrinsic::powi:
     if (auto *Power = dyn_cast<ConstantInt>(Op1)) {
       // powi(x, 0) -> 1.0
       if (Power->isZero())
         return ConstantFP::get(Op0->getType(), 1.0);
       // powi(x, 1) -> x
       if (Power->isOne())
         return Op0;
     }
     break;
   case Intrinsic::maxnum:
   case Intrinsic::minnum:
     // If one argument is NaN, return the other argument.
     if (match(Op0, m_NaN())) return Op1;
     if (match(Op1, m_NaN())) return Op0;
     break;
   default:
     break;
   }
 
   return nullptr;
 }
 
 template <typename IterTy>
 static Value *simplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
                                 const SimplifyQuery &Q) {
   // Intrinsics with no operands have some kind of side effect. Don't simplify.
   unsigned NumOperands = std::distance(ArgBegin, ArgEnd);
   if (NumOperands == 0)
     return nullptr;
 
   Intrinsic::ID IID = F->getIntrinsicID();
   if (NumOperands == 1)
     return simplifyUnaryIntrinsic(F, ArgBegin[0], Q);
 
   if (NumOperands == 2)
     return simplifyBinaryIntrinsic(F, ArgBegin[0], ArgBegin[1], Q);
 
   // Handle intrinsics with 3 or more arguments.
   switch (IID) {
   case Intrinsic::masked_load: {
     Value *MaskArg = ArgBegin[2];
     Value *PassthruArg = ArgBegin[3];
     // If the mask is all zeros or undef, the "passthru" argument is the result.
     if (maskIsAllZeroOrUndef(MaskArg))
       return PassthruArg;
     return nullptr;
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
     Value *ShAmtArg = ArgBegin[2];
     const APInt *ShAmtC;
     if (match(ShAmtArg, m_APInt(ShAmtC))) {
       // If there's effectively no shift, return the 1st arg or 2nd arg.
       // TODO: For vectors, we could check each element of a non-splat constant.
       APInt BitWidth = APInt(ShAmtC->getBitWidth(), ShAmtC->getBitWidth());
       if (ShAmtC->urem(BitWidth).isNullValue())
         return ArgBegin[IID == Intrinsic::fshl ? 0 : 1];
     }
     return nullptr;
   }
   default:
     return nullptr;
   }
 }
 
 template <typename IterTy>
 static Value *SimplifyCall(ImmutableCallSite CS, Value *V, IterTy ArgBegin,
                            IterTy ArgEnd, const SimplifyQuery &Q,
                            unsigned MaxRecurse) {
   Type *Ty = V->getType();
   if (PointerType *PTy = dyn_cast<PointerType>(Ty))
     Ty = PTy->getElementType();
   FunctionType *FTy = cast<FunctionType>(Ty);
 
   // call undef -> undef
   // call null -> undef
   if (isa<UndefValue>(V) || isa<ConstantPointerNull>(V))
     return UndefValue::get(FTy->getReturnType());
 
   Function *F = dyn_cast<Function>(V);
   if (!F)
     return nullptr;
 
   if (F->isIntrinsic())
     if (Value *Ret = simplifyIntrinsic(F, ArgBegin, ArgEnd, Q))
       return Ret;
 
   if (!canConstantFoldCallTo(CS, F))
     return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
   ConstantArgs.reserve(ArgEnd - ArgBegin);
   for (IterTy I = ArgBegin, E = ArgEnd; I != E; ++I) {
     Constant *C = dyn_cast<Constant>(*I);
     if (!C)
       return nullptr;
     ConstantArgs.push_back(C);
   }
 
   return ConstantFoldCall(CS, F, ConstantArgs, Q.TLI);
 }
 
 Value *llvm::SimplifyCall(ImmutableCallSite CS, Value *V,
                           User::op_iterator ArgBegin, User::op_iterator ArgEnd,
                           const SimplifyQuery &Q) {
   return ::SimplifyCall(CS, V, ArgBegin, ArgEnd, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyCall(ImmutableCallSite CS, Value *V,
                           ArrayRef<Value *> Args, const SimplifyQuery &Q) {
   return ::SimplifyCall(CS, V, Args.begin(), Args.end(), Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyCall(ImmutableCallSite ICS, const SimplifyQuery &Q) {
   CallSite CS(const_cast<Instruction*>(ICS.getInstruction()));
   return ::SimplifyCall(CS, CS.getCalledValue(), CS.arg_begin(), CS.arg_end(),
                         Q, RecursionLimit);
 }
 
 /// See if we can compute a simplified version of this instruction.
 /// If not, this returns null.
 
 Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
                                  OptimizationRemarkEmitter *ORE) {
   const SimplifyQuery Q = SQ.CxtI ? SQ : SQ.getWithInstruction(I);
   Value *Result;
 
   switch (I->getOpcode()) {
   default:
     Result = ConstantFoldInstruction(I, Q.DL, Q.TLI);
     break;
   case Instruction::FAdd:
     Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Add:
     Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::FSub:
     Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Sub:
     Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::FMul:
     Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Mul:
     Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::SDiv:
     Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::UDiv:
     Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FDiv:
     Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::SRem:
     Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::URem:
     Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FRem:
     Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Shl:
     Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::LShr:
     Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
                               cast<BinaryOperator>(I)->isExact(), Q);
     break;
   case Instruction::AShr:
     Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
                               cast<BinaryOperator>(I)->isExact(), Q);
     break;
   case Instruction::And:
     Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::Or:
     Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::Xor:
     Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::ICmp:
     Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
                               I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FCmp:
     Result =
         SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), I->getOperand(0),
                          I->getOperand(1), I->getFastMathFlags(), Q);
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
                                 I->getOperand(2), Q);
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value *, 8> Ops(I->op_begin(), I->op_end());
     Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
                              Ops, Q);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
     Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
                                      IV->getInsertedValueOperand(),
                                      IV->getIndices(), Q);
     break;
   }
   case Instruction::InsertElement: {
     auto *IE = cast<InsertElementInst>(I);
     Result = SimplifyInsertElementInst(IE->getOperand(0), IE->getOperand(1),
                                        IE->getOperand(2), Q);
     break;
   }
   case Instruction::ExtractValue: {
     auto *EVI = cast<ExtractValueInst>(I);
     Result = SimplifyExtractValueInst(EVI->getAggregateOperand(),
                                       EVI->getIndices(), Q);
     break;
   }
   case Instruction::ExtractElement: {
     auto *EEI = cast<ExtractElementInst>(I);
     Result = SimplifyExtractElementInst(EEI->getVectorOperand(),
                                         EEI->getIndexOperand(), Q);
     break;
   }
   case Instruction::ShuffleVector: {
     auto *SVI = cast<ShuffleVectorInst>(I);
     Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
                                        SVI->getMask(), SVI->getType(), Q);
     break;
   }
   case Instruction::PHI:
     Result = SimplifyPHINode(cast<PHINode>(I), Q);
     break;
   case Instruction::Call: {
     CallSite CS(cast<CallInst>(I));
     Result = SimplifyCall(CS, Q);
     break;
   }
 #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc:
 #include "llvm/IR/Instruction.def"
 #undef HANDLE_CAST_INST
     Result =
         SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), Q);
     break;
   case Instruction::Alloca:
     // No simplifications for Alloca and it can't be constant folded.
     Result = nullptr;
     break;
   }
 
   // In general, it is possible for computeKnownBits to determine all bits in a
   // value even when the operands are not all constants.
   if (!Result && I->getType()->isIntOrIntVectorTy()) {
     KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
     if (Known.isConstant())
       Result = ConstantInt::get(I->getType(), Known.getConstant());
   }
 
   /// If called on unreachable code, the above logic may report that the
   /// instruction simplified to itself.  Make life easier for users by
   /// detecting that case here, returning a safe value instead.
   return Result == I ? UndefValue::get(I->getType()) : Result;
 }
 
 /// Implementation of recursive simplification through an instruction's
 /// uses.
 ///
 /// This is the common implementation of the recursive simplification routines.
 /// If we have a pre-simplified value in 'SimpleV', that is forcibly used to
 /// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of
 /// instructions to process and attempt to simplify it using
 /// InstructionSimplify.
 ///
 /// This routine returns 'true' only when *it* simplifies something. The passed
 /// in simplified value does not count toward this.
 static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
                                               const TargetLibraryInfo *TLI,
                                               const DominatorTree *DT,
                                               AssumptionCache *AC) {
   bool Simplified = false;
   SmallSetVector<Instruction *, 8> Worklist;
   const DataLayout &DL = I->getModule()->getDataLayout();
 
   // If we have an explicit value to collapse to, do that round of the
   // simplification loop by hand initially.
   if (SimpleV) {
     for (User *U : I->users())
       if (U != I)
         Worklist.insert(cast<Instruction>(U));
 
     // Replace the instruction with its simplified value.
     I->replaceAllUsesWith(SimpleV);
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
     if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
         !I->mayHaveSideEffects())
       I->eraseFromParent();
   } else {
     Worklist.insert(I);
   }
 
   // Note that we must test the size on each iteration, the worklist can grow.
   for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
     I = Worklist[Idx];
 
     // See if this instruction simplifies.
     SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC});
     if (!SimpleV)
       continue;
 
     Simplified = true;
 
     // Stash away all the uses of the old instruction so we can check them for
     // recursive simplifications after a RAUW. This is cheaper than checking all
     // uses of To on the recursive step in most cases.
     for (User *U : I->users())
       Worklist.insert(cast<Instruction>(U));
 
     // Replace the instruction with its simplified value.
     I->replaceAllUsesWith(SimpleV);
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
     if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
         !I->mayHaveSideEffects())
       I->eraseFromParent();
   }
   return Simplified;
 }
 
 bool llvm::recursivelySimplifyInstruction(Instruction *I,
                                           const TargetLibraryInfo *TLI,
                                           const DominatorTree *DT,
                                           AssumptionCache *AC) {
   return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC);
 }
 
 bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
                                          const TargetLibraryInfo *TLI,
                                          const DominatorTree *DT,
                                          AssumptionCache *AC) {
   assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
   assert(SimpleV && "Must provide a simplified value.");
   return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC);
 }
 
 namespace llvm {
 const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) {
   auto *DTWP = P.getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *TLIWP = P.getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr;
   auto *ACWP = P.getAnalysisIfAvailable<AssumptionCacheTracker>();
   auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr;
   return {F.getParent()->getDataLayout(), TLI, DT, AC};
 }
 
 const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &AR,
                                          const DataLayout &DL) {
   return {DL, &AR.TLI, &AR.DT, &AR.AC};
 }
 
 template <class T, class... TArgs>
 const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
                                          Function &F) {
   auto *DT = AM.template getCachedResult<DominatorTreeAnalysis>(F);
   auto *TLI = AM.template getCachedResult<TargetLibraryAnalysis>(F);
   auto *AC = AM.template getCachedResult<AssumptionAnalysis>(F);
   return {F.getParent()->getDataLayout(), TLI, DT, AC};
 }
 template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
                                                   Function &);
 }
Index: vendor/llvm/dist-release_70/lib/Analysis/ValueTracking.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Analysis/ValueTracking.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Analysis/ValueTracking.cpp	(revision 337631)
@@ -1,5132 +1,5135 @@
 //===- ValueTracking.cpp - Walk computations to compute properties --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains routines that help analyze properties that chains of
 // computations have.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <array>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <utility>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 const unsigned MaxDepth = 6;
 
 // Controls the number of uses of the value searched for possible
 // dominating comparisons.
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
                                               cl::Hidden, cl::init(20));
 
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
 
   return DL.getIndexTypeSizeInBits(Ty);
 }
 
 namespace {
 
 // Simplifying using an assume can only be done in a particular control-flow
 // context (the context instruction provides that context). If an assume and
 // the context instruction are not in the same block then the DT helps in
 // figuring out if we can use it.
 struct Query {
   const DataLayout &DL;
   AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
 
   // Unlike the other analyses, this may be a nullptr because not all clients
   // provide it currently.
   OptimizationRemarkEmitter *ORE;
 
   /// Set of assumptions that should be excluded from further queries.
   /// This is because of the potential for mutual recursion to cause
   /// computeKnownBits to repeatedly visit the same assume intrinsic. The
   /// classic case of this is assume(x = y), which will attempt to determine
   /// bits in x from bits in y, which will attempt to determine bits in y from
   /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
   /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
   /// (all of which can call computeKnownBits), and so on.
   std::array<const Value *, MaxDepth> Excluded;
 
   unsigned NumExcluded = 0;
 
   Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI,
         const DominatorTree *DT, OptimizationRemarkEmitter *ORE = nullptr)
       : DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE) {}
 
   Query(const Query &Q, const Value *NewExcl)
       : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
         NumExcluded(Q.NumExcluded) {
     Excluded = Q.Excluded;
     Excluded[NumExcluded++] = NewExcl;
     assert(NumExcluded <= Excluded.size());
   }
 
   bool isExcluded(const Value *Value) const {
     if (NumExcluded == 0)
       return false;
     auto End = Excluded.begin() + NumExcluded;
     return std::find(Excluded.begin(), End, Value) != End;
   }
 };
 
 } // end anonymous namespace
 
 // Given the provided Value and, potentially, a context instruction, return
 // the preferred context instruction (if any).
 static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
   // If we've been provided with a context instruction, then use that (provided
   // it has been inserted).
   if (CxtI && CxtI->getParent())
     return CxtI;
 
   // If the value is really an already-inserted instruction, then use that.
   CxtI = dyn_cast<Instruction>(V);
   if (CxtI && CxtI->getParent())
     return CxtI;
 
   return nullptr;
 }
 
 static void computeKnownBits(const Value *V, KnownBits &Known,
                              unsigned Depth, const Query &Q);
 
 void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
                             const DominatorTree *DT,
                             OptimizationRemarkEmitter *ORE) {
   ::computeKnownBits(V, Known, Depth,
                      Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 static KnownBits computeKnownBits(const Value *V, unsigned Depth,
                                   const Query &Q);
 
 KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
                                  unsigned Depth, AssumptionCache *AC,
                                  const Instruction *CxtI,
                                  const DominatorTree *DT,
                                  OptimizationRemarkEmitter *ORE) {
   return ::computeKnownBits(V, Depth,
                             Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
                                const DataLayout &DL,
                                AssumptionCache *AC, const Instruction *CxtI,
                                const DominatorTree *DT) {
   assert(LHS->getType() == RHS->getType() &&
          "LHS and RHS should have the same type");
   assert(LHS->getType()->isIntOrIntVectorTy() &&
          "LHS and RHS should be integers");
   // Look for an inverted mask: (X & ~M) op (Y & M).
   Value *M;
   if (match(LHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
       match(RHS, m_c_And(m_Specific(M), m_Value())))
     return true;
   if (match(RHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
       match(LHS, m_c_And(m_Specific(M), m_Value())))
     return true;
   IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
   KnownBits LHSKnown(IT->getBitWidth());
   KnownBits RHSKnown(IT->getBitWidth());
   computeKnownBits(LHS, LHSKnown, DL, 0, AC, CxtI, DT);
   computeKnownBits(RHS, RHSKnown, DL, 0, AC, CxtI, DT);
   return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
 }
 
 bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
   for (const User *U : CxtI->users()) {
     if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
       if (IC->isEquality())
         if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
           if (C->isNullValue())
             continue;
     return false;
   }
   return true;
 }
 
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
 
 bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
                                   bool OrZero,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
   return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
                                   Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q);
 
 bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth,
                           AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
   return ::isKnownNonZero(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
                               unsigned Depth,
                               AssumptionCache *AC, const Instruction *CxtI,
                               const DominatorTree *DT) {
   KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
   return Known.isNonNegative();
 }
 
 bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
   if (auto *CI = dyn_cast<ConstantInt>(V))
     return CI->getValue().isStrictlyPositive();
 
   // TODO: We'd doing two recursive queries here.  We should factor this such
   // that only a single query is needed.
   return isKnownNonNegative(V, DL, Depth, AC, CxtI, DT) &&
     isKnownNonZero(V, DL, Depth, AC, CxtI, DT);
 }
 
 bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
   KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
   return Known.isNegative();
 }
 
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
 
 bool llvm::isKnownNonEqual(const Value *V1, const Value *V2,
                            const DataLayout &DL,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
   return ::isKnownNonEqual(V1, V2, Query(DL, AC,
                                          safeCxtI(V1, safeCxtI(V2, CxtI)),
                                          DT));
 }
 
 static bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
                               const Query &Q);
 
 bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
                              const DataLayout &DL,
                              unsigned Depth, AssumptionCache *AC,
                              const Instruction *CxtI, const DominatorTree *DT) {
   return ::MaskedValueIsZero(V, Mask, Depth,
                              Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
                                    const Query &Q);
 
 unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
   return ::ComputeNumSignBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    bool NSW,
                                    KnownBits &KnownOut, KnownBits &Known2,
                                    unsigned Depth, const Query &Q) {
   unsigned BitWidth = KnownOut.getBitWidth();
 
   // If an initial sequence of bits in the result is not needed, the
   // corresponding bits in the operands are not needed.
   KnownBits LHSKnown(BitWidth);
   computeKnownBits(Op0, LHSKnown, Depth + 1, Q);
   computeKnownBits(Op1, Known2, Depth + 1, Q);
 
   KnownOut = KnownBits::computeForAddSub(Add, NSW, LHSKnown, Known2);
 }
 
 static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
                                 KnownBits &Known, KnownBits &Known2,
                                 unsigned Depth, const Query &Q) {
   unsigned BitWidth = Known.getBitWidth();
   computeKnownBits(Op1, Known, Depth + 1, Q);
   computeKnownBits(Op0, Known2, Depth + 1, Q);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
   // If the multiplication is known not to overflow, compute the sign bit.
   if (NSW) {
     if (Op0 == Op1) {
       // The product of a number with itself is non-negative.
       isKnownNonNegative = true;
     } else {
       bool isKnownNonNegativeOp1 = Known.isNonNegative();
       bool isKnownNonNegativeOp0 = Known2.isNonNegative();
       bool isKnownNegativeOp1 = Known.isNegative();
       bool isKnownNegativeOp0 = Known2.isNegative();
       // The product of two numbers with the same sign is non-negative.
       isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) ||
         (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
       // The product of a negative number and a non-negative number is either
       // negative or zero.
       if (!isKnownNonNegative)
         isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
                            isKnownNonZero(Op0, Depth, Q)) ||
                           (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
                            isKnownNonZero(Op1, Depth, Q));
     }
   }
 
   assert(!Known.hasConflict() && !Known2.hasConflict());
   // Compute a conservative estimate for high known-0 bits.
   unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
                              Known2.countMinLeadingZeros(),
                              BitWidth) - BitWidth;
   LeadZ = std::min(LeadZ, BitWidth);
 
   // The result of the bottom bits of an integer multiply can be
   // inferred by looking at the bottom bits of both operands and
   // multiplying them together.
   // We can infer at least the minimum number of known trailing bits
   // of both operands. Depending on number of trailing zeros, we can
   // infer more bits, because (a*b) <=> ((a/m) * (b/n)) * (m*n) assuming
   // a and b are divisible by m and n respectively.
   // We then calculate how many of those bits are inferrable and set
   // the output. For example, the i8 mul:
   //  a = XXXX1100 (12)
   //  b = XXXX1110 (14)
   // We know the bottom 3 bits are zero since the first can be divided by
   // 4 and the second by 2, thus having ((12/4) * (14/2)) * (2*4).
   // Applying the multiplication to the trimmed arguments gets:
   //    XX11 (3)
   //    X111 (7)
   // -------
   //    XX11
   //   XX11
   //  XX11
   // XX11
   // -------
   // XXXXX01
   // Which allows us to infer the 2 LSBs. Since we're multiplying the result
   // by 8, the bottom 3 bits will be 0, so we can infer a total of 5 bits.
   // The proof for this can be described as:
   // Pre: (C1 >= 0) && (C1 < (1 << C5)) && (C2 >= 0) && (C2 < (1 << C6)) &&
   //      (C7 == (1 << (umin(countTrailingZeros(C1), C5) +
   //                    umin(countTrailingZeros(C2), C6) +
   //                    umin(C5 - umin(countTrailingZeros(C1), C5),
   //                         C6 - umin(countTrailingZeros(C2), C6)))) - 1)
   // %aa = shl i8 %a, C5
   // %bb = shl i8 %b, C6
   // %aaa = or i8 %aa, C1
   // %bbb = or i8 %bb, C2
   // %mul = mul i8 %aaa, %bbb
   // %mask = and i8 %mul, C7
   //   =>
   // %mask = i8 ((C1*C2)&C7)
   // Where C5, C6 describe the known bits of %a, %b
   // C1, C2 describe the known bottom bits of %a, %b.
   // C7 describes the mask of the known bits of the result.
   APInt Bottom0 = Known.One;
   APInt Bottom1 = Known2.One;
 
   // How many times we'd be able to divide each argument by 2 (shr by 1).
   // This gives us the number of trailing zeros on the multiplication result.
   unsigned TrailBitsKnown0 = (Known.Zero | Known.One).countTrailingOnes();
   unsigned TrailBitsKnown1 = (Known2.Zero | Known2.One).countTrailingOnes();
   unsigned TrailZero0 = Known.countMinTrailingZeros();
   unsigned TrailZero1 = Known2.countMinTrailingZeros();
   unsigned TrailZ = TrailZero0 + TrailZero1;
 
   // Figure out the fewest known-bits operand.
   unsigned SmallestOperand = std::min(TrailBitsKnown0 - TrailZero0,
                                       TrailBitsKnown1 - TrailZero1);
   unsigned ResultBitsKnown = std::min(SmallestOperand + TrailZ, BitWidth);
 
   APInt BottomKnown = Bottom0.getLoBits(TrailBitsKnown0) *
                       Bottom1.getLoBits(TrailBitsKnown1);
 
   Known.resetAll();
   Known.Zero.setHighBits(LeadZ);
   Known.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
   Known.One |= BottomKnown.getLoBits(ResultBitsKnown);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
   // which case we prefer to follow the result of the direct computation,
   // though as the program is invoking undefined behaviour we can choose
   // whatever we like here.
   if (isKnownNonNegative && !Known.isNegative())
     Known.makeNonNegative();
   else if (isKnownNegative && !Known.isNonNegative())
     Known.makeNegative();
 }
 
 void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
                                              KnownBits &Known) {
   unsigned BitWidth = Known.getBitWidth();
   unsigned NumRanges = Ranges.getNumOperands() / 2;
   assert(NumRanges >= 1);
 
   Known.Zero.setAllBits();
   Known.One.setAllBits();
 
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
     ConstantInt *Upper =
         mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
 
     // The first CommonPrefixBits of all values in Range are equal.
     unsigned CommonPrefixBits =
         (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();
 
     APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
     Known.One &= Range.getUnsignedMax() & Mask;
     Known.Zero &= ~Range.getUnsignedMax() & Mask;
   }
 }
 
 static bool isEphemeralValueOf(const Instruction *I, const Value *E) {
   SmallVector<const Value *, 16> WorkSet(1, I);
   SmallPtrSet<const Value *, 32> Visited;
   SmallPtrSet<const Value *, 16> EphValues;
 
   // The instruction defining an assumption's condition itself is always
   // considered ephemeral to that assumption (even if it has other
   // non-ephemeral users). See r246696's test case for an example.
   if (is_contained(I->operands(), E))
     return true;
 
   while (!WorkSet.empty()) {
     const Value *V = WorkSet.pop_back_val();
     if (!Visited.insert(V).second)
       continue;
 
     // If all uses of this value are ephemeral, then so is this value.
     if (llvm::all_of(V->users(), [&](const User *U) {
                                    return EphValues.count(U);
                                  })) {
       if (V == E)
         return true;
 
       if (V == I || isSafeToSpeculativelyExecute(V)) {
        EphValues.insert(V);
        if (const User *U = dyn_cast<User>(V))
          for (User::const_op_iterator J = U->op_begin(), JE = U->op_end();
               J != JE; ++J)
            WorkSet.push_back(*J);
       }
     }
   }
 
   return false;
 }
 
 // Is this an intrinsic that cannot be speculated but also cannot trap?
 bool llvm::isAssumeLikeIntrinsic(const Instruction *I) {
   if (const CallInst *CI = dyn_cast<CallInst>(I))
     if (Function *F = CI->getCalledFunction())
       switch (F->getIntrinsicID()) {
       default: break;
       // FIXME: This list is repeated from NoTTI::getIntrinsicCost.
       case Intrinsic::assume:
       case Intrinsic::sideeffect:
       case Intrinsic::dbg_declare:
       case Intrinsic::dbg_value:
       case Intrinsic::dbg_label:
       case Intrinsic::invariant_start:
       case Intrinsic::invariant_end:
       case Intrinsic::lifetime_start:
       case Intrinsic::lifetime_end:
       case Intrinsic::objectsize:
       case Intrinsic::ptr_annotation:
       case Intrinsic::var_annotation:
         return true;
       }
 
   return false;
 }
 
 bool llvm::isValidAssumeForContext(const Instruction *Inv,
                                    const Instruction *CxtI,
                                    const DominatorTree *DT) {
   // There are two restrictions on the use of an assume:
   //  1. The assume must dominate the context (or the control flow must
   //     reach the assume whenever it reaches the context).
   //  2. The context must not be in the assume's set of ephemeral values
   //     (otherwise we will use the assume to prove that the condition
   //     feeding the assume is trivially true, thus causing the removal of
   //     the assume).
 
   if (DT) {
     if (DT->dominates(Inv, CxtI))
       return true;
   } else if (Inv->getParent() == CxtI->getParent()->getSinglePredecessor()) {
     // We don't have a DT, but this trivially dominates.
     return true;
   }
 
   // With or without a DT, the only remaining case we will check is if the
   // instructions are in the same BB.  Give up if that is not the case.
   if (Inv->getParent() != CxtI->getParent())
     return false;
 
   // If we have a dom tree, then we now know that the assume doesn't dominate
   // the other instruction.  If we don't have a dom tree then we can check if
   // the assume is first in the BB.
   if (!DT) {
     // Search forward from the assume until we reach the context (or the end
     // of the block); the common case is that the assume will come first.
     for (auto I = std::next(BasicBlock::const_iterator(Inv)),
          IE = Inv->getParent()->end(); I != IE; ++I)
       if (&*I == CxtI)
         return true;
   }
 
   // The context comes first, but they're both in the same block. Make sure
   // there is nothing in between that might interrupt the control flow.
   for (BasicBlock::const_iterator I =
          std::next(BasicBlock::const_iterator(CxtI)), IE(Inv);
        I != IE; ++I)
     if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
       return false;
 
   return !isEphemeralValueOf(Inv, CxtI);
 }
 
 static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
                                        unsigned Depth, const Query &Q) {
   // Use of assumptions is context-sensitive. If we don't have a context, we
   // cannot use them!
   if (!Q.AC || !Q.CxtI)
     return;
 
   unsigned BitWidth = Known.getBitWidth();
 
   // Note that the patterns below need to be kept in sync with the code
   // in AssumptionCache::updateAffectedValues.
 
   for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
     if (!AssumeVH)
       continue;
     CallInst *I = cast<CallInst>(AssumeVH);
     assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() &&
            "Got assumption for the wrong function!");
     if (Q.isExcluded(I))
       continue;
 
     // Warning: This loop can end up being somewhat performance sensitive.
     // We're running this loop for once for each value queried resulting in a
     // runtime of ~O(#assumes * #values).
 
     assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
            "must be an assume intrinsic");
 
     Value *Arg = I->getArgOperand(0);
 
     if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
       Known.setAllOnes();
       return;
     }
     if (match(Arg, m_Not(m_Specific(V))) &&
         isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
       Known.setAllZero();
       return;
     }
 
     // The remaining tests are all recursive, so bail out if we hit the limit.
     if (Depth == MaxDepth)
       continue;
 
     Value *A, *B;
     auto m_V = m_CombineOr(m_Specific(V),
                            m_CombineOr(m_PtrToInt(m_Specific(V)),
                            m_BitCast(m_Specific(V))));
 
     CmpInst::Predicate Pred;
     uint64_t C;
     // assume(v = a)
     if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
         Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       Known.Zero |= RHSKnown.Zero;
       Known.One  |= RHSKnown.One;
     // assume(v & b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits MaskKnown(BitWidth);
       computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // known bits from the RHS to V.
       Known.Zero |= RHSKnown.Zero & MaskKnown.One;
       Known.One  |= RHSKnown.One  & MaskKnown.One;
     // assume(~(v & b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits MaskKnown(BitWidth);
       computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // inverted known bits from the RHS to V.
       Known.Zero |= RHSKnown.One  & MaskKnown.One;
       Known.One  |= RHSKnown.Zero & MaskKnown.One;
     // assume(v | b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V.
       Known.Zero |= RHSKnown.Zero & BKnown.Zero;
       Known.One  |= RHSKnown.One  & BKnown.Zero;
     // assume(~(v | b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V.
       Known.Zero |= RHSKnown.One  & BKnown.Zero;
       Known.One  |= RHSKnown.Zero & BKnown.Zero;
     // assume(v ^ b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V. For those bits in B that are known to be one,
       // we can propagate inverted known bits from the RHS to V.
       Known.Zero |= RHSKnown.Zero & BKnown.Zero;
       Known.One  |= RHSKnown.One  & BKnown.Zero;
       Known.Zero |= RHSKnown.One  & BKnown.One;
       Known.One  |= RHSKnown.Zero & BKnown.One;
     // assume(~(v ^ b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V. For those bits in B that are
       // known to be one, we can propagate known bits from the RHS to V.
       Known.Zero |= RHSKnown.One  & BKnown.Zero;
       Known.One  |= RHSKnown.Zero & BKnown.Zero;
       Known.Zero |= RHSKnown.Zero & BKnown.One;
       Known.One  |= RHSKnown.One  & BKnown.One;
     // assume(v << c = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
       RHSKnown.Zero.lshrInPlace(C);
       Known.Zero |= RHSKnown.Zero;
       RHSKnown.One.lshrInPlace(C);
       Known.One  |= RHSKnown.One;
     // assume(~(v << c) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
       RHSKnown.One.lshrInPlace(C);
       Known.Zero |= RHSKnown.One;
       RHSKnown.Zero.lshrInPlace(C);
       Known.One  |= RHSKnown.Zero;
     // assume(v >> c = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
                               m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
       Known.Zero |= RHSKnown.Zero << C;
       Known.One  |= RHSKnown.One  << C;
     // assume(~(v >> c) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
       Known.Zero |= RHSKnown.One  << C;
       Known.One  |= RHSKnown.Zero << C;
     // assume(v >=_s c) where c is non-negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isNonNegative()) {
         // We know that the sign bit is zero.
         Known.makeNonNegative();
       }
     // assume(v >_s c) where c is at least -1.
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isAllOnes() || RHSKnown.isNonNegative()) {
         // We know that the sign bit is zero.
         Known.makeNonNegative();
       }
     // assume(v <=_s c) where c is negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isNegative()) {
         // We know that the sign bit is one.
         Known.makeNegative();
       }
     // assume(v <_s c) where c is non-positive
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isZero() || RHSKnown.isNegative()) {
         // We know that the sign bit is one.
         Known.makeNegative();
       }
     // assume(v <=_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
       Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
       // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // If the RHS is known zero, then this assumption must be wrong (nothing
       // is unsigned less than zero). Signal a conflict and get out of here.
       if (RHSKnown.isZero()) {
         Known.Zero.setAllBits();
         Known.One.setAllBits();
         break;
       }
 
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
         Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
       else
         Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
     }
   }
 
   // If assumptions conflict with each other or previous known bits, then we
   // have a logical fallacy. It's possible that the assumption is not reachable,
   // so this isn't a real bug. On the other hand, the program may have undefined
   // behavior, or we might have a bug in the compiler. We can't assert/crash, so
   // clear out the known bits, try to warn the user, and hope for the best.
   if (Known.Zero.intersects(Known.One)) {
     Known.resetAll();
 
     if (Q.ORE)
       Q.ORE->emit([&]() {
         auto *CxtI = const_cast<Instruction *>(Q.CxtI);
         return OptimizationRemarkAnalysis("value-tracking", "BadAssumption",
                                           CxtI)
                << "Detected conflicting code assumptions. Program may "
                   "have undefined behavior, or compiler may have "
                   "internal error.";
       });
   }
 }
 
 /// Compute known bits from a shift operator, including those with a
 /// non-constant shift amount. Known is the output of this function. Known2 is a
 /// pre-allocated temporary with the same bit width as Known. KZF and KOF are
 /// operator-specific functions that, given the known-zero or known-one bits
 /// respectively, and a shift amount, compute the implied known-zero or
 /// known-one bits of the shift operator's result respectively for that shift
 /// amount. The results from calling KZF and KOF are conservatively combined for
 /// all permitted shift amounts.
 static void computeKnownBitsFromShiftOperator(
     const Operator *I, KnownBits &Known, KnownBits &Known2,
     unsigned Depth, const Query &Q,
     function_ref<APInt(const APInt &, unsigned)> KZF,
     function_ref<APInt(const APInt &, unsigned)> KOF) {
   unsigned BitWidth = Known.getBitWidth();
 
   if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
     unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     Known.Zero = KZF(Known.Zero, ShiftAmt);
     Known.One  = KOF(Known.One, ShiftAmt);
     // If the known bits conflict, this must be an overflowing left shift, so
     // the shift result is poison. We can return anything we want. Choose 0 for
     // the best folding opportunity.
     if (Known.hasConflict())
       Known.setAllZero();
 
     return;
   }
 
   computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
 
   // If the shift amount could be greater than or equal to the bit-width of the
   // LHS, the value could be poison, but bail out because the check below is
   // expensive. TODO: Should we just carry on?
   if ((~Known.Zero).uge(BitWidth)) {
     Known.resetAll();
     return;
   }
 
   // Note: We cannot use Known.Zero.getLimitedValue() here, because if
   // BitWidth > 64 and any upper bits are known, we'll end up returning the
   // limit value (which implies all bits are known).
   uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
   uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();
 
   // It would be more-clearly correct to use the two temporaries for this
   // calculation. Reusing the APInts here to prevent unnecessary allocations.
   Known.resetAll();
 
   // If we know the shifter operand is nonzero, we can sometimes infer more
   // known bits. However this is expensive to compute, so be lazy about it and
   // only compute it when absolutely necessary.
   Optional<bool> ShifterOperandIsNonZero;
 
   // Early exit if we can't constrain any well-defined shift amount.
   if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) &&
       !(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) {
     ShifterOperandIsNonZero = isKnownNonZero(I->getOperand(1), Depth + 1, Q);
     if (!*ShifterOperandIsNonZero)
       return;
   }
 
   computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
   Known.Zero.setAllBits();
   Known.One.setAllBits();
   for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
     // Combine the shifted known input bits only for those shift amounts
     // compatible with its known constraints.
     if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt)
       continue;
     if ((ShiftAmt | ShiftAmtKO) != ShiftAmt)
       continue;
     // If we know the shifter is nonzero, we may be able to infer more known
     // bits. This check is sunk down as far as possible to avoid the expensive
     // call to isKnownNonZero if the cheaper checks above fail.
     if (ShiftAmt == 0) {
       if (!ShifterOperandIsNonZero.hasValue())
         ShifterOperandIsNonZero =
             isKnownNonZero(I->getOperand(1), Depth + 1, Q);
       if (*ShifterOperandIsNonZero)
         continue;
     }
 
     Known.Zero &= KZF(Known2.Zero, ShiftAmt);
     Known.One  &= KOF(Known2.One, ShiftAmt);
   }
 
   // If the known bits conflict, the result is poison. Return a 0 and hope the
   // caller can further optimize that.
   if (Known.hasConflict())
     Known.setAllZero();
 }
 
 static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
                                          unsigned Depth, const Query &Q) {
   unsigned BitWidth = Known.getBitWidth();
 
   KnownBits Known2(Known);
   switch (I->getOpcode()) {
   default: break;
   case Instruction::Load:
     if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
       computeKnownBitsFromRangeMetadata(*MD, Known);
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
     computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     Known.One &= Known2.One;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     Known.Zero |= Known2.Zero;
 
     // and(x, add (x, -1)) is a common idiom that always clears the low bit;
     // here we handle the more general case of adding any odd number by
     // matching the form add(x, add(x, y)) where y is odd.
     // TODO: This could be generalized to clearing any bit set in y where the
     // following bit is known to be unset in y.
     Value *X = nullptr, *Y = nullptr;
     if (!Known.Zero[0] && !Known.One[0] &&
         match(I, m_c_BinOp(m_Value(X), m_Add(m_Deferred(X), m_Value(Y))))) {
       Known2.resetAll();
       computeKnownBits(Y, Known2, Depth + 1, Q);
       if (Known2.countMinTrailingOnes() > 0)
         Known.Zero.setBit(0);
     }
     break;
   }
   case Instruction::Or:
     computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     Known.Zero &= Known2.Zero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
     Known.One |= Known2.One;
     break;
   case Instruction::Xor: {
     computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
     Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
     Known.Zero = std::move(KnownZeroOut);
     break;
   }
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known,
                         Known2, Depth, Q);
     break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
     unsigned LeadZ = Known2.countMinLeadingZeros();
 
     Known2.resetAll();
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
     unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
     if (RHSMaxLeadingZeros != BitWidth)
       LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
   }
   case Instruction::Select: {
     const Value *LHS, *RHS;
     SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
     if (SelectPatternResult::isMinOrMax(SPF)) {
       computeKnownBits(RHS, Known, Depth + 1, Q);
       computeKnownBits(LHS, Known2, Depth + 1, Q);
     } else {
       computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
       computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
     }
 
     unsigned MaxHighOnes = 0;
     unsigned MaxHighZeros = 0;
     if (SPF == SPF_SMAX) {
       // If both sides are negative, the result is negative.
       if (Known.isNegative() && Known2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
         MaxHighOnes =
             std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
       // If either side is non-negative, the result is non-negative.
       else if (Known.isNonNegative() || Known2.isNonNegative())
         MaxHighZeros = 1;
     } else if (SPF == SPF_SMIN) {
       // If both sides are non-negative, the result is non-negative.
       if (Known.isNonNegative() && Known2.isNonNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
         MaxHighZeros = std::max(Known.countMinLeadingZeros(),
                                 Known2.countMinLeadingZeros());
       // If either side is negative, the result is negative.
       else if (Known.isNegative() || Known2.isNegative())
         MaxHighOnes = 1;
     } else if (SPF == SPF_UMAX) {
       // We can derive a lower bound on the result by taking the max of the
       // leading one bits.
       MaxHighOnes =
           std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
     } else if (SPF == SPF_UMIN) {
       // We can derive an upper bound on the result by taking the max of the
       // leading zero bits.
       MaxHighZeros =
           std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     } else if (SPF == SPF_ABS) {
       // RHS from matchSelectPattern returns the negation part of abs pattern.
       // If the negate has an NSW flag we can assume the sign bit of the result
       // will be 0 because that makes abs(INT_MIN) undefined.
       if (cast<Instruction>(RHS)->hasNoSignedWrap())
         MaxHighZeros = 1;
     }
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     if (MaxHighOnes > 0)
       Known.One.setHighBits(MaxHighOnes);
     if (MaxHighZeros > 0)
       Known.Zero.setHighBits(MaxHighZeros);
     break;
   }
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
   case Instruction::SIToFP:
   case Instruction::UIToFP:
     break; // Can't work with floating point.
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
     // Fall through and handle them the same as zext/trunc.
     LLVM_FALLTHROUGH;
   case Instruction::ZExt:
   case Instruction::Trunc: {
     Type *SrcTy = I->getOperand(0)->getType();
 
     unsigned SrcBitWidth;
     // Note that we handle pointer operands here because of inttoptr/ptrtoint
     // which fall through here.
     Type *ScalarTy = SrcTy->getScalarType();
     SrcBitWidth = ScalarTy->isPointerTy() ?
       Q.DL.getIndexTypeSizeInBits(ScalarTy) :
       Q.DL.getTypeSizeInBits(ScalarTy);
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     Known = Known.zextOrTrunc(SrcBitWidth);
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     Known = Known.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
       Known.Zero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::BitCast: {
     Type *SrcTy = I->getOperand(0)->getType();
     if (SrcTy->isIntOrPtrTy() &&
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
       computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
       break;
     }
     break;
   }
   case Instruction::SExt: {
     // Compute the bits in the result that are not present in the input.
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
     Known = Known.trunc(SrcBitWidth);
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
     Known = Known.sext(BitWidth);
     break;
   }
   case Instruction::Shl: {
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
       APInt KZResult = KnownZero << ShiftAmt;
       KZResult.setLowBits(ShiftAmt); // Low bits known 0.
       // If this shift has "nsw" keyword, then the result is either a poison
       // value or has the same sign bit as the first operand.
       if (NSW && KnownZero.isSignBitSet())
         KZResult.setSignBit();
       return KZResult;
     };
 
     auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
       APInt KOResult = KnownOne << ShiftAmt;
       if (NSW && KnownOne.isSignBitSet())
         KOResult.setSignBit();
       return KOResult;
     };
 
     computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::LShr: {
     // (lshr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
       APInt KZResult = KnownZero.lshr(ShiftAmt);
       // High bits known zero.
       KZResult.setHighBits(ShiftAmt);
       return KZResult;
     };
 
     auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
       return KnownOne.lshr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::AShr: {
     // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
       return KnownZero.ashr(ShiftAmt);
     };
 
     auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
       return KnownOne.ashr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
                            Known, Known2, Depth, Q);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
                            Known, Known2, Depth, Q);
     break;
   }
   case Instruction::SRem:
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
         // The low bits of the first operand are unchanged by the srem.
         Known.Zero = Known2.Zero & LowBits;
         Known.One = Known2.One & LowBits;
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
         if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
           Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
         if (Known2.isNegative() && LowBits.intersects(Known2.One))
           Known.One |= ~LowBits;
 
         assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
         break;
       }
     }
 
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
     // If it's known zero, our sign bit is also zero.
     if (Known2.isNonNegative())
       Known.makeNonNegative();
 
     break;
   case Instruction::URem: {
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       const APInt &RA = Rem->getValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
         computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
         Known.Zero |= ~LowBits;
         Known.One &= LowBits;
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
 
     unsigned Leaders =
         std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
   }
 
   case Instruction::Alloca: {
     const AllocaInst *AI = cast<AllocaInst>(I);
     unsigned Align = AI->getAlignment();
     if (Align == 0)
       Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
 
     if (Align > 0)
       Known.Zero.setLowBits(countTrailingZeros(Align));
     break;
   }
   case Instruction::GetElementPtr: {
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
     KnownBits LocalKnown(BitWidth);
     computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
     unsigned TrailZ = LocalKnown.countMinTrailingZeros();
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
       Value *Index = I->getOperand(i);
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         // Handle struct member offset arithmetic.
 
         // Handle case when index is vector zeroinitializer
         Constant *CIndex = cast<Constant>(Index);
         if (CIndex->isZeroValue())
           continue;
 
         if (CIndex->getType()->isVectorTy())
           Index = CIndex->getSplatValue();
 
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
         const StructLayout *SL = Q.DL.getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
         TrailZ = std::min<unsigned>(TrailZ,
                                     countTrailingZeros(Offset));
       } else {
         // Handle array index arithmetic.
         Type *IndexedTy = GTI.getIndexedType();
         if (!IndexedTy->isSized()) {
           TrailZ = 0;
           break;
         }
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
         LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
         computeKnownBits(Index, LocalKnown, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
                                    LocalKnown.countMinTrailingZeros()));
       }
     }
 
     Known.Zero.setLowBits(TrailZ);
     break;
   }
   case Instruction::PHI: {
     const PHINode *P = cast<PHINode>(I);
     // Handle the case of a simple two-predecessor recurrence PHI.
     // There's a lot more that could theoretically be done here, but
     // this is sufficient to catch some interesting cases.
     if (P->getNumIncomingValues() == 2) {
       for (unsigned i = 0; i != 2; ++i) {
         Value *L = P->getIncomingValue(i);
         Value *R = P->getIncomingValue(!i);
         Operator *LU = dyn_cast<Operator>(L);
         if (!LU)
           continue;
         unsigned Opcode = LU->getOpcode();
         // Check for operations that have the property that if
         // both their operands have low zero bits, the result
         // will have low zero bits.
         if (Opcode == Instruction::Add ||
             Opcode == Instruction::Sub ||
             Opcode == Instruction::And ||
             Opcode == Instruction::Or ||
             Opcode == Instruction::Mul) {
           Value *LL = LU->getOperand(0);
           Value *LR = LU->getOperand(1);
           // Find a recurrence.
           if (LL == I)
             L = LR;
           else if (LR == I)
             L = LL;
           else
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
           computeKnownBits(R, Known2, Depth + 1, Q);
 
           // We need to take the minimum number of known bits
           KnownBits Known3(Known);
           computeKnownBits(L, Known3, Depth + 1, Q);
 
           Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
                                          Known3.countMinTrailingZeros()));
 
           auto *OverflowOp = dyn_cast<OverflowingBinaryOperator>(LU);
           if (OverflowOp && OverflowOp->hasNoSignedWrap()) {
             // If initial value of recurrence is nonnegative, and we are adding
             // a nonnegative number with nsw, the result can only be nonnegative
             // or poison value regardless of the number of times we execute the
             // add in phi recurrence. If initial value is negative and we are
             // adding a negative number with nsw, the result can only be
             // negative or poison value. Similar arguments apply to sub and mul.
             //
             // (add non-negative, non-negative) --> non-negative
             // (add negative, negative) --> negative
             if (Opcode == Instruction::Add) {
               if (Known2.isNonNegative() && Known3.isNonNegative())
                 Known.makeNonNegative();
               else if (Known2.isNegative() && Known3.isNegative())
                 Known.makeNegative();
             }
 
             // (sub nsw non-negative, negative) --> non-negative
             // (sub nsw negative, non-negative) --> negative
             else if (Opcode == Instruction::Sub && LL == I) {
               if (Known2.isNonNegative() && Known3.isNegative())
                 Known.makeNonNegative();
               else if (Known2.isNegative() && Known3.isNonNegative())
                 Known.makeNegative();
             }
 
             // (mul nsw non-negative, non-negative) --> non-negative
             else if (Opcode == Instruction::Mul && Known2.isNonNegative() &&
                      Known3.isNonNegative())
               Known.makeNonNegative();
           }
 
           break;
         }
       }
     }
 
     // Unreachable blocks may have zero-operand PHI nodes.
     if (P->getNumIncomingValues() == 0)
       break;
 
     // Otherwise take the unions of the known bit sets of the operands,
     // taking conservative care to avoid excessive recursion.
     if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) {
       // Skip if every incoming value references to ourself.
       if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
       Known.Zero.setAllBits();
       Known.One.setAllBits();
       for (Value *IncValue : P->incoming_values()) {
         // Skip direct self references.
         if (IncValue == P) continue;
 
         Known2 = KnownBits(BitWidth);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
         computeKnownBits(IncValue, Known2, MaxDepth - 1, Q);
         Known.Zero &= Known2.Zero;
         Known.One &= Known2.One;
         // If all bits have been ruled out, there's no need to check
         // more operands.
         if (!Known.Zero && !Known.One)
           break;
       }
     }
     break;
   }
   case Instruction::Call:
   case Instruction::Invoke:
     // If range metadata is attached to this call, set known bits from that,
     // and then intersect with known bits based on other properties of the
     // function.
     if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
       computeKnownBitsFromRangeMetadata(*MD, Known);
     if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) {
       computeKnownBits(RV, Known2, Depth + 1, Q);
       Known.Zero |= Known2.Zero;
       Known.One |= Known2.One;
     }
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::bitreverse:
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.reverseBits();
         Known.One |= Known2.One.reverseBits();
         break;
       case Intrinsic::bswap:
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.byteSwap();
         Known.One |= Known2.One.byteSwap();
         break;
       case Intrinsic::ctlz: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleLZ = Known2.One.countLeadingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
         unsigned LowBits = Log2_32(PossibleLZ)+1;
         Known.Zero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::cttz: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleTZ = Known2.One.countTrailingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
         unsigned LowBits = Log2_32(PossibleTZ)+1;
         Known.Zero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::ctpop: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
         unsigned BitsPossiblySet = Known2.countMaxPopulation();
         unsigned LowBits = Log2_32(BitsPossiblySet)+1;
         Known.Zero.setBitsFrom(LowBits);
         // TODO: we could bound KnownOne using the lower bound on the number
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
       }
     }
     break;
   case Instruction::ExtractElement:
     // Look through extract element. At the moment we keep this simple and skip
     // tracking the specific element. But at least we might find information
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     break;
   case Instruction::ExtractValue:
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
       const ExtractValueInst *EVI = cast<ExtractValueInst>(I);
       if (EVI->getNumIndices() != 1) break;
       if (EVI->getIndices()[0] == 0) {
         switch (II->getIntrinsicID()) {
         default: break;
         case Intrinsic::uadd_with_overflow:
         case Intrinsic::sadd_with_overflow:
           computeKnownBitsAddSub(true, II->getArgOperand(0),
                                  II->getArgOperand(1), false, Known, Known2,
                                  Depth, Q);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           computeKnownBitsAddSub(false, II->getArgOperand(0),
                                  II->getArgOperand(1), false, Known, Known2,
                                  Depth, Q);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
                               Known, Known2, Depth, Q);
           break;
         }
       }
     }
   }
 }
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them.
 KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
   KnownBits Known(getBitWidth(V->getType(), Q.DL));
   computeKnownBits(V, Known, Depth, Q);
   return Known;
 }
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the Known bit set.
 ///
 /// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
 /// we cannot optimize based on the assumption that it is zero without changing
 /// it to be an explicit zero.  If we don't change it to zero, other code could
 /// optimized based on the contradictory assumption that it is non-zero.
 /// Because instcombine aggressively folds operations with undef args anyway,
 /// this won't lose us code quality.
 ///
 /// This function is defined on values with integer type, values with pointer
 /// type, and vectors of integers.  In the case
 /// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
                       const Query &Q) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   unsigned BitWidth = Known.getBitWidth();
 
   assert((V->getType()->isIntOrIntVectorTy(BitWidth) ||
           V->getType()->isPtrOrPtrVectorTy()) &&
          "Not integer or pointer type!");
 
   Type *ScalarTy = V->getType()->getScalarType();
   unsigned ExpectedWidth = ScalarTy->isPointerTy() ?
     Q.DL.getIndexTypeSizeInBits(ScalarTy) : Q.DL.getTypeSizeInBits(ScalarTy);
   assert(ExpectedWidth == BitWidth && "V and Known should have same BitWidth");
   (void)BitWidth;
   (void)ExpectedWidth;
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
     // We know all of the bits for a scalar constant or a splat vector constant!
     Known.One = *C;
     Known.Zero = ~Known.One;
     return;
   }
   // Null and aggregate-zero are all-zeros.
   if (isa<ConstantPointerNull>(V) || isa<ConstantAggregateZero>(V)) {
     Known.setAllZero();
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
   // each element.
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
     // We know that CDS must be a vector of integers. Take the intersection of
     // each element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
       APInt Elt = CDS->getElementAsAPInt(i);
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
     return;
   }
 
   if (const auto *CV = dyn_cast<ConstantVector>(V)) {
     // We know that CV must be a vector of integers. Take the intersection of
     // each element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
       Constant *Element = CV->getAggregateElement(i);
       auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
       if (!ElementCI) {
         Known.resetAll();
         return;
       }
       const APInt &Elt = ElementCI->getValue();
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
     return;
   }
 
   // Start out not knowing anything.
   Known.resetAll();
 
   // We can't imply anything about undefs.
   if (isa<UndefValue>(V))
     return;
 
   // There's no point in looking through other users of ConstantData for
   // assumptions.  Confirm that we've handled them all.
   assert(!isa<ConstantData>(V) && "Unhandled constant data!");
 
   // Limit search depth.
   // All recursive calls that increase depth must come after this.
   if (Depth == MaxDepth)
     return;
 
   // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
   // the bits of its aliasee.
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
     if (!GA->isInterposable())
       computeKnownBits(GA->getAliasee(), Known, Depth + 1, Q);
     return;
   }
 
   if (const Operator *I = dyn_cast<Operator>(V))
     computeKnownBitsFromOperator(I, Known, Depth, Q);
 
   // Aligned pointers have trailing zeros - refine Known.Zero set
   if (V->getType()->isPointerTy()) {
     unsigned Align = V->getPointerAlignment(Q.DL);
     if (Align)
       Known.Zero.setLowBits(countTrailingZeros(Align));
   }
 
   // computeKnownBitsFromAssume strictly refines Known.
   // Therefore, we run them after computeKnownBitsFromOperator.
 
   // Check whether a nearby assume intrinsic can determine some known bits.
   computeKnownBitsFromAssume(V, Known, Depth, Q);
 
   assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
 }
 
 /// Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined. Supports values with integer or pointer
 /// types and vectors of integers.
 bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                             const Query &Q) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
   // Attempt to match against constants.
   if (OrZero && match(V, m_Power2OrZero()))
       return true;
   if (match(V, m_Power2()))
       return true;
 
   // 1 << X is clearly a power of two if the one is not shifted off the end.  If
   // it is shifted off the end then the result is undefined.
   if (match(V, m_Shl(m_One(), m_Value())))
     return true;
 
   // (signmask) >>l X is clearly a power of two if the one is not shifted off
   // the bottom.  If it is shifted off the bottom then the result is undefined.
   if (match(V, m_LShr(m_SignMask(), m_Value())))
     return true;
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ == MaxDepth)
     return false;
 
   Value *X = nullptr, *Y = nullptr;
   // A shift left or a logical shift right of a power of two is a power of two
   // or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
                  match(V, m_LShr(m_Value(X), m_Value()))))
     return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q);
 
   if (const ZExtInst *ZI = dyn_cast<ZExtInst>(V))
     return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q);
 
   if (const SelectInst *SI = dyn_cast<SelectInst>(V))
     return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) &&
            isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q);
 
   if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
     // A power of two and'd with anything is a power of two or zero.
     if (isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q) ||
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, Depth, Q))
       return true;
     // X & (-X) is always a power of two or zero.
     if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X))))
       return true;
     return false;
   }
 
   // Adding a power-of-two or zero to the same power-of-two or zero yields
   // either the original power-of-two, a larger power-of-two or zero.
   if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
     const OverflowingBinaryOperator *VOBO = cast<OverflowingBinaryOperator>(V);
     if (OrZero || VOBO->hasNoUnsignedWrap() || VOBO->hasNoSignedWrap()) {
       if (match(X, m_And(m_Specific(Y), m_Value())) ||
           match(X, m_And(m_Value(), m_Specific(Y))))
         if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q))
           return true;
       if (match(Y, m_And(m_Specific(X), m_Value())) ||
           match(Y, m_And(m_Value(), m_Specific(X))))
         if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q))
           return true;
 
       unsigned BitWidth = V->getType()->getScalarSizeInBits();
       KnownBits LHSBits(BitWidth);
       computeKnownBits(X, LHSBits, Depth, Q);
 
       KnownBits RHSBits(BitWidth);
       computeKnownBits(Y, RHSBits, Depth, Q);
       // If i8 V is a power of two or zero:
       //  ZeroBits: 1 1 1 0 1 1 1 1
       // ~ZeroBits: 0 0 0 1 0 0 0 0
       if ((~(LHSBits.Zero & RHSBits.Zero)).isPowerOf2())
         // If OrZero isn't set, we cannot give back a zero result.
         // Make sure either the LHS or RHS has a bit set.
         if (OrZero || RHSBits.One.getBoolValue() || LHSBits.One.getBoolValue())
           return true;
     }
   }
 
   // An exact divide or right shift can only shift off zero bits, so the result
   // is a power of two only if the first operand is a power of two and not
   // copying a sign bit (sdiv int_min, 2).
   if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) ||
       match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
     return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero,
                                   Depth, Q);
   }
 
   return false;
 }
 
 /// Test whether a GEP's result is known to be non-null.
 ///
 /// Uses properties inherent in a GEP to try to determine whether it is known
 /// to be non-null.
 ///
 /// Currently this routine does not support vector GEPs.
 static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
                               const Query &Q) {
   const Function *F = nullptr;
   if (const Instruction *I = dyn_cast<Instruction>(GEP))
     F = I->getFunction();
 
   if (!GEP->isInBounds() ||
       NullPointerIsDefined(F, GEP->getPointerAddressSpace()))
     return false;
 
   // FIXME: Support vector-GEPs.
   assert(GEP->getType()->isPointerTy() && "We only support plain pointer GEP");
 
   // If the base pointer is non-null, we cannot walk to a null address with an
   // inbounds GEP in address space zero.
   if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q))
     return true;
 
   // Walk the GEP operands and see if any operand introduces a non-zero offset.
   // If so, then the GEP cannot produce a null pointer, as doing so would
   // inherently violate the inbounds contract within address space zero.
   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
        GTI != GTE; ++GTI) {
     // Struct types are easy -- they must always be indexed by a constant.
     if (StructType *STy = GTI.getStructTypeOrNull()) {
       ConstantInt *OpC = cast<ConstantInt>(GTI.getOperand());
       unsigned ElementIdx = OpC->getZExtValue();
       const StructLayout *SL = Q.DL.getStructLayout(STy);
       uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
       if (ElementOffset > 0)
         return true;
       continue;
     }
 
     // If we have a zero-sized type, the index doesn't matter. Keep looping.
     if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
       continue;
 
     // Fast path the constant operand case both for efficiency and so we don't
     // increment Depth when just zipping down an all-constant GEP.
     if (ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand())) {
       if (!OpC->isZero())
         return true;
       continue;
     }
 
     // We post-increment Depth here because while isKnownNonZero increments it
     // as well, when we pop back up that increment won't persist. We don't want
     // to recurse 10k times just because we have 10k GEP operands. We don't
     // bail completely out because we want to handle constant GEPs regardless
     // of depth.
     if (Depth++ >= MaxDepth)
       continue;
 
     if (isKnownNonZero(GTI.getOperand(), Depth, Q))
       return true;
   }
 
   return false;
 }
 
 static bool isKnownNonNullFromDominatingCondition(const Value *V,
                                                   const Instruction *CtxI,
                                                   const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() && "V must be pointer type");
   assert(!isa<ConstantData>(V) && "Did not expect ConstantPointerNull");
 
   if (!CtxI || !DT)
     return false;
 
   unsigned NumUsesExplored = 0;
   for (auto *U : V->users()) {
     // Avoid massive lists
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
     NumUsesExplored++;
 
     // If the value is used as an argument to a call or invoke, then argument
     // attributes may provide an answer about null-ness.
     if (auto CS = ImmutableCallSite(U))
       if (auto *CalledFunc = CS.getCalledFunction())
         for (const Argument &Arg : CalledFunc->args())
           if (CS.getArgOperand(Arg.getArgNo()) == V &&
               Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
             return true;
 
     // Consider only compare instructions uniquely controlling a branch
     CmpInst::Predicate Pred;
     if (!match(const_cast<User *>(U),
                m_c_ICmp(Pred, m_Specific(V), m_Zero())) ||
         (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE))
       continue;
 
     for (auto *CmpU : U->users()) {
       if (const BranchInst *BI = dyn_cast<BranchInst>(CmpU)) {
         assert(BI->isConditional() && "uses a comparison!");
 
         BasicBlock *NonNullSuccessor =
             BI->getSuccessor(Pred == ICmpInst::ICMP_EQ ? 1 : 0);
         BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
         if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
           return true;
       } else if (Pred == ICmpInst::ICMP_NE &&
                  match(CmpU, m_Intrinsic<Intrinsic::experimental_guard>()) &&
                  DT->dominates(cast<Instruction>(CmpU), CtxI)) {
         return true;
       }
     }
   }
 
   return false;
 }
 
 /// Does the 'Range' metadata (which must be a valid MD_range operand list)
 /// ensure that the value it's attached to is never Value?  'RangeType' is
 /// is the type of the value described by the range.
 static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value) {
   const unsigned NumRanges = Ranges->getNumOperands() / 2;
   assert(NumRanges >= 1);
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 0));
     ConstantInt *Upper =
         mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
     if (Range.contains(Value))
       return false;
   }
   return true;
 }
 
 /// Return true if the given value is known to be non-zero when defined. For
 /// vectors, return true if every element is known to be non-zero when
 /// defined. For pointers, if the context instruction and dominator tree are
 /// specified, perform context-sensitive analysis and return true if the
 /// pointer couldn't possibly be null at the specified instruction.
 /// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return false;
     if (isa<ConstantInt>(C))
       // Must be non-zero due to null test above.
       return true;
 
     // For constant vectors, check that all elements are undefined or known
     // non-zero to determine that the whole vector is known non-zero.
     if (auto *VecTy = dyn_cast<VectorType>(C->getType())) {
       for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
         Constant *Elt = C->getAggregateElement(i);
         if (!Elt || Elt->isNullValue())
           return false;
         if (!isa<UndefValue>(Elt) && !isa<ConstantInt>(Elt))
           return false;
       }
       return true;
     }
 
     // A global variable in address space 0 is non null unless extern weak
     // or an absolute symbol reference. Other address spaces may have null as a
     // valid address for a global, so we can't assume anything.
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
       if (!GV->isAbsoluteSymbolRef() && !GV->hasExternalWeakLinkage() &&
           GV->getType()->getAddressSpace() == 0)
         return true;
     } else
       return false;
   }
 
   if (auto *I = dyn_cast<Instruction>(V)) {
     if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) {
       // If the possible ranges don't contain zero, then the value is
       // definitely non-zero.
       if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
         const APInt ZeroValue(Ty->getBitWidth(), 0);
         if (rangeMetadataExcludesValue(Ranges, ZeroValue))
           return true;
       }
     }
   }
 
   // Some of the tests below are recursive, so bail out if we hit the limit.
   if (Depth++ >= MaxDepth)
     return false;
 
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
     // Alloca never returns null, malloc might.
     if (isa<AllocaInst>(V) && Q.DL.getAllocaAddrSpace() == 0)
       return true;
 
     // A byval, inalloca, or nonnull argument is never null.
     if (const Argument *A = dyn_cast<Argument>(V))
       if (A->hasByValOrInAllocaAttr() || A->hasNonNullAttr())
         return true;
 
     // A Load tagged with nonnull metadata is never null.
     if (const LoadInst *LI = dyn_cast<LoadInst>(V))
       if (LI->getMetadata(LLVMContext::MD_nonnull))
         return true;
 
     if (auto CS = ImmutableCallSite(V)) {
       if (CS.isReturnNonNull())
         return true;
       if (const auto *RP = getArgumentAliasingToReturnedPointer(CS))
         return isKnownNonZero(RP, Depth, Q);
     }
   }
 
 
   // Check for recursive pointer simplifications.
   if (V->getType()->isPointerTy()) {
     if (isKnownNonNullFromDominatingCondition(V, Q.CxtI, Q.DT))
       return true;
 
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
       if (isGEPKnownNonNull(GEP, Depth, Q))
         return true;
   }
 
   unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL);
 
   // X | Y != 0 if X != 0 or Y != 0.
   Value *X = nullptr, *Y = nullptr;
   if (match(V, m_Or(m_Value(X), m_Value(Y))))
     return isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q);
 
   // ext X != 0 if X != 0.
   if (isa<SExtInst>(V) || isa<ZExtInst>(V))
     return isKnownNonZero(cast<Instruction>(V)->getOperand(0), Depth, Q);
 
   // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
   // if the lowest bit is shifted off the end.
   if (match(V, m_Shl(m_Value(X), m_Value(Y)))) {
     // shl nuw can't remove any non-zero bits.
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     if (BO->hasNoUnsignedWrap())
       return isKnownNonZero(X, Depth, Q);
 
     KnownBits Known(BitWidth);
     computeKnownBits(X, Known, Depth, Q);
     if (Known.One[0])
       return true;
   }
   // shr X, Y != 0 if X is negative.  Note that the value of the shift is not
   // defined if the sign bit is shifted off the end.
   else if (match(V, m_Shr(m_Value(X), m_Value(Y)))) {
     // shr exact can only shift out zero bits.
     const PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
     if (BO->isExact())
       return isKnownNonZero(X, Depth, Q);
 
     KnownBits Known = computeKnownBits(X, Depth, Q);
     if (Known.isNegative())
       return true;
 
     // If the shifter operand is a constant, and all of the bits shifted
     // out are known to be zero, and X is known non-zero then at least one
     // non-zero bit must remain.
     if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
       auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
       // Is there a known one in the portion not shifted out?
       if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
         return true;
       // Are all the bits to be shifted out known zero?
       if (Known.countMinTrailingZeros() >= ShiftVal)
         return isKnownNonZero(X, Depth, Q);
     }
   }
   // div exact can only produce a zero if the dividend is zero.
   else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
     return isKnownNonZero(X, Depth, Q);
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
     KnownBits XKnown = computeKnownBits(X, Depth, Q);
     KnownBits YKnown = computeKnownBits(Y, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
     if (XKnown.isNonNegative() && YKnown.isNonNegative())
       if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
     // zero unless both X and Y equal INT_MIN.
     if (XKnown.isNegative() && YKnown.isNegative()) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
       if (XKnown.One.intersects(Mask))
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
       if (YKnown.One.intersects(Mask))
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
     if (XKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
       return true;
     if (YKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
       return true;
   }
   // X * Y.
   else if (match(V, m_Mul(m_Value(X), m_Value(Y)))) {
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     // If X and Y are non-zero then so is X * Y as long as the multiplication
     // does not overflow.
     if ((BO->hasNoSignedWrap() || BO->hasNoUnsignedWrap()) &&
         isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q))
       return true;
   }
   // (C ? X : Y) != 0 if X != 0 and Y != 0.
   else if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
     if (isKnownNonZero(SI->getTrueValue(), Depth, Q) &&
         isKnownNonZero(SI->getFalseValue(), Depth, Q))
       return true;
   }
   // PHI
   else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
     // Try and detect a recurrence that monotonically increases from a
     // starting value, as these are common as induction variables.
     if (PN->getNumIncomingValues() == 2) {
       Value *Start = PN->getIncomingValue(0);
       Value *Induction = PN->getIncomingValue(1);
       if (isa<ConstantInt>(Induction) && !isa<ConstantInt>(Start))
         std::swap(Start, Induction);
       if (ConstantInt *C = dyn_cast<ConstantInt>(Start)) {
         if (!C->isZero() && !C->isNegative()) {
           ConstantInt *X;
           if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) ||
                match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) &&
               !X->isNegative())
             return true;
         }
       }
     }
     // Check if all incoming values are non-zero constant.
     bool AllNonZeroConstants = llvm::all_of(PN->operands(), [](Value *V) {
       return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZero();
     });
     if (AllNonZeroConstants)
       return true;
   }
 
   KnownBits Known(BitWidth);
   computeKnownBits(V, Known, Depth, Q);
   return Known.One != 0;
 }
 
 /// Return true if V2 == V1 + X, where X is known non-zero.
 static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) {
   const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
   if (!BO || BO->getOpcode() != Instruction::Add)
     return false;
   Value *Op = nullptr;
   if (V2 == BO->getOperand(0))
     Op = BO->getOperand(1);
   else if (V2 == BO->getOperand(1))
     Op = BO->getOperand(0);
   else
     return false;
   return isKnownNonZero(Op, 0, Q);
 }
 
 /// Return true if it is known that V1 != V2.
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) {
   if (V1 == V2)
     return false;
   if (V1->getType() != V2->getType())
     // We can't look through casts yet.
     return false;
   if (isAddOfNonZero(V1, V2, Q) || isAddOfNonZero(V2, V1, Q))
     return true;
 
   if (V1->getType()->isIntOrIntVectorTy()) {
     // Are any known bits in V1 contradictory to known bits in V2? If V1
     // has a known zero where V2 has a known one, they must not be equal.
     KnownBits Known1 = computeKnownBits(V1, 0, Q);
     KnownBits Known2 = computeKnownBits(V2, 0, Q);
 
     if (Known1.Zero.intersects(Known2.One) ||
         Known2.Zero.intersects(Known1.One))
       return true;
   }
   return false;
 }
 
 /// Return true if 'V & Mask' is known to be zero.  We use this predicate to
 /// simplify operations downstream. Mask is known to be zero for bits that V
 /// cannot have.
 ///
 /// This function is defined on values with integer type, values with pointer
 /// type, and vectors of integers.  In the case
 /// where V is a vector, the mask, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
                        const Query &Q) {
   KnownBits Known(Mask.getBitWidth());
   computeKnownBits(V, Known, Depth, Q);
   return Mask.isSubsetOf(Known.Zero);
 }
 
 /// For vector constants, loop over the elements and find the constant with the
 /// minimum number of sign bits. Return 0 if the value is not a vector constant
 /// or if any element was not analyzed; otherwise, return the count for the
 /// element with the minimum number of sign bits.
 static unsigned computeNumSignBitsVectorConstant(const Value *V,
                                                  unsigned TyBits) {
   const auto *CV = dyn_cast<Constant>(V);
   if (!CV || !CV->getType()->isVectorTy())
     return 0;
 
   unsigned MinSignBits = TyBits;
   unsigned NumElts = CV->getType()->getVectorNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     // If we find a non-ConstantInt, bail out.
     auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
     if (!Elt)
       return 0;
 
     MinSignBits = std::min(MinSignBits, Elt->getValue().getNumSignBits());
   }
 
   return MinSignBits;
 }
 
 static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
                                        const Query &Q);
 
 static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
                                    const Query &Q) {
   unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
   assert(Result > 0 && "At least one sign bit needs to be present!");
   return Result;
 }
 
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign bit
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
 /// vector element with the minimum number of known sign bits.
 static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
                                        const Query &Q) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
   // We return the minimum number of sign bits that are guaranteed to be present
   // in V, so for undef we have to conservatively return 1.  We don't have the
   // same behavior for poison though -- that's a FIXME today.
 
   Type *ScalarTy = V->getType()->getScalarType();
   unsigned TyBits = ScalarTy->isPointerTy() ?
     Q.DL.getIndexTypeSizeInBits(ScalarTy) :
     Q.DL.getTypeSizeInBits(ScalarTy);
 
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
   // Note that ConstantInt is handled by the general computeKnownBits case
   // below.
 
   if (Depth == MaxDepth)
     return 1;  // Limit search depth.
 
   const Operator *U = dyn_cast<Operator>(V);
   switch (Operator::getOpcode(V)) {
   default: break;
   case Instruction::SExt:
     Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;
 
   case Instruction::SDiv: {
     const APInt *Denominator;
     // sdiv X, C -> adds log(C) sign bits.
     if (match(U->getOperand(1), m_APInt(Denominator))) {
 
       // Ignore non-positive denominator.
       if (!Denominator->isStrictlyPositive())
         break;
 
       // Calculate the incoming numerator bits.
       unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
       // Add floor(log(C)) bits to the numerator bits.
       return std::min(TyBits, NumBits + Denominator->logBase2());
     }
     break;
   }
 
   case Instruction::SRem: {
     const APInt *Denominator;
     // srem X, C -> we know that the result is within [-C+1,C) when C is a
     // positive constant.  This let us put a lower bound on the number of sign
     // bits.
     if (match(U->getOperand(1), m_APInt(Denominator))) {
 
       // Ignore non-positive denominator.
       if (!Denominator->isStrictlyPositive())
         break;
 
       // Calculate the incoming numerator bits. SRem by a positive constant
       // can't lower the number of sign bits.
       unsigned NumrBits =
           ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
       // Calculate the leading sign bit constraints by examining the
       // denominator.  Given that the denominator is positive, there are two
       // cases:
       //
       //  1. the numerator is positive.  The result range is [0,C) and [0,C) u<
       //     (1 << ceilLogBase2(C)).
       //
       //  2. the numerator is negative.  Then the result range is (-C,0] and
       //     integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)).
       //
       // Thus a lower bound on the number of sign bits is `TyBits -
       // ceilLogBase2(C)`.
 
       unsigned ResBits = TyBits - Denominator->ceilLogBase2();
       return std::max(NumrBits, ResBits);
     }
     break;
   }
 
   case Instruction::AShr: {
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
       if (ShAmt->uge(TyBits))
         break;  // Bad shift.
       unsigned ShAmtLimited = ShAmt->getZExtValue();
       Tmp += ShAmtLimited;
       if (Tmp > TyBits) Tmp = TyBits;
     }
     return Tmp;
   }
   case Instruction::Shl: {
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
       // shl destroys sign bits.
       Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
       if (ShAmt->uge(TyBits) ||      // Bad shift.
           ShAmt->uge(Tmp)) break;    // Shifted all sign bits out.
       Tmp2 = ShAmt->getZExtValue();
       return Tmp - Tmp2;
     }
     break;
   }
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:    // NOT is handled here.
     // Logical binary ops preserve the number of sign bits at the worst.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp != 1) {
       Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
       // computeKnownBits, and pick whichever answer is better.
     }
     break;
 
   case Instruction::Select:
     Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp == 1) break;
     Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
     return std::min(Tmp, Tmp2);
 
   case Instruction::Add:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp == 1) break;
 
     // Special case decrementing a value (ADD X, -1):
     if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         KnownBits Known(TyBits);
         computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
           return TyBits;
 
         // If we are subtracting one from a positive number, there is no carry
         // out of the result.
         if (Known.isNonNegative())
           return Tmp;
       }
 
     Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp2 == 1) break;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Sub:
     Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp2 == 1) break;
 
     // Handle NEG.
     if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         KnownBits Known(TyBits);
         computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
           return TyBits;
 
         // If the input is known to be positive (the sign bit is known clear),
         // the output of the NEG has the same number of sign bits as the input.
         if (Known.isNonNegative())
           return Tmp2;
 
         // Otherwise, we treat this like a SUB.
       }
 
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp == 1) break;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Mul: {
     // The output of the Mul can be at most twice the valid bits in the inputs.
     unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (SignBitsOp0 == 1) break;
     unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (SignBitsOp1 == 1) break;
     unsigned OutValidBits =
         (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1);
     return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1;
   }
 
   case Instruction::PHI: {
     const PHINode *PN = cast<PHINode>(U);
     unsigned NumIncomingValues = PN->getNumIncomingValues();
     // Don't analyze large in-degree PHIs.
     if (NumIncomingValues > 4) break;
     // Unreachable blocks may have zero-operand PHI nodes.
     if (NumIncomingValues == 0) break;
 
     // Take the minimum of all incoming values.  This can't infinitely loop
     // because of our depth threshold.
     Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
     for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
       if (Tmp == 1) return Tmp;
       Tmp = std::min(
           Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
     }
     return Tmp;
   }
 
   case Instruction::Trunc:
     // FIXME: it's tricky to do anything useful for this, but it is an important
     // case for targets like X86.
     break;
 
   case Instruction::ExtractElement:
     // Look through extract element. At the moment we keep this simple and skip
     // tracking the specific element. But at least we might find information
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
 
   // If we can examine all elements of a vector constant successfully, we're
   // done (we can't do any better than that). If not, keep trying.
   if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
     return VecSignBits;
 
   KnownBits Known(TyBits);
   computeKnownBits(V, Known, Depth, Q);
 
   // If we know that the sign bit is either zero or one, determine the number of
   // identical bits in the top of the input value.
   return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
 /// This function computes the integer multiple of Base that equals V.
 /// If successful, it returns true and returns the multiple in
 /// Multiple. If unsuccessful, it returns false. It looks
 /// through SExt instructions only if LookThroughSExt is true.
 bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
                            bool LookThroughSExt, unsigned Depth) {
   const unsigned MaxDepth = 6;
 
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
 
   Type *T = V->getType();
 
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
 
   if (Base == 0)
     return false;
 
   if (Base == 1) {
     Multiple = V;
     return true;
   }
 
   ConstantExpr *CO = dyn_cast<ConstantExpr>(V);
   Constant *BaseVal = ConstantInt::get(T, Base);
   if (CO && CO == BaseVal) {
     // Multiple is 1.
     Multiple = ConstantInt::get(T, 1);
     return true;
   }
 
   if (CI && CI->getZExtValue() % Base == 0) {
     Multiple = ConstantInt::get(T, CI->getZExtValue() / Base);
     return true;
   }
 
   if (Depth == MaxDepth) return false;  // Limit search depth.
 
   Operator *I = dyn_cast<Operator>(V);
   if (!I) return false;
 
   switch (I->getOpcode()) {
   default: break;
   case Instruction::SExt:
     if (!LookThroughSExt) return false;
     // otherwise fall through to ZExt
     LLVM_FALLTHROUGH;
   case Instruction::ZExt:
     return ComputeMultiple(I->getOperand(0), Base, Multiple,
                            LookThroughSExt, Depth+1);
   case Instruction::Shl:
   case Instruction::Mul: {
     Value *Op0 = I->getOperand(0);
     Value *Op1 = I->getOperand(1);
 
     if (I->getOpcode() == Instruction::Shl) {
       ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1);
       if (!Op1CI) return false;
       // Turn Op0 << Op1 into Op0 * 2^Op1
       APInt Op1Int = Op1CI->getValue();
       uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1);
       APInt API(Op1Int.getBitWidth(), 0);
       API.setBit(BitToSet);
       Op1 = ConstantInt::get(V->getContext(), API);
     }
 
     Value *Mul0 = nullptr;
     if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
       if (Constant *Op1C = dyn_cast<Constant>(Op1))
         if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
           if (Op1C->getType()->getPrimitiveSizeInBits() <
               MulC->getType()->getPrimitiveSizeInBits())
             Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
           if (Op1C->getType()->getPrimitiveSizeInBits() >
               MulC->getType()->getPrimitiveSizeInBits())
             MulC = ConstantExpr::getZExt(MulC, Op1C->getType());
 
           // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
           Multiple = ConstantExpr::getMul(MulC, Op1C);
           return true;
         }
 
       if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
         if (Mul0CI->getValue() == 1) {
           // V == Base * Op1, so return Op1
           Multiple = Op1;
           return true;
         }
     }
 
     Value *Mul1 = nullptr;
     if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
       if (Constant *Op0C = dyn_cast<Constant>(Op0))
         if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
           if (Op0C->getType()->getPrimitiveSizeInBits() <
               MulC->getType()->getPrimitiveSizeInBits())
             Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
           if (Op0C->getType()->getPrimitiveSizeInBits() >
               MulC->getType()->getPrimitiveSizeInBits())
             MulC = ConstantExpr::getZExt(MulC, Op0C->getType());
 
           // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
           Multiple = ConstantExpr::getMul(MulC, Op0C);
           return true;
         }
 
       if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
         if (Mul1CI->getValue() == 1) {
           // V == Base * Op0, so return Op0
           Multiple = Op0;
           return true;
         }
     }
   }
   }
 
   // We could not determine if V is a multiple of Base.
   return false;
 }
 
 Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
                                             const TargetLibraryInfo *TLI) {
   const Function *F = ICS.getCalledFunction();
   if (!F)
     return Intrinsic::not_intrinsic;
 
   if (F->isIntrinsic())
     return F->getIntrinsicID();
 
   if (!TLI)
     return Intrinsic::not_intrinsic;
 
   LibFunc Func;
   // We're going to make assumptions on the semantics of the functions, check
   // that the target knows that it's available in this environment and it does
   // not have local linkage.
   if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(*F, Func))
     return Intrinsic::not_intrinsic;
 
   if (!ICS.onlyReadsMemory())
     return Intrinsic::not_intrinsic;
 
   // Otherwise check if we have a call to a function that can be turned into a
   // vector intrinsic.
   switch (Func) {
   default:
     break;
   case LibFunc_sin:
   case LibFunc_sinf:
   case LibFunc_sinl:
     return Intrinsic::sin;
   case LibFunc_cos:
   case LibFunc_cosf:
   case LibFunc_cosl:
     return Intrinsic::cos;
   case LibFunc_exp:
   case LibFunc_expf:
   case LibFunc_expl:
     return Intrinsic::exp;
   case LibFunc_exp2:
   case LibFunc_exp2f:
   case LibFunc_exp2l:
     return Intrinsic::exp2;
   case LibFunc_log:
   case LibFunc_logf:
   case LibFunc_logl:
     return Intrinsic::log;
   case LibFunc_log10:
   case LibFunc_log10f:
   case LibFunc_log10l:
     return Intrinsic::log10;
   case LibFunc_log2:
   case LibFunc_log2f:
   case LibFunc_log2l:
     return Intrinsic::log2;
   case LibFunc_fabs:
   case LibFunc_fabsf:
   case LibFunc_fabsl:
     return Intrinsic::fabs;
   case LibFunc_fmin:
   case LibFunc_fminf:
   case LibFunc_fminl:
     return Intrinsic::minnum;
   case LibFunc_fmax:
   case LibFunc_fmaxf:
   case LibFunc_fmaxl:
     return Intrinsic::maxnum;
   case LibFunc_copysign:
   case LibFunc_copysignf:
   case LibFunc_copysignl:
     return Intrinsic::copysign;
   case LibFunc_floor:
   case LibFunc_floorf:
   case LibFunc_floorl:
     return Intrinsic::floor;
   case LibFunc_ceil:
   case LibFunc_ceilf:
   case LibFunc_ceill:
     return Intrinsic::ceil;
   case LibFunc_trunc:
   case LibFunc_truncf:
   case LibFunc_truncl:
     return Intrinsic::trunc;
   case LibFunc_rint:
   case LibFunc_rintf:
   case LibFunc_rintl:
     return Intrinsic::rint;
   case LibFunc_nearbyint:
   case LibFunc_nearbyintf:
   case LibFunc_nearbyintl:
     return Intrinsic::nearbyint;
   case LibFunc_round:
   case LibFunc_roundf:
   case LibFunc_roundl:
     return Intrinsic::round;
   case LibFunc_pow:
   case LibFunc_powf:
   case LibFunc_powl:
     return Intrinsic::pow;
   case LibFunc_sqrt:
   case LibFunc_sqrtf:
   case LibFunc_sqrtl:
     return Intrinsic::sqrt;
   }
 
   return Intrinsic::not_intrinsic;
 }
 
 /// Return true if we can prove that the specified FP value is never equal to
 /// -0.0.
 ///
 /// NOTE: this function will need to be revisited when we support non-default
 /// rounding modes!
 bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
                                 unsigned Depth) {
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->getValueAPF().isNegZero();
 
   // Limit search depth.
   if (Depth == MaxDepth)
     return false;
 
   auto *Op = dyn_cast<Operator>(V);
   if (!Op)
     return false;
 
   // Check if the nsz fast-math flag is set.
   if (auto *FPO = dyn_cast<FPMathOperator>(Op))
     if (FPO->hasNoSignedZeros())
       return true;
 
   // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
   if (match(Op, m_FAdd(m_Value(), m_PosZeroFP())))
     return true;
 
   // sitofp and uitofp turn into +0.0 for zero.
   if (isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op))
     return true;
 
   if (auto *Call = dyn_cast<CallInst>(Op)) {
     Intrinsic::ID IID = getIntrinsicForCallSite(Call, TLI);
     switch (IID) {
     default:
       break;
     // sqrt(-0.0) = -0.0, no other negative results are possible.
     case Intrinsic::sqrt:
       return CannotBeNegativeZero(Call->getArgOperand(0), TLI, Depth + 1);
     // fabs(x) != -0.0
     case Intrinsic::fabs:
       return true;
     }
   }
 
   return false;
 }
 
 /// If \p SignBitOnly is true, test for a known 0 sign bit rather than a
 /// standard ordered compare. e.g. make -0.0 olt 0.0 be true because of the sign
 /// bit despite comparing equal.
 static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             bool SignBitOnly,
                                             unsigned Depth) {
   // TODO: This function does not do the right thing when SignBitOnly is true
   // and we're lowering to a hypothetical IEEE 754-compliant-but-evil platform
   // which flips the sign bits of NaNs.  See
   // https://llvm.org/bugs/show_bug.cgi?id=31702.
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     return !CFP->getValueAPF().isNegative() ||
            (!SignBitOnly && CFP->getValueAPF().isZero());
   }
 
   // Handle vector of constants.
   if (auto *CV = dyn_cast<Constant>(V)) {
     if (CV->getType()->isVectorTy()) {
       unsigned NumElts = CV->getType()->getVectorNumElements();
       for (unsigned i = 0; i != NumElts; ++i) {
         auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
         if (!CFP)
           return false;
         if (CFP->getValueAPF().isNegative() &&
             (SignBitOnly || !CFP->getValueAPF().isZero()))
           return false;
       }
 
       // All non-negative ConstantFPs.
       return true;
     }
   }
 
   if (Depth == MaxDepth)
     return false; // Limit search depth.
 
   const Operator *I = dyn_cast<Operator>(V);
   if (!I)
     return false;
 
   switch (I->getOpcode()) {
   default:
     break;
   // Unsigned integers are always nonnegative.
   case Instruction::UIToFP:
     return true;
   case Instruction::FMul:
     // x*x is always non-negative or a NaN.
     if (I->getOperand(0) == I->getOperand(1) &&
         (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()))
       return true;
 
     LLVM_FALLTHROUGH;
   case Instruction::FAdd:
   case Instruction::FDiv:
   case Instruction::FRem:
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1) &&
            cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::Select:
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                            Depth + 1) &&
            cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::FPExt:
   case Instruction::FPTrunc:
     // Widening/narrowing never change sign.
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::ExtractElement:
     // Look through extract element. At the moment we keep this simple and skip
     // tracking the specific element. But at least we might find information
     // valid for all elements of the vector.
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::Call:
     const auto *CI = cast<CallInst>(I);
     Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
     switch (IID) {
     default:
       break;
     case Intrinsic::maxnum:
-      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
-                                             Depth + 1) ||
-             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
-                                             Depth + 1);
+      return (isKnownNeverNaN(I->getOperand(0)) &&
+              cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI,
+                                              SignBitOnly, Depth + 1)) ||
+             (isKnownNeverNaN(I->getOperand(1)) &&
+              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI,
+                                              SignBitOnly, Depth + 1));
+
     case Intrinsic::minnum:
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1) &&
              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                              Depth + 1);
     case Intrinsic::exp:
     case Intrinsic::exp2:
     case Intrinsic::fabs:
       return true;
 
     case Intrinsic::sqrt:
       // sqrt(x) is always >= -0 or NaN.  Moreover, sqrt(x) == -0 iff x == -0.
       if (!SignBitOnly)
         return true;
       return CI->hasNoNaNs() && (CI->hasNoSignedZeros() ||
                                  CannotBeNegativeZero(CI->getOperand(0), TLI));
 
     case Intrinsic::powi:
       if (ConstantInt *Exponent = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // powi(x,n) is non-negative if n is even.
         if (Exponent->getBitWidth() <= 64 && Exponent->getSExtValue() % 2u == 0)
           return true;
       }
       // TODO: This is not correct.  Given that exp is an integer, here are the
       // ways that pow can return a negative value:
       //
       //   pow(x, exp)    --> negative if exp is odd and x is negative.
       //   pow(-0, exp)   --> -inf if exp is negative odd.
       //   pow(-0, exp)   --> -0 if exp is positive odd.
       //   pow(-inf, exp) --> -0 if exp is negative odd.
       //   pow(-inf, exp) --> -inf if exp is positive odd.
       //
       // Therefore, if !SignBitOnly, we can return true if x >= +0 or x is NaN,
       // but we must return false if x == -0.  Unfortunately we do not currently
       // have a way of expressing this constraint.  See details in
       // https://llvm.org/bugs/show_bug.cgi?id=31702.
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1);
 
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
       // x*x+y is non-negative if y is non-negative.
       return I->getOperand(0) == I->getOperand(1) &&
              (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()) &&
              cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
                                              Depth + 1);
     }
     break;
   }
   return false;
 }
 
 bool llvm::CannotBeOrderedLessThanZero(const Value *V,
                                        const TargetLibraryInfo *TLI) {
   return cannotBeOrderedLessThanZeroImpl(V, TLI, false, 0);
 }
 
 bool llvm::SignBitMustBeZero(const Value *V, const TargetLibraryInfo *TLI) {
   return cannotBeOrderedLessThanZeroImpl(V, TLI, true, 0);
 }
 
 bool llvm::isKnownNeverNaN(const Value *V) {
   assert(V->getType()->isFPOrFPVectorTy() && "Querying for NaN on non-FP type");
 
   // If we're told that NaNs won't happen, assume they won't.
   if (auto *FPMathOp = dyn_cast<FPMathOperator>(V))
     if (FPMathOp->hasNoNaNs())
       return true;
 
   // TODO: Handle instructions and potentially recurse like other 'isKnown'
   // functions. For example, the result of sitofp is never NaN.
 
   // Handle scalar constants.
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->isNaN();
 
   // Bail out for constant expressions, but try to handle vector constants.
   if (!V->getType()->isVectorTy() || !isa<Constant>(V))
     return false;
 
   // For vectors, verify that each element is not NaN.
   unsigned NumElts = V->getType()->getVectorNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
     if (!Elt)
       return false;
     if (isa<UndefValue>(Elt))
       continue;
     auto *CElt = dyn_cast<ConstantFP>(Elt);
     if (!CElt || CElt->isNaN())
       return false;
   }
   // All elements were confirmed not-NaN or undefined.
   return true;
 }
 
 /// If the specified value can be set by repeating the same byte in memory,
 /// return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
 /// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
 /// byte store (e.g. i16 0x1234), return null.
 Value *llvm::isBytewiseValue(Value *V) {
   // All byte-wide stores are splatable, even of arbitrary variables.
   if (V->getType()->isIntegerTy(8)) return V;
 
   // Handle 'null' ConstantArrayZero etc.
   if (Constant *C = dyn_cast<Constant>(V))
     if (C->isNullValue())
       return Constant::getNullValue(Type::getInt8Ty(V->getContext()));
 
   // Constant float and double values can be handled as integer values if the
   // corresponding integer value is "byteable".  An important case is 0.0.
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType()->isFloatTy())
       V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
     if (CFP->getType()->isDoubleTy())
       V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
     // Don't handle long double formats, which have strange constraints.
   }
 
   // We can handle constant integers that are multiple of 8 bits.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() % 8 == 0) {
       assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");
 
       if (!CI->getValue().isSplat(8))
         return nullptr;
       return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
     }
   }
 
   // A ConstantDataArray/Vector is splatable if all its members are equal and
   // also splatable.
   if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
     Value *Elt = CA->getElementAsConstant(0);
     Value *Val = isBytewiseValue(Elt);
     if (!Val)
       return nullptr;
 
     for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
       if (CA->getElementAsConstant(I) != Elt)
         return nullptr;
 
     return Val;
   }
 
   // Conceptually, we could handle things like:
   //   %a = zext i8 %X to i16
   //   %b = shl i16 %a, 8
   //   %c = or i16 %a, %b
   // but until there is an example that actually needs this, it doesn't seem
   // worth worrying about.
   return nullptr;
 }
 
 // This is the recursive version of BuildSubAggregate. It takes a few different
 // arguments. Idxs is the index within the nested struct From that we are
 // looking at now (which is of type IndexedType). IdxSkip is the number of
 // indices from Idxs that should be left out when inserting into the resulting
 // struct. To is the result struct built so far, new insertvalue instructions
 // build on that.
 static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
                                 SmallVectorImpl<unsigned> &Idxs,
                                 unsigned IdxSkip,
                                 Instruction *InsertBefore) {
   StructType *STy = dyn_cast<StructType>(IndexedType);
   if (STy) {
     // Save the original To argument so we can modify it
     Value *OrigTo = To;
     // General case, the type indexed by Idxs is a struct
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       // Process each struct element recursively
       Idxs.push_back(i);
       Value *PrevTo = To;
       To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip,
                              InsertBefore);
       Idxs.pop_back();
       if (!To) {
         // Couldn't find any inserted value for this index? Cleanup
         while (PrevTo != OrigTo) {
           InsertValueInst* Del = cast<InsertValueInst>(PrevTo);
           PrevTo = Del->getAggregateOperand();
           Del->eraseFromParent();
         }
         // Stop processing elements
         break;
       }
     }
     // If we successfully found a value for each of our subaggregates
     if (To)
       return To;
   }
   // Base case, the type indexed by SourceIdxs is not a struct, or not all of
   // the struct's elements had a value that was inserted directly. In the latter
   // case, perhaps we can't determine each of the subelements individually, but
   // we might be able to find the complete struct somewhere.
 
   // Find the value that is at that particular spot
   Value *V = FindInsertedValue(From, Idxs);
 
   if (!V)
     return nullptr;
 
   // Insert the value in the new (sub) aggregate
   return InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
                                  "tmp", InsertBefore);
 }
 
 // This helper takes a nested struct and extracts a part of it (which is again a
 // struct) into a new value. For example, given the struct:
 // { a, { b, { c, d }, e } }
 // and the indices "1, 1" this returns
 // { c, d }.
 //
 // It does this by inserting an insertvalue for each element in the resulting
 // struct, as opposed to just inserting a single struct. This will only work if
 // each of the elements of the substruct are known (ie, inserted into From by an
 // insertvalue instruction somewhere).
 //
 // All inserted insertvalue instructions are inserted before InsertBefore
 static Value *BuildSubAggregate(Value *From, ArrayRef<unsigned> idx_range,
                                 Instruction *InsertBefore) {
   assert(InsertBefore && "Must have someplace to insert!");
   Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
                                                              idx_range);
   Value *To = UndefValue::get(IndexedType);
   SmallVector<unsigned, 10> Idxs(idx_range.begin(), idx_range.end());
   unsigned IdxSkip = Idxs.size();
 
   return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
 }
 
 /// Given an aggregate and a sequence of indices, see if the scalar value
 /// indexed is already around as a register, for example if it was inserted
 /// directly into the aggregate.
 ///
 /// If InsertBefore is not null, this function will duplicate (modified)
 /// insertvalues when a part of a nested struct is extracted.
 Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
                                Instruction *InsertBefore) {
   // Nothing to index? Just return V then (this is useful at the end of our
   // recursion).
   if (idx_range.empty())
     return V;
   // We have indices, so V should have an indexable type.
   assert((V->getType()->isStructTy() || V->getType()->isArrayTy()) &&
          "Not looking at a struct or array?");
   assert(ExtractValueInst::getIndexedType(V->getType(), idx_range) &&
          "Invalid indices for type?");
 
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = C->getAggregateElement(idx_range[0]);
     if (!C) return nullptr;
     return FindInsertedValue(C, idx_range.slice(1), InsertBefore);
   }
 
   if (InsertValueInst *I = dyn_cast<InsertValueInst>(V)) {
     // Loop the indices for the insertvalue instruction in parallel with the
     // requested indices
     const unsigned *req_idx = idx_range.begin();
     for (const unsigned *i = I->idx_begin(), *e = I->idx_end();
          i != e; ++i, ++req_idx) {
       if (req_idx == idx_range.end()) {
         // We can't handle this without inserting insertvalues
         if (!InsertBefore)
           return nullptr;
 
         // The requested index identifies a part of a nested aggregate. Handle
         // this specially. For example,
         // %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0
         // %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1
         // %C = extractvalue {i32, { i32, i32 } } %B, 1
         // This can be changed into
         // %A = insertvalue {i32, i32 } undef, i32 10, 0
         // %C = insertvalue {i32, i32 } %A, i32 11, 1
         // which allows the unused 0,0 element from the nested struct to be
         // removed.
         return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx),
                                  InsertBefore);
       }
 
       // This insert value inserts something else than what we are looking for.
       // See if the (aggregate) value inserted into has the value we are
       // looking for, then.
       if (*req_idx != *i)
         return FindInsertedValue(I->getAggregateOperand(), idx_range,
                                  InsertBefore);
     }
     // If we end up here, the indices of the insertvalue match with those
     // requested (though possibly only partially). Now we recursively look at
     // the inserted value, passing any remaining indices.
     return FindInsertedValue(I->getInsertedValueOperand(),
                              makeArrayRef(req_idx, idx_range.end()),
                              InsertBefore);
   }
 
   if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
     // If we're extracting a value from an aggregate that was extracted from
     // something else, we can extract from that something else directly instead.
     // However, we will need to chain I's indices with the requested indices.
 
     // Calculate the number of indices required
     unsigned size = I->getNumIndices() + idx_range.size();
     // Allocate some space to put the new indices in
     SmallVector<unsigned, 5> Idxs;
     Idxs.reserve(size);
     // Add indices from the extract value instruction
     Idxs.append(I->idx_begin(), I->idx_end());
 
     // Add requested indices
     Idxs.append(idx_range.begin(), idx_range.end());
 
     assert(Idxs.size() == size
            && "Number of indices added not correct?");
 
     return FindInsertedValue(I->getAggregateOperand(), Idxs, InsertBefore);
   }
   // Otherwise, we don't know (such as, extracting from a function return value
   // or load instruction)
   return nullptr;
 }
 
 /// Analyze the specified pointer to see if it can be expressed as a base
 /// pointer plus a constant offset. Return the base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                               const DataLayout &DL) {
   unsigned BitWidth = DL.getIndexTypeSizeInBits(Ptr->getType());
   APInt ByteOffset(BitWidth, 0);
 
   // We walk up the defs but use a visited set to handle unreachable code. In
   // that case, we stop after accumulating the cycle once (not that it
   // matters).
   SmallPtrSet<Value *, 16> Visited;
   while (Visited.insert(Ptr).second) {
     if (Ptr->getType()->isVectorTy())
       break;
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
       // If one of the values we have visited is an addrspacecast, then
       // the pointer type of this GEP may be different from the type
       // of the Ptr parameter which was passed to this function.  This
       // means when we construct GEPOffset, we need to use the size
       // of GEP's pointer type rather than the size of the original
       // pointer type.
       APInt GEPOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
       if (!GEP->accumulateConstantOffset(DL, GEPOffset))
         break;
 
       ByteOffset += GEPOffset.getSExtValue();
 
       Ptr = GEP->getPointerOperand();
     } else if (Operator::getOpcode(Ptr) == Instruction::BitCast ||
                Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
       if (GA->isInterposable())
         break;
       Ptr = GA->getAliasee();
     } else {
       break;
     }
   }
   Offset = ByteOffset.getSExtValue();
   return Ptr;
 }
 
 bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
                                        unsigned CharSize) {
   // Make sure the GEP has exactly three arguments.
   if (GEP->getNumOperands() != 3)
     return false;
 
   // Make sure the index-ee is a pointer to array of \p CharSize integers.
   // CharSize.
   ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
   if (!AT || !AT->getElementType()->isIntegerTy(CharSize))
     return false;
 
   // Check to make sure that the first operand of the GEP is an integer and
   // has value 0 so that we are sure we're indexing into the initializer.
   const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
   if (!FirstIdx || !FirstIdx->isZero())
     return false;
 
   return true;
 }
 
 bool llvm::getConstantDataArrayInfo(const Value *V,
                                     ConstantDataArraySlice &Slice,
                                     unsigned ElementSize, uint64_t Offset) {
   assert(V);
 
   // Look through bitcast instructions and geps.
   V = V->stripPointerCasts();
 
   // If the value is a GEP instruction or constant expression, treat it as an
   // offset.
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     // The GEP operator should be based on a pointer to string constant, and is
     // indexing into the string constant.
     if (!isGEPBasedOnPointerToString(GEP, ElementSize))
       return false;
 
     // If the second index isn't a ConstantInt, then this is a variable index
     // into the array.  If this occurs, we can't say anything meaningful about
     // the string.
     uint64_t StartIdx = 0;
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
       StartIdx = CI->getZExtValue();
     else
       return false;
     return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
                                     StartIdx + Offset);
   }
 
   // The GEP instruction, constant or instruction, must reference a global
   // variable that is a constant and is initialized. The referenced constant
   // initializer is the array that we'll use for optimization.
   const GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return false;
 
   const ConstantDataArray *Array;
   ArrayType *ArrayTy;
   if (GV->getInitializer()->isNullValue()) {
     Type *GVTy = GV->getValueType();
     if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
       // A zeroinitializer for the array; there is no ConstantDataArray.
       Array = nullptr;
     } else {
       const DataLayout &DL = GV->getParent()->getDataLayout();
       uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
       uint64_t Length = SizeInBytes / (ElementSize / 8);
       if (Length <= Offset)
         return false;
 
       Slice.Array = nullptr;
       Slice.Offset = 0;
       Slice.Length = Length - Offset;
       return true;
     }
   } else {
     // This must be a ConstantDataArray.
     Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
     if (!Array)
       return false;
     ArrayTy = Array->getType();
   }
   if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
     return false;
 
   uint64_t NumElts = ArrayTy->getArrayNumElements();
   if (Offset > NumElts)
     return false;
 
   Slice.Array = Array;
   Slice.Offset = Offset;
   Slice.Length = NumElts - Offset;
   return true;
 }
 
 /// This function computes the length of a null-terminated C string pointed to
 /// by V. If successful, it returns true and returns the string in Str.
 /// If unsuccessful, it returns false.
 bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
                                  uint64_t Offset, bool TrimAtNul) {
   ConstantDataArraySlice Slice;
   if (!getConstantDataArrayInfo(V, Slice, 8, Offset))
     return false;
 
   if (Slice.Array == nullptr) {
     if (TrimAtNul) {
       Str = StringRef();
       return true;
     }
     if (Slice.Length == 1) {
       Str = StringRef("", 1);
       return true;
     }
     // We cannot instantiate a StringRef as we do not have an appropriate string
     // of 0s at hand.
     return false;
   }
 
   // Start out with the entire array in the StringRef.
   Str = Slice.Array->getAsString();
   // Skip over 'offset' bytes.
   Str = Str.substr(Slice.Offset);
 
   if (TrimAtNul) {
     // Trim off the \0 and anything after it.  If the array is not nul
     // terminated, we just return the whole end of string.  The client may know
     // some other way that the string is length-bound.
     Str = Str.substr(0, Str.find('\0'));
   }
   return true;
 }
 
 // These next two are very similar to the above, but also look through PHI
 // nodes.
 // TODO: See if we can integrate these two together.
 
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 static uint64_t GetStringLengthH(const Value *V,
                                  SmallPtrSetImpl<const PHINode*> &PHIs,
                                  unsigned CharSize) {
   // Look through noop bitcast instructions.
   V = V->stripPointerCasts();
 
   // If this is a PHI node, there are two cases: either we have already seen it
   // or we haven't.
   if (const PHINode *PN = dyn_cast<PHINode>(V)) {
     if (!PHIs.insert(PN).second)
       return ~0ULL;  // already in the set.
 
     // If it was new, see if all the input strings are the same length.
     uint64_t LenSoFar = ~0ULL;
     for (Value *IncValue : PN->incoming_values()) {
       uint64_t Len = GetStringLengthH(IncValue, PHIs, CharSize);
       if (Len == 0) return 0; // Unknown length -> unknown.
 
       if (Len == ~0ULL) continue;
 
       if (Len != LenSoFar && LenSoFar != ~0ULL)
         return 0;    // Disagree -> unknown.
       LenSoFar = Len;
     }
 
     // Success, all agree.
     return LenSoFar;
   }
 
   // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
   if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
     uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs, CharSize);
     if (Len1 == 0) return 0;
     uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs, CharSize);
     if (Len2 == 0) return 0;
     if (Len1 == ~0ULL) return Len2;
     if (Len2 == ~0ULL) return Len1;
     if (Len1 != Len2) return 0;
     return Len1;
   }
 
   // Otherwise, see if we can read the string.
   ConstantDataArraySlice Slice;
   if (!getConstantDataArrayInfo(V, Slice, CharSize))
     return 0;
 
   if (Slice.Array == nullptr)
     return 1;
 
   // Search for nul characters
   unsigned NullIndex = 0;
   for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
     if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
       break;
   }
 
   return NullIndex + 1;
 }
 
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
   if (!V->getType()->isPointerTy())
     return 0;
 
   SmallPtrSet<const PHINode*, 32> PHIs;
   uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
   // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
   // an empty string as a length.
   return Len == ~0ULL ? 1 : Len;
 }
 
 const Value *llvm::getArgumentAliasingToReturnedPointer(ImmutableCallSite CS) {
   assert(CS &&
          "getArgumentAliasingToReturnedPointer only works on nonnull CallSite");
   if (const Value *RV = CS.getReturnedArgOperand())
     return RV;
   // This can be used only as a aliasing property.
   if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CS))
     return CS.getArgOperand(0);
   return nullptr;
 }
 
 bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
     ImmutableCallSite CS) {
   return CS.getIntrinsicID() == Intrinsic::launder_invariant_group ||
          CS.getIntrinsicID() == Intrinsic::strip_invariant_group;
 }
 
 /// \p PN defines a loop-variant pointer to an object.  Check if the
 /// previous iteration of the loop was referring to the same object as \p PN.
 static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
                                          const LoopInfo *LI) {
   // Find the loop-defined value.
   Loop *L = LI->getLoopFor(PN->getParent());
   if (PN->getNumIncomingValues() != 2)
     return true;
 
   // Find the value from previous iteration.
   auto *PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(0));
   if (!PrevValue || LI->getLoopFor(PrevValue->getParent()) != L)
     PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(1));
   if (!PrevValue || LI->getLoopFor(PrevValue->getParent()) != L)
     return true;
 
   // If a new pointer is loaded in the loop, the pointer references a different
   // object in every iteration.  E.g.:
   //    for (i)
   //       int *p = a[i];
   //       ...
   if (auto *Load = dyn_cast<LoadInst>(PrevValue))
     if (!L->isLoopInvariant(Load->getPointerOperand()))
       return false;
   return true;
 }
 
 Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
                                  unsigned MaxLookup) {
   if (!V->getType()->isPointerTy())
     return V;
   for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else if (isa<AllocaInst>(V)) {
       // An alloca can't be further simplified.
       return V;
     } else {
       if (auto CS = CallSite(V)) {
         // CaptureTracking can know about special capturing properties of some
         // intrinsics like launder.invariant.group, that can't be expressed with
         // the attributes, but have properties like returning aliasing pointer.
         // Because some analysis may assume that nocaptured pointer is not
         // returned from some special intrinsic (because function would have to
         // be marked with returns attribute), it is crucial to use this function
         // because it should be in sync with CaptureTracking. Not using it may
         // cause weird miscompilations where 2 aliasing pointers are assumed to
         // noalias.
         if (auto *RP = getArgumentAliasingToReturnedPointer(CS)) {
           V = RP;
           continue;
         }
       }
 
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
         // TODO: Acquire a DominatorTree and AssumptionCache and use them.
         if (Value *Simplified = SimplifyInstruction(I, {DL, I})) {
           V = Simplified;
           continue;
         }
 
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   }
   return V;
 }
 
 void llvm::GetUnderlyingObjects(Value *V, SmallVectorImpl<Value *> &Objects,
                                 const DataLayout &DL, LoopInfo *LI,
                                 unsigned MaxLookup) {
   SmallPtrSet<Value *, 4> Visited;
   SmallVector<Value *, 4> Worklist;
   Worklist.push_back(V);
   do {
     Value *P = Worklist.pop_back_val();
     P = GetUnderlyingObject(P, DL, MaxLookup);
 
     if (!Visited.insert(P).second)
       continue;
 
     if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
       Worklist.push_back(SI->getTrueValue());
       Worklist.push_back(SI->getFalseValue());
       continue;
     }
 
     if (PHINode *PN = dyn_cast<PHINode>(P)) {
       // If this PHI changes the underlying object in every iteration of the
       // loop, don't look through it.  Consider:
       //   int **A;
       //   for (i) {
       //     Prev = Curr;     // Prev = PHI (Prev_0, Curr)
       //     Curr = A[i];
       //     *Prev, *Curr;
       //
       // Prev is tracking Curr one iteration behind so they refer to different
       // underlying objects.
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
         for (Value *IncValue : PN->incoming_values())
           Worklist.push_back(IncValue);
       continue;
     }
 
     Objects.push_back(P);
   } while (!Worklist.empty());
 }
 
 /// This is the function that does the work of looking through basic
 /// ptrtoint+arithmetic+inttoptr sequences.
 static const Value *getUnderlyingObjectFromInt(const Value *V) {
   do {
     if (const Operator *U = dyn_cast<Operator>(V)) {
       // If we find a ptrtoint, we can transfer control back to the
       // regular getUnderlyingObjectFromInt.
       if (U->getOpcode() == Instruction::PtrToInt)
         return U->getOperand(0);
       // If we find an add of a constant, a multiplied value, or a phi, it's
       // likely that the other operand will lead us to the base
       // object. We don't have to worry about the case where the
       // object address is somehow being computed by the multiply,
       // because our callers only care when the result is an
       // identifiable object.
       if (U->getOpcode() != Instruction::Add ||
           (!isa<ConstantInt>(U->getOperand(1)) &&
            Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
            !isa<PHINode>(U->getOperand(1))))
         return V;
       V = U->getOperand(0);
     } else {
       return V;
     }
     assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
   } while (true);
 }
 
 /// This is a wrapper around GetUnderlyingObjects and adds support for basic
 /// ptrtoint+arithmetic+inttoptr sequences.
 /// It returns false if unidentified object is found in GetUnderlyingObjects.
 bool llvm::getUnderlyingObjectsForCodeGen(const Value *V,
                           SmallVectorImpl<Value *> &Objects,
                           const DataLayout &DL) {
   SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 4> Working(1, V);
   do {
     V = Working.pop_back_val();
 
     SmallVector<Value *, 4> Objs;
     GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
 
     for (Value *V : Objs) {
       if (!Visited.insert(V).second)
         continue;
       if (Operator::getOpcode(V) == Instruction::IntToPtr) {
         const Value *O =
           getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
         if (O->getType()->isPointerTy()) {
           Working.push_back(O);
           continue;
         }
       }
       // If GetUnderlyingObjects fails to find an identifiable object,
       // getUnderlyingObjectsForCodeGen also fails for safety.
       if (!isIdentifiedObject(V)) {
         Objects.clear();
         return false;
       }
       Objects.push_back(const_cast<Value *>(V));
     }
   } while (!Working.empty());
   return true;
 }
 
 /// Return true if the only users of this pointer are lifetime markers.
 bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
   for (const User *U : V->users()) {
     const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
     if (!II) return false;
 
     if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
         II->getIntrinsicID() != Intrinsic::lifetime_end)
       return false;
   }
   return true;
 }
 
 bool llvm::isSafeToSpeculativelyExecute(const Value *V,
                                         const Instruction *CtxI,
                                         const DominatorTree *DT) {
   const Operator *Inst = dyn_cast<Operator>(V);
   if (!Inst)
     return false;
 
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
     if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i)))
       if (C->canTrap())
         return false;
 
   switch (Inst->getOpcode()) {
   default:
     return true;
   case Instruction::UDiv:
   case Instruction::URem: {
     // x / y is undefined if y == 0.
     const APInt *V;
     if (match(Inst->getOperand(1), m_APInt(V)))
       return *V != 0;
     return false;
   }
   case Instruction::SDiv:
   case Instruction::SRem: {
     // x / y is undefined if y == 0 or x == INT_MIN and y == -1
     const APInt *Numerator, *Denominator;
     if (!match(Inst->getOperand(1), m_APInt(Denominator)))
       return false;
     // We cannot hoist this division if the denominator is 0.
     if (*Denominator == 0)
       return false;
     // It's safe to hoist if the denominator is not 0 or -1.
     if (*Denominator != -1)
       return true;
     // At this point we know that the denominator is -1.  It is safe to hoist as
     // long we know that the numerator is not INT_MIN.
     if (match(Inst->getOperand(0), m_APInt(Numerator)))
       return !Numerator->isMinSignedValue();
     // The numerator *might* be MinSignedValue.
     return false;
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(Inst);
     if (!LI->isUnordered() ||
         // Speculative load may create a race that did not exist in the source.
         LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
         // Speculative load may load data from dirty regions.
         LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
         LI->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
     return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
                                               LI->getAlignment(), DL, CtxI, DT);
   }
   case Instruction::Call: {
     auto *CI = cast<const CallInst>(Inst);
     const Function *Callee = CI->getCalledFunction();
 
     // The called function could have undefined behavior or side-effects, even
     // if marked readnone nounwind.
     return Callee && Callee->isSpeculatable();
   }
   case Instruction::VAArg:
   case Instruction::Alloca:
   case Instruction::Invoke:
   case Instruction::PHI:
   case Instruction::Store:
   case Instruction::Ret:
   case Instruction::Br:
   case Instruction::IndirectBr:
   case Instruction::Switch:
   case Instruction::Unreachable:
   case Instruction::Fence:
   case Instruction::AtomicRMW:
   case Instruction::AtomicCmpXchg:
   case Instruction::LandingPad:
   case Instruction::Resume:
   case Instruction::CatchSwitch:
   case Instruction::CatchPad:
   case Instruction::CatchRet:
   case Instruction::CleanupPad:
   case Instruction::CleanupRet:
     return false; // Misc instructions which have effects
   }
 }
 
 bool llvm::mayBeMemoryDependent(const Instruction &I) {
   return I.mayReadOrWriteMemory() || !isSafeToSpeculativelyExecute(&I);
 }
 
 OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
                                                    const Value *RHS,
                                                    const DataLayout &DL,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
   // Multiplying n * m significant bits yields a result of n + m significant
   // bits. If the total number of significant bits does not exceed the
   // result bit width (minus 1), there is no overflow.
   // This means if we have enough leading zero bits in the operands
   // we can guarantee that the result does not overflow.
   // Ref: "Hacker's Delight" by Henry Warren
   unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
   KnownBits LHSKnown(BitWidth);
   KnownBits RHSKnown(BitWidth);
   computeKnownBits(LHS, LHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   // Note that underestimating the number of zero bits gives a more
   // conservative answer.
   unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
                       RHSKnown.countMinLeadingZeros();
   // First handle the easy case: if we have enough zero bits there's
   // definitely no overflow.
   if (ZeroBits >= BitWidth)
     return OverflowResult::NeverOverflows;
 
   // Get the largest possible values for each operand.
   APInt LHSMax = ~LHSKnown.Zero;
   APInt RHSMax = ~RHSKnown.Zero;
 
   // We know the multiply operation doesn't overflow if the maximum values for
   // each operand will not overflow after we multiply them together.
   bool MaxOverflow;
   (void)LHSMax.umul_ov(RHSMax, MaxOverflow);
   if (!MaxOverflow)
     return OverflowResult::NeverOverflows;
 
   // We know it always overflows if multiplying the smallest possible values for
   // the operands also results in overflow.
   bool MinOverflow;
   (void)LHSKnown.One.umul_ov(RHSKnown.One, MinOverflow);
   if (MinOverflow)
     return OverflowResult::AlwaysOverflows;
 
   return OverflowResult::MayOverflow;
 }
 
 OverflowResult llvm::computeOverflowForSignedMul(const Value *LHS,
                                                  const Value *RHS,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
   // Multiplying n * m significant bits yields a result of n + m significant
   // bits. If the total number of significant bits does not exceed the
   // result bit width (minus 1), there is no overflow.
   // This means if we have enough leading sign bits in the operands
   // we can guarantee that the result does not overflow.
   // Ref: "Hacker's Delight" by Henry Warren
   unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
 
   // Note that underestimating the number of sign bits gives a more
   // conservative answer.
   unsigned SignBits = ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) +
                       ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT);
 
   // First handle the easy case: if we have enough sign bits there's
   // definitely no overflow.
   if (SignBits > BitWidth + 1)
     return OverflowResult::NeverOverflows;
 
   // There are two ambiguous cases where there can be no overflow:
   //   SignBits == BitWidth + 1    and
   //   SignBits == BitWidth
   // The second case is difficult to check, therefore we only handle the
   // first case.
   if (SignBits == BitWidth + 1) {
     // It overflows only when both arguments are negative and the true
     // product is exactly the minimum negative number.
     // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
     // For simplicity we just check if at least one side is not negative.
     KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
     KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
     if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
       return OverflowResult::NeverOverflows;
   }
   return OverflowResult::MayOverflow;
 }
 
 OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
                                                    const Value *RHS,
                                                    const DataLayout &DL,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
   KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
   if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
     KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
     if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
       // The sign bit is set in both cases: this MUST overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::AlwaysOverflows;
     }
 
     if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
       // The sign bit is clear in both cases: this CANNOT overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::NeverOverflows;
     }
   }
 
   return OverflowResult::MayOverflow;
 }
 
 /// Return true if we can prove that adding the two values of the
 /// knownbits will not overflow.
 /// Otherwise return false.
 static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
                                     const KnownBits &RHSKnown) {
   // Addition of two 2's complement numbers having opposite signs will never
   // overflow.
   if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
       (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
     return true;
 
   // If either of the values is known to be non-negative, adding them can only
   // overflow if the second is also non-negative, so we can assume that.
   // Two non-negative numbers will only overflow if there is a carry to the
   // sign bit, so we can check if even when the values are as big as possible
   // there is no overflow to the sign bit.
   if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
     APInt MaxLHS = ~LHSKnown.Zero;
     MaxLHS.clearSignBit();
     APInt MaxRHS = ~RHSKnown.Zero;
     MaxRHS.clearSignBit();
     APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
     return Result.isSignBitClear();
   }
 
   // If either of the values is known to be negative, adding them can only
   // overflow if the second is also negative, so we can assume that.
   // Two negative number will only overflow if there is no carry to the sign
   // bit, so we can check if even when the values are as small as possible
   // there is overflow to the sign bit.
   if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
     APInt MinLHS = LHSKnown.One;
     MinLHS.clearSignBit();
     APInt MinRHS = RHSKnown.One;
     MinRHS.clearSignBit();
     APInt Result = std::move(MinLHS) + std::move(MinRHS);
     return Result.isSignBitSet();
   }
 
   // If we reached here it means that we know nothing about the sign bits.
   // In this case we can't know if there will be an overflow, since by
   // changing the sign bits any two values can be made to overflow.
   return false;
 }
 
 static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
                                                   const Value *RHS,
                                                   const AddOperator *Add,
                                                   const DataLayout &DL,
                                                   AssumptionCache *AC,
                                                   const Instruction *CxtI,
                                                   const DominatorTree *DT) {
   if (Add && Add->hasNoSignedWrap()) {
     return OverflowResult::NeverOverflows;
   }
 
   // If LHS and RHS each have at least two sign bits, the addition will look
   // like
   //
   // XX..... +
   // YY.....
   //
   // If the carry into the most significant position is 0, X and Y can't both
   // be 1 and therefore the carry out of the addition is also 0.
   //
   // If the carry into the most significant position is 1, X and Y can't both
   // be 0 and therefore the carry out of the addition is also 1.
   //
   // Since the carry into the most significant position is always equal to
   // the carry out of the addition, there is no signed overflow.
   if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
       ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
     return OverflowResult::NeverOverflows;
 
   KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
   KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
   if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
     return OverflowResult::NeverOverflows;
 
   // The remaining code needs Add to be available. Early returns if not so.
   if (!Add)
     return OverflowResult::MayOverflow;
 
   // If the sign of Add is the same as at least one of the operands, this add
   // CANNOT overflow. This is particularly useful when the sum is
   // @llvm.assume'ed non-negative rather than proved so from analyzing its
   // operands.
   bool LHSOrRHSKnownNonNegative =
       (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
   bool LHSOrRHSKnownNegative =
       (LHSKnown.isNegative() || RHSKnown.isNegative());
   if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
     KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
     if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
         (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
       return OverflowResult::NeverOverflows;
     }
   }
 
   return OverflowResult::MayOverflow;
 }
 
 OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
                                                    const Value *RHS,
                                                    const DataLayout &DL,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
   KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
   KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
   if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
     return OverflowResult::NeverOverflows;
 
   return OverflowResult::MayOverflow;
 }
 
 OverflowResult llvm::computeOverflowForSignedSub(const Value *LHS,
                                                  const Value *RHS,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
   // If LHS and RHS each have at least two sign bits, the subtraction
   // cannot overflow.
   if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
       ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
     return OverflowResult::NeverOverflows;
 
   KnownBits LHSKnown = computeKnownBits(LHS, DL, 0, AC, CxtI, DT);
 
   KnownBits RHSKnown = computeKnownBits(RHS, DL, 0, AC, CxtI, DT);
 
   // Subtraction of two 2's complement numbers having identical signs will
   // never overflow.
   if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
       (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
     return OverflowResult::NeverOverflows;
 
   // TODO: implement logic similar to checkRippleForAdd
   return OverflowResult::MayOverflow;
 }
 
 bool llvm::isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
                                      const DominatorTree &DT) {
 #ifndef NDEBUG
   auto IID = II->getIntrinsicID();
   assert((IID == Intrinsic::sadd_with_overflow ||
           IID == Intrinsic::uadd_with_overflow ||
           IID == Intrinsic::ssub_with_overflow ||
           IID == Intrinsic::usub_with_overflow ||
           IID == Intrinsic::smul_with_overflow ||
           IID == Intrinsic::umul_with_overflow) &&
          "Not an overflow intrinsic!");
 #endif
 
   SmallVector<const BranchInst *, 2> GuardingBranches;
   SmallVector<const ExtractValueInst *, 2> Results;
 
   for (const User *U : II->users()) {
     if (const auto *EVI = dyn_cast<ExtractValueInst>(U)) {
       assert(EVI->getNumIndices() == 1 && "Obvious from CI's type");
 
       if (EVI->getIndices()[0] == 0)
         Results.push_back(EVI);
       else {
         assert(EVI->getIndices()[0] == 1 && "Obvious from CI's type");
 
         for (const auto *U : EVI->users())
           if (const auto *B = dyn_cast<BranchInst>(U)) {
             assert(B->isConditional() && "How else is it using an i1?");
             GuardingBranches.push_back(B);
           }
       }
     } else {
       // We are using the aggregate directly in a way we don't want to analyze
       // here (storing it to a global, say).
       return false;
     }
   }
 
   auto AllUsesGuardedByBranch = [&](const BranchInst *BI) {
     BasicBlockEdge NoWrapEdge(BI->getParent(), BI->getSuccessor(1));
     if (!NoWrapEdge.isSingleEdge())
       return false;
 
     // Check if all users of the add are provably no-wrap.
     for (const auto *Result : Results) {
       // If the extractvalue itself is not executed on overflow, the we don't
       // need to check each use separately, since domination is transitive.
       if (DT.dominates(NoWrapEdge, Result->getParent()))
         continue;
 
       for (auto &RU : Result->uses())
         if (!DT.dominates(NoWrapEdge, RU))
           return false;
     }
 
     return true;
   };
 
   return llvm::any_of(GuardingBranches, AllUsesGuardedByBranch);
 }
 
 
 OverflowResult llvm::computeOverflowForSignedAdd(const AddOperator *Add,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
   return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1),
                                        Add, DL, AC, CxtI, DT);
 }
 
 OverflowResult llvm::computeOverflowForSignedAdd(const Value *LHS,
                                                  const Value *RHS,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
   return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT);
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
   // A memory operation returns normally if it isn't volatile. A volatile
   // operation is allowed to trap.
   //
   // An atomic operation isn't guaranteed to return in a reasonable amount of
   // time because it's possible for another thread to interfere with it for an
   // arbitrary length of time, but programs aren't allowed to rely on that.
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return !LI->isVolatile();
   if (const StoreInst *SI = dyn_cast<StoreInst>(I))
     return !SI->isVolatile();
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
     return !CXI->isVolatile();
   if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
     return !RMWI->isVolatile();
   if (const MemIntrinsic *MII = dyn_cast<MemIntrinsic>(I))
     return !MII->isVolatile();
 
   // If there is no successor, then execution can't transfer to it.
   if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
     return !CRI->unwindsToCaller();
   if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
     return !CatchSwitch->unwindsToCaller();
   if (isa<ResumeInst>(I))
     return false;
   if (isa<ReturnInst>(I))
     return false;
   if (isa<UnreachableInst>(I))
     return false;
 
   // Calls can throw, or contain an infinite loop, or kill the process.
   if (auto CS = ImmutableCallSite(I)) {
     // Call sites that throw have implicit non-local control flow.
     if (!CS.doesNotThrow())
       return false;
 
     // Non-throwing call sites can loop infinitely, call exit/pthread_exit
     // etc. and thus not return.  However, LLVM already assumes that
     //
     //  - Thread exiting actions are modeled as writes to memory invisible to
     //    the program.
     //
     //  - Loops that don't have side effects (side effects are volatile/atomic
     //    stores and IO) always terminate (see http://llvm.org/PR965).
     //    Furthermore IO itself is also modeled as writes to memory invisible to
     //    the program.
     //
     // We rely on those assumptions here, and use the memory effects of the call
     // target as a proxy for checking that it always returns.
 
     // FIXME: This isn't aggressive enough; a call which only writes to a global
     // is guaranteed to return.
     return CS.onlyReadsMemory() || CS.onlyAccessesArgMemory() ||
            match(I, m_Intrinsic<Intrinsic::assume>()) ||
            match(I, m_Intrinsic<Intrinsic::sideeffect>());
   }
 
   // Other instructions return normally.
   return true;
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) {
   // TODO: This is slightly consdervative for invoke instruction since exiting
   // via an exception *is* normal control for them.
   for (auto I = BB->begin(), E = BB->end(); I != E; ++I)
     if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
       return false;
   return true;
 }
 
 bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
                                                   const Loop *L) {
   // The loop header is guaranteed to be executed for every iteration.
   //
   // FIXME: Relax this constraint to cover all basic blocks that are
   // guaranteed to be executed at every iteration.
   if (I->getParent() != L->getHeader()) return false;
 
   for (const Instruction &LI : *L->getHeader()) {
     if (&LI == I) return true;
     if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false;
   }
   llvm_unreachable("Instruction not contained in its own parent basic block.");
 }
 
 bool llvm::propagatesFullPoison(const Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Xor:
   case Instruction::Trunc:
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
   case Instruction::Mul:
   case Instruction::Shl:
   case Instruction::GetElementPtr:
     // These operations all propagate poison unconditionally. Note that poison
     // is not any particular value, so xor or subtraction of poison with
     // itself still yields poison, not zero.
     return true;
 
   case Instruction::AShr:
   case Instruction::SExt:
     // For these operations, one bit of the input is replicated across
     // multiple output bits. A replicated poison bit is still poison.
     return true;
 
   case Instruction::ICmp:
     // Comparing poison with any value yields poison.  This is why, for
     // instance, x s< (x +nsw 1) can be folded to true.
     return true;
 
   default:
     return false;
   }
 }
 
 const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) {
   switch (I->getOpcode()) {
     case Instruction::Store:
       return cast<StoreInst>(I)->getPointerOperand();
 
     case Instruction::Load:
       return cast<LoadInst>(I)->getPointerOperand();
 
     case Instruction::AtomicCmpXchg:
       return cast<AtomicCmpXchgInst>(I)->getPointerOperand();
 
     case Instruction::AtomicRMW:
       return cast<AtomicRMWInst>(I)->getPointerOperand();
 
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::URem:
     case Instruction::SRem:
       return I->getOperand(1);
 
     default:
       return nullptr;
   }
 }
 
 bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
   // We currently only look for uses of poison values within the same basic
   // block, as that makes it easier to guarantee that the uses will be
   // executed given that PoisonI is executed.
   //
   // FIXME: Expand this to consider uses beyond the same basic block. To do
   // this, look out for the distinction between post-dominance and strong
   // post-dominance.
   const BasicBlock *BB = PoisonI->getParent();
 
   // Set of instructions that we have proved will yield poison if PoisonI
   // does.
   SmallSet<const Value *, 16> YieldsPoison;
   SmallSet<const BasicBlock *, 4> Visited;
   YieldsPoison.insert(PoisonI);
   Visited.insert(PoisonI->getParent());
 
   BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end();
 
   unsigned Iter = 0;
   while (Iter++ < MaxDepth) {
     for (auto &I : make_range(Begin, End)) {
       if (&I != PoisonI) {
         const Value *NotPoison = getGuaranteedNonFullPoisonOp(&I);
         if (NotPoison != nullptr && YieldsPoison.count(NotPoison))
           return true;
         if (!isGuaranteedToTransferExecutionToSuccessor(&I))
           return false;
       }
 
       // Mark poison that propagates from I through uses of I.
       if (YieldsPoison.count(&I)) {
         for (const User *User : I.users()) {
           const Instruction *UserI = cast<Instruction>(User);
           if (propagatesFullPoison(UserI))
             YieldsPoison.insert(User);
         }
       }
     }
 
     if (auto *NextBB = BB->getSingleSuccessor()) {
       if (Visited.insert(NextBB).second) {
         BB = NextBB;
         Begin = BB->getFirstNonPHI()->getIterator();
         End = BB->end();
         continue;
       }
     }
 
     break;
   }
   return false;
 }
 
 static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) {
   if (FMF.noNaNs())
     return true;
 
   if (auto *C = dyn_cast<ConstantFP>(V))
     return !C->isNaN();
   return false;
 }
 
 static bool isKnownNonZero(const Value *V) {
   if (auto *C = dyn_cast<ConstantFP>(V))
     return !C->isZero();
   return false;
 }
 
 /// Match clamp pattern for float types without care about NaNs or signed zeros.
 /// Given non-min/max outer cmp/select from the clamp pattern this
 /// function recognizes if it can be substitued by a "canonical" min/max
 /// pattern.
 static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred,
                                                Value *CmpLHS, Value *CmpRHS,
                                                Value *TrueVal, Value *FalseVal,
                                                Value *&LHS, Value *&RHS) {
   // Try to match
   //   X < C1 ? C1 : Min(X, C2) --> Max(C1, Min(X, C2))
   //   X > C1 ? C1 : Max(X, C2) --> Min(C1, Max(X, C2))
   // and return description of the outer Max/Min.
 
   // First, check if select has inverse order:
   if (CmpRHS == FalseVal) {
     std::swap(TrueVal, FalseVal);
     Pred = CmpInst::getInversePredicate(Pred);
   }
 
   // Assume success now. If there's no match, callers should not use these anyway.
   LHS = TrueVal;
   RHS = FalseVal;
 
   const APFloat *FC1;
   if (CmpRHS != TrueVal || !match(CmpRHS, m_APFloat(FC1)) || !FC1->isFinite())
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   const APFloat *FC2;
   switch (Pred) {
   case CmpInst::FCMP_OLT:
   case CmpInst::FCMP_OLE:
   case CmpInst::FCMP_ULT:
   case CmpInst::FCMP_ULE:
     if (match(FalseVal,
               m_CombineOr(m_OrdFMin(m_Specific(CmpLHS), m_APFloat(FC2)),
                           m_UnordFMin(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
         FC1->compare(*FC2) == APFloat::cmpResult::cmpLessThan)
       return {SPF_FMAXNUM, SPNB_RETURNS_ANY, false};
     break;
   case CmpInst::FCMP_OGT:
   case CmpInst::FCMP_OGE:
   case CmpInst::FCMP_UGT:
   case CmpInst::FCMP_UGE:
     if (match(FalseVal,
               m_CombineOr(m_OrdFMax(m_Specific(CmpLHS), m_APFloat(FC2)),
                           m_UnordFMax(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
         FC1->compare(*FC2) == APFloat::cmpResult::cmpGreaterThan)
       return {SPF_FMINNUM, SPNB_RETURNS_ANY, false};
     break;
   default:
     break;
   }
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 /// Recognize variations of:
 ///   CLAMP(v,l,h) ==> ((v) < (l) ? (l) : ((v) > (h) ? (h) : (v)))
 static SelectPatternResult matchClamp(CmpInst::Predicate Pred,
                                       Value *CmpLHS, Value *CmpRHS,
                                       Value *TrueVal, Value *FalseVal) {
   // Swap the select operands and predicate to match the patterns below.
   if (CmpRHS != TrueVal) {
     Pred = ICmpInst::getSwappedPredicate(Pred);
     std::swap(TrueVal, FalseVal);
   }
   const APInt *C1;
   if (CmpRHS == TrueVal && match(CmpRHS, m_APInt(C1))) {
     const APInt *C2;
     // (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
     if (match(FalseVal, m_SMin(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->slt(*C2) && Pred == CmpInst::ICMP_SLT)
       return {SPF_SMAX, SPNB_NA, false};
 
     // (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
     if (match(FalseVal, m_SMax(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->sgt(*C2) && Pred == CmpInst::ICMP_SGT)
       return {SPF_SMIN, SPNB_NA, false};
 
     // (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
     if (match(FalseVal, m_UMin(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->ult(*C2) && Pred == CmpInst::ICMP_ULT)
       return {SPF_UMAX, SPNB_NA, false};
 
     // (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
     if (match(FalseVal, m_UMax(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->ugt(*C2) && Pred == CmpInst::ICMP_UGT)
       return {SPF_UMIN, SPNB_NA, false};
   }
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 /// Recognize variations of:
 ///   a < c ? min(a,b) : min(b,c) ==> min(min(a,b),min(b,c))
 static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
                                                Value *CmpLHS, Value *CmpRHS,
                                                Value *TVal, Value *FVal,
                                                unsigned Depth) {
   // TODO: Allow FP min/max with nnan/nsz.
   assert(CmpInst::isIntPredicate(Pred) && "Expected integer comparison");
 
   Value *A, *B;
   SelectPatternResult L = matchSelectPattern(TVal, A, B, nullptr, Depth + 1);
   if (!SelectPatternResult::isMinOrMax(L.Flavor))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   Value *C, *D;
   SelectPatternResult R = matchSelectPattern(FVal, C, D, nullptr, Depth + 1);
   if (L.Flavor != R.Flavor)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // We have something like: x Pred y ? min(a, b) : min(c, d).
   // Try to match the compare to the min/max operations of the select operands.
   // First, make sure we have the right compare predicate.
   switch (L.Flavor) {
   case SPF_SMIN:
     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   case SPF_SMAX:
     if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   case SPF_UMIN:
     if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   case SPF_UMAX:
     if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   default:
     return {SPF_UNKNOWN, SPNB_NA, false};
   }
 
   // If there is a common operand in the already matched min/max and the other
   // min/max operands match the compare operands (either directly or inverted),
   // then this is min/max of the same flavor.
 
   // a pred c ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
   // ~c pred ~a ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
   if (D == B) {
     if ((CmpLHS == A && CmpRHS == C) || (match(C, m_Not(m_Specific(CmpLHS))) &&
                                          match(A, m_Not(m_Specific(CmpRHS)))))
       return {L.Flavor, SPNB_NA, false};
   }
   // a pred d ? m(a, b) : m(b, d) --> m(m(a, b), m(b, d))
   // ~d pred ~a ? m(a, b) : m(b, d) --> m(m(a, b), m(b, d))
   if (C == B) {
     if ((CmpLHS == A && CmpRHS == D) || (match(D, m_Not(m_Specific(CmpLHS))) &&
                                          match(A, m_Not(m_Specific(CmpRHS)))))
       return {L.Flavor, SPNB_NA, false};
   }
   // b pred c ? m(a, b) : m(c, a) --> m(m(a, b), m(c, a))
   // ~c pred ~b ? m(a, b) : m(c, a) --> m(m(a, b), m(c, a))
   if (D == A) {
     if ((CmpLHS == B && CmpRHS == C) || (match(C, m_Not(m_Specific(CmpLHS))) &&
                                          match(B, m_Not(m_Specific(CmpRHS)))))
       return {L.Flavor, SPNB_NA, false};
   }
   // b pred d ? m(a, b) : m(a, d) --> m(m(a, b), m(a, d))
   // ~d pred ~b ? m(a, b) : m(a, d) --> m(m(a, b), m(a, d))
   if (C == A) {
     if ((CmpLHS == B && CmpRHS == D) || (match(D, m_Not(m_Specific(CmpLHS))) &&
                                          match(B, m_Not(m_Specific(CmpRHS)))))
       return {L.Flavor, SPNB_NA, false};
   }
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 /// Match non-obvious integer minimum and maximum sequences.
 static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
                                        Value *CmpLHS, Value *CmpRHS,
                                        Value *TrueVal, Value *FalseVal,
                                        Value *&LHS, Value *&RHS,
                                        unsigned Depth) {
   // Assume success. If there's no match, callers should not use these anyway.
   LHS = TrueVal;
   RHS = FalseVal;
 
   SelectPatternResult SPR = matchClamp(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal);
   if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
     return SPR;
 
   SPR = matchMinMaxOfMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, Depth);
   if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
     return SPR;
 
   if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // Z = X -nsw Y
   // (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
   // (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
   if (match(TrueVal, m_Zero()) &&
       match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
 
   // Z = X -nsw Y
   // (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
   // (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
   if (match(FalseVal, m_Zero()) &&
       match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
 
   const APInt *C1;
   if (!match(CmpRHS, m_APInt(C1)))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // An unsigned min/max can be written with a signed compare.
   const APInt *C2;
   if ((CmpLHS == TrueVal && match(FalseVal, m_APInt(C2))) ||
       (CmpLHS == FalseVal && match(TrueVal, m_APInt(C2)))) {
     // Is the sign bit set?
     // (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
     // (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
     if (Pred == CmpInst::ICMP_SLT && C1->isNullValue() &&
         C2->isMaxSignedValue())
       return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
 
     // Is the sign bit clear?
     // (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
     // (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
     if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
         C2->isMinSignedValue())
       return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
   }
 
   // Look through 'not' ops to find disguised signed min/max.
   // (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
   // (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
   if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
       match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
 
   // (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
   // (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
   if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
       match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 bool llvm::isKnownNegation(const Value *X, const Value *Y, bool NeedNSW) {
   assert(X && Y && "Invalid operand");
 
   // X = sub (0, Y) || X = sub nsw (0, Y)
   if ((!NeedNSW && match(X, m_Sub(m_ZeroInt(), m_Specific(Y)))) ||
       (NeedNSW && match(X, m_NSWSub(m_ZeroInt(), m_Specific(Y)))))
     return true;
 
   // Y = sub (0, X) || Y = sub nsw (0, X)
   if ((!NeedNSW && match(Y, m_Sub(m_ZeroInt(), m_Specific(X)))) ||
       (NeedNSW && match(Y, m_NSWSub(m_ZeroInt(), m_Specific(X)))))
     return true;
 
   // X = sub (A, B), Y = sub (B, A) || X = sub nsw (A, B), Y = sub nsw (B, A)
   Value *A, *B;
   return (!NeedNSW && (match(X, m_Sub(m_Value(A), m_Value(B))) &&
                         match(Y, m_Sub(m_Specific(B), m_Specific(A))))) ||
          (NeedNSW && (match(X, m_NSWSub(m_Value(A), m_Value(B))) &&
                        match(Y, m_NSWSub(m_Specific(B), m_Specific(A)))));
 }
 
 static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               FastMathFlags FMF,
                                               Value *CmpLHS, Value *CmpRHS,
                                               Value *TrueVal, Value *FalseVal,
                                               Value *&LHS, Value *&RHS,
                                               unsigned Depth) {
   LHS = CmpLHS;
   RHS = CmpRHS;
 
   // Signed zero may return inconsistent results between implementations.
   //  (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0
   //  minNum(0.0, -0.0)          // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1)
   // Therefore, we behave conservatively and only proceed if at least one of the
   // operands is known to not be zero or if we don't care about signed zero.
   switch (Pred) {
   default: break;
   // FIXME: Include OGT/OLT/UGT/ULT.
   case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE:
   case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE:
     if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
         !isKnownNonZero(CmpRHS))
       return {SPF_UNKNOWN, SPNB_NA, false};
   }
 
   SelectPatternNaNBehavior NaNBehavior = SPNB_NA;
   bool Ordered = false;
 
   // When given one NaN and one non-NaN input:
   //   - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input.
   //   - A simple C99 (a < b ? a : b) construction will return 'b' (as the
   //     ordered comparison fails), which could be NaN or non-NaN.
   // so here we discover exactly what NaN behavior is required/accepted.
   if (CmpInst::isFPPredicate(Pred)) {
     bool LHSSafe = isKnownNonNaN(CmpLHS, FMF);
     bool RHSSafe = isKnownNonNaN(CmpRHS, FMF);
 
     if (LHSSafe && RHSSafe) {
       // Both operands are known non-NaN.
       NaNBehavior = SPNB_RETURNS_ANY;
     } else if (CmpInst::isOrdered(Pred)) {
       // An ordered comparison will return false when given a NaN, so it
       // returns the RHS.
       Ordered = true;
       if (LHSSafe)
         // LHS is non-NaN, so if RHS is NaN then NaN will be returned.
         NaNBehavior = SPNB_RETURNS_NAN;
       else if (RHSSafe)
         NaNBehavior = SPNB_RETURNS_OTHER;
       else
         // Completely unsafe.
         return {SPF_UNKNOWN, SPNB_NA, false};
     } else {
       Ordered = false;
       // An unordered comparison will return true when given a NaN, so it
       // returns the LHS.
       if (LHSSafe)
         // LHS is non-NaN, so if RHS is NaN then non-NaN will be returned.
         NaNBehavior = SPNB_RETURNS_OTHER;
       else if (RHSSafe)
         NaNBehavior = SPNB_RETURNS_NAN;
       else
         // Completely unsafe.
         return {SPF_UNKNOWN, SPNB_NA, false};
     }
   }
 
   if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
     std::swap(CmpLHS, CmpRHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
     if (NaNBehavior == SPNB_RETURNS_NAN)
       NaNBehavior = SPNB_RETURNS_OTHER;
     else if (NaNBehavior == SPNB_RETURNS_OTHER)
       NaNBehavior = SPNB_RETURNS_NAN;
     Ordered = !Ordered;
   }
 
   // ([if]cmp X, Y) ? X : Y
   if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
     switch (Pred) {
     default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality.
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false};
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false};
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false};
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false};
     case FCmpInst::FCMP_UGT:
     case FCmpInst::FCMP_UGE:
     case FCmpInst::FCMP_OGT:
     case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered};
     case FCmpInst::FCMP_ULT:
     case FCmpInst::FCMP_ULE:
     case FCmpInst::FCMP_OLT:
     case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered};
     }
   }
 
   if (isKnownNegation(TrueVal, FalseVal)) {
     // Sign-extending LHS does not change its sign, so TrueVal/FalseVal can
     // match against either LHS or sext(LHS).
     auto MaybeSExtCmpLHS =
         m_CombineOr(m_Specific(CmpLHS), m_SExt(m_Specific(CmpLHS)));
     auto ZeroOrAllOnes = m_CombineOr(m_ZeroInt(), m_AllOnes());
     auto ZeroOrOne = m_CombineOr(m_ZeroInt(), m_One());
     if (match(TrueVal, MaybeSExtCmpLHS)) {
       // Set the return values. If the compare uses the negated value (-X >s 0),
       // swap the return values because the negated value is always 'RHS'.
       LHS = TrueVal;
       RHS = FalseVal;
       if (match(CmpLHS, m_Neg(m_Specific(FalseVal))))
         std::swap(LHS, RHS);
 
       // (X >s 0) ? X : -X or (X >s -1) ? X : -X --> ABS(X)
       // (-X >s 0) ? -X : X or (-X >s -1) ? -X : X --> ABS(X)
       if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, ZeroOrAllOnes))
         return {SPF_ABS, SPNB_NA, false};
 
       // (X <s 0) ? X : -X or (X <s 1) ? X : -X --> NABS(X)
       // (-X <s 0) ? -X : X or (-X <s 1) ? -X : X --> NABS(X)
       if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, ZeroOrOne))
         return {SPF_NABS, SPNB_NA, false};
     }
     else if (match(FalseVal, MaybeSExtCmpLHS)) {
       // Set the return values. If the compare uses the negated value (-X >s 0),
       // swap the return values because the negated value is always 'RHS'.
       LHS = FalseVal;
       RHS = TrueVal;
       if (match(CmpLHS, m_Neg(m_Specific(TrueVal))))
         std::swap(LHS, RHS);
 
       // (X >s 0) ? -X : X or (X >s -1) ? -X : X --> NABS(X)
       // (-X >s 0) ? X : -X or (-X >s -1) ? X : -X --> NABS(X)
       if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, ZeroOrAllOnes))
         return {SPF_NABS, SPNB_NA, false};
 
       // (X <s 0) ? -X : X or (X <s 1) ? -X : X --> ABS(X)
       // (-X <s 0) ? X : -X or (-X <s 1) ? X : -X --> ABS(X)
       if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, ZeroOrOne))
         return {SPF_ABS, SPNB_NA, false};
     }
   }
 
   if (CmpInst::isIntPredicate(Pred))
     return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS, Depth);
 
   // According to (IEEE 754-2008 5.3.1), minNum(0.0, -0.0) and similar
   // may return either -0.0 or 0.0, so fcmp/select pair has stricter
   // semantics than minNum. Be conservative in such case.
   if (NaNBehavior != SPNB_RETURNS_ANY ||
       (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
        !isKnownNonZero(CmpRHS)))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   return matchFastFloatClamp(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS);
 }
 
 /// Helps to match a select pattern in case of a type mismatch.
 ///
 /// The function processes the case when type of true and false values of a
 /// select instruction differs from type of the cmp instruction operands because
 /// of a cast instruction. The function checks if it is legal to move the cast
 /// operation after "select". If yes, it returns the new second value of
 /// "select" (with the assumption that cast is moved):
 /// 1. As operand of cast instruction when both values of "select" are same cast
 /// instructions.
 /// 2. As restored constant (by applying reverse cast operation) when the first
 /// value of the "select" is a cast operation and the second value is a
 /// constant.
 /// NOTE: We return only the new second value because the first value could be
 /// accessed as operand of cast instruction.
 static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
                               Instruction::CastOps *CastOp) {
   auto *Cast1 = dyn_cast<CastInst>(V1);
   if (!Cast1)
     return nullptr;
 
   *CastOp = Cast1->getOpcode();
   Type *SrcTy = Cast1->getSrcTy();
   if (auto *Cast2 = dyn_cast<CastInst>(V2)) {
     // If V1 and V2 are both the same cast from the same type, look through V1.
     if (*CastOp == Cast2->getOpcode() && SrcTy == Cast2->getSrcTy())
       return Cast2->getOperand(0);
     return nullptr;
   }
 
   auto *C = dyn_cast<Constant>(V2);
   if (!C)
     return nullptr;
 
   Constant *CastedTo = nullptr;
   switch (*CastOp) {
   case Instruction::ZExt:
     if (CmpI->isUnsigned())
       CastedTo = ConstantExpr::getTrunc(C, SrcTy);
     break;
   case Instruction::SExt:
     if (CmpI->isSigned())
       CastedTo = ConstantExpr::getTrunc(C, SrcTy, true);
     break;
   case Instruction::Trunc:
     Constant *CmpConst;
     if (match(CmpI->getOperand(1), m_Constant(CmpConst)) &&
         CmpConst->getType() == SrcTy) {
       // Here we have the following case:
       //
       //   %cond = cmp iN %x, CmpConst
       //   %tr = trunc iN %x to iK
       //   %narrowsel = select i1 %cond, iK %t, iK C
       //
       // We can always move trunc after select operation:
       //
       //   %cond = cmp iN %x, CmpConst
       //   %widesel = select i1 %cond, iN %x, iN CmpConst
       //   %tr = trunc iN %widesel to iK
       //
       // Note that C could be extended in any way because we don't care about
       // upper bits after truncation. It can't be abs pattern, because it would
       // look like:
       //
       //   select i1 %cond, x, -x.
       //
       // So only min/max pattern could be matched. Such match requires widened C
       // == CmpConst. That is why set widened C = CmpConst, condition trunc
       // CmpConst == C is checked below.
       CastedTo = CmpConst;
     } else {
       CastedTo = ConstantExpr::getIntegerCast(C, SrcTy, CmpI->isSigned());
     }
     break;
   case Instruction::FPTrunc:
     CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
     break;
   case Instruction::FPExt:
     CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
     break;
   case Instruction::FPToUI:
     CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
     break;
   case Instruction::FPToSI:
     CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
     break;
   case Instruction::UIToFP:
     CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
     break;
   case Instruction::SIToFP:
     CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
     break;
   default:
     break;
   }
 
   if (!CastedTo)
     return nullptr;
 
   // Make sure the cast doesn't lose any information.
   Constant *CastedBack =
       ConstantExpr::getCast(*CastOp, CastedTo, C->getType(), true);
   if (CastedBack != C)
     return nullptr;
 
   return CastedTo;
 }
 
 SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
                                              Instruction::CastOps *CastOp,
                                              unsigned Depth) {
   if (Depth >= MaxDepth)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};
 
   CmpInst *CmpI = dyn_cast<CmpInst>(SI->getCondition());
   if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false};
 
   CmpInst::Predicate Pred = CmpI->getPredicate();
   Value *CmpLHS = CmpI->getOperand(0);
   Value *CmpRHS = CmpI->getOperand(1);
   Value *TrueVal = SI->getTrueValue();
   Value *FalseVal = SI->getFalseValue();
   FastMathFlags FMF;
   if (isa<FPMathOperator>(CmpI))
     FMF = CmpI->getFastMathFlags();
 
   // Bail out early.
   if (CmpI->isEquality())
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // Deal with type mismatches.
   if (CastOp && CmpLHS->getType() != TrueVal->getType()) {
     if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp)) {
       // If this is a potential fmin/fmax with a cast to integer, then ignore
       // -0.0 because there is no corresponding integer value.
       if (*CastOp == Instruction::FPToSI || *CastOp == Instruction::FPToUI)
         FMF.setNoSignedZeros();
       return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                   cast<CastInst>(TrueVal)->getOperand(0), C,
                                   LHS, RHS, Depth);
     }
     if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp)) {
       // If this is a potential fmin/fmax with a cast to integer, then ignore
       // -0.0 because there is no corresponding integer value.
       if (*CastOp == Instruction::FPToSI || *CastOp == Instruction::FPToUI)
         FMF.setNoSignedZeros();
       return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                   C, cast<CastInst>(FalseVal)->getOperand(0),
                                   LHS, RHS, Depth);
     }
   }
   return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal,
                               LHS, RHS, Depth);
 }
 
 CmpInst::Predicate llvm::getMinMaxPred(SelectPatternFlavor SPF, bool Ordered) {
   if (SPF == SPF_SMIN) return ICmpInst::ICMP_SLT;
   if (SPF == SPF_UMIN) return ICmpInst::ICMP_ULT;
   if (SPF == SPF_SMAX) return ICmpInst::ICMP_SGT;
   if (SPF == SPF_UMAX) return ICmpInst::ICMP_UGT;
   if (SPF == SPF_FMINNUM)
     return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT;
   if (SPF == SPF_FMAXNUM)
     return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;
   llvm_unreachable("unhandled!");
 }
 
 SelectPatternFlavor llvm::getInverseMinMaxFlavor(SelectPatternFlavor SPF) {
   if (SPF == SPF_SMIN) return SPF_SMAX;
   if (SPF == SPF_UMIN) return SPF_UMAX;
   if (SPF == SPF_SMAX) return SPF_SMIN;
   if (SPF == SPF_UMAX) return SPF_UMIN;
   llvm_unreachable("unhandled!");
 }
 
 CmpInst::Predicate llvm::getInverseMinMaxPred(SelectPatternFlavor SPF) {
   return getMinMaxPred(getInverseMinMaxFlavor(SPF));
 }
 
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
                             const Value *RHS, const DataLayout &DL,
                             unsigned Depth) {
   assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!");
   if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
     return true;
 
   switch (Pred) {
   default:
     return false;
 
   case CmpInst::ICMP_SLE: {
     const APInt *C;
 
     // LHS s<= LHS +_{nsw} C   if C >= 0
     if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
       return !C->isNegative();
     return false;
   }
 
   case CmpInst::ICMP_ULE: {
     const APInt *C;
 
     // LHS u<= LHS +_{nuw} C   for any C
     if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C))))
       return true;
 
     // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
     auto MatchNUWAddsToSameValue = [&](const Value *A, const Value *B,
                                        const Value *&X,
                                        const APInt *&CA, const APInt *&CB) {
       if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
           match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
         return true;
 
       // If X & C == 0 then (X | C) == X +_{nuw} C
       if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
           match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
         KnownBits Known(CA->getBitWidth());
         computeKnownBits(X, Known, DL, Depth + 1, /*AC*/ nullptr,
                          /*CxtI*/ nullptr, /*DT*/ nullptr);
         if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
           return true;
       }
 
       return false;
     };
 
     const Value *X;
     const APInt *CLHS, *CRHS;
     if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
       return CLHS->ule(*CRHS);
 
     return false;
   }
   }
 }
 
 /// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred
 /// ALHS ARHS" is true.  Otherwise, return None.
 static Optional<bool>
 isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
                       const Value *ARHS, const Value *BLHS, const Value *BRHS,
                       const DataLayout &DL, unsigned Depth) {
   switch (Pred) {
   default:
     return None;
 
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
     if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth) &&
         isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth))
       return true;
     return None;
 
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
     if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth) &&
         isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth))
       return true;
     return None;
   }
 }
 
 /// Return true if the operands of the two compares match.  IsSwappedOps is true
 /// when the operands match, but are swapped.
 static bool isMatchingOps(const Value *ALHS, const Value *ARHS,
                           const Value *BLHS, const Value *BRHS,
                           bool &IsSwappedOps) {
 
   bool IsMatchingOps = (ALHS == BLHS && ARHS == BRHS);
   IsSwappedOps = (ALHS == BRHS && ARHS == BLHS);
   return IsMatchingOps || IsSwappedOps;
 }
 
 /// Return true if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS BRHS" is
 /// true.  Return false if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS
 /// BRHS" is false.  Otherwise, return None if we can't infer anything.
 static Optional<bool> isImpliedCondMatchingOperands(CmpInst::Predicate APred,
                                                     const Value *ALHS,
                                                     const Value *ARHS,
                                                     CmpInst::Predicate BPred,
                                                     const Value *BLHS,
                                                     const Value *BRHS,
                                                     bool IsSwappedOps) {
   // Canonicalize the operands so they're matching.
   if (IsSwappedOps) {
     std::swap(BLHS, BRHS);
     BPred = ICmpInst::getSwappedPredicate(BPred);
   }
   if (CmpInst::isImpliedTrueByMatchingCmp(APred, BPred))
     return true;
   if (CmpInst::isImpliedFalseByMatchingCmp(APred, BPred))
     return false;
 
   return None;
 }
 
 /// Return true if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS C2" is
 /// true.  Return false if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS
 /// C2" is false.  Otherwise, return None if we can't infer anything.
 static Optional<bool>
 isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, const Value *ALHS,
                                  const ConstantInt *C1,
                                  CmpInst::Predicate BPred,
                                  const Value *BLHS, const ConstantInt *C2) {
   assert(ALHS == BLHS && "LHS operands must match.");
   ConstantRange DomCR =
       ConstantRange::makeExactICmpRegion(APred, C1->getValue());
   ConstantRange CR =
       ConstantRange::makeAllowedICmpRegion(BPred, C2->getValue());
   ConstantRange Intersection = DomCR.intersectWith(CR);
   ConstantRange Difference = DomCR.difference(CR);
   if (Intersection.isEmptySet())
     return false;
   if (Difference.isEmptySet())
     return true;
   return None;
 }
 
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.
 static Optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
                                          const ICmpInst *RHS,
                                          const DataLayout &DL, bool LHSIsTrue,
                                          unsigned Depth) {
   Value *ALHS = LHS->getOperand(0);
   Value *ARHS = LHS->getOperand(1);
   // The rest of the logic assumes the LHS condition is true.  If that's not the
   // case, invert the predicate to make it so.
   ICmpInst::Predicate APred =
       LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
 
   Value *BLHS = RHS->getOperand(0);
   Value *BRHS = RHS->getOperand(1);
   ICmpInst::Predicate BPred = RHS->getPredicate();
 
   // Can we infer anything when the two compares have matching operands?
   bool IsSwappedOps;
   if (isMatchingOps(ALHS, ARHS, BLHS, BRHS, IsSwappedOps)) {
     if (Optional<bool> Implication = isImpliedCondMatchingOperands(
             APred, ALHS, ARHS, BPred, BLHS, BRHS, IsSwappedOps))
       return Implication;
     // No amount of additional analysis will infer the second condition, so
     // early exit.
     return None;
   }
 
   // Can we infer anything when the LHS operands match and the RHS operands are
   // constants (not necessarily matching)?
   if (ALHS == BLHS && isa<ConstantInt>(ARHS) && isa<ConstantInt>(BRHS)) {
     if (Optional<bool> Implication = isImpliedCondMatchingImmOperands(
             APred, ALHS, cast<ConstantInt>(ARHS), BPred, BLHS,
             cast<ConstantInt>(BRHS)))
       return Implication;
     // No amount of additional analysis will infer the second condition, so
     // early exit.
     return None;
   }
 
   if (APred == BPred)
     return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth);
   return None;
 }
 
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.  We expect the
 /// RHS to be an icmp and the LHS to be an 'and' or an 'or' instruction.
 static Optional<bool> isImpliedCondAndOr(const BinaryOperator *LHS,
                                          const ICmpInst *RHS,
                                          const DataLayout &DL, bool LHSIsTrue,
                                          unsigned Depth) {
   // The LHS must be an 'or' or an 'and' instruction.
   assert((LHS->getOpcode() == Instruction::And ||
           LHS->getOpcode() == Instruction::Or) &&
          "Expected LHS to be 'and' or 'or'.");
 
   assert(Depth <= MaxDepth && "Hit recursion limit");
 
   // If the result of an 'or' is false, then we know both legs of the 'or' are
   // false.  Similarly, if the result of an 'and' is true, then we know both
   // legs of the 'and' are true.
   Value *ALHS, *ARHS;
   if ((!LHSIsTrue && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) ||
       (LHSIsTrue && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
     // FIXME: Make this non-recursion.
     if (Optional<bool> Implication =
             isImpliedCondition(ALHS, RHS, DL, LHSIsTrue, Depth + 1))
       return Implication;
     if (Optional<bool> Implication =
             isImpliedCondition(ARHS, RHS, DL, LHSIsTrue, Depth + 1))
       return Implication;
     return None;
   }
   return None;
 }
 
 Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
                                         const DataLayout &DL, bool LHSIsTrue,
                                         unsigned Depth) {
   // Bail out when we hit the limit.
   if (Depth == MaxDepth)
     return None;
 
   // A mismatch occurs when we compare a scalar cmp to a vector cmp, for
   // example.
   if (LHS->getType() != RHS->getType())
     return None;
 
   Type *OpTy = LHS->getType();
   assert(OpTy->isIntOrIntVectorTy(1) && "Expected integer type only!");
 
   // LHS ==> RHS by definition
   if (LHS == RHS)
     return LHSIsTrue;
 
   // FIXME: Extending the code below to handle vectors.
   if (OpTy->isVectorTy())
     return None;
 
   assert(OpTy->isIntegerTy(1) && "implied by above");
 
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS);
   if (LHSCmp && RHSCmp)
     return isImpliedCondICmps(LHSCmp, RHSCmp, DL, LHSIsTrue, Depth);
 
   // The LHS should be an 'or' or an 'and' instruction.  We expect the RHS to be
   // an icmp. FIXME: Add support for and/or on the RHS.
   const BinaryOperator *LHSBO = dyn_cast<BinaryOperator>(LHS);
   if (LHSBO && RHSCmp) {
     if ((LHSBO->getOpcode() == Instruction::And ||
          LHSBO->getOpcode() == Instruction::Or))
       return isImpliedCondAndOr(LHSBO, RHSCmp, DL, LHSIsTrue, Depth);
   }
   return None;
 }
Index: vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp	(revision 337631)
@@ -1,4793 +1,4789 @@
 //===- LegalizeDAG.cpp - Implement SelectionDAG::Legalize -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the SelectionDAG::Legalize method.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <tuple>
 #include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "legalizedag"
 
 namespace {
 
 /// Keeps track of state when getting the sign of a floating-point value as an
 /// integer.
 struct FloatSignAsInt {
   EVT FloatVT;
   SDValue Chain;
   SDValue FloatPtr;
   SDValue IntPtr;
   MachinePointerInfo IntPointerInfo;
   MachinePointerInfo FloatPointerInfo;
   SDValue IntValue;
   APInt SignMask;
   uint8_t SignBit;
 };
 
 //===----------------------------------------------------------------------===//
 /// This takes an arbitrary SelectionDAG as input and
 /// hacks on it until the target machine can handle it.  This involves
 /// eliminating value sizes the machine cannot handle (promoting small sizes to
 /// large sizes or splitting up large values into small values) as well as
 /// eliminating operations the machine cannot handle.
 ///
 /// This code also does a small amount of optimization and recognition of idioms
 /// as part of its processing.  For example, if a target does not support a
 /// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
 /// will attempt merge setcc and brc instructions into brcc's.
 class SelectionDAGLegalize {
   const TargetMachine &TM;
   const TargetLowering &TLI;
   SelectionDAG &DAG;
 
   /// The set of nodes which have already been legalized. We hold a
   /// reference to it in order to update as necessary on node deletion.
   SmallPtrSetImpl<SDNode *> &LegalizedNodes;
 
   /// A set of all the nodes updated during legalization.
   SmallSetVector<SDNode *, 16> *UpdatedNodes;
 
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
 
   // Libcall insertion helpers.
 
 public:
   SelectionDAGLegalize(SelectionDAG &DAG,
                        SmallPtrSetImpl<SDNode *> &LegalizedNodes,
                        SmallSetVector<SDNode *, 16> *UpdatedNodes = nullptr)
       : TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
         LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}
 
   /// Legalizes the given operation.
   void LegalizeOp(SDNode *Node);
 
 private:
   SDValue OptimizeFloatStore(StoreSDNode *ST);
 
   void LegalizeLoadOps(SDNode *Node);
   void LegalizeStoreOps(SDNode *Node);
 
   /// Some targets cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
   /// the insert there, and then read the result back.
   SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
                                          const SDLoc &dl);
   SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
                                   const SDLoc &dl);
 
   /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
   SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, const SDLoc &dl,
                                      SDValue N1, SDValue N2,
                                      ArrayRef<int> Mask) const;
 
   bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
                              bool &NeedInvert, const SDLoc &dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
   SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
                         unsigned NumOps, bool isSigned, const SDLoc &dl);
 
   std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
                                                  SDNode *Node, bool isSigned);
   SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
                           RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
                           RTLIB::Libcall Call_F128,
                           RTLIB::Libcall Call_PPCF128);
   SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
                            RTLIB::Libcall Call_I8,
                            RTLIB::Libcall Call_I16,
                            RTLIB::Libcall Call_I32,
                            RTLIB::Libcall Call_I64,
                            RTLIB::Libcall Call_I128);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
   SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
   void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
                                 SmallVectorImpl<SDValue> &Results);
   void getSignAsIntValue(FloatSignAsInt &State, const SDLoc &DL,
                          SDValue Value) const;
   SDValue modifySignAsInt(const FloatSignAsInt &State, const SDLoc &DL,
                           SDValue NewIntValue) const;
   SDValue ExpandFCOPYSIGN(SDNode *Node) const;
   SDValue ExpandFABS(SDNode *Node) const;
   SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT,
                                const SDLoc &dl);
   SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
                                 const SDLoc &dl);
   SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
                                 const SDLoc &dl);
 
   SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
   SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
   SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
   SDValue ExpandConstant(ConstantSDNode *CP);
 
   // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
   bool ExpandNode(SDNode *Node);
   void ConvertNodeToLibcall(SDNode *Node);
   void PromoteNode(SDNode *Node);
 
 public:
   // Node replacement helpers
 
   void ReplacedNode(SDNode *N) {
     LegalizedNodes.erase(N);
     if (UpdatedNodes)
       UpdatedNodes->insert(N);
   }
 
   void ReplaceNode(SDNode *Old, SDNode *New) {
     LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
                dbgs() << "     with:      "; New->dump(&DAG));
 
     assert(Old->getNumValues() == New->getNumValues() &&
            "Replacing one node with another that produces a different number "
            "of values!");
     DAG.ReplaceAllUsesWith(Old, New);
     if (UpdatedNodes)
       UpdatedNodes->insert(New);
     ReplacedNode(Old);
   }
 
   void ReplaceNode(SDValue Old, SDValue New) {
     LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
                dbgs() << "     with:      "; New->dump(&DAG));
 
     DAG.ReplaceAllUsesWith(Old, New);
     if (UpdatedNodes)
       UpdatedNodes->insert(New.getNode());
     ReplacedNode(Old.getNode());
   }
 
   void ReplaceNode(SDNode *Old, const SDValue *New) {
     LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
 
     DAG.ReplaceAllUsesWith(Old, New);
     for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
       LLVM_DEBUG(dbgs() << (i == 0 ? "     with:      " : "      and:      ");
                  New[i]->dump(&DAG));
       if (UpdatedNodes)
         UpdatedNodes->insert(New[i].getNode());
     }
     ReplacedNode(Old);
   }
 };
 
 } // end anonymous namespace
 
 /// Return a vector shuffle operation which
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
 SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
     EVT NVT, EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
     ArrayRef<int> Mask) const {
   unsigned NumMaskElts = VT.getVectorNumElements();
   unsigned NumDestElts = NVT.getVectorNumElements();
   unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
 
   assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
 
   if (NumEltsGrowth == 1)
     return DAG.getVectorShuffle(NVT, dl, N1, N2, Mask);
 
   SmallVector<int, 8> NewMask;
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int Idx = Mask[i];
     for (unsigned j = 0; j != NumEltsGrowth; ++j) {
       if (Idx < 0)
         NewMask.push_back(-1);
       else
         NewMask.push_back(Idx * NumEltsGrowth + j);
     }
   }
   assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
   assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
   return DAG.getVectorShuffle(NVT, dl, N1, N2, NewMask);
 }
 
 /// Expands the ConstantFP node to an integer constant or
 /// a load from the constant pool.
 SDValue
 SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
   bool Extend = false;
   SDLoc dl(CFP);
 
   // If a FP immediate is precise when represented as a float and if the
   // target can do an extending load from float to double, we put it into
   // the constant pool as a float, even if it's is statically typed as a
   // double.  This shrinks FP constants and canonicalizes them for targets where
   // an FP extending load is the same cost as a normal load (such as on the x87
   // fp stack or PPC FP unit).
   EVT VT = CFP->getValueType(0);
   ConstantFP *LLVMC = const_cast<ConstantFP*>(CFP->getConstantFPValue());
   if (!UseCP) {
     assert((VT == MVT::f64 || VT == MVT::f32) && "Invalid type expansion");
     return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(), dl,
                            (VT == MVT::f64) ? MVT::i64 : MVT::i32);
   }
 
   APFloat APF = CFP->getValueAPF();
   EVT OrigVT = VT;
   EVT SVT = VT;
 
   // We don't want to shrink SNaNs. Converting the SNaN back to its real type
   // can cause it to be changed into a QNaN on some platforms (e.g. on SystemZ).
   if (!APF.isSignaling()) {
     while (SVT != MVT::f32 && SVT != MVT::f16) {
       SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
       if (ConstantFPSDNode::isValueValidForType(SVT, APF) &&
           // Only do this if the target has a native EXTLOAD instruction from
           // smaller type.
           TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
           TLI.ShouldShrinkFPConstant(OrigVT)) {
         Type *SType = SVT.getTypeForEVT(*DAG.getContext());
         LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
         VT = SVT;
         Extend = true;
       }
     }
   }
 
   SDValue CPIdx =
       DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   if (Extend) {
     SDValue Result = DAG.getExtLoad(
         ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
         Alignment);
     return Result;
   }
   SDValue Result = DAG.getLoad(
       OrigVT, dl, DAG.getEntryNode(), CPIdx,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
   return Result;
 }
 
 /// Expands the Constant node to a load from the constant pool.
 SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
   SDLoc dl(CP);
   EVT VT = CP->getValueType(0);
   SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
                                       TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Result = DAG.getLoad(
       VT, dl, DAG.getEntryNode(), CPIdx,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
   return Result;
 }
 
 /// Some target cannot handle a variable insertion index for the
 /// INSERT_VECTOR_ELT instruction.  In this case, it
 /// is necessary to spill the vector being inserted into to memory, perform
 /// the insert there, and then read the result back.
 SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
                                                              SDValue Val,
                                                              SDValue Idx,
                                                              const SDLoc &dl) {
   SDValue Tmp1 = Vec;
   SDValue Tmp2 = Val;
   SDValue Tmp3 = Idx;
 
   // If the target doesn't support this, we have to spill the input vector
   // to a temporary stack slot, update the element, then reload it.  This is
   // badness.  We could also load the value into a vector register (either
   // with a "move to register" or "extload into register" instruction, then
   // permute it into place, if the idx is a constant and if the idx is
   // supported by the target.
   EVT VT    = Tmp1.getValueType();
   EVT EltVT = VT.getVectorElementType();
   SDValue StackPtr = DAG.CreateStackTemporary(VT);
 
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
 
   // Store the vector.
   SDValue Ch = DAG.getStore(
       DAG.getEntryNode(), dl, Tmp1, StackPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
 
   SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);
 
   // Store the scalar value.
   Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
   // Load the updated vector.
   return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
                                                DAG.getMachineFunction(), SPFI));
 }
 
 SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
                                                       SDValue Idx,
                                                       const SDLoc &dl) {
   if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
     // SCALAR_TO_VECTOR requires that the type of the value being inserted
     // match the element type of the vector being created, except for
     // integers in which case the inserted value can be over width.
     EVT EltVT = Vec.getValueType().getVectorElementType();
     if (Val.getValueType() == EltVT ||
         (EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
       SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                   Vec.getValueType(), Val);
 
       unsigned NumElts = Vec.getValueType().getVectorNumElements();
       // We generate a shuffle of InVec and ScVec, so the shuffle mask
       // should be 0,1,2,3,4,5... with the appropriate element replaced with
       // elt 0 of the RHS.
       SmallVector<int, 8> ShufOps;
       for (unsigned i = 0; i != NumElts; ++i)
         ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
 
       return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
     }
   }
   return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
 }
 
 SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   LLVM_DEBUG(dbgs() << "Optimizing float store operations\n");
   // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
   // FIXME: We shouldn't do this for TargetConstantFP's.
   // FIXME: move this to the DAG Combiner!  Note that we can't regress due
   // to phase ordering between legalized code and the dag combiner.  This
   // probably means that we need to integrate dag combiner and legalizer
   // together.
   // We generally can't do this one for long doubles.
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
   unsigned Alignment = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
         TLI.isTypeLegal(MVT::i32)) {
       SDValue Con = DAG.getConstant(CFP->getValueAPF().
                                       bitcastToAPInt().zextOrTrunc(32),
                                     SDLoc(CFP), MVT::i32);
       return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), Alignment,
                           MMOFlags, AAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
       // If this target supports 64-bit registers, do a single 64-bit store.
       if (TLI.isTypeLegal(MVT::i64)) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                       zextOrTrunc(64), SDLoc(CFP), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                             Alignment, MMOFlags, AAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
         // Otherwise, if the target supports 32-bit registers, use 2 32-bit
         // stores.  If the target supports neither 32- nor 64-bits, this
         // xform is certainly not worth it.
         const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt();
         SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32);
         SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32);
         if (DAG.getDataLayout().isBigEndian())
           std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), Alignment,
                           MMOFlags, AAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                           DAG.getConstant(4, dl, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           MinAlign(Alignment, 4U), MMOFlags, AAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
     }
   }
   return SDValue(nullptr, 0);
 }
 
 void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
   StoreSDNode *ST = cast<StoreSDNode>(Node);
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
   SDLoc dl(Node);
 
   unsigned Alignment = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
 
   if (!ST->isTruncatingStore()) {
     LLVM_DEBUG(dbgs() << "Legalizing store operation\n");
     if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
       ReplaceNode(ST, OptStore);
       return;
     }
 
     SDValue Value = ST->getValue();
     MVT VT = Value.getSimpleValueType();
     switch (TLI.getOperationAction(ISD::STORE, VT)) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal: {
       // If this is an unaligned store and the target doesn't support it,
       // expand it.
       EVT MemVT = ST->getMemoryVT();
       unsigned AS = ST->getAddressSpace();
       unsigned Align = ST->getAlignment();
       const DataLayout &DL = DAG.getDataLayout();
       if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
         LLVM_DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
       } else
         LLVM_DEBUG(dbgs() << "Legal store\n");
       break;
     }
     case TargetLowering::Custom: {
       LLVM_DEBUG(dbgs() << "Trying custom lowering\n");
       SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
       if (Res && Res != SDValue(Node, 0))
         ReplaceNode(SDValue(Node, 0), Res);
       return;
     }
     case TargetLowering::Promote: {
       MVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
       assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
              "Can only promote stores to same size type");
       Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
       SDValue Result =
           DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                        Alignment, MMOFlags, AAInfo);
       ReplaceNode(SDValue(Node, 0), Result);
       break;
     }
     }
     return;
   }
 
   LLVM_DEBUG(dbgs() << "Legalizing truncating store operations\n");
   SDValue Value = ST->getValue();
   EVT StVT = ST->getMemoryVT();
   unsigned StWidth = StVT.getSizeInBits();
   auto &DL = DAG.getDataLayout();
 
   if (StWidth != StVT.getStoreSizeInBits()) {
     // Promote to a byte-sized store with upper bits zero if not
     // storing an integral number of bytes.  For example, promote
     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
     EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
                                 StVT.getStoreSizeInBits());
     Value = DAG.getZeroExtendInReg(Value, dl, StVT);
     SDValue Result =
         DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
                           Alignment, MMOFlags, AAInfo);
     ReplaceNode(SDValue(Node, 0), Result);
   } else if (StWidth & (StWidth - 1)) {
     // If not storing a power-of-2 number of bits, expand as two stores.
     assert(!StVT.isVector() && "Unsupported truncstore!");
     unsigned RoundWidth = 1 << Log2_32(StWidth);
     assert(RoundWidth < StWidth);
     unsigned ExtraWidth = StWidth - RoundWidth;
     assert(ExtraWidth < RoundWidth);
     assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
            "Store size not an integral number of bytes!");
     EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
     EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
     SDValue Lo, Hi;
     unsigned IncrementSize;
 
     if (DL.isLittleEndian()) {
       // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
       // Store the bottom RoundWidth bits.
       Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                              RoundVT, Alignment, MMOFlags, AAInfo);
 
       // Store the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                         DAG.getConstant(IncrementSize, dl,
                                         Ptr.getValueType()));
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
           DAG.getConstant(RoundWidth, dl,
                           TLI.getShiftAmountTy(Value.getValueType(), DL)));
       Hi = DAG.getTruncStore(
           Chain, dl, Hi, Ptr,
           ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
           MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
     } else {
       // Big endian - avoid unaligned stores.
       // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
       // Store the top RoundWidth bits.
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
           DAG.getConstant(ExtraWidth, dl,
                           TLI.getShiftAmountTy(Value.getValueType(), DL)));
       Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
                              RoundVT, Alignment, MMOFlags, AAInfo);
 
       // Store the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                         DAG.getConstant(IncrementSize, dl,
                                         Ptr.getValueType()));
       Lo = DAG.getTruncStore(
           Chain, dl, Value, Ptr,
           ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
           MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
     }
 
     // The order of the stores doesn't matter.
     SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
     ReplaceNode(SDValue(Node, 0), Result);
   } else {
     switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal: {
       EVT MemVT = ST->getMemoryVT();
       unsigned AS = ST->getAddressSpace();
       unsigned Align = ST->getAlignment();
       // If this is an unaligned store and the target doesn't support it,
       // expand it.
       if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
       }
       break;
     }
     case TargetLowering::Custom: {
       SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
       if (Res && Res != SDValue(Node, 0))
         ReplaceNode(SDValue(Node, 0), Res);
       return;
     }
     case TargetLowering::Expand:
       assert(!StVT.isVector() &&
              "Vector Stores are handled in LegalizeVectorOps");
 
       SDValue Result;
 
       // TRUNCSTORE:i16 i32 -> STORE i16
       if (TLI.isTypeLegal(StVT)) {
         Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
         Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                               Alignment, MMOFlags, AAInfo);
       } else {
         // The in-memory type isn't legal. Truncate to the type it would promote
         // to, and then do a truncstore.
         Value = DAG.getNode(ISD::TRUNCATE, dl,
                             TLI.getTypeToTransformTo(*DAG.getContext(), StVT),
                             Value);
         Result = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                                    StVT, Alignment, MMOFlags, AAInfo);
       }
 
       ReplaceNode(SDValue(Node, 0), Result);
       break;
     }
   }
 }
 
 void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   LoadSDNode *LD = cast<LoadSDNode>(Node);
   SDValue Chain = LD->getChain();  // The chain.
   SDValue Ptr = LD->getBasePtr();  // The base pointer.
   SDValue Value;                   // The value returned by the load op.
   SDLoc dl(Node);
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   if (ExtType == ISD::NON_EXTLOAD) {
     LLVM_DEBUG(dbgs() << "Legalizing non-extending load operation\n");
     MVT VT = Node->getSimpleValueType(0);
     SDValue RVal = SDValue(Node, 0);
     SDValue RChain = SDValue(Node, 1);
 
     switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal: {
       EVT MemVT = LD->getMemoryVT();
       unsigned AS = LD->getAddressSpace();
       unsigned Align = LD->getAlignment();
       const DataLayout &DL = DAG.getDataLayout();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
       if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
         std::tie(RVal, RChain) =  TLI.expandUnalignedLoad(LD, DAG);
       }
       break;
     }
     case TargetLowering::Custom:
       if (SDValue Res = TLI.LowerOperation(RVal, DAG)) {
         RVal = Res;
         RChain = Res.getValue(1);
       }
       break;
 
     case TargetLowering::Promote: {
       MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
       assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
              "Can only promote loads to same size type");
 
       SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
       RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
       RChain = Res.getValue(1);
       break;
     }
     }
     if (RChain.getNode() != Node) {
       assert(RVal.getNode() != Node && "Load must be completely replaced");
       DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
       if (UpdatedNodes) {
         UpdatedNodes->insert(RVal.getNode());
         UpdatedNodes->insert(RChain.getNode());
       }
       ReplacedNode(Node);
     }
     return;
   }
 
   LLVM_DEBUG(dbgs() << "Legalizing extending load operation\n");
   EVT SrcVT = LD->getMemoryVT();
   unsigned SrcWidth = SrcVT.getSizeInBits();
   unsigned Alignment = LD->getAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
       // Some targets pretend to have an i1 loading operation, and actually
       // load an i8.  This trick is correct for ZEXTLOAD because the top 7
       // bits are guaranteed to be zero; it helps the optimizers understand
       // that these bits are zero.  It is also useful for EXTLOAD, since it
       // tells the optimizers that those bits are undefined.  It would be
       // nice to have an effective generic way of getting these benefits...
       // Until such a way is found, don't insist on promoting i1 here.
       (SrcVT != MVT::i1 ||
        TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
          TargetLowering::Promote)) {
     // Promote to a byte-sized load if not loading an integral number of
     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
     unsigned NewWidth = SrcVT.getStoreSizeInBits();
     EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
     SDValue Ch;
 
     // The extra bits are guaranteed to be zero, since we stored them that
     // way.  A zext load from NVT thus automatically gives zext from SrcVT.
 
     ISD::LoadExtType NewExtType =
       ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
 
     SDValue Result =
         DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr,
                        LD->getPointerInfo(), NVT, Alignment, MMOFlags, AAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
     if (ExtType == ISD::SEXTLOAD)
       // Having the top bits zero doesn't help when sign extending.
       Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
                            Result.getValueType(),
                            Result, DAG.getValueType(SrcVT));
     else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
       // All the top bits are guaranteed to be zero - inform the optimizers.
       Result = DAG.getNode(ISD::AssertZext, dl,
                            Result.getValueType(), Result,
                            DAG.getValueType(SrcVT));
 
     Value = Result;
     Chain = Ch;
   } else if (SrcWidth & (SrcWidth - 1)) {
     // If not loading a power-of-2 number of bits, expand as two loads.
     assert(!SrcVT.isVector() && "Unsupported extload!");
     unsigned RoundWidth = 1 << Log2_32(SrcWidth);
     assert(RoundWidth < SrcWidth);
     unsigned ExtraWidth = SrcWidth - RoundWidth;
     assert(ExtraWidth < RoundWidth);
     assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
            "Load size not an integral number of bytes!");
     EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
     EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
     SDValue Lo, Hi, Ch;
     unsigned IncrementSize;
     auto &DL = DAG.getDataLayout();
 
     if (DL.isLittleEndian()) {
       // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
       // Load the bottom RoundWidth bits.
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
                           AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                          DAG.getConstant(IncrementSize, dl,
                                          Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
                           AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
       Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                        Hi.getValue(1));
 
       // Move the top bits to the right place.
       Hi = DAG.getNode(
           ISD::SHL, dl, Hi.getValueType(), Hi,
           DAG.getConstant(RoundWidth, dl,
                           TLI.getShiftAmountTy(Hi.getValueType(), DL)));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
     } else {
       // Big endian - avoid unaligned loads.
       // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
                           AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                          DAG.getConstant(IncrementSize, dl,
                                          Ptr.getValueType()));
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
                           AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
       Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                        Hi.getValue(1));
 
       // Move the top bits to the right place.
       Hi = DAG.getNode(
           ISD::SHL, dl, Hi.getValueType(), Hi,
           DAG.getConstant(ExtraWidth, dl,
                           TLI.getShiftAmountTy(Hi.getValueType(), DL)));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
     }
 
     Chain = Ch;
   } else {
     bool isCustom = false;
     switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
                                  SrcVT.getSimpleVT())) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Custom:
       isCustom = true;
       LLVM_FALLTHROUGH;
     case TargetLowering::Legal:
       Value = SDValue(Node, 0);
       Chain = SDValue(Node, 1);
 
       if (isCustom) {
         if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
           Value = Res;
           Chain = Res.getValue(1);
         }
       } else {
         // If this is an unaligned load and the target doesn't support it,
         // expand it.
         EVT MemVT = LD->getMemoryVT();
         unsigned AS = LD->getAddressSpace();
         unsigned Align = LD->getAlignment();
         const DataLayout &DL = DAG.getDataLayout();
         if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
           std::tie(Value, Chain) = TLI.expandUnalignedLoad(LD, DAG);
         }
       }
       break;
 
     case TargetLowering::Expand: {
       EVT DestVT = Node->getValueType(0);
       if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
         // If the source type is not legal, see if there is a legal extload to
         // an intermediate type that we can then extend further.
         EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
         if (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT?
             TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
           // If we are loading a legal type, this is a non-extload followed by a
           // full extend.
           ISD::LoadExtType MidExtType =
               (LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;
 
           SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
                                         SrcVT, LD->getMemOperand());
           unsigned ExtendOp =
               ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
           Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
           Chain = Load.getValue(1);
           break;
         }
 
         // Handle the special case of fp16 extloads. EXTLOAD doesn't have the
         // normal undefined upper bits behavior to allow using an in-reg extend
         // with the illegal FP type, so load as an integer and do the
         // from-integer conversion.
         if (SrcVT.getScalarType() == MVT::f16) {
           EVT ISrcVT = SrcVT.changeTypeToInteger();
           EVT IDestVT = DestVT.changeTypeToInteger();
           EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
 
           SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
                                           Chain, Ptr, ISrcVT,
                                           LD->getMemOperand());
           Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
           Chain = Result.getValue(1);
           break;
         }
       }
 
       assert(!SrcVT.isVector() &&
              "Vector Loads are handled in LegalizeVectorOps");
 
       // FIXME: This does not work for vectors on most targets.  Sign-
       // and zero-extend operations are currently folded into extending
       // loads, whether they are legal or not, and then we end up here
       // without any support for legalizing them.
       assert(ExtType != ISD::EXTLOAD &&
              "EXTLOAD should always be supported!");
       // Turn the unsupported load into an EXTLOAD followed by an
       // explicit zero/sign extend inreg.
       SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl,
                                       Node->getValueType(0),
                                       Chain, Ptr, SrcVT,
                                       LD->getMemOperand());
       SDValue ValRes;
       if (ExtType == ISD::SEXTLOAD)
         ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
                              Result.getValueType(),
                              Result, DAG.getValueType(SrcVT));
       else
         ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
       Value = ValRes;
       Chain = Result.getValue(1);
       break;
     }
     }
   }
 
   // Since loads produce two values, make sure to remember that we legalized
   // both of them.
   if (Chain.getNode() != Node) {
     assert(Value.getNode() != Node && "Load must be completely replaced");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
     if (UpdatedNodes) {
       UpdatedNodes->insert(Value.getNode());
       UpdatedNodes->insert(Chain.getNode());
     }
     ReplacedNode(Node);
   }
 }
 
 /// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
 
   // Allow illegal target nodes and illegal registers.
   if (Node->getOpcode() == ISD::TargetConstant ||
       Node->getOpcode() == ISD::Register)
     return;
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
     assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
               TargetLowering::TypeLegal ||
             TLI.isTypeLegal(Node->getValueType(i))) &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
     assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
               TargetLowering::TypeLegal ||
             TLI.isTypeLegal(Op.getValueType()) ||
             Op.getOpcode() == ISD::TargetConstant ||
             Op.getOpcode() == ISD::Register) &&
             "Unexpected illegal type!");
 #endif
 
   // Figure out the correct action; the way to query this varies by opcode
   TargetLowering::LegalizeAction Action = TargetLowering::Legal;
   bool SimpleFinishLegalizing = true;
   switch (Node->getOpcode()) {
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID:
   case ISD::STACKSAVE:
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
   case ISD::GET_DYNAMIC_AREA_OFFSET:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getValueType(0));
     break;
   case ISD::VAARG:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getValueType(0));
     if (Action != TargetLowering::Promote)
       Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
   case ISD::FP_TO_FP16:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::EXTRACT_VECTOR_ELT:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
   case ISD::FP_ROUND_INREG:
   case ISD::SIGN_EXTEND_INREG: {
     EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
     Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
     break;
   }
   case ISD::ATOMIC_STORE:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(2).getValueType());
     break;
   case ISD::SELECT_CC:
   case ISD::SETCC:
   case ISD::BR_CC: {
     unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
                          Node->getOpcode() == ISD::SETCC ? 2 : 1;
     unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
     MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
     ISD::CondCode CCCode =
         cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
     Action = TLI.getCondCodeAction(CCCode, OpVT);
     if (Action == TargetLowering::Legal) {
       if (Node->getOpcode() == ISD::SELECT_CC)
         Action = TLI.getOperationAction(Node->getOpcode(),
                                         Node->getValueType(0));
       else
         Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
     }
     break;
   }
   case ISD::LOAD:
   case ISD::STORE:
     // FIXME: Model these properly.  LOAD and STORE are complicated, and
     // STORE expects the unlegalized operand in some cases.
     SimpleFinishLegalizing = false;
     break;
   case ISD::CALLSEQ_START:
   case ISD::CALLSEQ_END:
     // FIXME: This shouldn't be necessary.  These nodes have special properties
     // dealing with the recursive nature of legalization.  Removing this
     // special case should be done as part of making LegalizeDAG non-recursive.
     SimpleFinishLegalizing = false;
     break;
   case ISD::EXTRACT_ELEMENT:
   case ISD::FLT_ROUNDS_:
   case ISD::MERGE_VALUES:
   case ISD::EH_RETURN:
   case ISD::FRAME_TO_ARGS_OFFSET:
   case ISD::EH_DWARF_CFA:
   case ISD::EH_SJLJ_SETJMP:
   case ISD::EH_SJLJ_LONGJMP:
   case ISD::EH_SJLJ_SETUP_DISPATCH:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be expanded.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Expand;
     break;
   case ISD::INIT_TRAMPOLINE:
   case ISD::ADJUST_TRAMPOLINE:
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::ADDROFRETURNADDR:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be custom-lowered.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Custom;
     break;
   case ISD::READCYCLECOUNTER:
     // READCYCLECOUNTER returns an i64, even if type legalization might have
     // expanded that to several smaller types.
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
     break;
   case ISD::READ_REGISTER:
   case ISD::WRITE_REGISTER:
     // Named register is legal in the DAG, but blocked by register name
     // selection if not implemented by target (to chose the correct register)
     // They'll be converted to Copy(To/From)Reg.
     Action = TargetLowering::Legal;
     break;
   case ISD::DEBUGTRAP:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     if (Action == TargetLowering::Expand) {
       // replace ISD::DEBUGTRAP with ISD::TRAP
       SDValue NewVal;
       NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
                            Node->getOperand(0));
       ReplaceNode(Node, NewVal.getNode());
       LegalizeOp(NewVal.getNode());
       return;
     }
     break;
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
   case ISD::STRICT_FDIV:
   case ISD::STRICT_FSQRT:
   case ISD::STRICT_FMA:
   case ISD::STRICT_FPOW:
   case ISD::STRICT_FPOWI:
   case ISD::STRICT_FSIN:
   case ISD::STRICT_FCOS:
   case ISD::STRICT_FEXP:
   case ISD::STRICT_FEXP2:
   case ISD::STRICT_FLOG:
   case ISD::STRICT_FLOG10:
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
     // ISD::STRICT_FSQRT.
     Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
                                             Node->getValueType(0));
     break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TargetLowering::Legal;
     } else {
       Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     }
     break;
   }
 
   if (SimpleFinishLegalizing) {
     SDNode *NewNode = Node;
     switch (Node->getOpcode()) {
     default: break;
     case ISD::SHL:
     case ISD::SRL:
     case ISD::SRA:
     case ISD::ROTL:
     case ISD::ROTR: {
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
       SDValue Op0 = Node->getOperand(0);
       SDValue Op1 = Node->getOperand(1);
       if (!Op1.getValueType().isVector()) {
         SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op1);
         // The getShiftAmountOperand() may create a new operand node or
         // return the existing one. If new operand is created we need
         // to update the parent node.
         // Do not try to legalize SAO here! It will be automatically legalized
         // in the next round.
         if (SAO != Op1)
           NewNode = DAG.UpdateNodeOperands(Node, Op0, SAO);
       }
     }
     break;
     case ISD::SRL_PARTS:
     case ISD::SRA_PARTS:
     case ISD::SHL_PARTS: {
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
       SDValue Op0 = Node->getOperand(0);
       SDValue Op1 = Node->getOperand(1);
       SDValue Op2 = Node->getOperand(2);
       if (!Op2.getValueType().isVector()) {
         SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op2);
         // The getShiftAmountOperand() may create a new operand node or
         // return the existing one. If new operand is created we need
         // to update the parent node.
         if (SAO != Op2)
           NewNode = DAG.UpdateNodeOperands(Node, Op0, Op1, SAO);
       }
       break;
     }
     }
 
     if (NewNode != Node) {
       ReplaceNode(Node, NewNode);
       Node = NewNode;
     }
     switch (Action) {
     case TargetLowering::Legal:
       LLVM_DEBUG(dbgs() << "Legal node: nothing to do\n");
       return;
     case TargetLowering::Custom:
       LLVM_DEBUG(dbgs() << "Trying custom legalization\n");
       // FIXME: The handling for custom lowering with multiple results is
       // a complete mess.
       if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
         if (!(Res.getNode() != Node || Res.getResNo() != 0))
           return;
 
         if (Node->getNumValues() == 1) {
           LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
           // We can just directly replace this node with the lowered value.
           ReplaceNode(SDValue(Node, 0), Res);
           return;
         }
 
         SmallVector<SDValue, 8> ResultVals;
         for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
           ResultVals.push_back(Res.getValue(i));
         LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
         ReplaceNode(Node, ResultVals.data());
         return;
       }
       LLVM_DEBUG(dbgs() << "Could not custom legalize node\n");
       LLVM_FALLTHROUGH;
     case TargetLowering::Expand:
       if (ExpandNode(Node))
         return;
       LLVM_FALLTHROUGH;
     case TargetLowering::LibCall:
       ConvertNodeToLibcall(Node);
       return;
     case TargetLowering::Promote:
       PromoteNode(Node);
       return;
     }
   }
 
   switch (Node->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "NODE: ";
     Node->dump( &DAG);
     dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to legalize this operator!");
 
   case ISD::CALLSEQ_START:
   case ISD::CALLSEQ_END:
     break;
   case ISD::LOAD:
     return LegalizeLoadOps(Node);
   case ISD::STORE:
     return LegalizeStoreOps(Node);
   }
 }
 
 SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
   SDLoc dl(Op);
 
   // Before we generate a new store to a temporary stack slot, see if there is
   // already one that we can use. There often is because when we scalarize
   // vector operations (using SelectionDAG::UnrollVectorOp for example) a whole
   // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
   // the vector. If all are expanded here, we don't want one store per vector
   // element.
 
   // Caches for hasPredecessorHelper
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 16> Worklist;
   Worklist.push_back(Idx.getNode());
   SDValue StackPtr, Ch;
   for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
        UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
       if (ST->isIndexed() || ST->isTruncatingStore() ||
           ST->getValue() != Vec)
         continue;
 
       // Make sure that nothing else could have stored into the destination of
       // this store.
       if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
         continue;
 
       // If the index is dependent on the store we will introduce a cycle when
       // creating the load (the load uses the index, and by replacing the chain
       // we will make the index dependent on the load). Also, the store might be
       // dependent on the extractelement and introduce a cycle when creating
       // the load.
       if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) ||
           ST->hasPredecessor(Op.getNode()))
         continue;
 
       StackPtr = ST->getBasePtr();
       Ch = SDValue(ST, 0);
       break;
     }
   }
 
   EVT VecVT = Vec.getValueType();
 
   if (!Ch.getNode()) {
     // Store the value to a temporary stack slot, then LOAD the returned part.
     StackPtr = DAG.CreateStackTemporary(VecVT);
     Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
                       MachinePointerInfo());
   }
 
   StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   SDValue NewLoad;
 
   if (Op.getValueType().isVector())
     NewLoad =
         DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
   else
     NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
                              MachinePointerInfo(),
                              VecVT.getVectorElementType());
 
   // Replace the chain going out of the store, by the one out of the load.
   DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
 
   // We introduced a cycle though, so update the loads operands, making sure
   // to use the original store's chain as an incoming chain.
   SmallVector<SDValue, 6> NewLoadOperands(NewLoad->op_begin(),
                                           NewLoad->op_end());
   NewLoadOperands[0] = Ch;
   NewLoad =
       SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0);
   return NewLoad;
 }
 
 SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   assert(Op.getValueType().isVector() && "Non-vector insert subvector!");
 
   SDValue Vec  = Op.getOperand(0);
   SDValue Part = Op.getOperand(1);
   SDValue Idx  = Op.getOperand(2);
   SDLoc dl(Op);
 
   // Store the value to a temporary stack slot, then LOAD the returned part.
   EVT VecVT = Vec.getValueType();
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // First store the whole vector.
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Then store the inserted part.
   SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   // Store the subvector.
   Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());
 
   // Finally, load the updated vector.
   return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
 }
 
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   // We can't handle this case efficiently.  Allocate a sufficiently
   // aligned object on the stack, store each element into it, then load
   // the result as a vector.
   // Create the stack frame object.
   EVT VT = Node->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
   SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // Emit a store of each element to the stack slot.
   SmallVector<SDValue, 8> Stores;
   unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
     // Ignore undef elements.
     if (Node->getOperand(i).isUndef()) continue;
 
     unsigned Offset = TypeByteSize*i;
 
     SDValue Idx = DAG.getConstant(Offset, dl, FIPtr.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
 
     // If the destination vector element type is narrower than the source
     // element type, only store the bits necessary.
     if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
       Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
                                          Node->getOperand(i), Idx,
                                          PtrInfo.getWithOffset(Offset), EltVT));
     } else
       Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
                                     Idx, PtrInfo.getWithOffset(Offset)));
   }
 
   SDValue StoreChain;
   if (!Stores.empty())    // Not all undef elements?
     StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
   else
     StoreChain = DAG.getEntryNode();
 
   // Result is a load from the stack slot.
   return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo);
 }
 
 /// Bitcast a floating-point value to an integer value. Only bitcast the part
 /// containing the sign bit if the target has no integer value capable of
 /// holding all bits of the floating-point value.
 void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
                                              const SDLoc &DL,
                                              SDValue Value) const {
   EVT FloatVT = Value.getValueType();
   unsigned NumBits = FloatVT.getSizeInBits();
   State.FloatVT = FloatVT;
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
   // Convert to an integer of the same size.
   if (TLI.isTypeLegal(IVT)) {
     State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
     State.SignMask = APInt::getSignMask(NumBits);
     State.SignBit = NumBits - 1;
     return;
   }
 
   auto &DataLayout = DAG.getDataLayout();
   // Store the float to memory, then load the sign part out as an integer.
   MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8);
   // First create a temporary that is aligned for both the load and store.
   SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   // Then store the float to it.
   State.FloatPtr = StackPtr;
   MachineFunction &MF = DAG.getMachineFunction();
   State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
   State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
                              State.FloatPointerInfo);
 
   SDValue IntPtr;
   if (DataLayout.isBigEndian()) {
     assert(FloatVT.isByteSized() && "Unsupported floating point type!");
     // Load out a legal integer with the same sign bit as the float.
     IntPtr = StackPtr;
     State.IntPointerInfo = State.FloatPointerInfo;
   } else {
     // Advance the pointer so that the loaded byte will contain the sign bit.
     unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1;
     IntPtr = DAG.getNode(ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
                       DAG.getConstant(ByteOffset, DL, StackPtr.getValueType()));
     State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
                                                              ByteOffset);
   }
 
   State.IntPtr = IntPtr;
   State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
                                   State.IntPointerInfo, MVT::i8);
   State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
   State.SignBit = 7;
 }
 
 /// Replace the integer value produced by getSignAsIntValue() with a new value
 /// and cast the result back to a floating-point type.
 SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
                                               const SDLoc &DL,
                                               SDValue NewIntValue) const {
   if (!State.Chain)
     return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);
 
   // Override the part containing the sign bit in the value stored on the stack.
   SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
                                     State.IntPointerInfo, MVT::i8);
   return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
                      State.FloatPointerInfo);
 }
 
 SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
   SDLoc DL(Node);
   SDValue Mag = Node->getOperand(0);
   SDValue Sign = Node->getOperand(1);
 
   // Get sign bit into an integer value.
   FloatSignAsInt SignAsInt;
   getSignAsIntValue(SignAsInt, DL, Sign);
 
   EVT IntVT = SignAsInt.IntValue.getValueType();
   SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
   SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue,
                                 SignMask);
 
   // If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X)
   EVT FloatVT = Mag.getValueType();
   if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) &&
       TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) {
     SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag);
     SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue);
     SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit,
                                 DAG.getConstant(0, DL, IntVT), ISD::SETNE);
     return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue);
   }
 
   // Transform Mag value to integer, and clear the sign bit.
   FloatSignAsInt MagAsInt;
   getSignAsIntValue(MagAsInt, DL, Mag);
   EVT MagVT = MagAsInt.IntValue.getValueType();
   SDValue ClearSignMask = DAG.getConstant(~MagAsInt.SignMask, DL, MagVT);
   SDValue ClearedSign = DAG.getNode(ISD::AND, DL, MagVT, MagAsInt.IntValue,
                                     ClearSignMask);
 
   // Get the signbit at the right position for MagAsInt.
   int ShiftAmount = SignAsInt.SignBit - MagAsInt.SignBit;
+  EVT ShiftVT = IntVT;
+  if (SignBit.getValueSizeInBits() < ClearedSign.getValueSizeInBits()) {
+    SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
+    ShiftVT = MagVT;
+  }
+  if (ShiftAmount > 0) {
+    SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, ShiftVT);
+    SignBit = DAG.getNode(ISD::SRL, DL, ShiftVT, SignBit, ShiftCnst);
+  } else if (ShiftAmount < 0) {
+    SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, ShiftVT);
+    SignBit = DAG.getNode(ISD::SHL, DL, ShiftVT, SignBit, ShiftCnst);
+  }
   if (SignBit.getValueSizeInBits() > ClearedSign.getValueSizeInBits()) {
-    if (ShiftAmount > 0) {
-      SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, IntVT);
-      SignBit = DAG.getNode(ISD::SRL, DL, IntVT, SignBit, ShiftCnst);
-    } else if (ShiftAmount < 0) {
-      SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, IntVT);
-      SignBit = DAG.getNode(ISD::SHL, DL, IntVT, SignBit, ShiftCnst);
-    }
     SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
-  } else if (SignBit.getValueSizeInBits() < ClearedSign.getValueSizeInBits()) {
-    SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
-    if (ShiftAmount > 0) {
-      SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, MagVT);
-      SignBit = DAG.getNode(ISD::SRL, DL, MagVT, SignBit, ShiftCnst);
-    } else if (ShiftAmount < 0) {
-      SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, MagVT);
-      SignBit = DAG.getNode(ISD::SHL, DL, MagVT, SignBit, ShiftCnst);
-    }
   }
 
   // Store the part with the modified sign and convert back to float.
   SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit);
   return modifySignAsInt(MagAsInt, DL, CopiedSign);
 }
 
 SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
   SDLoc DL(Node);
   SDValue Value = Node->getOperand(0);
 
   // Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal.
   EVT FloatVT = Value.getValueType();
   if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) {
     SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT);
     return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero);
   }
 
   // Transform value to integer, clear the sign bit and transform back.
   FloatSignAsInt ValueAsInt;
   getSignAsIntValue(ValueAsInt, DL, Value);
   EVT IntVT = ValueAsInt.IntValue.getValueType();
   SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT);
   SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue,
                                     ClearSignMask);
   return modifySignAsInt(ValueAsInt, DL, ClearedSign);
 }
 
 void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
                                            SmallVectorImpl<SDValue> &Results) {
   unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
           " not tell us which reg is the stack pointer!");
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   SDValue Tmp1 = SDValue(Node, 0);
   SDValue Tmp2 = SDValue(Node, 1);
   SDValue Tmp3 = Node->getOperand(2);
   SDValue Chain = Tmp1.getOperand(0);
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   SDValue Size  = Tmp2.getOperand(1);
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   unsigned StackAlign =
       DAG.getSubtarget().getFrameLowering()->getStackAlignment();
   Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
   if (Align > StackAlign)
     Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
                        DAG.getConstant(-(uint64_t)Align, dl, VT));
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
 
   Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
 
   Results.push_back(Tmp1);
   Results.push_back(Tmp2);
 }
 
 /// Legalize a SETCC with given LHS and RHS and condition code CC on the current
 /// target.
 ///
 /// If the SETCC has been legalized using AND / OR, then the legalized node
 /// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
 /// will be set to false.
 ///
 /// If the SETCC has been legalized by using getSetCCSwappedOperands(),
 /// then the values of LHS and RHS will be swapped, CC will be set to the
 /// new condition, and NeedInvert will be set to false.
 ///
 /// If the SETCC has been legalized using the inverse condcode, then LHS and
 /// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
 /// will be set to true. The caller must invert the result of the SETCC with
 /// SelectionDAG::getLogicalNOT() or take equivalent action to swap the effect
 /// of a true/false result.
 ///
 /// \returns true if the SetCC has been legalized, false if it hasn't.
 bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
                                                  SDValue &RHS, SDValue &CC,
                                                  bool &NeedInvert,
                                                  const SDLoc &dl) {
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
   NeedInvert = false;
   bool NeedSwap = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
     // Nothing to do.
     break;
   case TargetLowering::Expand: {
     ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
     if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
       std::swap(LHS, RHS);
       CC = DAG.getCondCode(InvCC);
       return true;
     }
     // Swapping operands didn't work. Try inverting the condition.
     InvCC = getSetCCInverse(CCCode, OpVT.isInteger());
     if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
       // If inverting the condition is not enough, try swapping operands
       // on top of it.
       InvCC = ISD::getSetCCSwappedOperands(InvCC);
       NeedSwap = true;
     }
     if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
       CC = DAG.getCondCode(InvCC);
       NeedInvert = true;
       if (NeedSwap)
         std::swap(LHS, RHS);
       return true;
     }
 
     ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
     case ISD::SETO:
         assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT)
             && "If SETO is expanded, SETOEQ must be legal!");
         CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
     case ISD::SETUO:
         assert(TLI.isCondCodeLegal(ISD::SETUNE, OpVT)
             && "If SETUO is expanded, SETUNE must be legal!");
         CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;  break;
     case ISD::SETOEQ:
     case ISD::SETOGT:
     case ISD::SETOGE:
     case ISD::SETOLT:
     case ISD::SETOLE:
     case ISD::SETONE:
     case ISD::SETUEQ:
     case ISD::SETUNE:
     case ISD::SETUGT:
     case ISD::SETUGE:
     case ISD::SETULT:
     case ISD::SETULE:
         // If we are floating point, assign and break, otherwise fall through.
         if (!OpVT.isInteger()) {
           // We can use the 4th bit to tell if we are the unordered
           // or ordered version of the opcode.
           CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
           Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
           CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10);
           break;
         }
         // Fallthrough if we are unsigned integer.
         LLVM_FALLTHROUGH;
     case ISD::SETLE:
     case ISD::SETGT:
     case ISD::SETGE:
     case ISD::SETLT:
     case ISD::SETNE:
     case ISD::SETEQ:
       // If all combinations of inverting the condition and swapping operands
       // didn't work then we have no means to expand the condition.
       llvm_unreachable("Don't know how to expand this condition!");
     }
 
     SDValue SetCC1, SetCC2;
     if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
       // If we aren't the ordered or unorder operation,
       // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
       SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
       SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
     } else {
       // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
       SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1);
       SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2);
     }
     LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
     RHS = SDValue();
     CC  = SDValue();
     return true;
   }
   }
   return false;
 }
 
 /// Emit a store/load combination to the stack.  This stores
 /// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
 /// a load from the stack slot to DestVT, extending it if needed.
 /// The resultant code need not be legal.
 SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
                                                EVT DestVT, const SDLoc &dl) {
   // Create the stack frame object.
   unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
       SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
   SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
 
   FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
   int SPFI = StackPtrFI->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
 
   unsigned SrcSize = SrcOp.getValueSizeInBits();
   unsigned SlotSize = SlotVT.getSizeInBits();
   unsigned DestSize = DestVT.getSizeInBits();
   Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
   unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType);
 
   // Emit a store to the stack slot.  Use a truncstore if the input value is
   // later than DestVT.
   SDValue Store;
 
   if (SrcSize > SlotSize)
     Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo,
                               SlotVT, SrcAlign);
   else {
     assert(SrcSize == SlotSize && "Invalid store");
     Store =
         DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
   }
 
   // Result is a load from the stack slot.
   if (SlotSize == DestSize)
     return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
 
   assert(SlotSize < DestSize && "Unknown extension!");
   return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
                         DestAlign);
 }
 
 SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
   SDLoc dl(Node);
   // Create a vector sized/aligned stack slot, store the value to element #0,
   // then load the whole vector back out.
   SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
 
   FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
   int SPFI = StackPtrFI->getIndex();
 
   SDValue Ch = DAG.getTruncStore(
       DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
       Node->getValueType(0).getVectorElementType());
   return DAG.getLoad(
       Node->getValueType(0), dl, Ch, StackPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
 }
 
 static bool
 ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
                      const TargetLowering &TLI, SDValue &Res) {
   unsigned NumElems = Node->getNumOperands();
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
 
   // Try to group the scalars into pairs, shuffle the pairs together, then
   // shuffle the pairs of pairs together, etc. until the vector has
   // been built. This will work only if all of the necessary shuffle masks
   // are legal.
 
   // We do this in two phases; first to check the legality of the shuffles,
   // and next, assuming that all shuffles are legal, to create the new nodes.
   for (int Phase = 0; Phase < 2; ++Phase) {
     SmallVector<std::pair<SDValue, SmallVector<int, 16>>, 16> IntermedVals,
                                                               NewIntermedVals;
     for (unsigned i = 0; i < NumElems; ++i) {
       SDValue V = Node->getOperand(i);
       if (V.isUndef())
         continue;
 
       SDValue Vec;
       if (Phase)
         Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, V);
       IntermedVals.push_back(std::make_pair(Vec, SmallVector<int, 16>(1, i)));
     }
 
     while (IntermedVals.size() > 2) {
       NewIntermedVals.clear();
       for (unsigned i = 0, e = (IntermedVals.size() & ~1u); i < e; i += 2) {
         // This vector and the next vector are shuffled together (simply to
         // append the one to the other).
         SmallVector<int, 16> ShuffleVec(NumElems, -1);
 
         SmallVector<int, 16> FinalIndices;
         FinalIndices.reserve(IntermedVals[i].second.size() +
                              IntermedVals[i+1].second.size());
 
         int k = 0;
         for (unsigned j = 0, f = IntermedVals[i].second.size(); j != f;
              ++j, ++k) {
           ShuffleVec[k] = j;
           FinalIndices.push_back(IntermedVals[i].second[j]);
         }
         for (unsigned j = 0, f = IntermedVals[i+1].second.size(); j != f;
              ++j, ++k) {
           ShuffleVec[k] = NumElems + j;
           FinalIndices.push_back(IntermedVals[i+1].second[j]);
         }
 
         SDValue Shuffle;
         if (Phase)
           Shuffle = DAG.getVectorShuffle(VT, dl, IntermedVals[i].first,
                                          IntermedVals[i+1].first,
                                          ShuffleVec);
         else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
           return false;
         NewIntermedVals.push_back(
             std::make_pair(Shuffle, std::move(FinalIndices)));
       }
 
       // If we had an odd number of defined values, then append the last
       // element to the array of new vectors.
       if ((IntermedVals.size() & 1) != 0)
         NewIntermedVals.push_back(IntermedVals.back());
 
       IntermedVals.swap(NewIntermedVals);
     }
 
     assert(IntermedVals.size() <= 2 && IntermedVals.size() > 0 &&
            "Invalid number of intermediate vectors");
     SDValue Vec1 = IntermedVals[0].first;
     SDValue Vec2;
     if (IntermedVals.size() > 1)
       Vec2 = IntermedVals[1].first;
     else if (Phase)
       Vec2 = DAG.getUNDEF(VT);
 
     SmallVector<int, 16> ShuffleVec(NumElems, -1);
     for (unsigned i = 0, e = IntermedVals[0].second.size(); i != e; ++i)
       ShuffleVec[IntermedVals[0].second[i]] = i;
     for (unsigned i = 0, e = IntermedVals[1].second.size(); i != e; ++i)
       ShuffleVec[IntermedVals[1].second[i]] = NumElems + i;
 
     if (Phase)
       Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
     else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
       return false;
   }
 
   return true;
 }
 
 /// Expand a BUILD_VECTOR node on targets that don't
 /// support the operation, but do support the resultant vector type.
 SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   unsigned NumElems = Node->getNumOperands();
   SDValue Value1, Value2;
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   EVT OpVT = Node->getOperand(0).getValueType();
   EVT EltVT = VT.getVectorElementType();
 
   // If the only non-undef value is the low element, turn this into a
   // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
   bool isOnlyLowElement = true;
   bool MoreThanTwoValues = false;
   bool isConstant = true;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue V = Node->getOperand(i);
     if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
     if (!Value1.getNode()) {
       Value1 = V;
     } else if (!Value2.getNode()) {
       if (V != Value1)
         Value2 = V;
     } else if (V != Value1 && V != Value2) {
       MoreThanTwoValues = true;
     }
   }
 
   if (!Value1.getNode())
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
 
   // If all elements are constants, create a load from the constant pool.
   if (isConstant) {
     SmallVector<Constant*, 16> CV;
     for (unsigned i = 0, e = NumElems; i != e; ++i) {
       if (ConstantFPSDNode *V =
           dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
         CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
       } else if (ConstantSDNode *V =
                  dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
         if (OpVT==EltVT)
           CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
         else {
           // If OpVT and EltVT don't match, EltVT is not legal and the
           // element values have been promoted/truncated earlier.  Undo this;
           // we don't want a v16i8 to become a v16i32 for example.
           const ConstantInt *CI = V->getConstantIntValue();
           CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
                                         CI->getZExtValue()));
         }
       } else {
         assert(Node->getOperand(i).isUndef());
         Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
         CV.push_back(UndefValue::get(OpNTy));
       }
     }
     Constant *CP = ConstantVector::get(CV);
     SDValue CPIdx =
         DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
     unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
     return DAG.getLoad(
         VT, dl, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
         Alignment);
   }
 
   SmallSet<SDValue, 16> DefinedValues;
   for (unsigned i = 0; i < NumElems; ++i) {
     if (Node->getOperand(i).isUndef())
       continue;
     DefinedValues.insert(Node->getOperand(i));
   }
 
   if (TLI.shouldExpandBuildVectorWithShuffles(VT, DefinedValues.size())) {
     if (!MoreThanTwoValues) {
       SmallVector<int, 8> ShuffleVec(NumElems, -1);
       for (unsigned i = 0; i < NumElems; ++i) {
         SDValue V = Node->getOperand(i);
         if (V.isUndef())
           continue;
         ShuffleVec[i] = V == Value1 ? 0 : NumElems;
       }
       if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
         // Get the splatted value into the low element of a vector register.
         SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
         SDValue Vec2;
         if (Value2.getNode())
           Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
         else
           Vec2 = DAG.getUNDEF(VT);
 
         // Return shuffle(LowValVec, undef, <0,0,0,0>)
         return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
       }
     } else {
       SDValue Res;
       if (ExpandBVWithShuffles(Node, DAG, TLI, Res))
         return Res;
     }
   }
 
   // Otherwise, we can't handle this case efficiently.
   return ExpandVectorBuildThroughStack(Node);
 }
 
 // Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
 // and leave the Hi part unset.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
                                             bool isSigned) {
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (const SDValue &Op : Node->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
     Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned);
     Entry.IsZExt = !TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned);
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
 
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   // By default, the input chain to this libcall is the entry node of the
   // function. If the libcall is going to be emitted as a tail call then
   // TLI.isUsedByReturnOnly will change it to the right chain if the return
   // node which is being folded has a non-entry input chain.
   SDValue InChain = DAG.getEntryNode();
 
   // isTailCall may be true since the callee does not reference caller stack
   // frame. Check if it's in the right position and that the return types match.
   SDValue TCChain = InChain;
   const Function &F = DAG.getMachineFunction().getFunction();
   bool isTailCall =
       TLI.isInTailCallPosition(DAG, Node, TCChain) &&
       (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
   if (isTailCall)
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned);
   CLI.setDebugLoc(SDLoc(Node))
       .setChain(InChain)
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
                     std::move(Args))
       .setTailCall(isTailCall)
       .setSExtResult(signExtend)
       .setZExtResult(!signExtend)
       .setIsPostTypeLegalization(true);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode()) {
     LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
     // It's a tailcall, return the chain (which is the DAG root).
     return DAG.getRoot();
   }
 
   LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
   return CallInfo.first;
 }
 
 /// Generate a libcall taking the given operands as arguments
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
                                             bool isSigned, const SDLoc &dl) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
 
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0; i != NumOps; ++i) {
     Entry.Node = Ops[i];
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
     Entry.IsSExt = isSigned;
     Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
                     std::move(Args))
       .setSExtResult(isSigned)
       .setZExtResult(!isSigned)
       .setIsPostTypeLegalization(true);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo.first;
 }
 
 // Expand a node into a call to a libcall. Similar to
 // ExpandLibCall except that the first operand is the in-chain.
 std::pair<SDValue, SDValue>
 SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          SDNode *Node,
                                          bool isSigned) {
   SDValue InChain = Node->getOperand(0);
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Node->getOperand(i).getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
     Entry.IsSExt = isSigned;
     Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node))
       .setChain(InChain)
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
                     std::move(Args))
       .setSExtResult(isSigned)
       .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
 }
 
 SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F32,
                                               RTLIB::Libcall Call_F64,
                                               RTLIB::Libcall Call_F80,
                                               RTLIB::Libcall Call_F128,
                                               RTLIB::Libcall Call_PPCF128) {
   if (Node->isStrictFPOpcode())
     Node = DAG.mutateStrictFPToFP(Node);
 
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32: LC = Call_F32; break;
   case MVT::f64: LC = Call_F64; break;
   case MVT::f80: LC = Call_F80; break;
   case MVT::f128: LC = Call_F128; break;
   case MVT::ppcf128: LC = Call_PPCF128; break;
   }
   return ExpandLibCall(LC, Node, false);
 }
 
 SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
                                                RTLIB::Libcall Call_I8,
                                                RTLIB::Libcall Call_I16,
                                                RTLIB::Libcall Call_I32,
                                                RTLIB::Libcall Call_I64,
                                                RTLIB::Libcall Call_I128) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC = Call_I8; break;
   case MVT::i16:  LC = Call_I16; break;
   case MVT::i32:  LC = Call_I32; break;
   case MVT::i64:  LC = Call_I64; break;
   case MVT::i128: LC = Call_I128; break;
   }
   return ExpandLibCall(LC, Node, isSigned);
 }
 
 /// Issue libcalls to __{u}divmod to compute div / rem pairs.
 void
 SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
   unsigned Opcode = Node->getOpcode();
   bool isSigned = Opcode == ISD::SDIVREM;
 
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
   case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
   case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
   case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
   }
 
   // The input chain to this libcall is the entry node of the function.
   // Legalizing the call will automatically add the previous call to the
   // dependence.
   SDValue InChain = DAG.getEntryNode();
 
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (const SDValue &Op : Node->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
     Entry.IsSExt = isSigned;
     Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
 
   // Also pass the return address of the remainder.
   SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = FIPtr;
   Entry.Ty = RetTy->getPointerTo();
   Entry.IsSExt = isSigned;
   Entry.IsZExt = !isSigned;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
                     std::move(Args))
       .setSExtResult(isSigned)
       .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   // Remainder is loaded back from the stack frame.
   SDValue Rem =
       DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
   Results.push_back(CallInfo.first);
   Results.push_back(Rem);
 }
 
 /// Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
   case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
   case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
   case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
   }
   return TLI.getLibcallName(LC) != nullptr;
 }
 
 /// Only issue sincos libcall if both sin and cos are needed.
 static bool useSinCos(SDNode *Node) {
   unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
     ? ISD::FCOS : ISD::FSIN;
 
   SDValue Op0 = Node->getOperand(0);
   for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
        UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User == Node)
       continue;
     // The other user might have been turned into sincos already.
     if (User->getOpcode() == OtherOpcode || User->getOpcode() == ISD::FSINCOS)
       return true;
   }
   return false;
 }
 
 /// Issue libcalls to sincos to compute sin / cos pairs.
 void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
   case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
   case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
   case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
   }
 
   // The input chain to this libcall is the entry node of the function.
   // Legalizing the call will automatically add the previous call to the
   // dependence.
   SDValue InChain = DAG.getEntryNode();
 
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
   Entry.Ty = RetTy;
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Pass the return address of sin.
   SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = SinPtr;
   Entry.Ty = RetTy->getPointerTo();
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Also pass the return address of the cos.
   SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = CosPtr;
   Entry.Ty = RetTy->getPointerTo();
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
       TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
       std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   Results.push_back(
       DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
   Results.push_back(
       DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
 }
 
 /// This function is responsible for legalizing a
 /// INT_TO_FP operation of the specified operand when the target requests that
 /// we expand it.  At this point, we know that the result and operand types are
 /// legal for the target.
 SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
                                                    EVT DestVT,
                                                    const SDLoc &dl) {
   // TODO: Should any fast-math-flags be set for the created nodes?
   LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
   if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
     LLVM_DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double "
                          "expansion\n");
 
     // Get the stack frame index of a 8 byte buffer.
     SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
 
     // word offset constant for Hi/Lo address computation
     SDValue WordOff = DAG.getConstant(sizeof(int), dl,
                                       StackSlot.getValueType());
     // set up Hi and Lo (into buffer) address based on endian
     SDValue Hi = StackSlot;
     SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
                              StackSlot, WordOff);
     if (DAG.getDataLayout().isLittleEndian())
       std::swap(Hi, Lo);
 
     // if signed map to unsigned space
     SDValue Op0Mapped;
     if (isSigned) {
       // constant used to invert sign bit (signed to unsigned mapping)
       SDValue SignBit = DAG.getConstant(0x80000000u, dl, MVT::i32);
       Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
     } else {
       Op0Mapped = Op0;
     }
     // store the lo of the constructed double - based on integer input
     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op0Mapped, Lo,
                                   MachinePointerInfo());
     // initial hi portion of constructed double
     SDValue InitialHi = DAG.getConstant(0x43300000u, dl, MVT::i32);
     // store the hi of the constructed double - biased exponent
     SDValue Store2 =
         DAG.getStore(Store1, dl, InitialHi, Hi, MachinePointerInfo());
     // load the constructed double
     SDValue Load =
         DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo());
     // FP constant to bias correct the final result
     SDValue Bias = DAG.getConstantFP(isSigned ?
                                      BitsToDouble(0x4330000080000000ULL) :
                                      BitsToDouble(0x4330000000000000ULL),
                                      dl, MVT::f64);
     // subtract the bias
     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
     // final result
     SDValue Result;
     // handle final rounding
     if (DestVT == MVT::f64) {
       // do nothing
       Result = Sub;
     } else if (DestVT.bitsLT(MVT::f64)) {
       Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
                            DAG.getIntPtrConstant(0, dl));
     } else if (DestVT.bitsGT(MVT::f64)) {
       Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
     }
     return Result;
   }
   assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
   // Code below here assumes !isSigned without checking again.
 
   // Implementation of unsigned i64 to f64 following the algorithm in
   // __floatundidf in compiler_rt. This implementation has the advantage
   // of performing rounding correctly, both in the default rounding mode
   // and in all alternate rounding modes.
   // TODO: Generalize this for use with other types.
   if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
     LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
     SDValue TwoP52 =
       DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
     SDValue TwoP84PlusTwoP52 =
       DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl,
                         MVT::f64);
     SDValue TwoP84 =
       DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64);
 
     SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
     SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
                              DAG.getConstant(32, dl, MVT::i64));
     SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
     SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
     SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
     SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
     SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
                                 TwoP84PlusTwoP52);
     return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
   }
 
   // TODO: Generalize this for use with other types.
   if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
     LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
     // For unsigned conversions, convert them to signed conversions using the
     // algorithm from the x86_64 __floatundidf in compiler_rt.
     if (!isSigned) {
       SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
 
       SDValue ShiftConst = DAG.getConstant(
           1, dl, TLI.getShiftAmountTy(Op0.getValueType(), DAG.getDataLayout()));
       SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
       SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
       SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
       SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);
 
       SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
       SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);
 
       // TODO: This really should be implemented using a branch rather than a
       // select.  We happen to get lucky and machinesink does the right
       // thing most of the time.  This would be a good candidate for a
       //pseudo-op, or, even better, for whole-function isel.
       SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
         Op0, DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
       return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
     }
 
     // Otherwise, implement the fully general conversion.
 
     SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0xfffffffffffff800), dl, MVT::i64));
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
          DAG.getConstant(UINT64_C(0x800), dl, MVT::i64));
     SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0x7ff), dl, MVT::i64));
     SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), And2,
                               DAG.getConstant(UINT64_C(0), dl, MVT::i64),
                               ISD::SETNE);
     SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
     SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), Op0,
                               DAG.getConstant(UINT64_C(0x0020000000000000), dl,
                                               MVT::i64),
                               ISD::SETUGE);
     SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
     EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());
 
     SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
                              DAG.getConstant(32, dl, SHVT));
     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
     SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
     SDValue TwoP32 =
       DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), dl,
                         MVT::f64);
     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
     SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
     SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
     SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
     return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
 
   SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Op0.getValueType()),
                                  Op0,
                                  DAG.getConstant(0, dl, Op0.getValueType()),
                                  ISD::SETLT);
   SDValue Zero = DAG.getIntPtrConstant(0, dl),
           Four = DAG.getIntPtrConstant(4, dl);
   SDValue CstOffset = DAG.getSelect(dl, Zero.getValueType(),
                                     SignSet, Four, Zero);
 
   // If the sign bit of the integer is set, the large number will be treated
   // as a negative number.  To counteract this, the dynamic code adds an
   // offset depending on the data type.
   uint64_t FF;
   switch (Op0.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unsupported integer type!");
   case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
   case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
   case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
   case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
   }
   if (DAG.getDataLayout().isLittleEndian())
     FF <<= 32;
   Constant *FudgeFactor = ConstantInt::get(
                                        Type::getInt64Ty(*DAG.getContext()), FF);
 
   SDValue CPIdx =
       DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
   Alignment = std::min(Alignment, 4u);
   SDValue FudgeInReg;
   if (DestVT == MVT::f32)
     FudgeInReg = DAG.getLoad(
         MVT::f32, dl, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
         Alignment);
   else {
     SDValue Load = DAG.getExtLoad(
         ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
         Alignment);
     HandleSDNode Handle(Load);
     LegalizeOp(Load.getNode());
     FudgeInReg = Handle.getValue();
   }
 
   return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
 }
 
 /// This function is responsible for legalizing a
 /// *INT_TO_FP operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
 /// operation that takes a larger input.
 SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT,
                                                     bool isSigned,
                                                     const SDLoc &dl) {
   // First step, figure out the appropriate *INT_TO_FP operation to use.
   EVT NewInTy = LegalOp.getValueType();
 
   unsigned OpToUse = 0;
 
   // Scan for the appropriate larger type to use.
   while (true) {
     NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
     assert(NewInTy.isInteger() && "Ran out of possibilities!");
 
     // If the target supports SINT_TO_FP of this type, use it.
     if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
       OpToUse = ISD::SINT_TO_FP;
       break;
     }
     if (isSigned) continue;
 
     // If the target supports UINT_TO_FP of this type, use it.
     if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
       OpToUse = ISD::UINT_TO_FP;
       break;
     }
 
     // Otherwise, try a larger type.
   }
 
   // Okay, we found the operation and type to use.  Zero extend our input to the
   // desired type then run the operation on it.
   return DAG.getNode(OpToUse, dl, DestVT,
                      DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
                                  dl, NewInTy, LegalOp));
 }
 
 /// This function is responsible for legalizing a
 /// FP_TO_*INT operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
 /// operation that returns a larger result.
 SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT,
                                                     bool isSigned,
                                                     const SDLoc &dl) {
   // First step, figure out the appropriate FP_TO*INT operation to use.
   EVT NewOutTy = DestVT;
 
   unsigned OpToUse = 0;
 
   // Scan for the appropriate larger type to use.
   while (true) {
     NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
     assert(NewOutTy.isInteger() && "Ran out of possibilities!");
 
     // A larger signed type can hold all unsigned values of the requested type,
     // so using FP_TO_SINT is valid
     if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
       OpToUse = ISD::FP_TO_SINT;
       break;
     }
 
     // However, if the value may be < 0.0, we *must* use some FP_TO_SINT.
     if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
       OpToUse = ISD::FP_TO_UINT;
       break;
     }
 
     // Otherwise, try a larger type.
   }
 
   // Okay, we found the operation and type to use.
   SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);
 
   // Truncate the result of the extended FP_TO_*INT operation to the desired
   // size.
   return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
 }
 
 /// Legalize a BITREVERSE scalar/vector operation as a series of mask + shifts.
 SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
   unsigned Sz = VT.getScalarSizeInBits();
 
   SDValue Tmp, Tmp2, Tmp3;
 
   // If we can, perform BSWAP first and then the mask+swap the i4, then i2
   // and finally the i1 pairs.
   // TODO: We can easily support i4/i2 legal types if any target ever does.
   if (Sz >= 8 && isPowerOf2_32(Sz)) {
     // Create the masks - repeating the pattern every byte.
     APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
     APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
     for (unsigned J = 0; J != Sz; J += 8) {
       MaskHi4 = MaskHi4 | (0xF0ull << J);
       MaskLo4 = MaskLo4 | (0x0Full << J);
       MaskHi2 = MaskHi2 | (0xCCull << J);
       MaskLo2 = MaskLo2 | (0x33ull << J);
       MaskHi1 = MaskHi1 | (0xAAull << J);
       MaskLo1 = MaskLo1 | (0x55ull << J);
     }
 
     // BSWAP if the type is wider than a single byte.
     Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);
 
     // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
     Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, VT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
     // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
     Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, VT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
     // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
     Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, VT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
     return Tmp;
   }
 
   Tmp = DAG.getConstant(0, dl, VT);
   for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
     if (I < J)
       Tmp2 =
           DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT));
     else
       Tmp2 =
           DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));
 
     APInt Shift(Sz, 1);
     Shift <<= J;
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
   }
 
   return Tmp;
 }
 
 /// Open code the operations for BSWAP of the specified operation.
 SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
   SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
   switch (VT.getSimpleVT().getScalarType().SimpleTy) {
   default: llvm_unreachable("Unhandled Expand type in BSWAP!");
   case MVT::i16:
     Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
     Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
     return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   case MVT::i32:
     Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
     Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
     Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
                        DAG.getConstant(0xFF0000, dl, VT));
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
     Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
     Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
     return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
   case MVT::i64:
     Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
     Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
     Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
     Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
     Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
     Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
     Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7,
                        DAG.getConstant(255ULL<<48, dl, VT));
     Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6,
                        DAG.getConstant(255ULL<<40, dl, VT));
     Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5,
                        DAG.getConstant(255ULL<<32, dl, VT));
     Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
                        DAG.getConstant(255ULL<<24, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
                        DAG.getConstant(255ULL<<16, dl, VT));
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
                        DAG.getConstant(255ULL<<8 , dl, VT));
     Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
     Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
     Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
     Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
     Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
     Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
     return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
   }
 }
 
 /// Expand the specified bitcount instruction into operations.
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              const SDLoc &dl) {
   switch (Opc) {
   default: llvm_unreachable("Cannot expand this yet!");
   case ISD::CTPOP: {
     EVT VT = Op.getValueType();
     EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned Len = VT.getSizeInBits();
 
     assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
            "CTPOP not implemented for this type.");
 
     // This is the "best" algorithm from
     // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
 
     SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)),
                                      dl, VT);
     SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)),
                                      dl, VT);
     SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)),
                                      dl, VT);
     SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)),
                                      dl, VT);
 
     // v = v - ((v >> 1) & 0x55555555...)
     Op = DAG.getNode(ISD::SUB, dl, VT, Op,
                      DAG.getNode(ISD::AND, dl, VT,
                                  DAG.getNode(ISD::SRL, dl, VT, Op,
                                              DAG.getConstant(1, dl, ShVT)),
                                  Mask55));
     // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
     Op = DAG.getNode(ISD::ADD, dl, VT,
                      DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
                      DAG.getNode(ISD::AND, dl, VT,
                                  DAG.getNode(ISD::SRL, dl, VT, Op,
                                              DAG.getConstant(2, dl, ShVT)),
                                  Mask33));
     // v = (v + (v >> 4)) & 0x0F0F0F0F...
     Op = DAG.getNode(ISD::AND, dl, VT,
                      DAG.getNode(ISD::ADD, dl, VT, Op,
                                  DAG.getNode(ISD::SRL, dl, VT, Op,
                                              DAG.getConstant(4, dl, ShVT))),
                      Mask0F);
     // v = (v * 0x01010101...) >> (Len - 8)
     Op = DAG.getNode(ISD::SRL, dl, VT,
                      DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
                      DAG.getConstant(Len - 8, dl, ShVT));
 
     return Op;
   }
   case ISD::CTLZ_ZERO_UNDEF:
     // This trivially expands to CTLZ.
     return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
   case ISD::CTLZ: {
     EVT VT = Op.getValueType();
     unsigned Len = VT.getSizeInBits();
 
     if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
       EVT SetCCVT = getSetCCResultType(VT);
       SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
       SDValue Zero = DAG.getConstant(0, dl, VT);
       SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
       return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
                          DAG.getConstant(Len, dl, VT), CTLZ);
     }
 
     // for now, we do this:
     // x = x | (x >> 1);
     // x = x | (x >> 2);
     // ...
     // x = x | (x >>16);
     // x = x | (x >>32); // for 64-bit input
     // return popcount(~x);
     //
     // Ref: "Hacker's Delight" by Henry Warren
     EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
       SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
       Op = DAG.getNode(ISD::OR, dl, VT, Op,
                        DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
     }
     Op = DAG.getNOT(dl, Op, VT);
     return DAG.getNode(ISD::CTPOP, dl, VT, Op);
   }
   case ISD::CTTZ_ZERO_UNDEF:
     // This trivially expands to CTTZ.
     return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
   case ISD::CTTZ: {
     EVT VT = Op.getValueType();
     unsigned Len = VT.getSizeInBits();
 
     if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
       EVT SetCCVT = getSetCCResultType(VT);
       SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
       SDValue Zero = DAG.getConstant(0, dl, VT);
       SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
       return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
                          DAG.getConstant(Len, dl, VT), CTTZ);
     }
 
     // for now, we use: { return popcount(~x & (x - 1)); }
     // unless the target has ctlz but not ctpop, in which case we use:
     // { return 32 - nlz(~x & (x-1)); }
     // Ref: "Hacker's Delight" by Henry Warren
     SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
                                DAG.getNOT(dl, Op, VT),
                                DAG.getNode(ISD::SUB, dl, VT, Op,
                                            DAG.getConstant(1, dl, VT)));
     // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
     if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
         TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
       return DAG.getNode(ISD::SUB, dl, VT,
                          DAG.getConstant(VT.getSizeInBits(), dl, VT),
                          DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
     return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
   }
   }
 }
 
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4;
   bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
     Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
     Results.push_back(Tmp1);
     break;
   case ISD::BITREVERSE:
     Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
     break;
   case ISD::BSWAP:
     Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
     break;
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::FRAME_TO_ARGS_OFFSET:
     Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
     break;
   case ISD::EH_DWARF_CFA: {
     SDValue CfaArg = DAG.getSExtOrTrunc(Node->getOperand(0), dl,
                                         TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Offset = DAG.getNode(ISD::ADD, dl,
                                  CfaArg.getValueType(),
                                  DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
                                              CfaArg.getValueType()),
                                  CfaArg);
     SDValue FA = DAG.getNode(
         ISD::FRAMEADDR, dl, TLI.getPointerTy(DAG.getDataLayout()),
         DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout())));
     Results.push_back(DAG.getNode(ISD::ADD, dl, FA.getValueType(),
                                   FA, Offset));
     break;
   }
   case ISD::FLT_ROUNDS_:
     Results.push_back(DAG.getConstant(1, dl, Node->getValueType(0)));
     break;
   case ISD::EH_RETURN:
   case ISD::EH_LABEL:
   case ISD::PREFETCH:
   case ISD::VAEND:
   case ISD::EH_SJLJ_LONGJMP:
     // If the target didn't expand these, there's nothing to do, so just
     // preserve the chain and be done.
     Results.push_back(Node->getOperand(0));
     break;
   case ISD::READCYCLECOUNTER:
     // If the target didn't expand this, just return 'zero' and preserve the
     // chain.
     Results.append(Node->getNumValues() - 1,
                    DAG.getConstant(0, dl, Node->getValueType(0)));
     Results.push_back(Node->getOperand(0));
     break;
   case ISD::EH_SJLJ_SETJMP:
     // If the target didn't expand this, just return 'zero' and preserve the
     // chain.
     Results.push_back(DAG.getConstant(0, dl, MVT::i32));
     Results.push_back(Node->getOperand(0));
     break;
   case ISD::ATOMIC_LOAD: {
     // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
     SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
     SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
     SDValue Swap = DAG.getAtomicCmpSwap(
         ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
         Node->getOperand(0), Node->getOperand(1), Zero, Zero,
         cast<AtomicSDNode>(Node)->getMemOperand());
     Results.push_back(Swap.getValue(0));
     Results.push_back(Swap.getValue(1));
     break;
   }
   case ISD::ATOMIC_STORE: {
     // There is no libcall for atomic store; fake it with ATOMIC_SWAP.
     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
                                  Node->getOperand(0),
                                  Node->getOperand(1), Node->getOperand(2),
                                  cast<AtomicSDNode>(Node)->getMemOperand());
     Results.push_back(Swap.getValue(1));
     break;
   }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
     // splits out the success value as a comparison. Expanding the resulting
     // ATOMIC_CMP_SWAP will produce a libcall.
     SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
     SDValue Res = DAG.getAtomicCmpSwap(
         ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
         Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
         Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand());
 
     SDValue ExtRes = Res;
     SDValue LHS = Res;
     SDValue RHS = Node->getOperand(1);
 
     EVT AtomicType = cast<AtomicSDNode>(Node)->getMemoryVT();
     EVT OuterType = Node->getValueType(0);
     switch (TLI.getExtendForAtomicOps()) {
     case ISD::SIGN_EXTEND:
       LHS = DAG.getNode(ISD::AssertSext, dl, OuterType, Res,
                         DAG.getValueType(AtomicType));
       RHS = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OuterType,
                         Node->getOperand(2), DAG.getValueType(AtomicType));
       ExtRes = LHS;
       break;
     case ISD::ZERO_EXTEND:
       LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
                         DAG.getValueType(AtomicType));
       RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
       ExtRes = LHS;
       break;
     case ISD::ANY_EXTEND:
       LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
       RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
       break;
     default:
       llvm_unreachable("Invalid atomic op extension");
     }
 
     SDValue Success =
         DAG.getSetCC(dl, Node->getValueType(1), LHS, RHS, ISD::SETEQ);
 
     Results.push_back(ExtRes.getValue(0));
     Results.push_back(Success);
     Results.push_back(Res.getValue(1));
     break;
   }
   case ISD::DYNAMIC_STACKALLOC:
     ExpandDYNAMIC_STACKALLOC(Node, Results);
     break;
   case ISD::MERGE_VALUES:
     for (unsigned i = 0; i < Node->getNumValues(); i++)
       Results.push_back(Node->getOperand(i));
     break;
   case ISD::UNDEF: {
     EVT VT = Node->getValueType(0);
     if (VT.isInteger())
       Results.push_back(DAG.getConstant(0, dl, VT));
     else {
       assert(VT.isFloatingPoint() && "Unknown value type!");
       Results.push_back(DAG.getConstantFP(0, dl, VT));
     }
     break;
   }
   case ISD::FP_ROUND:
   case ISD::BITCAST:
     Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
                             Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
   case ISD::FP_EXTEND:
     Tmp1 = EmitStackConvert(Node->getOperand(0),
                             Node->getOperand(0).getValueType(),
                             Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
     EVT VT = Node->getValueType(0);
 
     // An in-register sign-extend of a boolean is a negation:
     // 'true' (1) sign-extended is -1.
     // 'false' (0) sign-extended is 0.
     // However, we must mask the high bits of the source operand because the
     // SIGN_EXTEND_INREG does not guarantee that the high bits are already zero.
 
     // TODO: Do this for vectors too?
     if (ExtraVT.getSizeInBits() == 1) {
       SDValue One = DAG.getConstant(1, dl, VT);
       SDValue And = DAG.getNode(ISD::AND, dl, VT, Node->getOperand(0), One);
       SDValue Zero = DAG.getConstant(0, dl, VT);
       SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, Zero, And);
       Results.push_back(Neg);
       break;
     }
 
     // NOTE: we could fall back on load/store here too for targets without
     // SRA.  However, it is doubtful that any exist.
     EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned BitsDiff = VT.getScalarSizeInBits() -
                         ExtraVT.getScalarSizeInBits();
     SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
     Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
                        Node->getOperand(0), ShiftCst);
     Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::FP_ROUND_INREG: {
     // The only way we can lower this is to turn it into a TRUNCSTORE,
     // EXTLOAD pair, targeting a temporary location (a stack slot).
 
     // NOTE: there is a choice here between constantly creating new stack
     // slots and always reusing the same one.  We currently always create
     // new ones, as reuse may inhibit scheduling.
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
     Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
                             Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
                                 Node->getOperand(0), Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
   case ISD::FP_TO_SINT:
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
       Results.push_back(Tmp1);
     break;
   case ISD::FP_TO_UINT: {
     SDValue True, False;
     EVT VT =  Node->getOperand(0).getValueType();
     EVT NVT = Node->getValueType(0);
     APFloat apf(DAG.EVTToAPFloatSemantics(VT),
                 APInt::getNullValue(VT.getSizeInBits()));
     APInt x = APInt::getSignMask(NVT.getSizeInBits());
     (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
     Tmp1 = DAG.getConstantFP(apf, dl, VT);
     Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
                         Node->getOperand(0),
                         Tmp1, ISD::SETLT);
     True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
     // TODO: Should any fast-math-flags be set for the FSUB?
     False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
                         DAG.getNode(ISD::FSUB, dl, VT,
                                     Node->getOperand(0), Tmp1));
     False = DAG.getNode(ISD::XOR, dl, NVT, False,
                         DAG.getConstant(x, dl, NVT));
     Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
     break;
   case ISD::VACOPY:
     Results.push_back(DAG.expandVACopy(Node));
     break;
   case ISD::EXTRACT_VECTOR_ELT:
     if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
       // This must be an access of the only element.  Return it.
       Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0),
                          Node->getOperand(0));
     else
       Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
     Results.push_back(Tmp1);
     break;
   case ISD::EXTRACT_SUBVECTOR:
     Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::INSERT_SUBVECTOR:
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
     Results.push_back(ExpandVectorBuildThroughStack(Node));
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
     break;
   case ISD::INSERT_VECTOR_ELT:
     Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
                                               Node->getOperand(1),
                                               Node->getOperand(2), dl));
     break;
   case ISD::VECTOR_SHUFFLE: {
     SmallVector<int, 32> NewMask;
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
 
     EVT VT = Node->getValueType(0);
     EVT EltVT = VT.getVectorElementType();
     SDValue Op0 = Node->getOperand(0);
     SDValue Op1 = Node->getOperand(1);
     if (!TLI.isTypeLegal(EltVT)) {
       EVT NewEltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);
 
       // BUILD_VECTOR operands are allowed to be wider than the element type.
       // But if NewEltVT is smaller that EltVT the BUILD_VECTOR does not accept
       // it.
       if (NewEltVT.bitsLT(EltVT)) {
         // Convert shuffle node.
         // If original node was v4i64 and the new EltVT is i32,
         // cast operands to v8i32 and re-build the mask.
 
         // Calculate new VT, the size of the new VT should be equal to original.
         EVT NewVT =
             EVT::getVectorVT(*DAG.getContext(), NewEltVT,
                              VT.getSizeInBits() / NewEltVT.getSizeInBits());
         assert(NewVT.bitsEq(VT));
 
         // cast operands to new VT
         Op0 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op0);
         Op1 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op1);
 
         // Convert the shuffle mask
         unsigned int factor =
                          NewVT.getVectorNumElements()/VT.getVectorNumElements();
 
         // EltVT gets smaller
         assert(factor > 0);
 
         for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
           if (Mask[i] < 0) {
             for (unsigned fi = 0; fi < factor; ++fi)
               NewMask.push_back(Mask[i]);
           }
           else {
             for (unsigned fi = 0; fi < factor; ++fi)
               NewMask.push_back(Mask[i]*factor+fi);
           }
         }
         Mask = NewMask;
         VT = NewVT;
       }
       EltVT = NewEltVT;
     }
     unsigned NumElems = VT.getVectorNumElements();
     SmallVector<SDValue, 16> Ops;
     for (unsigned i = 0; i != NumElems; ++i) {
       if (Mask[i] < 0) {
         Ops.push_back(DAG.getUNDEF(EltVT));
         continue;
       }
       unsigned Idx = Mask[i];
       if (Idx < NumElems)
         Ops.push_back(DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
       else
         Ops.push_back(DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
             DAG.getConstant(Idx - NumElems, dl,
                             TLI.getVectorIdxTy(DAG.getDataLayout()))));
     }
 
     Tmp1 = DAG.getBuildVector(VT, dl, Ops);
     // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
     EVT OpTy = Node->getOperand(0).getValueType();
     if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
       // 1 -> Hi
       Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
                          DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
                                          TLI.getShiftAmountTy(
                                              Node->getOperand(0).getValueType(),
                                              DAG.getDataLayout())));
       Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
     } else {
       // 0 -> Lo
       Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
                          Node->getOperand(0));
     }
     Results.push_back(Tmp1);
     break;
   }
   case ISD::STACKSAVE:
     // Expand to CopyFromReg if the target set
     // StackPointerRegisterToSaveRestore.
     if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
       Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
                                            Node->getValueType(0)));
       Results.push_back(Results[0].getValue(1));
     } else {
       Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
       Results.push_back(Node->getOperand(0));
     }
     break;
   case ISD::STACKRESTORE:
     // Expand to CopyToReg if the target set
     // StackPointerRegisterToSaveRestore.
     if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
       Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
                                          Node->getOperand(1)));
     } else {
       Results.push_back(Node->getOperand(0));
     }
     break;
   case ISD::GET_DYNAMIC_AREA_OFFSET:
     Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
     Results.push_back(Results[0].getValue(0));
     break;
   case ISD::FCOPYSIGN:
     Results.push_back(ExpandFCOPYSIGN(Node));
     break;
   case ISD::FNEG:
     // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
     Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0));
     // TODO: If FNEG has fast-math-flags, propagate them to the FSUB.
     Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
                        Node->getOperand(0));
     Results.push_back(Tmp1);
     break;
   case ISD::FABS:
     Results.push_back(ExpandFABS(Node));
     break;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX: {
     // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
     ISD::CondCode Pred;
     switch (Node->getOpcode()) {
     default: llvm_unreachable("How did we get here?");
     case ISD::SMAX: Pred = ISD::SETGT; break;
     case ISD::SMIN: Pred = ISD::SETLT; break;
     case ISD::UMAX: Pred = ISD::SETUGT; break;
     case ISD::UMIN: Pred = ISD::SETULT; break;
     }
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp1, Tmp2, Pred);
     Results.push_back(Tmp1);
     break;
   }
 
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
     // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
     // fcos which share the same operand and both are used.
     if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
          isSinCosLibcallAvailable(Node, TLI))
         && useSinCos(Node)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
       if (Node->getOpcode() == ISD::FCOS)
         Tmp1 = Tmp1.getValue(1);
       Results.push_back(Tmp1);
     }
     break;
   }
   case ISD::FMAD:
     llvm_unreachable("Illegal fmad should never be formed");
 
   case ISD::FP16_TO_FP:
     if (Node->getValueType(0) != MVT::f32) {
       // We can extend to types bigger than f32 in two steps without changing
       // the result. Since "f16 -> f32" is much more commonly available, give
       // CodeGen the option of emitting that before resorting to a libcall.
       SDValue Res =
           DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
       Results.push_back(
           DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
     }
     break;
   case ISD::FP_TO_FP16:
     LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
     if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
       SDValue Op = Node->getOperand(0);
       MVT SVT = Op.getSimpleValueType();
       if ((SVT == MVT::f64 || SVT == MVT::f80) &&
           TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
         // Under fastmath, we can expand this node into a fround followed by
         // a float-half conversion.
         SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
                                        DAG.getIntPtrConstant(0, dl));
         Results.push_back(
             DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal));
       }
     }
     break;
   case ISD::ConstantFP: {
     ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
     // Check to see if this FP immediate is already legal.
     // If this is a legal constant, turn it into a TargetConstantFP node.
     if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0)))
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
   case ISD::Constant: {
     ConstantSDNode *CP = cast<ConstantSDNode>(Node);
     Results.push_back(ExpandConstant(CP));
     break;
   }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
         TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
       const SDNodeFlags Flags = Node->getFlags();
       Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
       Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
       Results.push_back(Tmp1);
     }
     break;
   }
   case ISD::SUB: {
     EVT VT = Node->getValueType(0);
     assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
            TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
            "Don't know how to expand this subtraction!");
     Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
                DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
                                VT));
     Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
     Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
     break;
   }
   case ISD::UREM:
   case ISD::SREM: {
     EVT VT = Node->getValueType(0);
     bool isSigned = Node->getOpcode() == ISD::SREM;
     unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
     Tmp2 = Node->getOperand(0);
     Tmp3 = Node->getOperand(1);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
       Results.push_back(Tmp1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
       // X % Y -> X-X/Y*Y
       Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
       Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
       Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
       Results.push_back(Tmp1);
     }
     break;
   }
   case ISD::UDIV:
   case ISD::SDIV: {
     bool isSigned = Node->getOpcode() == ISD::SDIV;
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
                          Node->getOperand(1));
       Results.push_back(Tmp1);
     }
     break;
   }
   case ISD::MULHU:
   case ISD::MULHS: {
     unsigned ExpandOpcode =
         Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI : ISD::SMUL_LOHI;
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
 
     Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
                        Node->getOperand(1));
     Results.push_back(Tmp1.getValue(1));
     break;
   }
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI: {
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
     MVT VT = LHS.getSimpleValueType();
     unsigned MULHOpcode =
         Node->getOpcode() == ISD::UMUL_LOHI ? ISD::MULHU : ISD::MULHS;
 
     if (TLI.isOperationLegalOrCustom(MULHOpcode, VT)) {
       Results.push_back(DAG.getNode(ISD::MUL, dl, VT, LHS, RHS));
       Results.push_back(DAG.getNode(MULHOpcode, dl, VT, LHS, RHS));
       break;
     }
 
     SmallVector<SDValue, 4> Halves;
     EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
     assert(TLI.isTypeLegal(HalfType));
     if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, Node, LHS, RHS, Halves,
                            HalfType, DAG,
                            TargetLowering::MulExpansionKind::Always)) {
       for (unsigned i = 0; i < 2; ++i) {
         SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
         SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
         SDValue Shift = DAG.getConstant(
             HalfType.getScalarSizeInBits(), dl,
             TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
         Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
         Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
       }
       break;
     }
     break;
   }
   case ISD::MUL: {
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
     // See if multiply or divide can be lowered using two-result operations.
     // We just need the low half of the multiply; try both the signed
     // and unsigned forms. If the target supports both SMUL_LOHI and
     // UMUL_LOHI, form a preference by checking which forms of plain
     // MULH it supports.
     bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
     bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
     bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
     bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
     unsigned OpToUse = 0;
     if (HasSMUL_LOHI && !HasMULHS) {
       OpToUse = ISD::SMUL_LOHI;
     } else if (HasUMUL_LOHI && !HasMULHU) {
       OpToUse = ISD::UMUL_LOHI;
     } else if (HasSMUL_LOHI) {
       OpToUse = ISD::SMUL_LOHI;
     } else if (HasUMUL_LOHI) {
       OpToUse = ISD::UMUL_LOHI;
     }
     if (OpToUse) {
       Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
                                     Node->getOperand(1)));
       break;
     }
 
     SDValue Lo, Hi;
     EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
     if (TLI.isOperationLegalOrCustom(ISD::ZERO_EXTEND, VT) &&
         TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
         TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
         TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
         TLI.expandMUL(Node, Lo, Hi, HalfType, DAG,
                       TargetLowering::MulExpansionKind::OnlyLegalOrCustom)) {
       Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
       Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
       SDValue Shift =
           DAG.getConstant(HalfType.getSizeInBits(), dl,
                           TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
       Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
       Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
     }
     break;
   }
   case ISD::SADDO:
   case ISD::SSUBO: {
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
     SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
                               ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
                               LHS, RHS);
     Results.push_back(Sum);
     EVT ResultType = Node->getValueType(1);
     EVT OType = getSetCCResultType(Node->getValueType(0));
 
     SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
 
     //   LHSSign -> LHS >= 0
     //   RHSSign -> RHS >= 0
     //   SumSign -> Sum >= 0
     //
     //   Add:
     //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
     //   Sub:
     //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
     SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
     SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
     SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
                                       Node->getOpcode() == ISD::SADDO ?
                                       ISD::SETEQ : ISD::SETNE);
 
     SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
     SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
 
     SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
     Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType));
     break;
   }
   case ISD::UADDO:
   case ISD::USUBO: {
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
     bool IsAdd = Node->getOpcode() == ISD::UADDO;
     // If ADD/SUBCARRY is legal, use that instead.
     unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY;
     if (TLI.isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) {
       SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1));
       SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(),
                                       { LHS, RHS, CarryIn });
       Results.push_back(SDValue(NodeCarry.getNode(), 0));
       Results.push_back(SDValue(NodeCarry.getNode(), 1));
       break;
     }
 
     SDValue Sum = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
                               LHS.getValueType(), LHS, RHS);
     Results.push_back(Sum);
 
     EVT ResultType = Node->getValueType(1);
     EVT SetCCType = getSetCCResultType(Node->getValueType(0));
     ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
     SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
 
     Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
     break;
   }
   case ISD::UMULO:
   case ISD::SMULO: {
     EVT VT = Node->getValueType(0);
     EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
     SDValue BottomHalf;
     SDValue TopHalf;
     static const unsigned Ops[2][3] =
         { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
           { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
     bool isSigned = Node->getOpcode() == ISD::SMULO;
     if (TLI.isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
       BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
       TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
     } else if (TLI.isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
       BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
                                RHS);
       TopHalf = BottomHalf.getValue(1);
     } else if (TLI.isTypeLegal(WideVT)) {
       LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
       RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
       Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
       BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
                                DAG.getIntPtrConstant(0, dl));
       TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
                             DAG.getIntPtrConstant(1, dl));
     } else {
       // We can fall back to a libcall with an illegal type for the MUL if we
       // have a libcall big enough.
       // Also, we can fall back to a division in some cases, but that's a big
       // performance hit in the general case.
       RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
       if (WideVT == MVT::i16)
         LC = RTLIB::MUL_I16;
       else if (WideVT == MVT::i32)
         LC = RTLIB::MUL_I32;
       else if (WideVT == MVT::i64)
         LC = RTLIB::MUL_I64;
       else if (WideVT == MVT::i128)
         LC = RTLIB::MUL_I128;
       assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
 
       SDValue HiLHS;
       SDValue HiRHS;
       if (isSigned) {
         // The high part is obtained by SRA'ing all but one of the bits of low
         // part.
         unsigned LoSize = VT.getSizeInBits();
         HiLHS =
             DAG.getNode(ISD::SRA, dl, VT, LHS,
                         DAG.getConstant(LoSize - 1, dl,
                                         TLI.getPointerTy(DAG.getDataLayout())));
         HiRHS =
             DAG.getNode(ISD::SRA, dl, VT, RHS,
                         DAG.getConstant(LoSize - 1, dl,
                                         TLI.getPointerTy(DAG.getDataLayout())));
       } else {
           HiLHS = DAG.getConstant(0, dl, VT);
           HiRHS = DAG.getConstant(0, dl, VT);
       }
 
       // Here we're passing the 2 arguments explicitly as 4 arguments that are
       // pre-lowered to the correct types. This all depends upon WideVT not
       // being a legal type for the architecture and thus has to be split to
       // two arguments.
       SDValue Ret;
       if(DAG.getDataLayout().isLittleEndian()) {
         // Halves of WideVT are packed into registers in different order
         // depending on platform endianness. This is usually handled by
         // the C calling convention, but we can't defer to it in
         // the legalizer.
         SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
         Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
       } else {
         SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
         Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
       }
       assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
              "Ret value is a collection of constituent nodes holding result.");
       BottomHalf = Ret.getOperand(0);
       TopHalf = Ret.getOperand(1);
     }
 
     if (isSigned) {
       Tmp1 = DAG.getConstant(
           VT.getSizeInBits() - 1, dl,
           TLI.getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
       Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
       TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf, Tmp1,
                              ISD::SETNE);
     } else {
       TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf,
                              DAG.getConstant(0, dl, VT), ISD::SETNE);
     }
 
     // Truncate the result if SetCC returns a larger type than needed.
     EVT RType = Node->getValueType(1);
     if (RType.getSizeInBits() < TopHalf.getValueSizeInBits())
       TopHalf = DAG.getNode(ISD::TRUNCATE, dl, RType, TopHalf);
 
     assert(RType.getSizeInBits() == TopHalf.getValueSizeInBits() &&
            "Unexpected result type for S/UMULO legalization");
 
     Results.push_back(BottomHalf);
     Results.push_back(TopHalf);
     break;
   }
   case ISD::BUILD_PAIR: {
     EVT PairTy = Node->getValueType(0);
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
     Tmp2 = DAG.getNode(
         ISD::SHL, dl, PairTy, Tmp2,
         DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
                         TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
     Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
     break;
   }
   case ISD::SELECT:
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
     if (Tmp1.getOpcode() == ISD::SETCC) {
       Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
                              Tmp2, Tmp3,
                              cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
     } else {
       Tmp1 = DAG.getSelectCC(dl, Tmp1,
                              DAG.getConstant(0, dl, Tmp1.getValueType()),
                              Tmp2, Tmp3, ISD::SETNE);
     }
     Results.push_back(Tmp1);
     break;
   case ISD::BR_JT: {
     SDValue Chain = Node->getOperand(0);
     SDValue Table = Node->getOperand(1);
     SDValue Index = Node->getOperand(2);
 
     const DataLayout &TD = DAG.getDataLayout();
     EVT PTy = TLI.getPointerTy(TD);
 
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
     // For power-of-two jumptable entry sizes convert multiplication to a shift.
     // This transformation needs to be done here since otherwise the MIPS
     // backend will end up emitting a three instruction multiply sequence
     // instead of a single shift and MSP430 will call a runtime function.
     if (llvm::isPowerOf2_32(EntrySize))
       Index = DAG.getNode(
           ISD::SHL, dl, Index.getValueType(), Index,
           DAG.getConstant(llvm::Log2_32(EntrySize), dl, Index.getValueType()));
     else
       Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
                           DAG.getConstant(EntrySize, dl, Index.getValueType()));
     SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
                                Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(
         ISD::SEXTLOAD, dl, PTy, Chain, Addr,
         MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
     Addr = LD;
     if (TLI.isJumpTableRelative()) {
       // For PIC, the sequence is:
       // BRIND(load(Jumptable + index) + RelocBase)
       // RelocBase can be JumpTable, GOT or some sort of global base.
       Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
                           TLI.getPICJumpTableRelocBase(Table, DAG));
     }
 
     Tmp1 = TLI.expandIndirectJTBranch(dl, LD.getValue(1), Addr, DAG);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::BRCOND:
     // Expand brcond's setcc into its constituent parts and create a BR_CC
     // Node.
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     if (Tmp2.getOpcode() == ISD::SETCC) {
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
                          Tmp1, Tmp2.getOperand(2),
                          Tmp2.getOperand(0), Tmp2.getOperand(1),
                          Node->getOperand(2));
     } else {
       // We test only the i1 bit.  Skip the AND if UNDEF or another AND.
       if (Tmp2.isUndef() ||
           (Tmp2.getOpcode() == ISD::AND &&
            isa<ConstantSDNode>(Tmp2.getOperand(1)) &&
            cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1))
         Tmp3 = Tmp2;
       else
         Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
                            DAG.getConstant(1, dl, Tmp2.getValueType()));
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
                          DAG.getCondCode(ISD::SETNE), Tmp3,
                          DAG.getConstant(0, dl, Tmp3.getValueType()),
                          Node->getOperand(2));
     }
     Results.push_back(Tmp1);
     break;
   case ISD::SETCC: {
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
     bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
                                            Tmp3, NeedInvert, dl);
 
     if (Legalized) {
       // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
       // condition code, create a new SETCC node.
       if (Tmp3.getNode())
         Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
                            Tmp1, Tmp2, Tmp3);
 
       // If we expanded the SETCC by inverting the condition code, then wrap
       // the existing SETCC in a NOT to restore the intended condition.
       if (NeedInvert)
         Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));
 
       Results.push_back(Tmp1);
       break;
     }
 
     // Otherwise, SETCC for the given comparison type must be completely
     // illegal; expand it into a SELECT_CC.
     EVT VT = Node->getValueType(0);
     int TrueValue;
     switch (TLI.getBooleanContents(Tmp1.getValueType())) {
     case TargetLowering::ZeroOrOneBooleanContent:
     case TargetLowering::UndefinedBooleanContent:
       TrueValue = 1;
       break;
     case TargetLowering::ZeroOrNegativeOneBooleanContent:
       TrueValue = -1;
       break;
     }
     Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
                        DAG.getConstant(TrueValue, dl, VT),
                        DAG.getConstant(0, dl, VT),
                        Tmp3);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::SELECT_CC: {
     Tmp1 = Node->getOperand(0);   // LHS
     Tmp2 = Node->getOperand(1);   // RHS
     Tmp3 = Node->getOperand(2);   // True
     Tmp4 = Node->getOperand(3);   // False
     EVT VT = Node->getValueType(0);
     SDValue CC = Node->getOperand(4);
     ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();
 
     if (TLI.isCondCodeLegalOrCustom(CCOp, Tmp1.getSimpleValueType())) {
       // If the condition code is legal, then we need to expand this
       // node using SETCC and SELECT.
       EVT CmpVT = Tmp1.getValueType();
       assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
              "Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
              "expanded.");
       EVT CCVT =
           TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
       SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
       Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
       break;
     }
 
     // SELECT_CC is legal, so the condition code must not be.
     bool Legalized = false;
     // Try to legalize by inverting the condition.  This is for targets that
     // might support an ordered version of a condition, but not the unordered
     // version (or vice versa).
     ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
                                                Tmp1.getValueType().isInteger());
     if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) {
       // Use the new condition code and swap true and false
       Legalized = true;
       Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
     } else {
       // If The inverse is not legal, then try to swap the arguments using
       // the inverse condition code.
       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
       if (TLI.isCondCodeLegalOrCustom(SwapInvCC, Tmp1.getSimpleValueType())) {
         // The swapped inverse condition is legal, so swap true and false,
         // lhs and rhs.
         Legalized = true;
         Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
       }
     }
 
     if (!Legalized) {
       Legalized = LegalizeSetCCCondCode(
           getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
           dl);
 
       assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
 
       // If we expanded the SETCC by inverting the condition code, then swap
       // the True/False operands to match.
       if (NeedInvert)
         std::swap(Tmp3, Tmp4);
 
       // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
       // condition code, create a new SELECT_CC node.
       if (CC.getNode()) {
         Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
                            Tmp1, Tmp2, Tmp3, Tmp4, CC);
       } else {
         Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
         CC = DAG.getCondCode(ISD::SETNE);
         Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
                            Tmp2, Tmp3, Tmp4, CC);
       }
     }
     Results.push_back(Tmp1);
     break;
   }
   case ISD::BR_CC: {
     Tmp1 = Node->getOperand(0);              // Chain
     Tmp2 = Node->getOperand(2);              // LHS
     Tmp3 = Node->getOperand(3);              // RHS
     Tmp4 = Node->getOperand(1);              // CC
 
     bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
         Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
     (void)Legalized;
     assert(Legalized && "Can't legalize BR_CC with legal condition!");
 
     // If we expanded the SETCC by inverting the condition code, then wrap
     // the existing SETCC in a NOT to restore the intended condition.
     if (NeedInvert)
       Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));
 
     // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
     // node.
     if (Tmp4.getNode()) {
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
                          Tmp4, Tmp2, Tmp3, Node->getOperand(4));
     } else {
       Tmp3 = DAG.getConstant(0, dl, Tmp2.getValueType());
       Tmp4 = DAG.getCondCode(ISD::SETNE);
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4,
                          Tmp2, Tmp3, Node->getOperand(4));
     }
     Results.push_back(Tmp1);
     break;
   }
   case ISD::BUILD_VECTOR:
     Results.push_back(ExpandBUILD_VECTOR(Node));
     break;
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL: {
     // Scalarize vector SRA/SRL/SHL.
     EVT VT = Node->getValueType(0);
     assert(VT.isVector() && "Unable to legalize non-vector shift");
     assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
     unsigned NumElem = VT.getVectorNumElements();
 
     SmallVector<SDValue, 8> Scalars;
     for (unsigned Idx = 0; Idx < NumElem; Idx++) {
       SDValue Ex = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(0),
           DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       SDValue Sh = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(1),
           DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
                                     VT.getScalarType(), Ex, Sh));
     }
 
     SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
     ReplaceNode(SDValue(Node, 0), Result);
     break;
   }
   case ISD::ROTL:
   case ISD::ROTR: {
     bool IsLeft = Node->getOpcode() == ISD::ROTL;
     SDValue Op0 = Node->getOperand(0), Op1 = Node->getOperand(1);
     EVT ResVT = Node->getValueType(0);
     EVT OpVT = Op0.getValueType();
     assert(OpVT == ResVT &&
            "The result and the operand types of rotate should match");
     EVT ShVT = Op1.getValueType();
     SDValue Width = DAG.getConstant(OpVT.getScalarSizeInBits(), dl, ShVT);
 
     // If a rotate in the other direction is legal, use it.
     unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
     if (TLI.isOperationLegal(RevRot, ResVT)) {
       SDValue Sub = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1);
       Results.push_back(DAG.getNode(RevRot, dl, ResVT, Op0, Sub));
       break;
     }
 
     // Otherwise,
     //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
     //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
     //
     assert(isPowerOf2_32(OpVT.getScalarSizeInBits()) &&
            "Expecting the type bitwidth to be a power of 2");
     unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
     unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
     SDValue Width1 = DAG.getNode(ISD::SUB, dl, ShVT,
                                  Width, DAG.getConstant(1, dl, ShVT));
     SDValue NegOp1 = DAG.getNode(ISD::SUB, dl, ShVT, Width, Op1);
     SDValue And0 = DAG.getNode(ISD::AND, dl, ShVT, Op1, Width1);
     SDValue And1 = DAG.getNode(ISD::AND, dl, ShVT, NegOp1, Width1);
 
     SDValue Or = DAG.getNode(ISD::OR, dl, ResVT,
                              DAG.getNode(ShOpc, dl, ResVT, Op0, And0),
                              DAG.getNode(HsOpc, dl, ResVT, Op0, And1));
     Results.push_back(Or);
     break;
   }
 
   case ISD::GLOBAL_OFFSET_TABLE:
   case ISD::GlobalAddress:
   case ISD::GlobalTLSAddress:
   case ISD::ExternalSymbol:
   case ISD::ConstantPool:
   case ISD::JumpTable:
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID:
     // FIXME: Custom lowering for these operations shouldn't return null!
     break;
   }
 
   // Replace the original node with the legalized result.
   if (Results.empty()) {
     LLVM_DEBUG(dbgs() << "Cannot expand node\n");
     return false;
   }
 
   LLVM_DEBUG(dbgs() << "Succesfully expanded node\n");
   ReplaceNode(Node, Results.data());
   return true;
 }
 
 void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to convert node to libcall\n");
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   // FIXME: Check flags on the node to see if we can use a finite call.
   bool CanUseFiniteLibCall = TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath;
   unsigned Opc = Node->getOpcode();
   switch (Opc) {
   case ISD::ATOMIC_FENCE: {
     // If the target didn't lower this, lower it to '__sync_synchronize()' call
     // FIXME: handle "fence singlethread" more efficiently.
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
         .setLibCallee(
             CallingConv::C, Type::getVoidTy(*DAG.getContext()),
             DAG.getExternalSymbol("__sync_synchronize",
                                   TLI.getPointerTy(DAG.getDataLayout())),
             std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
     break;
   }
   // By default, atomic intrinsics are marked Legal and lowered. Targets
   // which don't support them directly, however, may want libcalls, in which
   // case they mark them Expand, and we get here.
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_CMP_SWAP: {
     MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
     RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
 
     std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false);
     Results.push_back(Tmp.first);
     Results.push_back(Tmp.second);
     break;
   }
   case ISD::TRAP: {
     // If this operation is not supported, lower it to 'abort()' call
     TargetLowering::ArgListTy Args;
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
         .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                       DAG.getExternalSymbol(
                           "abort", TLI.getPointerTy(DAG.getDataLayout())),
                       std::move(Args));
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
     break;
   }
   case ISD::FMINNUM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
                                       RTLIB::FMIN_F80, RTLIB::FMIN_F128,
                                       RTLIB::FMIN_PPCF128));
     break;
   case ISD::FMAXNUM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
                                       RTLIB::FMAX_F80, RTLIB::FMAX_F128,
                                       RTLIB::FMAX_PPCF128));
     break;
   case ISD::FSQRT:
   case ISD::STRICT_FSQRT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
                                       RTLIB::SQRT_PPCF128));
     break;
   case ISD::FSIN:
   case ISD::STRICT_FSIN:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
                                       RTLIB::SIN_F80, RTLIB::SIN_F128,
                                       RTLIB::SIN_PPCF128));
     break;
   case ISD::FCOS:
   case ISD::STRICT_FCOS:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
                                       RTLIB::COS_F80, RTLIB::COS_F128,
                                       RTLIB::COS_PPCF128));
     break;
   case ISD::FSINCOS:
     // Expand into sincos libcall.
     ExpandSinCosLibCall(Node, Results);
     break;
   case ISD::FLOG:
   case ISD::STRICT_FLOG:
     if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log_finite))
       Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_FINITE_F32,
                                         RTLIB::LOG_FINITE_F64,
                                         RTLIB::LOG_FINITE_F80,
                                         RTLIB::LOG_FINITE_F128,
                                         RTLIB::LOG_FINITE_PPCF128));
     else
       Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
                                         RTLIB::LOG_F80, RTLIB::LOG_F128,
                                         RTLIB::LOG_PPCF128));
     break;
   case ISD::FLOG2:
   case ISD::STRICT_FLOG2:
     if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log2_finite))
       Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_FINITE_F32,
                                         RTLIB::LOG2_FINITE_F64,
                                         RTLIB::LOG2_FINITE_F80,
                                         RTLIB::LOG2_FINITE_F128,
                                         RTLIB::LOG2_FINITE_PPCF128));
     else
       Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
                                         RTLIB::LOG2_F80, RTLIB::LOG2_F128,
                                         RTLIB::LOG2_PPCF128));
     break;
   case ISD::FLOG10:
   case ISD::STRICT_FLOG10:
     if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log10_finite))
       Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_FINITE_F32,
                                         RTLIB::LOG10_FINITE_F64,
                                         RTLIB::LOG10_FINITE_F80,
                                         RTLIB::LOG10_FINITE_F128,
                                         RTLIB::LOG10_FINITE_PPCF128));
     else
       Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
                                         RTLIB::LOG10_F80, RTLIB::LOG10_F128,
                                         RTLIB::LOG10_PPCF128));
     break;
   case ISD::FEXP:
   case ISD::STRICT_FEXP:
     if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_exp_finite))
       Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_FINITE_F32,
                                         RTLIB::EXP_FINITE_F64,
                                         RTLIB::EXP_FINITE_F80,
                                         RTLIB::EXP_FINITE_F128,
                                         RTLIB::EXP_FINITE_PPCF128));
     else
       Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
                                         RTLIB::EXP_F80, RTLIB::EXP_F128,
                                         RTLIB::EXP_PPCF128));
     break;
   case ISD::FEXP2:
   case ISD::STRICT_FEXP2:
     if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_exp2_finite))
       Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_FINITE_F32,
                                         RTLIB::EXP2_FINITE_F64,
                                         RTLIB::EXP2_FINITE_F80,
                                         RTLIB::EXP2_FINITE_F128,
                                         RTLIB::EXP2_FINITE_PPCF128));
     else
       Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
                                         RTLIB::EXP2_F80, RTLIB::EXP2_F128,
                                         RTLIB::EXP2_PPCF128));
     break;
   case ISD::FTRUNC:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
                                       RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
                                       RTLIB::TRUNC_PPCF128));
     break;
   case ISD::FFLOOR:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
                                       RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
                                       RTLIB::FLOOR_PPCF128));
     break;
   case ISD::FCEIL:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
                                       RTLIB::CEIL_F80, RTLIB::CEIL_F128,
                                       RTLIB::CEIL_PPCF128));
     break;
   case ISD::FRINT:
   case ISD::STRICT_FRINT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
                                       RTLIB::RINT_F80, RTLIB::RINT_F128,
                                       RTLIB::RINT_PPCF128));
     break;
   case ISD::FNEARBYINT:
   case ISD::STRICT_FNEARBYINT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
                                       RTLIB::NEARBYINT_F64,
                                       RTLIB::NEARBYINT_F80,
                                       RTLIB::NEARBYINT_F128,
                                       RTLIB::NEARBYINT_PPCF128));
     break;
   case ISD::FROUND:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
                                       RTLIB::ROUND_F64,
                                       RTLIB::ROUND_F80,
                                       RTLIB::ROUND_F128,
                                       RTLIB::ROUND_PPCF128));
     break;
   case ISD::FPOWI:
   case ISD::STRICT_FPOWI:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
                                       RTLIB::POWI_F80, RTLIB::POWI_F128,
                                       RTLIB::POWI_PPCF128));
     break;
   case ISD::FPOW:
   case ISD::STRICT_FPOW:
     if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_pow_finite))
       Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_FINITE_F32,
                                         RTLIB::POW_FINITE_F64,
                                         RTLIB::POW_FINITE_F80,
                                         RTLIB::POW_FINITE_F128,
                                         RTLIB::POW_FINITE_PPCF128));
     else
       Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
                                         RTLIB::POW_F80, RTLIB::POW_F128,
                                         RTLIB::POW_PPCF128));
     break;
   case ISD::FDIV:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
                                       RTLIB::DIV_F80, RTLIB::DIV_F128,
                                       RTLIB::DIV_PPCF128));
     break;
   case ISD::FREM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
                                       RTLIB::REM_F80, RTLIB::REM_F128,
                                       RTLIB::REM_PPCF128));
     break;
   case ISD::FMA:
   case ISD::STRICT_FMA:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
     break;
   case ISD::FADD:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
                                       RTLIB::ADD_F80, RTLIB::ADD_F128,
                                       RTLIB::ADD_PPCF128));
     break;
   case ISD::FMUL:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
                                       RTLIB::MUL_F80, RTLIB::MUL_F128,
                                       RTLIB::MUL_PPCF128));
     break;
   case ISD::FP16_TO_FP:
     if (Node->getValueType(0) == MVT::f32) {
       Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
     }
     break;
   case ISD::FP_TO_FP16: {
     RTLIB::Libcall LC =
         RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
     Results.push_back(ExpandLibCall(LC, Node, false));
     break;
   }
   case ISD::FSUB:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
                                       RTLIB::SUB_F80, RTLIB::SUB_F128,
                                       RTLIB::SUB_PPCF128));
     break;
   case ISD::SREM:
     Results.push_back(ExpandIntLibCall(Node, true,
                                        RTLIB::SREM_I8,
                                        RTLIB::SREM_I16, RTLIB::SREM_I32,
                                        RTLIB::SREM_I64, RTLIB::SREM_I128));
     break;
   case ISD::UREM:
     Results.push_back(ExpandIntLibCall(Node, false,
                                        RTLIB::UREM_I8,
                                        RTLIB::UREM_I16, RTLIB::UREM_I32,
                                        RTLIB::UREM_I64, RTLIB::UREM_I128));
     break;
   case ISD::SDIV:
     Results.push_back(ExpandIntLibCall(Node, true,
                                        RTLIB::SDIV_I8,
                                        RTLIB::SDIV_I16, RTLIB::SDIV_I32,
                                        RTLIB::SDIV_I64, RTLIB::SDIV_I128));
     break;
   case ISD::UDIV:
     Results.push_back(ExpandIntLibCall(Node, false,
                                        RTLIB::UDIV_I8,
                                        RTLIB::UDIV_I16, RTLIB::UDIV_I32,
                                        RTLIB::UDIV_I64, RTLIB::UDIV_I128));
     break;
   case ISD::SDIVREM:
   case ISD::UDIVREM:
     // Expand into divrem libcall
     ExpandDivRemLibCall(Node, Results);
     break;
   case ISD::MUL:
     Results.push_back(ExpandIntLibCall(Node, false,
                                        RTLIB::MUL_I8,
                                        RTLIB::MUL_I16, RTLIB::MUL_I32,
                                        RTLIB::MUL_I64, RTLIB::MUL_I128));
     break;
   }
 
   // Replace the original node with the legalized result.
   if (!Results.empty()) {
     LLVM_DEBUG(dbgs() << "Successfully converted node to libcall\n");
     ReplaceNode(Node, Results.data());
   } else
     LLVM_DEBUG(dbgs() << "Could not convert node to libcall\n");
 }
 
 // Determine the vector type to use in place of an original scalar element when
 // promoting equally sized vectors.
 static MVT getPromotedVectorElementType(const TargetLowering &TLI,
                                         MVT EltVT, MVT NewEltVT) {
   unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
   MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
   assert(TLI.isTypeLegal(MidVT) && "unexpected");
   return MidVT;
 }
 
 void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to promote node\n");
   SmallVector<SDValue, 8> Results;
   MVT OVT = Node->getSimpleValueType(0);
   if (Node->getOpcode() == ISD::UINT_TO_FP ||
       Node->getOpcode() == ISD::SINT_TO_FP ||
       Node->getOpcode() == ISD::SETCC ||
       Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
       Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
     OVT = Node->getOperand(0).getSimpleValueType();
   }
   if (Node->getOpcode() == ISD::BR_CC)
     OVT = Node->getOperand(2).getSimpleValueType();
   MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
   SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3;
   switch (Node->getOpcode()) {
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP:
     // Zero extend the argument.
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
     if (Node->getOpcode() == ISD::CTTZ) {
       // The count is the same in the promoted type except if the original
       // value was zero.  This can be handled by setting the bit just off
       // the top of the original type.
       auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
                                         OVT.getSizeInBits());
       Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
                          DAG.getConstant(TopBit, dl, NVT));
     }
     // Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
     // already the correct result.
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
     if (Node->getOpcode() == ISD::CTLZ ||
         Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
       // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
       Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
                           DAG.getConstant(NVT.getSizeInBits() -
                                           OVT.getSizeInBits(), dl, NVT));
     }
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     break;
   case ISD::BITREVERSE:
   case ISD::BSWAP: {
     unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
     Tmp1 = DAG.getNode(
         ISD::SRL, dl, NVT, Tmp1,
         DAG.getConstant(DiffBits, dl,
                         TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
 
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     break;
   }
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
     Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
                                  Node->getOpcode() == ISD::FP_TO_SINT, dl);
     Results.push_back(Tmp1);
     break;
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:
     Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
                                  Node->getOpcode() == ISD::SINT_TO_FP, dl);
     Results.push_back(Tmp1);
     break;
   case ISD::VAARG: {
     SDValue Chain = Node->getOperand(0); // Get the chain.
     SDValue Ptr = Node->getOperand(1); // Get the pointer.
 
     unsigned TruncOp;
     if (OVT.isVector()) {
       TruncOp = ISD::BITCAST;
     } else {
       assert(OVT.isInteger()
         && "VAARG promotion is supported only for vectors or integer types");
       TruncOp = ISD::TRUNCATE;
     }
 
     // Perform the larger operation, then convert back
     Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
              Node->getConstantOperandVal(3));
     Chain = Tmp1.getValue(1);
 
     Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);
 
     // Modified the chain result - switch anything that used the old chain to
     // use the new one.
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
     if (UpdatedNodes) {
       UpdatedNodes->insert(Tmp2.getNode());
       UpdatedNodes->insert(Chain.getNode());
     }
     ReplacedNode(Node);
     break;
   }
   case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::UDIV:
   case ISD::UREM:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR: {
     unsigned ExtOp, TruncOp;
     if (OVT.isVector()) {
       ExtOp   = ISD::BITCAST;
       TruncOp = ISD::BITCAST;
     } else {
       assert(OVT.isInteger() && "Cannot promote logic operation");
 
       switch (Node->getOpcode()) {
       default:
         ExtOp = ISD::ANY_EXTEND;
         break;
       case ISD::SDIV:
       case ISD::SREM:
         ExtOp = ISD::SIGN_EXTEND;
         break;
       case ISD::UDIV:
       case ISD::UREM:
         ExtOp = ISD::ZERO_EXTEND;
         break;
       }
       TruncOp = ISD::TRUNCATE;
     }
     // Promote each of the values to the new type.
     Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     // Perform the larger operation, then convert back
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
     Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
     break;
   }
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI: {
     // Promote to a multiply in a wider integer type.
     unsigned ExtOp = Node->getOpcode() == ISD::UMUL_LOHI ? ISD::ZERO_EXTEND
                                                          : ISD::SIGN_EXTEND;
     Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);
 
     auto &DL = DAG.getDataLayout();
     unsigned OriginalSize = OVT.getScalarSizeInBits();
     Tmp2 = DAG.getNode(
         ISD::SRL, dl, NVT, Tmp1,
         DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
     break;
   }
   case ISD::SELECT: {
     unsigned ExtOp, TruncOp;
     if (Node->getValueType(0).isVector() ||
         Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
       ExtOp   = ISD::BITCAST;
       TruncOp = ISD::BITCAST;
     } else if (Node->getValueType(0).isInteger()) {
       ExtOp   = ISD::ANY_EXTEND;
       TruncOp = ISD::TRUNCATE;
     } else {
       ExtOp   = ISD::FP_EXTEND;
       TruncOp = ISD::FP_ROUND;
     }
     Tmp1 = Node->getOperand(0);
     // Promote each of the values to the new type.
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
     // Perform the larger operation, then round down.
     Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
     if (TruncOp != ISD::FP_ROUND)
       Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
     else
       Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
                          DAG.getIntPtrConstant(0, dl));
     Results.push_back(Tmp1);
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
 
     // Cast the two input vectors.
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(1));
 
     // Convert the shuffle mask to the right # elements.
     Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OVT, Tmp1);
     Results.push_back(Tmp1);
     break;
   }
   case ISD::SETCC: {
     unsigned ExtOp = ISD::FP_EXTEND;
     if (NVT.isInteger()) {
       ISD::CondCode CCCode =
         cast<CondCodeSDNode>(Node->getOperand(2))->get();
       ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     }
     Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
                                   Tmp1, Tmp2, Node->getOperand(2)));
     break;
   }
   case ISD::BR_CC: {
     unsigned ExtOp = ISD::FP_EXTEND;
     if (NVT.isInteger()) {
       ISD::CondCode CCCode =
         cast<CondCodeSDNode>(Node->getOperand(1))->get();
       ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     }
     Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(3));
     Results.push_back(DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0),
                                   Node->getOperand(0), Node->getOperand(1),
                                   Tmp1, Tmp2, Node->getOperand(4)));
     break;
   }
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FPOW:
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
                        Node->getFlags());
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
                                   Tmp3, DAG.getIntPtrConstant(0, dl)));
     break;
   case ISD::FMA:
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2));
     Results.push_back(
         DAG.getNode(ISD::FP_ROUND, dl, OVT,
                     DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3),
                     DAG.getIntPtrConstant(0, dl)));
     break;
   case ISD::FCOPYSIGN:
   case ISD::FPOWI: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = Node->getOperand(1);
     Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
 
     // fcopysign doesn't change anything but the sign bit, so
     //   (fp_round (fcopysign (fpext a), b))
     // is as precise as
     //   (fp_round (fpext a))
     // which is a no-op. Mark it as a TRUNCating FP_ROUND.
     const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN);
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
                                   Tmp3, DAG.getIntPtrConstant(isTrunc, dl)));
     break;
   }
   case ISD::FFLOOR:
   case ISD::FCEIL:
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
   case ISD::FTRUNC:
   case ISD::FNEG:
   case ISD::FSQRT:
   case ISD::FSIN:
   case ISD::FCOS:
   case ISD::FLOG:
   case ISD::FLOG2:
   case ISD::FLOG10:
   case ISD::FABS:
   case ISD::FEXP:
   case ISD::FEXP2:
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
                                   Tmp2, DAG.getIntPtrConstant(0, dl)));
     break;
   case ISD::BUILD_VECTOR: {
     MVT EltVT = OVT.getVectorElementType();
     MVT NewEltVT = NVT.getVectorElementType();
 
     // Handle bitcasts to a different vector type with the same total bit size
     //
     // e.g. v2i64 = build_vector i64:x, i64:y => v4i32
     //  =>
     //  v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y))
 
     assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
            "Invalid promote type for build_vector");
     assert(NewEltVT.bitsLT(EltVT) && "not handled");
 
     MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
 
     SmallVector<SDValue, 8> NewOps;
     for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) {
       SDValue Op = Node->getOperand(I);
       NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op));
     }
 
     SDLoc SL(Node);
     SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
     SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
     Results.push_back(CvtVec);
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT: {
     MVT EltVT = OVT.getVectorElementType();
     MVT NewEltVT = NVT.getVectorElementType();
 
     // Handle bitcasts to a different vector type with the same total bit size.
     //
     // e.g. v2i64 = extract_vector_elt x:v2i64, y:i32
     //  =>
     //  v4i32:castx = bitcast x:v2i64
     //
     // i64 = bitcast
     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
     //
 
     assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
            "Invalid promote type for extract_vector_elt");
     assert(NewEltVT.bitsLT(EltVT) && "not handled");
 
     MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
     unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
 
     SDValue Idx = Node->getOperand(1);
     EVT IdxVT = Idx.getValueType();
     SDLoc SL(Node);
     SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT);
     SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
 
     SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
 
     SmallVector<SDValue, 8> NewOps;
     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
       SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
       SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
 
       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
                                 CastVec, TmpIdx);
       NewOps.push_back(Elt);
     }
 
     SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);
     Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
     break;
   }
   case ISD::INSERT_VECTOR_ELT: {
     MVT EltVT = OVT.getVectorElementType();
     MVT NewEltVT = NVT.getVectorElementType();
 
     // Handle bitcasts to a different vector type with the same total bit size
     //
     // e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32
     //  =>
     //  v4i32:castx = bitcast x:v2i64
     //  v2i32:casty = bitcast y:i64
     //
     // v2i64 = bitcast
     //   (v4i32 insert_vector_elt
     //       (v4i32 insert_vector_elt v4i32:castx,
     //                                (extract_vector_elt casty, 0), 2 * z),
     //        (extract_vector_elt casty, 1), (2 * z + 1))
 
     assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
            "Invalid promote type for insert_vector_elt");
     assert(NewEltVT.bitsLT(EltVT) && "not handled");
 
     MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
     unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
 
     SDValue Val = Node->getOperand(1);
     SDValue Idx = Node->getOperand(2);
     EVT IdxVT = Idx.getValueType();
     SDLoc SL(Node);
 
     SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT);
     SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
 
     SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
     SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
 
     SDValue NewVec = CastVec;
     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
       SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
       SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
 
       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
                                 CastVal, IdxOffset);
 
       NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT,
                            NewVec, Elt, InEltIdx);
     }
 
     Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec));
     break;
   }
   case ISD::SCALAR_TO_VECTOR: {
     MVT EltVT = OVT.getVectorElementType();
     MVT NewEltVT = NVT.getVectorElementType();
 
     // Handle bitcasts to different vector type with the same total bit size.
     //
     // e.g. v2i64 = scalar_to_vector x:i64
     //   =>
     //  concat_vectors (v2i32 bitcast x:i64), (v2i32 undef)
     //
 
     MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
     SDValue Val = Node->getOperand(0);
     SDLoc SL(Node);
 
     SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
     SDValue Undef = DAG.getUNDEF(MidVT);
 
     SmallVector<SDValue, 8> NewElts;
     NewElts.push_back(CastVal);
     for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I)
       NewElts.push_back(Undef);
 
     SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts);
     SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
     Results.push_back(CvtVec);
     break;
   }
   }
 
   // Replace the original node with the legalized result.
   if (!Results.empty()) {
     LLVM_DEBUG(dbgs() << "Successfully promoted node\n");
     ReplaceNode(Node, Results.data());
   } else
     LLVM_DEBUG(dbgs() << "Could not promote node\n");
 }
 
 /// This is the entry point for the file.
 void SelectionDAG::Legalize() {
   AssignTopologicalOrder();
 
   SmallPtrSet<SDNode *, 16> LegalizedNodes;
   // Use a delete listener to remove nodes which were deleted during
   // legalization from LegalizeNodes. This is needed to handle the situation
   // where a new node is allocated by the object pool to the same address of a
   // previously deleted node.
   DAGNodeDeletedListener DeleteListener(
       *this,
       [&LegalizedNodes](SDNode *N, SDNode *E) { LegalizedNodes.erase(N); });
 
   SelectionDAGLegalize Legalizer(*this, LegalizedNodes);
 
   // Visit all the nodes. We start in topological order, so that we see
   // nodes with their original operands intact. Legalization can produce
   // new nodes which may themselves need to be legalized. Iterate until all
   // nodes have been legalized.
   while (true) {
     bool AnyLegalized = false;
     for (auto NI = allnodes_end(); NI != allnodes_begin();) {
       --NI;
 
       SDNode *N = &*NI;
       if (N->use_empty() && N != getRoot().getNode()) {
         ++NI;
         DeleteNode(N);
         continue;
       }
 
       if (LegalizedNodes.insert(N).second) {
         AnyLegalized = true;
         Legalizer.LegalizeOp(N);
 
         if (N->use_empty() && N != getRoot().getNode()) {
           ++NI;
           DeleteNode(N);
         }
       }
     }
     if (!AnyLegalized)
       break;
 
   }
 
   // Remove dead nodes now.
   RemoveDeadNodes();
 }
 
 bool SelectionDAG::LegalizeOp(SDNode *N,
                               SmallSetVector<SDNode *, 16> &UpdatedNodes) {
   SmallPtrSet<SDNode *, 16> LegalizedNodes;
   SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);
 
   // Directly insert the node in question, and legalize it. This will recurse
   // as needed through operands.
   LegalizedNodes.insert(N);
   Legalizer.LegalizeOp(N);
 
   return LegalizedNodes.count(N);
 }
Index: vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp	(revision 337631)
@@ -1,4194 +1,4211 @@
 //===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file performs vector type splitting and scalarization for LegalizeTypes.
 // Scalarization is the act of changing a computation in an illegal one-element
 // vector type to be a computation in its scalar element type.  For example,
 // implementing <1 x f32> arithmetic in a scalar f32 register.  This is needed
 // as a base case when scalarizing vector arithmetic like <4 x f32>, which
 // eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
 // types.
 // Splitting is the act of changing a computation in an invalid vector type to
 // be a computation in two vectors of half the size.  For example, implementing
 // <128 x f32> operations in terms of two <64 x f32> operations.
 //
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "legalize-types"
 
 //===----------------------------------------------------------------------===//
 //  Result Vector Scalarization: <1 x ty> -> ty.
 //===----------------------------------------------------------------------===//
 
 void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Scalarize node result " << ResNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
     report_fatal_error("Do not know how to scalarize the result of this "
                        "operator!\n");
 
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
   case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
   case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
   case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
     R = ScalarizeVecRes_VecInregOp(N);
     break;
   case ISD::ANY_EXTEND:
   case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP:
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::FABS:
   case ISD::FCEIL:
   case ISD::FCOS:
   case ISD::FEXP:
   case ISD::FEXP2:
   case ISD::FFLOOR:
   case ISD::FLOG:
   case ISD::FLOG10:
   case ISD::FLOG2:
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
   case ISD::SIGN_EXTEND:
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
   case ISD::ZERO_EXTEND:
   case ISD::FCANONICALIZE:
     R = ScalarizeVecRes_UnaryOp(N);
     break;
 
   case ISD::ADD:
   case ISD::AND:
   case ISD::FADD:
   case ISD::FCOPYSIGN:
   case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
 
   case ISD::FPOW:
   case ISD::FREM:
   case ISD::FSUB:
   case ISD::MUL:
   case ISD::OR:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::SUB:
   case ISD::UDIV:
   case ISD::UREM:
   case ISD::XOR:
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
     R = ScalarizeVecRes_BinOp(N);
     break;
   case ISD::FMA:
     R = ScalarizeVecRes_TernaryOp(N);
     break;
   }
 
   // If R is null, the sub-method took care of registering the result.
   if (R.getNode())
     SetScalarizedVector(SDValue(N, ResNo), R);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
   SDValue Op1 = GetScalarizedVector(N->getOperand(1));
   SDValue Op2 = GetScalarizedVector(N->getOperand(2));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      Op0.getValueType(), Op0, Op1, Op2);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
                                                        unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
   return GetScalarizedVector(Op);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().isVector()
       && Op.getValueType().getVectorNumElements() == 1
       && !isSimpleLegalType(Op.getValueType()))
     Op = GetScalarizedVector(Op);
   EVT NewVT = N->getValueType(0).getVectorElementType();
   return DAG.getNode(ISD::BITCAST, SDLoc(N),
                      NewVT, Op);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
   EVT EltVT = N->getValueType(0).getVectorElementType();
   SDValue InOp = N->getOperand(0);
   // The BUILD_VECTOR operands may be of wider element types and
   // we may need to truncate them back to the requested return type.
   if (EltVT.isInteger())
     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
   return InOp;
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      N->getValueType(0).getVectorElementType(),
                      N->getOperand(0), N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   SDValue Op = GetScalarizedVector(N->getOperand(0));
   return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
                      NewVT, Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
   SDValue Op = GetScalarizedVector(N->getOperand(0));
   return DAG.getNode(ISD::FPOWI, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   // The value to insert may have a wider type than the vector element type,
   // so be sure to truncate it to the element type if necessary.
   SDValue Op = N->getOperand(1);
   EVT EltVT = N->getValueType(0).getVectorElementType();
   if (Op.getValueType() != EltVT)
     // FIXME: Can this happen for floating point types?
     Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
   return Op;
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
   SDValue Result = DAG.getLoad(
       ISD::UNINDEXED, N->getExtensionType(),
       N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
       N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
       N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
       N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
       N->getAAInfo());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
   return Result;
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
   // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
   EVT DestVT = N->getValueType(0).getVectorElementType();
   SDValue Op = N->getOperand(0);
   EVT OpVT = Op.getValueType();
   SDLoc DL(N);
   // The result needs scalarizing, but it's not a given that the source does.
   // This is a workaround for targets where it's impossible to scalarize the
   // result of a conversion, because the source type is legal.
   // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
   // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
   // legal and was not scalarized.
   // See the similar logic in ScalarizeVecRes_SETCC
   if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
     Op = GetScalarizedVector(Op);
   } else {
     EVT VT = OpVT.getVectorElementType();
     Op = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
         DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
   EVT EltVT = N->getValueType(0).getVectorElementType();
   EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
                      LHS, DAG.getValueType(ExtVT));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
 
   EVT OpVT = Op.getValueType();
   EVT OpEltVT = OpVT.getVectorElementType();
   EVT EltVT = N->getValueType(0).getVectorElementType();
 
   if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
     Op = GetScalarizedVector(Op);
   } else {
     Op = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
         DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   switch (N->getOpcode()) {
   case ISD::ANY_EXTEND_VECTOR_INREG:
     return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
   case ISD::ZERO_EXTEND_VECTOR_INREG:
     return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
   }
 
   llvm_unreachable("Illegal extend_vector_inreg opcode");
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   // If the operand is wider than the vector element type then it is implicitly
   // truncated.  Make that explicit here.
   EVT EltVT = N->getValueType(0).getVectorElementType();
   SDValue InOp = N->getOperand(0);
   if (InOp.getValueType() != EltVT)
     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
   return InOp;
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   EVT OpVT = Cond.getValueType();
   SDLoc DL(N);
   // The vselect result and true/value operands needs scalarizing, but it's
   // not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
   // See the similar logic in ScalarizeVecRes_SETCC
   if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
     Cond = GetScalarizedVector(Cond);
   } else {
     EVT VT = OpVT.getVectorElementType();
     Cond = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
         DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   TargetLowering::BooleanContent ScalarBool =
       TLI.getBooleanContents(false, false);
   TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);
 
   // If integer and float booleans have different contents then we can't
   // reliably optimize in all cases. There is a full explanation for this in
   // DAGCombiner::visitSELECT() where the same issue affects folding
   // (select C, 0, 1) to (xor C, 1).
   if (TLI.getBooleanContents(false, false) !=
       TLI.getBooleanContents(false, true)) {
     // At least try the common case where the boolean is generated by a
     // comparison.
     if (Cond->getOpcode() == ISD::SETCC) {
       EVT OpVT = Cond->getOperand(0).getValueType();
       ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
       VecBool = TLI.getBooleanContents(OpVT);
     } else
       ScalarBool = TargetLowering::UndefinedBooleanContent;
   }
 
   EVT CondVT = Cond.getValueType();
   if (ScalarBool != VecBool) {
     switch (ScalarBool) {
       case TargetLowering::UndefinedBooleanContent:
         break;
       case TargetLowering::ZeroOrOneBooleanContent:
         assert(VecBool == TargetLowering::UndefinedBooleanContent ||
                VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
         // Vector read from all ones, scalar expects a single 1 so mask.
         Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
                            Cond, DAG.getConstant(1, SDLoc(N), CondVT));
         break;
       case TargetLowering::ZeroOrNegativeOneBooleanContent:
         assert(VecBool == TargetLowering::UndefinedBooleanContent ||
                VecBool == TargetLowering::ZeroOrOneBooleanContent);
         // Vector reads from a one, scalar from all ones so sign extend.
         Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
                            Cond, DAG.getValueType(MVT::i1));
         break;
     }
   }
 
   // Truncate the condition if needed
   auto BoolVT = getSetCCResultType(CondVT);
   if (BoolVT.bitsLT(CondVT))
     Cond = DAG.getNode(ISD::TRUNCATE, SDLoc(N), BoolVT, Cond);
 
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), Cond, LHS,
                        GetScalarizedVector(N->getOperand(2)));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS,
                        GetScalarizedVector(N->getOperand(2)));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(2));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
                      N->getOperand(0), N->getOperand(1),
                      LHS, GetScalarizedVector(N->getOperand(3)),
                      N->getOperand(4));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
   return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
   // Figure out if the scalar is the LHS or RHS and return it.
   SDValue Arg = N->getOperand(2).getOperand(0);
   if (Arg.isUndef())
     return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
   unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
   return GetScalarizedVector(N->getOperand(Op));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
          "Operand types must be vectors");
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT OpVT = LHS.getValueType();
   EVT NVT = N->getValueType(0).getVectorElementType();
   SDLoc DL(N);
 
   // The result needs scalarizing, but it's not a given that the source does.
   if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
     LHS = GetScalarizedVector(LHS);
     RHS = GetScalarizedVector(RHS);
   } else {
     EVT VT = OpVT.getVectorElementType();
     LHS = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
         DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
     RHS = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
         DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   // Turn it into a scalar SETCC.
   SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
                             N->getOperand(2));
   // Vectors may have a different boolean contents to scalars.  Promote the
   // value appropriately.
   ISD::NodeType ExtendCode =
       TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
   return DAG.getNode(ExtendCode, DL, NVT, Res);
 }
 
 
 //===----------------------------------------------------------------------===//
 //  Operand Vector Scalarization <1 x ty> -> ty.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
 
   if (!Res.getNode()) {
     switch (N->getOpcode()) {
     default:
 #ifndef NDEBUG
       dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
       N->dump(&DAG);
       dbgs() << "\n";
 #endif
       report_fatal_error("Do not know how to scalarize this operator's "
                          "operand!\n");
     case ISD::BITCAST:
       Res = ScalarizeVecOp_BITCAST(N);
       break;
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
     case ISD::TRUNCATE:
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
       Res = ScalarizeVecOp_UnaryOp(N);
       break;
     case ISD::CONCAT_VECTORS:
       Res = ScalarizeVecOp_CONCAT_VECTORS(N);
       break;
     case ISD::EXTRACT_VECTOR_ELT:
       Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
       break;
     case ISD::VSELECT:
       Res = ScalarizeVecOp_VSELECT(N);
       break;
     case ISD::SETCC:
       Res = ScalarizeVecOp_VSETCC(N);
       break;
     case ISD::STORE:
       Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
     case ISD::FP_ROUND:
       Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
       break;
     }
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this.
   if (Res.getNode() == N)
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 /// If the value to convert is a vector that needs to be scalarized, it must be
 /// <1 x ty>. Convert the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   return DAG.getNode(ISD::BITCAST, SDLoc(N),
                      N->getValueType(0), Elt);
 }
 
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
 /// Do the operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
          "Unexpected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
                            N->getValueType(0).getScalarType(), Elt);
   // Revectorize the result so the types line up with what the uses of this
   // expression expect.
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
 /// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
     Ops[i] = GetScalarizedVector(N->getOperand(i));
   return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
 }
 
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
 /// so just return the element, ignoring the index.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue Res = GetScalarizedVector(N->getOperand(0));
   if (Res.getValueType() != VT)
     Res = VT.isFloatingPoint()
               ? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res)
               : DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res);
   return Res;
 }
 
 /// If the input condition is a vector that needs to be scalarized, it must be
 /// <1 x i1>, so just convert to a normal ISD::SELECT
 /// (still with vector output type since that was acceptable if we got here).
 SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
   SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
   EVT VT = N->getValueType(0);
 
   return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
                      N->getOperand(2));
 }
 
 /// If the operand is a vector that needs to be scalarized then the
 /// result must be v1i1, so just convert to a scalar SETCC and wrap
 /// with a scalar_to_vector since the res type is legal if we got here
 SDValue DAGTypeLegalizer::ScalarizeVecOp_VSETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
          "Operand types must be vectors");
   assert(N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type");
 
   EVT VT = N->getValueType(0);
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
 
   EVT OpVT = N->getOperand(0).getValueType();
   EVT NVT = VT.getVectorElementType();
   SDLoc DL(N);
   // Turn it into a scalar SETCC.
   SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
       N->getOperand(2));
 
   // Vectors may have a different boolean contents to scalars.  Promote the
   // value appropriately.
   ISD::NodeType ExtendCode =
       TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
 
   Res = DAG.getNode(ExtendCode, DL, NVT, Res);
 
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Res);
 }
 
 /// If the value to store is a vector that needs to be scalarized, it must be
 /// <1 x ty>. Just store the element.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(N->isUnindexed() && "Indexed store of one-element vector?");
   assert(OpNo == 1 && "Do not know how to scalarize this operand!");
   SDLoc dl(N);
 
   if (N->isTruncatingStore())
     return DAG.getTruncStore(
         N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
         N->getBasePtr(), N->getPointerInfo(),
         N->getMemoryVT().getVectorElementType(), N->getAlignment(),
         N->getMemOperand()->getFlags(), N->getAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
                       N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
                       N->getAAInfo());
 }
 
 /// If the value to round is a vector that needs to be scalarized, it must be
 /// <1 x ty>. Convert the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
                             N->getValueType(0).getVectorElementType(), Elt,
                             N->getOperand(1));
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
 }
 
 //===----------------------------------------------------------------------===//
 //  Result Vector Splitting
 //===----------------------------------------------------------------------===//
 
 /// This method is called when the specified result of the specified node is
 /// found to need vector splitting. At this point, the node may also have
 /// invalid operands or may have other results that need legalization, we just
 /// know that (at least) one result needs vector splitting.
 void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Split node result: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Lo, Hi;
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "SplitVectorResult #" << ResNo << ": ";
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
     report_fatal_error("Do not know how to split the result of this "
                        "operator!\n");
 
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::VSELECT:
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::BITCAST:           SplitVecRes_BITCAST(N, Lo, Hi); break;
   case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
   case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
   case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
   case ISD::MLOAD:
     SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
     break;
   case ISD::MGATHER:
     SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi);
     break;
   case ISD::SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
     break;
   case ISD::VECTOR_SHUFFLE:
     SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
     break;
 
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
     SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
     break;
 
   case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CTLZ:
   case ISD::CTTZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTPOP:
   case ISD::FABS:
   case ISD::FCEIL:
   case ISD::FCOS:
   case ISD::FEXP:
   case ISD::FEXP2:
   case ISD::FFLOOR:
   case ISD::FLOG:
   case ISD::FLOG10:
   case ISD::FLOG2:
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
   case ISD::FCANONICALIZE:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     SplitVecRes_ExtendOp(N, Lo, Hi);
     break;
 
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
   case ISD::MULHS:
   case ISD::MULHU:
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::FDIV:
   case ISD::FPOW:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
   case ISD::UREM:
   case ISD::SREM:
   case ISD::FREM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
     SplitVecRes_TernaryOp(N, Lo, Hi);
     break;
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
   case ISD::STRICT_FDIV:
   case ISD::STRICT_FSQRT:
   case ISD::STRICT_FMA:
   case ISD::STRICT_FPOW:
   case ISD::STRICT_FPOWI:
   case ISD::STRICT_FSIN:
   case ISD::STRICT_FCOS:
   case ISD::STRICT_FEXP:
   case ISD::STRICT_FEXP2:
   case ISD::STRICT_FLOG:
   case ISD::STRICT_FLOG10:
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
   if (Lo.getNode())
     SetSplitVector(SDValue(N, ResNo), Lo, Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
                                          SDValue &Hi) {
   SDValue LHSLo, LHSHi;
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
   SDValue RHSLo, RHSHi;
   GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
   SDLoc dl(N);
 
   const SDNodeFlags Flags = N->getFlags();
   unsigned Opcode = N->getOpcode();
   Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
   Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
 }
 
 void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
                                              SDValue &Hi) {
   SDValue Op0Lo, Op0Hi;
   GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
   SDValue Op1Lo, Op1Hi;
   GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
   SDValue Op2Lo, Op2Hi;
   GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
   SDLoc dl(N);
 
   Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
                    Op0Lo, Op1Lo, Op2Lo);
   Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
                    Op0Hi, Op1Hi, Op2Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   // We know the result is a vector.  The input may be either a vector or a
   // scalar value.
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   SDLoc dl(N);
 
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
 
   // Handle some special cases efficiently.
   switch (getTypeAction(InVT)) {
   case TargetLowering::TypeLegal:
   case TargetLowering::TypePromoteInteger:
   case TargetLowering::TypePromoteFloat:
   case TargetLowering::TypeSoftenFloat:
   case TargetLowering::TypeScalarizeVector:
   case TargetLowering::TypeWidenVector:
     break;
   case TargetLowering::TypeExpandInteger:
   case TargetLowering::TypeExpandFloat:
     // A scalar to vector conversion, where the scalar needs expansion.
     // If the vector is being split in two then we can just convert the
     // expanded pieces.
     if (LoVT == HiVT) {
       GetExpandedOp(InOp, Lo, Hi);
       if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
       return;
     }
     break;
   case TargetLowering::TypeSplitVector:
     // If the input is a vector that needs to be split, convert each split
     // piece of the input now.
     GetSplitVector(InOp, Lo, Hi);
     Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
     Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
     return;
   }
 
   // In the general case, convert the input to an integer and split it by hand.
   EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
   EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
   if (DAG.getDataLayout().isBigEndian())
     std::swap(LoIntVT, HiIntVT);
 
   SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
 
   if (DAG.getDataLayout().isBigEndian())
     std::swap(Lo, Hi);
   Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
   Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
   Lo = DAG.getBuildVector(LoVT, dl, LoOps);
 
   SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
   Hi = DAG.getBuildVector(HiVT, dl, HiOps);
 }
 
 void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
                                                   SDValue &Hi) {
   assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
   SDLoc dl(N);
   unsigned NumSubvectors = N->getNumOperands() / 2;
   if (NumSubvectors == 1) {
     Lo = N->getOperand(0);
     Hi = N->getOperand(1);
     return;
   }
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
   Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);
 
   SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
   Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
 }
 
 void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
   SDValue Idx = N->getOperand(1);
   SDLoc dl(N);
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
                    DAG.getConstant(IdxVal + LoVT.getVectorNumElements(), dl,
                                    TLI.getVectorIdxTy(DAG.getDataLayout())));
 }
 
 void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
   SDValue SubVec = N->getOperand(1);
   SDValue Idx = N->getOperand(2);
   SDLoc dl(N);
   GetSplitVector(Vec, Lo, Hi);
 
   EVT VecVT = Vec.getValueType();
   unsigned VecElems = VecVT.getVectorNumElements();
   unsigned SubElems = SubVec.getValueType().getVectorNumElements();
 
   // If we know the index is 0, and we know the subvector doesn't cross the
   // boundary between the halves, we can avoid spilling the vector, and insert
   // into the lower half of the split vector directly.
   // TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
   // the index is constant and there is no boundary crossing. But those cases
   // don't seem to get hit in practice.
   if (ConstantSDNode *ConstIdx = dyn_cast<ConstantSDNode>(Idx)) {
     unsigned IdxVal = ConstIdx->getZExtValue();
     if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
       EVT LoVT, HiVT;
       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
       Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
       return;
     }
   }
 
   // Spill the vector to the stack.
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   SDValue Store =
       DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Store the new subvector into the specified index.
   SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());
 
   // Load the Lo part from the stack slot.
   Lo =
       DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
   StackPtr =
       DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                   DAG.getConstant(IncrementSize, dl, StackPtr.getValueType()));
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
                    MinAlign(Alignment, IncrementSize));
 }
 
 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
                                          SDValue &Hi) {
   SDLoc dl(N);
   GetSplitVector(N->getOperand(0), Lo, Hi);
   Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
   Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
 }
 
 void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
                                              SDValue &Hi) {
   SDValue LHSLo, LHSHi;
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
   SDLoc DL(N);
 
   SDValue RHSLo, RHSHi;
   SDValue RHS = N->getOperand(1);
   EVT RHSVT = RHS.getValueType();
   if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
     GetSplitVector(RHS, RHSLo, RHSHi);
   else
     std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));
 
 
   Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo);
   Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue LHSLo, LHSHi;
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
   SDLoc dl(N);
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) =
     DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());
 
   Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
                    DAG.getValueType(LoVT));
   Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
                    DAG.getValueType(HiVT));
 }
 
 void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
                                                  SDValue &Hi) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
 
   SDLoc dl(N);
   SDValue InLo, InHi;
 
   if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(N0, InLo, InHi);
   else
     std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);
 
   EVT InLoVT = InLo.getValueType();
   unsigned InNumElements = InLoVT.getVectorNumElements();
 
   EVT OutLoVT, OutHiVT;
   std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned OutNumElements = OutLoVT.getVectorNumElements();
   assert((2 * OutNumElements) <= InNumElements &&
          "Illegal extend vector in reg split");
 
   // *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
   // input vector (i.e. we only use InLo):
   // OutLo will extend the first OutNumElements from InLo.
   // OutHi will extend the next OutNumElements from InLo.
 
   // Shuffle the elements from InLo for OutHi into the bottom elements to
   // create a 'fake' InHi.
   SmallVector<int, 8> SplitHi(InNumElements, -1);
   for (unsigned i = 0; i != OutNumElements; ++i)
     SplitHi[i] = i + OutNumElements;
   InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);
 
   Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
   Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,
                                               SDValue &Hi) {
   unsigned NumOps = N->getNumOperands();
   SDValue Chain = N->getOperand(0);
   EVT LoVT, HiVT;
   SDLoc dl(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   SmallVector<SDValue, 4> OpsLo;
   SmallVector<SDValue, 4> OpsHi;
 
   // The Chain is the first operand.
   OpsLo.push_back(Chain);
   OpsHi.push_back(Chain);
 
   // Now process the remaining operands.
   for (unsigned i = 1; i < NumOps; ++i) {
     SDValue Op = N->getOperand(i);
     SDValue OpLo = Op;
     SDValue OpHi = Op;
 
     EVT InVT = Op.getValueType();
     if (InVT.isVector()) {
       // If the input also splits, handle it directly for a
       // compile time speedup. Otherwise split it by hand.
       if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
         GetSplitVector(Op, OpLo, OpHi);
       else
         std::tie(OpLo, OpHi) = DAG.SplitVectorOperand(N, i);
     }
 
     OpsLo.push_back(OpLo);
     OpsHi.push_back(OpHi);
   }
 
   EVT LoValueVTs[] = {LoVT, MVT::Other};
   EVT HiValueVTs[] = {HiVT, MVT::Other};
   Lo = DAG.getNode(N->getOpcode(), dl, LoValueVTs, OpsLo);
   Hi = DAG.getNode(N->getOpcode(), dl, HiValueVTs, OpsHi);
 
   // Build a factor node to remember that this Op is independent of the
   // other one.
   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                       Lo.getValue(1), Hi.getValue(1));
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Chain);
 }
 
 void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
   SDValue Elt = N->getOperand(1);
   SDValue Idx = N->getOperand(2);
   SDLoc dl(N);
   GetSplitVector(Vec, Lo, Hi);
 
   if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
     unsigned IdxVal = CIdx->getZExtValue();
     unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
     if (IdxVal < LoNumElts)
       Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                        Lo.getValueType(), Lo, Elt, Idx);
     else
       Hi =
           DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
                       DAG.getConstant(IdxVal - LoNumElts, dl,
                                       TLI.getVectorIdxTy(DAG.getDataLayout())));
     return;
   }
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(0), true))
     return;
 
   // Make the vector elements byte-addressable if they aren't already.
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
   if (VecVT.getScalarSizeInBits() < 8) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                              VecVT.getVectorNumElements());
     Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
     // Extend the element type to match if needed.
     if (EltVT.bitsGT(Elt.getValueType()))
       Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt);
   }
 
   // Spill the vector to the stack.
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   auto &MF = DAG.getMachineFunction();
   auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Store the new element.  This may be larger than the vector element type,
   // so use a truncating store.
   SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
                             MachinePointerInfo::getUnknownStack(MF), EltVT);
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
 
   // Load the Lo part from the stack slot.
   Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo);
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = LoVT.getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                          DAG.getConstant(IncrementSize, dl,
                                          StackPtr.getValueType()));
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
                    PtrInfo.getWithOffset(IncrementSize),
                    MinAlign(Alignment, IncrementSize));
 
   // If we adjusted the original type, we need to truncate the results.
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   if (LoVT != Lo.getValueType())
     Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo);
   if (HiVT != Hi.getValueType())
     Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
   Hi = DAG.getUNDEF(HiVT);
 }
 
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
   EVT LoVT, HiVT;
   SDLoc dl(LD);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   SDValue Ch = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
   SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
   EVT MemoryVT = LD->getMemoryVT();
   unsigned Alignment = LD->getOriginalAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo(), LoMemVT, Alignment, MMOFlags, AAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo().getWithOffset(IncrementSize), HiMemVT,
                    Alignment, MMOFlags, AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(LD, 1), Ch);
 }
 
 void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                                          SDValue &Lo, SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(MLD);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
 
   SDValue Ch = MLD->getChain();
   SDValue Ptr = MLD->getBasePtr();
   SDValue Mask = MLD->getMask();
   SDValue Src0 = MLD->getSrc0();
   unsigned Alignment = MLD->getOriginalAlignment();
   ISD::LoadExtType ExtType = MLD->getExtensionType();
 
   // if Alignment is equal to the vector size,
   // take the half of it for the second part
   unsigned SecondHalfAlignment =
     (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
      Alignment/2 : Alignment;
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Mask, MaskLo, MaskHi);
   else
     std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
 
   EVT MemoryVT = MLD->getMemoryVT();
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue Src0Lo, Src0Hi;
   if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Src0, Src0Lo, Src0Hi);
   else
     std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MLD->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MLD->getAAInfo(), MLD->getRanges());
 
   Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
                          ExtType, MLD->isExpandingLoad());
 
   Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
                                    MLD->isExpandingLoad());
   unsigned HiOffset = LoMemVT.getStoreSize();
 
   MMO = DAG.getMachineFunction().getMachineMemOperand(
       MLD->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOLoad,
       HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(),
       MLD->getRanges());
 
   Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
                          ExtType, MLD->isExpandingLoad());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(MLD, 1), Ch);
 
 }
 
 void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
                                          SDValue &Lo, SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(MGT);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
 
   SDValue Ch = MGT->getChain();
   SDValue Ptr = MGT->getBasePtr();
   SDValue Mask = MGT->getMask();
   SDValue Src0 = MGT->getValue();
   SDValue Index = MGT->getIndex();
   SDValue Scale = MGT->getScale();
   unsigned Alignment = MGT->getOriginalAlignment();
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Mask, MaskLo, MaskHi);
   else
     std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
 
   EVT MemoryVT = MGT->getMemoryVT();
   EVT LoMemVT, HiMemVT;
   // Split MemoryVT
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue Src0Lo, Src0Hi;
   if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Src0, Src0Lo, Src0Hi);
   else
     std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   SDValue IndexHi, IndexLo;
   if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Index, IndexLo, IndexHi);
   else
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(), MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
                            MMO);
 
   SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi, Scale};
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
                            MMO);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(MGT, 1), Ch);
 }
 
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
          "Operand types must be vectors");
 
   EVT LoVT, HiVT;
   SDLoc DL(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // If the input also splits, handle it directly. Otherwise split it by hand.
   SDValue LL, LH, RL, RH;
   if (getTypeAction(N->getOperand(0).getValueType()) ==
       TargetLowering::TypeSplitVector)
     GetSplitVector(N->getOperand(0), LL, LH);
   else
     std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
 
   if (getTypeAction(N->getOperand(1).getValueType()) ==
       TargetLowering::TypeSplitVector)
     GetSplitVector(N->getOperand(1), RL, RH);
   else
     std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
 
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
 }
 
 void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   // Get the dest types - they may not match the input types, e.g. int_to_fp.
   EVT LoVT, HiVT;
   SDLoc dl(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // If the input also splits, handle it directly for a compile time speedup.
   // Otherwise split it by hand.
   EVT InVT = N->getOperand(0).getValueType();
   if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
     GetSplitVector(N->getOperand(0), Lo, Hi);
   else
     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
 
   if (N->getOpcode() == ISD::FP_ROUND) {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
     Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
   } else {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
     Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
   }
 }
 
 void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
                                             SDValue &Hi) {
   SDLoc dl(N);
   EVT SrcVT = N->getOperand(0).getValueType();
   EVT DestVT = N->getValueType(0);
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
 
   // We can do better than a generic split operation if the extend is doing
   // more than just doubling the width of the elements and the following are
   // true:
   //   - The number of vector elements is even,
   //   - the source type is legal,
   //   - the type of a split source is illegal,
   //   - the type of an extended (by doubling element size) source is legal, and
   //   - the type of that extended source when split is legal.
   //
   // This won't necessarily completely legalize the operation, but it will
   // more effectively move in the right direction and prevent falling down
   // to scalarization in many cases due to the input vector being split too
   // far.
   unsigned NumElements = SrcVT.getVectorNumElements();
   if ((NumElements & 1) == 0 &&
       SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
     LLVMContext &Ctx = *DAG.getContext();
     EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
     EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);
 
     EVT SplitLoVT, SplitHiVT;
     std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
     if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
         TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
       LLVM_DEBUG(dbgs() << "Split vector extend via incremental extend:";
                  N->dump(&DAG); dbgs() << "\n");
       // Extend the source vector by one step.
       SDValue NewSrc =
           DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
       // Get the low and high halves of the new, extended one step, vector.
       std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
       // Extend those vector halves the rest of the way.
       Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
       Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
       return;
     }
   }
   // Fall back to the generic unary operator splitting otherwise.
   SplitVecRes_UnaryOp(N, Lo, Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
                                                   SDValue &Lo, SDValue &Hi) {
   // The low and high parts of the original input give four input vectors.
   SDValue Inputs[4];
   SDLoc dl(N);
   GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
   GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
   EVT NewVT = Inputs[0].getValueType();
   unsigned NewElts = NewVT.getVectorNumElements();
 
   // If Lo or Hi uses elements from at most two of the four input vectors, then
   // express it as a vector shuffle of those two inputs.  Otherwise extract the
   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
   SmallVector<int, 16> Ops;
   for (unsigned High = 0; High < 2; ++High) {
     SDValue &Output = High ? Hi : Lo;
 
     // Build a shuffle mask for the output, discovering on the fly which
     // input vectors to use as shuffle operands (recorded in InputUsed).
     // If building a suitable shuffle vector proves too hard, then bail
     // out with useBuildVector set.
     unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
     unsigned FirstMaskIdx = High * NewElts;
     bool useBuildVector = false;
     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
       // The mask element.  This indexes into the input.
       int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
 
       // The input vector this mask element indexes into.
       unsigned Input = (unsigned)Idx / NewElts;
 
       if (Input >= array_lengthof(Inputs)) {
         // The mask element does not index into any input vector.
         Ops.push_back(-1);
         continue;
       }
 
       // Turn the index into an offset from the start of the input vector.
       Idx -= Input * NewElts;
 
       // Find or create a shuffle vector operand to hold this input.
       unsigned OpNo;
       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
         if (InputUsed[OpNo] == Input) {
           // This input vector is already an operand.
           break;
         } else if (InputUsed[OpNo] == -1U) {
           // Create a new operand for this input vector.
           InputUsed[OpNo] = Input;
           break;
         }
       }
 
       if (OpNo >= array_lengthof(InputUsed)) {
         // More than two input vectors used!  Give up on trying to create a
         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
         useBuildVector = true;
         break;
       }
 
       // Add the mask index for the new shuffle vector.
       Ops.push_back(Idx + OpNo * NewElts);
     }
 
     if (useBuildVector) {
       EVT EltVT = NewVT.getVectorElementType();
       SmallVector<SDValue, 16> SVOps;
 
       // Extract the input elements by hand.
       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
         // The mask element.  This indexes into the input.
         int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
 
         // The input vector this mask element indexes into.
         unsigned Input = (unsigned)Idx / NewElts;
 
         if (Input >= array_lengthof(Inputs)) {
           // The mask element is "undef" or indexes off the end of the input.
           SVOps.push_back(DAG.getUNDEF(EltVT));
           continue;
         }
 
         // Turn the index into an offset from the start of the input vector.
         Idx -= Input * NewElts;
 
         // Extract the vector element by hand.
         SVOps.push_back(DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Inputs[Input],
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
       Output = DAG.getBuildVector(NewVT, dl, SVOps);
     } else if (InputUsed[0] == -1U) {
       // No input vectors were used!  The result is undefined.
       Output = DAG.getUNDEF(NewVT);
     } else {
       SDValue Op0 = Inputs[InputUsed[0]];
       // If only one input was used, use an undefined vector for the other.
       SDValue Op1 = InputUsed[1] == -1U ?
         DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
       // At least one input vector was used.  Create a new shuffle vector.
       Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
     }
 
     Ops.clear();
   }
 }
 
 
 //===----------------------------------------------------------------------===//
 //  Operand Vector Splitting
 //===----------------------------------------------------------------------===//
 
 /// This method is called when the specified operand of the specified node is
 /// found to need vector splitting. At this point, all of the result types of
 /// the node are known to be legal, but other operands of the node may need
 /// legalization as well as the specified one.
 bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Split node operand: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom split this node.
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
     return false;
 
   if (!Res.getNode()) {
     switch (N->getOpcode()) {
     default:
 #ifndef NDEBUG
       dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
       N->dump(&DAG);
       dbgs() << "\n";
 #endif
       report_fatal_error("Do not know how to split this operator's "
                          "operand!\n");
 
     case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
     case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
     case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
     case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
     case ISD::TRUNCATE:
       Res = SplitVecOp_TruncateHelper(N);
       break;
     case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
     case ISD::FCOPYSIGN:         Res = SplitVecOp_FCOPYSIGN(N); break;
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
     case ISD::MSTORE:
       Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
       break;
     case ISD::MSCATTER:
       Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
       break;
     case ISD::MGATHER:
       Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
       break;
     case ISD::VSELECT:
       Res = SplitVecOp_VSELECT(N, OpNo);
       break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
       if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
         Res = SplitVecOp_TruncateHelper(N);
       else
         Res = SplitVecOp_UnaryOp(N);
       break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
       if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
         Res = SplitVecOp_TruncateHelper(N);
       else
         Res = SplitVecOp_UnaryOp(N);
       break;
     case ISD::CTTZ:
     case ISD::CTLZ:
     case ISD::CTPOP:
     case ISD::FP_EXTEND:
     case ISD::SIGN_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
     case ISD::FTRUNC:
     case ISD::FCANONICALIZE:
       Res = SplitVecOp_UnaryOp(N);
       break;
 
     case ISD::ANY_EXTEND_VECTOR_INREG:
     case ISD::SIGN_EXTEND_VECTOR_INREG:
     case ISD::ZERO_EXTEND_VECTOR_INREG:
       Res = SplitVecOp_ExtVecInRegOp(N);
       break;
 
     case ISD::VECREDUCE_FADD:
     case ISD::VECREDUCE_FMUL:
     case ISD::VECREDUCE_ADD:
     case ISD::VECREDUCE_MUL:
     case ISD::VECREDUCE_AND:
     case ISD::VECREDUCE_OR:
     case ISD::VECREDUCE_XOR:
     case ISD::VECREDUCE_SMAX:
     case ISD::VECREDUCE_SMIN:
     case ISD::VECREDUCE_UMAX:
     case ISD::VECREDUCE_UMIN:
     case ISD::VECREDUCE_FMAX:
     case ISD::VECREDUCE_FMIN:
       Res = SplitVecOp_VECREDUCE(N, OpNo);
       break;
     }
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this.
   if (Res.getNode() == N)
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   // The only possibility for an illegal operand is the mask, since result type
   // legalization would have handled this node already otherwise.
   assert(OpNo == 0 && "Illegal operand must be mask");
 
   SDValue Mask = N->getOperand(0);
   SDValue Src0 = N->getOperand(1);
   SDValue Src1 = N->getOperand(2);
   EVT Src0VT = Src0.getValueType();
   SDLoc DL(N);
   assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
 
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
   assert(Lo.getValueType() == Hi.getValueType() &&
          "Lo and Hi have differing types");
 
   EVT LoOpVT, HiOpVT;
   std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
   assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
 
   SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
   std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
   std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
   std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
 
   SDValue LoSelect =
     DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
   SDValue HiSelect =
     DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
   EVT ResVT = N->getValueType(0);
   SDValue Lo, Hi;
   SDLoc dl(N);
 
   SDValue VecOp = N->getOperand(OpNo);
   EVT VecVT = VecOp.getValueType();
   assert(VecVT.isVector() && "Can only split reduce vector operand");
   GetSplitVector(VecOp, Lo, Hi);
   EVT LoOpVT, HiOpVT;
   std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
 
   bool NoNaN = N->getFlags().hasNoNaNs();
   unsigned CombineOpc = 0;
   switch (N->getOpcode()) {
   case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
   case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
   case ISD::VECREDUCE_ADD:  CombineOpc = ISD::ADD; break;
   case ISD::VECREDUCE_MUL:  CombineOpc = ISD::MUL; break;
   case ISD::VECREDUCE_AND:  CombineOpc = ISD::AND; break;
   case ISD::VECREDUCE_OR:   CombineOpc = ISD::OR; break;
   case ISD::VECREDUCE_XOR:  CombineOpc = ISD::XOR; break;
   case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
   case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
   case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
   case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
   case ISD::VECREDUCE_FMAX:
     CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
     break;
   case ISD::VECREDUCE_FMIN:
     CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
     break;
   default:
     llvm_unreachable("Unexpected reduce ISD node");
   }
 
   // Use the appropriate scalar instruction on the split subvectors before
   // reducing the now partially reduced smaller vector.
   SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi, N->getFlags());
   return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
   SDValue Lo, Hi;
   SDLoc dl(N);
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
 
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
                                InVT.getVectorNumElements());
 
   Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
   Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
   // For example, i64 = BITCAST v4i16 on alpha.  Typically the vector will
   // end up being split all the way down to individual components.  Convert the
   // split pieces into integers and reassemble.
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
   Lo = BitConvertToInteger(Lo);
   Hi = BitConvertToInteger(Hi);
 
   if (DAG.getDataLayout().isBigEndian())
     std::swap(Lo, Hi);
 
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      JoinIntegers(Lo, Hi));
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   // We know that the extracted result type is legal.
   EVT SubVT = N->getValueType(0);
   SDValue Idx = N->getOperand(1);
   SDLoc dl(N);
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
 
   uint64_t LoElts = Lo.getValueType().getVectorNumElements();
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
   if (IdxVal < LoElts) {
     assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
            "Extracted subvector crosses vector split!");
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
   } else {
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
                        DAG.getConstant(IdxVal - LoElts, dl,
                                        Idx.getValueType()));
   }
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue Vec = N->getOperand(0);
   SDValue Idx = N->getOperand(1);
   EVT VecVT = Vec.getValueType();
 
   if (isa<ConstantSDNode>(Idx)) {
     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!");
 
     SDValue Lo, Hi;
     GetSplitVector(Vec, Lo, Hi);
 
     uint64_t LoElts = Lo.getValueType().getVectorNumElements();
 
     if (IdxVal < LoElts)
       return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
     return SDValue(DAG.UpdateNodeOperands(N, Hi,
                                   DAG.getConstant(IdxVal - LoElts, SDLoc(N),
                                                   Idx.getValueType())), 0);
   }
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(0), true))
     return SDValue();
 
   // Make the vector elements byte-addressable if they aren't already.
   SDLoc dl(N);
   EVT EltVT = VecVT.getVectorElementType();
   if (VecVT.getScalarSizeInBits() < 8) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                              VecVT.getVectorNumElements());
     Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
   }
 
   // Store the vector to the stack.
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   auto &MF = DAG.getMachineFunction();
   auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Load back the required element.
   StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   return DAG.getExtLoad(
       ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
       MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
   SDValue Lo, Hi;
 
   // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
   // splitting the result has the same effect as splitting the input operand.
   SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                                              unsigned OpNo) {
   EVT LoVT, HiVT;
   SDLoc dl(MGT);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
 
   SDValue Ch = MGT->getChain();
   SDValue Ptr = MGT->getBasePtr();
   SDValue Index = MGT->getIndex();
   SDValue Scale = MGT->getScale();
   SDValue Mask = MGT->getMask();
   SDValue Src0 = MGT->getValue();
   unsigned Alignment = MGT->getOriginalAlignment();
 
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Mask operand
     GetSplitVector(Mask, MaskLo, MaskHi);
   else
     std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
 
   EVT MemoryVT = MGT->getMemoryVT();
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue Src0Lo, Src0Hi;
   if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Src0, Src0Lo, Src0Hi);
   else
     std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
 
   SDValue IndexHi, IndexLo;
   if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Index, IndexLo, IndexHi);
   else
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(), MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo, Scale};
   SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
                                    OpsLo, MMO);
 
   MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(),
                          MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
                          Alignment, MGT->getAAInfo(),
                          MGT->getRanges());
 
   SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi, Scale};
   SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
                                    OpsHi, MMO);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
   Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                    Hi.getValue(1));
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(MGT, 1), Ch);
 
   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo,
                             Hi);
   ReplaceValueWith(SDValue(MGT, 0), Res);
   return SDValue();
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                                             unsigned OpNo) {
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   SDValue Mask = N->getMask();
   SDValue Data = N->getValue();
   EVT MemoryVT = N->getMemoryVT();
   unsigned Alignment = N->getOriginalAlignment();
   SDLoc DL(N);
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
   if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Data operand
     GetSplitVector(Data, DataLo, DataHi);
   else
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Mask operand
     GetSplitVector(Mask, MaskLo, MaskHi);
   else
     std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
 
   // if Alignment is equal to the vector size,
   // take the half of it for the second part
   unsigned SecondHalfAlignment =
     (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
        Alignment/2 : Alignment;
 
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
   Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
                           N->isTruncatingStore(),
                           N->isCompressingStore());
 
   Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                    N->isCompressingStore());
   unsigned HiOffset = LoMemVT.getStoreSize();
 
   MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
       HiMemVT.getStoreSize(), SecondHalfAlignment, N->getAAInfo(),
       N->getRanges());
 
   Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
                           N->isTruncatingStore(), N->isCompressingStore());
 
   // Build a factor node to remember that this store is independent of the
   // other one.
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
                                               unsigned OpNo) {
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   SDValue Mask = N->getMask();
   SDValue Index = N->getIndex();
   SDValue Scale = N->getScale();
   SDValue Data = N->getValue();
   EVT MemoryVT = N->getMemoryVT();
   unsigned Alignment = N->getOriginalAlignment();
   SDLoc DL(N);
 
   // Split all operands
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
   if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Data operand
     GetSplitVector(Data, DataLo, DataHi);
   else
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Mask operand
     GetSplitVector(Mask, MaskLo, MaskHi);
   else
     std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
 
   SDValue IndexHi, IndexLo;
   if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(Index, IndexLo, IndexHi);
   else
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
 
   SDValue Lo;
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
   SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
                             DL, OpsLo, MMO);
 
   MMO = DAG.getMachineFunction().
     getMachineMemOperand(N->getPointerInfo(),
                          MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
   // The order of the Scatter operation after split is well defined. The "Hi"
   // part comes after the "Lo". So these two operations should be chained one
   // after another.
   SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
                               DL, OpsHi, MMO);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   assert(N->isUnindexed() && "Indexed store of vector?");
   assert(OpNo == 1 && "Can only split the stored value");
   SDLoc DL(N);
 
   bool isTruncating = N->isTruncatingStore();
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   EVT MemoryVT = N->getMemoryVT();
   unsigned Alignment = N->getOriginalAlignment();
   MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   // Scalarize if the split halves are not byte-sized.
   if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized())
     return TLI.scalarizeVectorStore(N, DAG);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
                            Alignment, MMOFlags, AAInfo);
   else
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
                       AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getObjectPtrOffset(DL, Ptr, IncrementSize);
 
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            HiMemVT, Alignment, MMOFlags, AAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
                       Alignment, MMOFlags, AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
   SDLoc DL(N);
 
   // The input operands all must have the same type, and we know the result
   // type is valid.  Convert this to a buildvector which extracts all the
   // input elements.
   // TODO: If the input elements are power-two vectors, we could convert this to
   // a new CONCAT_VECTORS node with elements that are half-wide.
   SmallVector<SDValue, 32> Elts;
   EVT EltVT = N->getValueType(0).getVectorElementType();
   for (const SDValue &Op : N->op_values()) {
     for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
          i != e; ++i) {
       Elts.push_back(DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
           DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
     }
   }
 
   return DAG.getBuildVector(N->getValueType(0), DL, Elts);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   // The result type is legal, but the input type is illegal.  If splitting
   // ends up with the result type of each half still being legal, just
   // do that.  If, however, that would result in an illegal result type,
   // we can try to get more clever with power-two vectors. Specifically,
   // split the input type, but also widen the result element size, then
   // concatenate the halves and truncate again.  For example, consider a target
   // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
   // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
   //   %inlo = v4i32 extract_subvector %in, 0
   //   %inhi = v4i32 extract_subvector %in, 4
   //   %lo16 = v4i16 trunc v4i32 %inlo
   //   %hi16 = v4i16 trunc v4i32 %inhi
   //   %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
   //   %res = v8i8 trunc v8i16 %in16
   //
   // Without this transform, the original truncate would end up being
   // scalarized, which is pretty much always a last resort.
   SDValue InVec = N->getOperand(0);
   EVT InVT = InVec->getValueType(0);
   EVT OutVT = N->getValueType(0);
   unsigned NumElements = OutVT.getVectorNumElements();
   bool IsFloat = OutVT.isFloatingPoint();
 
   // Widening should have already made sure this is a power-two vector
   // if we're trying to split it at all. assert() that's true, just in case.
   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
 
   unsigned InElementSize = InVT.getScalarSizeInBits();
   unsigned OutElementSize = OutVT.getScalarSizeInBits();
 
   // If the input elements are only 1/2 the width of the result elements,
   // just use the normal splitting. Our trick only work if there's room
   // to split more than once.
   if (InElementSize <= OutElementSize * 2)
     return SplitVecOp_UnaryOp(N);
   SDLoc DL(N);
 
   // Get the split input vector.
   SDValue InLoVec, InHiVec;
   GetSplitVector(InVec, InLoVec, InHiVec);
   // Truncate them to 1/2 the element size.
   EVT HalfElementVT = IsFloat ?
     EVT::getFloatingPointVT(InElementSize/2) :
     EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
                                 NumElements/2);
   SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec);
   SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec);
   // Concatenate them to get the full intermediate truncation result.
   EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
   SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
                                  HalfHi);
   // Now finish up by truncating all the way down to the original result
   // type. This should normally be something that ends up being legal directly,
   // but in theory if a target has very wide vectors and an annoyingly
   // restricted set of legal types, this split can chain to build things up.
   return IsFloat
              ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
                            DAG.getTargetConstant(
                                0, DL, TLI.getPointerTy(DAG.getDataLayout())))
              : DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
          "Operand types must be vectors");
   // The result has a legal vector type, but the input needs splitting.
   SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
   SDLoc DL(N);
   GetSplitVector(N->getOperand(0), Lo0, Hi0);
   GetSplitVector(N->getOperand(1), Lo1, Hi1);
   unsigned PartElements = Lo0.getValueType().getVectorNumElements();
   EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements);
   EVT WideResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 2*PartElements);
 
   LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
   HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
   SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
   return PromoteTargetBoolean(Con, N->getValueType(0));
 }
 
 
 SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
   SDValue Lo, Hi;
   SDLoc DL(N);
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
 
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
                                InVT.getVectorNumElements());
 
   Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
   Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
 
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
   // The result (and the first input) has a legal vector type, but the second
   // input needs splitting.
   return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
 }
 
 
 //===----------------------------------------------------------------------===//
 //  Result Vector Widening
 //===----------------------------------------------------------------------===//
 
 void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Widen node result " << ResNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
 
   // See if the target wants to custom widen this node.
   if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
     return;
 
   SDValue Res = SDValue();
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "WidenVectorResult #" << ResNo << ": ";
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to widen the result of this operator!");
 
   case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
   case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
   case ISD::MGATHER:
     Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
 
   case ISD::ADD:
   case ISD::AND:
   case ISD::MUL:
   case ISD::MULHS:
   case ISD::MULHU:
   case ISD::OR:
   case ISD::SUB:
   case ISD::XOR:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
     Res = WidenVecRes_Binary(N);
     break;
 
   case ISD::FADD:
   case ISD::FMUL:
   case ISD::FPOW:
   case ISD::FSUB:
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM:
     Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
   case ISD::FCOPYSIGN:
     Res = WidenVecRes_FCOPYSIGN(N);
     break;
 
   case ISD::FPOWI:
     Res = WidenVecRes_POWI(N);
     break;
 
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
     Res = WidenVecRes_Shift(N);
     break;
 
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
     Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
     break;
 
   case ISD::ANY_EXTEND:
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::SIGN_EXTEND:
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
   case ISD::ZERO_EXTEND:
     Res = WidenVecRes_Convert(N);
     break;
 
   case ISD::BITREVERSE:
   case ISD::BSWAP:
   case ISD::CTLZ:
   case ISD::CTPOP:
   case ISD::CTTZ:
   case ISD::FABS:
   case ISD::FCEIL:
   case ISD::FCOS:
   case ISD::FEXP:
   case ISD::FEXP2:
   case ISD::FFLOOR:
   case ISD::FLOG:
   case ISD::FLOG10:
   case ISD::FLOG2:
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FRINT:
   case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
     Res = WidenVecRes_Unary(N);
     break;
   case ISD::FMA:
     Res = WidenVecRes_Ternary(N);
     break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
   if (Res.getNode())
     SetWidenedVector(SDValue(N, ResNo), Res);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
   // Ternary op widening.
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
   SDValue InOp3 = GetWidenedVector(N->getOperand(2));
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
   // Binary op widening for operations that can trap.
   unsigned Opcode = N->getOpcode();
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
   unsigned NumElts =  VT.getVectorNumElements();
   const SDNodeFlags Flags = N->getFlags();
   while (!TLI.isTypeLegal(VT) && NumElts != 1) {
     NumElts = NumElts / 2;
     VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
   }
 
   if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
     // Operation doesn't trap so just widen as normal.
     SDValue InOp1 = GetWidenedVector(N->getOperand(0));
     SDValue InOp2 = GetWidenedVector(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
   }
 
   // No legal vector version so unroll the vector operation and then widen.
   if (NumElts == 1)
     return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
 
   // Since the operation can trap, apply operation on the original vector.
   EVT MaxVT = VT;
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
   unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
 
   SmallVector<SDValue, 16> ConcatOps(CurNumElts);
   unsigned ConcatEnd = 0;  // Current ConcatOps index.
   int Idx = 0;        // Current Idx into input vectors.
 
   // NumElts := greatest legal vector size (at most WidenVT)
   // while (orig. vector has unhandled elements) {
   //   take munches of size NumElts from the beginning and add to ConcatOps
   //   NumElts := next smaller supported vector size or 1
   // }
   while (CurNumElts != 0) {
     while (CurNumElts >= NumElts) {
       SDValue EOp1 = DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
           DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       SDValue EOp2 = DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
           DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
       Idx += NumElts;
       CurNumElts -= NumElts;
     }
     do {
       NumElts = NumElts / 2;
       VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
     } while (!TLI.isTypeLegal(VT) && NumElts != 1);
 
     if (NumElts == 1) {
       for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
         SDValue EOp1 = DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp1,
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
         SDValue EOp2 = DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2,
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
         ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
                                              EOp1, EOp2, Flags);
       }
       CurNumElts = 0;
     }
   }
 
   // Check to see if we have a single operation with the widen type.
   if (ConcatEnd == 1) {
     VT = ConcatOps[0].getValueType();
     if (VT == WidenVT)
       return ConcatOps[0];
   }
 
   // while (Some element of ConcatOps is not of type MaxVT) {
   //   From the end of ConcatOps, collect elements of the same type and put
   //   them into an op of the next larger supported type
   // }
   while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
     Idx = ConcatEnd - 1;
     VT = ConcatOps[Idx--].getValueType();
     while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
       Idx--;
 
     int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
     EVT NextVT;
     do {
       NextSize *= 2;
       NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
     } while (!TLI.isTypeLegal(NextVT));
 
     if (!VT.isVector()) {
       // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
       SDValue VecOp = DAG.getUNDEF(NextVT);
       unsigned NumToInsert = ConcatEnd - Idx - 1;
       for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
         VecOp = DAG.getNode(
             ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp, ConcatOps[OpIdx],
             DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       }
       ConcatOps[Idx+1] = VecOp;
       ConcatEnd = Idx + 2;
     } else {
       // Vector type, create a CONCAT_VECTORS of type NextVT
       SDValue undefVec = DAG.getUNDEF(VT);
       unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
       SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
       unsigned RealVals = ConcatEnd - Idx - 1;
       unsigned SubConcatEnd = 0;
       unsigned SubConcatIdx = Idx + 1;
       while (SubConcatEnd < RealVals)
         SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
       while (SubConcatEnd < OpsToConcat)
         SubConcatOps[SubConcatEnd++] = undefVec;
       ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
                                             NextVT, SubConcatOps);
       ConcatEnd = SubConcatIdx + 1;
     }
   }
 
   // Check to see if we have a single operation with the widen type.
   if (ConcatEnd == 1) {
     VT = ConcatOps[0].getValueType();
     if (VT == WidenVT)
       return ConcatOps[0];
   }
 
   // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
   unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
   if (NumOps != ConcatEnd ) {
     SDValue UndefVal = DAG.getUNDEF(MaxVT);
     for (unsigned j = ConcatEnd; j < NumOps; ++j)
       ConcatOps[j] = UndefVal;
   }
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
                      makeArrayRef(ConcatOps.data(), NumOps));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   SDValue InOp = N->getOperand(0);
   SDLoc DL(N);
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   EVT InVT = InOp.getValueType();
   EVT InEltVT = InVT.getVectorElementType();
   EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
 
   unsigned Opcode = N->getOpcode();
   unsigned InVTNumElts = InVT.getVectorNumElements();
   const SDNodeFlags Flags = N->getFlags();
   if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
     InVTNumElts = InVT.getVectorNumElements();
     if (InVTNumElts == WidenNumElts) {
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InOp);
       return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
     }
     if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
       // If both input and result vector types are of same width, extend
       // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
       // accepts fewer elements in the result than in the input.
       if (Opcode == ISD::SIGN_EXTEND)
         return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
       if (Opcode == ISD::ZERO_EXTEND)
         return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
     }
   }
 
   if (TLI.isTypeLegal(InWidenVT)) {
     // Because the result and the input are different vector types, widening
     // the result could create a legal type but widening the input might make
     // it an illegal type that might lead to repeatedly splitting the input
     // and then widening it. To avoid this, we widen the input only if
     // it results in a legal type.
     if (WidenNumElts % InVTNumElts == 0) {
       // Widen the input and call convert on the widened input vector.
       unsigned NumConcat = WidenNumElts/InVTNumElts;
       SmallVector<SDValue, 16> Ops(NumConcat);
       Ops[0] = InOp;
       SDValue UndefVal = DAG.getUNDEF(InVT);
       for (unsigned i = 1; i != NumConcat; ++i)
         Ops[i] = UndefVal;
       SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVec);
       return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
     }
 
     if (InVTNumElts % WidenNumElts == 0) {
       SDValue InVal = DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
           DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
       // Extract the input and convert the shorten input vector.
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVal);
       return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
     }
   }
 
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = WidenVT.getVectorElementType();
   unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
   unsigned i;
   for (i=0; i < MinElts; ++i) {
     SDValue Val = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
         DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
     if (N->getNumOperands() == 1)
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
     else
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
   }
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
   return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   SDValue InOp = N->getOperand(0);
   SDLoc DL(N);
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT WidenSVT = WidenVT.getVectorElementType();
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   EVT InVT = InOp.getValueType();
   EVT InSVT = InVT.getVectorElementType();
   unsigned InVTNumElts = InVT.getVectorNumElements();
 
   if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(InOp);
     InVT = InOp.getValueType();
     if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
       switch (Opcode) {
       case ISD::ANY_EXTEND_VECTOR_INREG:
         return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
       case ISD::SIGN_EXTEND_VECTOR_INREG:
         return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
       case ISD::ZERO_EXTEND_VECTOR_INREG:
         return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
       }
     }
   }
 
   // Unroll, extend the scalars and rebuild the vector.
   SmallVector<SDValue, 16> Ops;
   for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
       DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
     switch (Opcode) {
     case ISD::ANY_EXTEND_VECTOR_INREG:
       Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
       break;
     case ISD::SIGN_EXTEND_VECTOR_INREG:
       Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
       break;
     case ISD::ZERO_EXTEND_VECTOR_INREG:
       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
       break;
     default:
       llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
     }
     Ops.push_back(Val);
   }
 
   while (Ops.size() != WidenNumElts)
     Ops.push_back(DAG.getUNDEF(WidenSVT));
 
   return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
   // If this is an FCOPYSIGN with same input types, we can treat it as a
   // normal (can trap) binary op.
   if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
     return WidenVecRes_BinaryCanTrap(N);
 
   // If the types are different, fall back to unrolling.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   SDValue ShOp = N->getOperand(1);
   return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   SDValue ShOp = N->getOperand(1);
 
   EVT ShVT = ShOp.getValueType();
   if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
     ShOp = GetWidenedVector(ShOp);
     ShVT = ShOp.getValueType();
   }
   EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
                                    ShVT.getVectorElementType(),
                                    WidenVT.getVectorNumElements());
   if (ShVT != ShWidenVT)
     ShOp = ModifyToType(ShOp, ShWidenVT);
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
   // Unary op widening.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
                                cast<VTSDNode>(N->getOperand(1))->getVT()
                                  .getVectorElementType(),
                                WidenVT.getVectorNumElements());
   SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      WidenVT, WidenLHS, DAG.getValueType(ExtVT));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
   SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
   return GetWidenedVector(WidenVec);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
   EVT VT = N->getValueType(0);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDLoc dl(N);
 
   switch (getTypeAction(InVT)) {
   case TargetLowering::TypeLegal:
     break;
   case TargetLowering::TypePromoteInteger:
     // If the incoming type is a vector that is being promoted, then
     // we know that the elements are arranged differently and that we
     // must perform the conversion using a stack slot.
     if (InVT.isVector())
       break;
 
     // If the InOp is promoted to the same size, convert it.  Otherwise,
     // fall out of the switch and widen the promoted input.
     InOp = GetPromotedInteger(InOp);
     InVT = InOp.getValueType();
     if (WidenVT.bitsEq(InVT))
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
     break;
   case TargetLowering::TypeSoftenFloat:
   case TargetLowering::TypePromoteFloat:
   case TargetLowering::TypeExpandInteger:
   case TargetLowering::TypeExpandFloat:
   case TargetLowering::TypeScalarizeVector:
   case TargetLowering::TypeSplitVector:
     break;
   case TargetLowering::TypeWidenVector:
     // If the InOp is widened to the same size, convert it.  Otherwise, fall
     // out of the switch and widen the widened input.
     InOp = GetWidenedVector(InOp);
     InVT = InOp.getValueType();
     if (WidenVT.bitsEq(InVT))
       // The input widens to the same size. Convert to the widen value.
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
     break;
   }
 
   unsigned WidenSize = WidenVT.getSizeInBits();
   unsigned InSize = InVT.getSizeInBits();
   // x86mmx is not an acceptable vector element type, so don't try.
   if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
     // Determine new input vector type.  The new input vector type will use
     // the same element type (if its a vector) or use the input type as a
     // vector.  It is the same size as the type to widen to.
     EVT NewInVT;
     unsigned NewNumElts = WidenSize / InSize;
     if (InVT.isVector()) {
       EVT InEltVT = InVT.getVectorElementType();
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
                                  WidenSize / InEltVT.getSizeInBits());
     } else {
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
     }
 
     if (TLI.isTypeLegal(NewInVT)) {
       // Because the result and the input are different vector types, widening
       // the result could create a legal type but widening the input might make
       // it an illegal type that might lead to repeatedly splitting the input
       // and then widening it. To avoid this, we widen the input only if
       // it results in a legal type.
       SmallVector<SDValue, 16> Ops(NewNumElts);
       SDValue UndefVal = DAG.getUNDEF(InVT);
       Ops[0] = InOp;
       for (unsigned i = 1; i < NewNumElts; ++i)
         Ops[i] = UndefVal;
 
       SDValue NewVec;
       if (InVT.isVector())
         NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
       else
         NewVec = DAG.getBuildVector(NewInVT, dl, Ops);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
     }
   }
 
   return CreateStackStoreLoad(InOp, WidenVT);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
   SDLoc dl(N);
   // Build a vector with undefined for the new nodes.
   EVT VT = N->getValueType(0);
 
   // Integer BUILD_VECTOR operands may be larger than the node's vector element
   // type. The UNDEFs need to have the same type as the existing operands.
   EVT EltVT = N->getOperand(0).getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
   assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
   NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
 
   return DAG.getBuildVector(WidenVT, dl, NewOps);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   EVT InVT = N->getOperand(0).getValueType();
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   unsigned NumInElts = InVT.getVectorNumElements();
   unsigned NumOperands = N->getNumOperands();
 
   bool InputWidened = false; // Indicates we need to widen the input.
   if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
     if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
       // Add undef vectors to widen to correct length.
       unsigned NumConcat = WidenVT.getVectorNumElements() /
                            InVT.getVectorNumElements();
       SDValue UndefVal = DAG.getUNDEF(InVT);
       SmallVector<SDValue, 16> Ops(NumConcat);
       for (unsigned i=0; i < NumOperands; ++i)
         Ops[i] = N->getOperand(i);
       for (unsigned i = NumOperands; i != NumConcat; ++i)
         Ops[i] = UndefVal;
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
     }
   } else {
     InputWidened = true;
     if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
       // The inputs and the result are widen to the same value.
       unsigned i;
       for (i=1; i < NumOperands; ++i)
         if (!N->getOperand(i).isUndef())
           break;
 
       if (i == NumOperands)
         // Everything but the first operand is an UNDEF so just return the
         // widened first operand.
         return GetWidenedVector(N->getOperand(0));
 
       if (NumOperands == 2) {
         // Replace concat of two operands with a shuffle.
         SmallVector<int, 16> MaskOps(WidenNumElts, -1);
         for (unsigned i = 0; i < NumInElts; ++i) {
           MaskOps[i] = i;
           MaskOps[i + NumInElts] = i + WidenNumElts;
         }
         return DAG.getVectorShuffle(WidenVT, dl,
                                     GetWidenedVector(N->getOperand(0)),
                                     GetWidenedVector(N->getOperand(1)),
                                     MaskOps);
       }
     }
   }
 
   // Fall back to use extracts and build vector.
   EVT EltVT = WidenVT.getVectorElementType();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   unsigned Idx = 0;
   for (unsigned i=0; i < NumOperands; ++i) {
     SDValue InOp = N->getOperand(i);
     if (InputWidened)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
           DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = UndefVal;
   return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT      VT = N->getValueType(0);
   EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   SDValue  InOp = N->getOperand(0);
   SDValue  Idx  = N->getOperand(1);
   SDLoc dl(N);
 
   if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
 
   EVT InVT = InOp.getValueType();
 
   // Check if we can just return the input vector after widening.
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0 && InVT == WidenVT)
     return InOp;
 
   // Check if we can extract from the vector.
   unsigned InNumElts = InVT.getVectorNumElements();
   if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
 
   // We could try widening the input to the right length but for now, extract
   // the original elements, fill the rest with undefs and build a vector.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned i;
   for (i=0; i < NumElts; ++i)
     Ops[i] =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
                     DAG.getConstant(IdxVal + i, dl,
                                     TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
   return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
                      InOp.getValueType(), InOp,
                      N->getOperand(1), N->getOperand(2));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::LoadExtType ExtType = LD->getExtensionType();
 
   SDValue Result;
   SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
   if (ExtType != ISD::NON_EXTLOAD)
     Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
   else
     Result = GenWidenVectorLoads(LdChain, LD);
 
   // If we generate a single load, we can use that for the chain.  Otherwise,
   // build a factor node to remember the multiple loads are independent and
   // chain to that.
   SDValue NewChain;
   if (LdChain.size() == 1)
     NewChain = LdChain[0];
   else
     NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
 
   // Modified the chain - switch anything that used the old chain to use
   // the new one.
   ReplaceValueWith(SDValue(N, 1), NewChain);
 
   return Result;
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
   SDValue Mask = N->getMask();
   EVT MaskVT = Mask.getValueType();
   SDValue Src0 = GetWidenedVector(N->getSrc0());
   ISD::LoadExtType ExtType = N->getExtensionType();
   SDLoc dl(N);
 
   // The mask should be widened as well
   EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
                                     MaskVT.getVectorElementType(),
                                     WidenVT.getVectorNumElements());
   Mask = ModifyToType(Mask, WideMaskVT, true);
 
   SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
                                   Mask, Src0, N->getMemoryVT(),
                                   N->getMemOperand(), ExtType,
                                         N->isExpandingLoad());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
 
   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Mask = N->getMask();
   EVT MaskVT = Mask.getValueType();
   SDValue Src0 = GetWidenedVector(N->getValue());
   SDValue Scale = N->getScale();
   unsigned NumElts = WideVT.getVectorNumElements();
   SDLoc dl(N);
 
   // The mask should be widened as well
   EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
                                     MaskVT.getVectorElementType(),
                                     WideVT.getVectorNumElements());
   Mask = ModifyToType(Mask, WideMaskVT, true);
 
   // Widen the Index operand
   SDValue Index = N->getIndex();
   EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
                                      Index.getValueType().getScalarType(),
                                      NumElts);
   Index = ModifyToType(Index, WideIndexVT);
   SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, Scale };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
                                     N->getMemOperand());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
                      WidenVT, N->getOperand(0));
 }
 
 // Return true if this is a node that could have two SETCCs as operands.
 static inline bool isLogicalMaskOp(unsigned Opcode) {
   switch (Opcode) {
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
     return true;
   }
   return false;
 }
 
 // This is used just for the assert in convertMask(). Check that this either
 // a SETCC or a previously handled SETCC by convertMask().
 #ifndef NDEBUG
 static inline bool isSETCCorConvertedSETCC(SDValue N) {
   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
     N = N.getOperand(0);
   else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
     for (unsigned i = 1; i < N->getNumOperands(); ++i)
       if (!N->getOperand(i)->isUndef())
         return false;
     N = N.getOperand(0);
   }
 
   if (N.getOpcode() == ISD::TRUNCATE)
     N = N.getOperand(0);
   else if (N.getOpcode() == ISD::SIGN_EXTEND)
     N = N.getOperand(0);
 
   if (isLogicalMaskOp(N.getOpcode()))
     return isSETCCorConvertedSETCC(N.getOperand(0)) &&
            isSETCCorConvertedSETCC(N.getOperand(1));
 
   return (N.getOpcode() == ISD::SETCC ||
           ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
 }
 #endif
 
 // Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
 // to ToMaskVT if needed with vector extension or truncation.
 SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
                                       EVT ToMaskVT) {
   // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
   // FIXME: This code seems to be too restrictive, we might consider
   // generalizing it or dropping it.
   assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument.");
 
   // Make a new Mask node, with a legal result VT.
   SmallVector<SDValue, 4> Ops;
   for (unsigned i = 0, e = InMask->getNumOperands(); i < e; ++i)
     Ops.push_back(InMask->getOperand(i));
   SDValue Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops);
 
   // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
   // extend or truncate is needed.
   LLVMContext &Ctx = *DAG.getContext();
   unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
   unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
   if (MaskScalarBits < ToMaskScalBits) {
     EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
                                  MaskVT.getVectorNumElements());
     Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
   } else if (MaskScalarBits > ToMaskScalBits) {
     EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
                                    MaskVT.getVectorNumElements());
     Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
   }
 
   assert(Mask->getValueType(0).getScalarSizeInBits() ==
              ToMaskVT.getScalarSizeInBits() &&
          "Mask should have the right element size by now.");
 
   // Adjust Mask to the right number of elements.
   unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
   if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
     MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
     SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
     Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
                        ZeroIdx);
   } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
     unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
     EVT SubVT = Mask->getValueType(0);
     SmallVector<SDValue, 16> SubOps(NumSubVecs, DAG.getUNDEF(SubVT));
     SubOps[0] = Mask;
     Mask = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubOps);
   }
 
   assert((Mask->getValueType(0) == ToMaskVT) &&
          "A mask of ToMaskVT should have been produced by now.");
 
   return Mask;
 }
 
 // Get the target mask VT, and widen if needed.
 EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
   assert(SetCC->getOpcode() == ISD::SETCC);
   LLVMContext &Ctx = *DAG.getContext();
   EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
   if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
     MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
   return MaskVT;
 }
 
 // This method tries to handle VSELECT and its mask by legalizing operands
 // (which may require widening) and if needed adjusting the mask vector type
 // to match that of the VSELECT. Without it, many cases end up with
 // scalarization of the SETCC, with many unnecessary instructions.
 SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
   LLVMContext &Ctx = *DAG.getContext();
   SDValue Cond = N->getOperand(0);
 
   if (N->getOpcode() != ISD::VSELECT)
     return SDValue();
 
   if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
     return SDValue();
 
   // If this is a splitted VSELECT that was previously already handled, do
   // nothing.
   EVT CondVT = Cond->getValueType(0);
   if (CondVT.getScalarSizeInBits() != 1)
     return SDValue();
 
   EVT VSelVT = N->getValueType(0);
   // Only handle vector types which are a power of 2.
   if (!isPowerOf2_64(VSelVT.getSizeInBits()))
     return SDValue();
 
   // Don't touch if this will be scalarized.
   EVT FinalVT = VSelVT;
   while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
     FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx);
 
   if (FinalVT.getVectorNumElements() == 1)
     return SDValue();
 
   // If there is support for an i1 vector mask, don't touch.
   if (Cond.getOpcode() == ISD::SETCC) {
     EVT SetCCOpVT = Cond->getOperand(0).getValueType();
     while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
       SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
     EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
     if (SetCCResVT.getScalarSizeInBits() == 1)
       return SDValue();
   } else if (CondVT.getScalarType() == MVT::i1) {
     // If there is support for an i1 vector mask (or only scalar i1 conditions),
     // don't touch.
     while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
       CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);
 
     if (CondVT.getScalarType() == MVT::i1)
       return SDValue();
   }
 
   // Get the VT and operands for VSELECT, and widen if needed.
   SDValue VSelOp1 = N->getOperand(1);
   SDValue VSelOp2 = N->getOperand(2);
   if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
     VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
     VSelOp1 = GetWidenedVector(VSelOp1);
     VSelOp2 = GetWidenedVector(VSelOp2);
   }
 
   // The mask of the VSELECT should have integer elements.
   EVT ToMaskVT = VSelVT;
   if (!ToMaskVT.getScalarType().isInteger())
     ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
 
   SDValue Mask;
   if (Cond->getOpcode() == ISD::SETCC) {
     EVT MaskVT = getSETCCWidenedResultTy(Cond);
     Mask = convertMask(Cond, MaskVT, ToMaskVT);
   } else if (isLogicalMaskOp(Cond->getOpcode()) &&
              Cond->getOperand(0).getOpcode() == ISD::SETCC &&
              Cond->getOperand(1).getOpcode() == ISD::SETCC) {
     // Cond is (AND/OR/XOR (SETCC, SETCC))
     SDValue SETCC0 = Cond->getOperand(0);
     SDValue SETCC1 = Cond->getOperand(1);
     EVT VT0 = getSETCCWidenedResultTy(SETCC0);
     EVT VT1 = getSETCCWidenedResultTy(SETCC1);
     unsigned ScalarBits0 = VT0.getScalarSizeInBits();
     unsigned ScalarBits1 = VT1.getScalarSizeInBits();
     unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
     EVT MaskVT;
     // If the two SETCCs have different VTs, either extend/truncate one of
     // them to the other "towards" ToMaskVT, or truncate one and extend the
     // other to ToMaskVT.
     if (ScalarBits0 != ScalarBits1) {
       EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
       EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
       if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
         MaskVT = WideVT;
       else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
         MaskVT = NarrowVT;
       else
         MaskVT = ToMaskVT;
     } else
       // If the two SETCCs have the same VT, don't change it.
       MaskVT = VT0;
 
     // Make new SETCCs and logical nodes.
     SETCC0 = convertMask(SETCC0, VT0, MaskVT);
     SETCC1 = convertMask(SETCC1, VT1, MaskVT);
     Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
 
     // Convert the logical op for VSELECT if needed.
     Mask = convertMask(Cond, MaskVT, ToMaskVT);
   } else
     return SDValue();
 
   return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
   if (CondVT.isVector()) {
     if (SDValue Res = WidenVSELECTAndMask(N))
       return Res;
 
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
                                         CondEltVT, WidenNumElts);
     if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
       Cond1 = GetWidenedVector(Cond1);
 
     // If we have to split the condition there is no point in widening the
     // select. This would result in an cycle of widening the select ->
     // widening the condition operand -> splitting the condition operand ->
     // splitting the select -> widening the select. Instead split this select
     // further and widen the resulting type.
     if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) {
       SDValue SplitSelect = SplitVecOp_VSELECT(N, 0);
       SDValue Res = ModifyToType(SplitSelect, WidenVT);
       return Res;
     }
 
     if (Cond1.getValueType() != CondWidenVT)
       Cond1 = ModifyToType(Cond1, CondWidenVT);
   }
 
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
   SDValue InOp2 = GetWidenedVector(N->getOperand(2));
   assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      WidenVT, Cond1, InOp1, InOp2);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
   SDValue InOp1 = GetWidenedVector(N->getOperand(2));
   SDValue InOp2 = GetWidenedVector(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
                      InOp1.getValueType(), N->getOperand(0),
                      N->getOperand(1), InOp1, InOp2, N->getOperand(4));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
  return DAG.getUNDEF(WidenVT);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   unsigned NumElts = VT.getVectorNumElements();
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
 
   // Adjust mask based on new input vector length.
   SmallVector<int, 16> NewMask;
   for (unsigned i = 0; i != NumElts; ++i) {
     int Idx = N->getMaskElt(i);
     if (Idx < (int)NumElts)
       NewMask.push_back(Idx);
     else
       NewMask.push_back(Idx - NumElts + WidenNumElts);
   }
   for (unsigned i = NumElts; i != WidenNumElts; ++i)
     NewMask.push_back(-1);
   return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
          "Operands must be vectors");
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   SDValue InOp1 = N->getOperand(0);
   EVT InVT = InOp1.getValueType();
   assert(InVT.isVector() && "can not widen non-vector type");
   EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
                                    InVT.getVectorElementType(), WidenNumElts);
 
   // The input and output types often differ here, and it could be that while
   // we'd prefer to widen the result type, the input operands have been split.
   // In this case, we also need to split the result of this node as well.
   if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
     SDValue SplitVSetCC = SplitVecOp_VSETCC(N);
     SDValue Res = ModifyToType(SplitVSetCC, WidenVT);
     return Res;
   }
 
   InOp1 = GetWidenedVector(InOp1);
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
 
   // Assume that the input and output will be widen appropriately.  If not,
   // we will have to unroll it at some point.
   assert(InOp1.getValueType() == WidenInVT &&
          InOp2.getValueType() == WidenInVT &&
          "Input not widened to expected type!");
   (void)WidenInVT;
   return DAG.getNode(ISD::SETCC, SDLoc(N),
                      WidenVT, InOp1, InOp2, N->getOperand(2));
 }
 
 
 //===----------------------------------------------------------------------===//
 // Widen Vector Operand
 //===----------------------------------------------------------------------===//
 bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Widen node operand " << OpNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom widen this node.
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
     return false;
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to widen this operator's operand!");
 
   case ISD::BITCAST:            Res = WidenVecOp_BITCAST(N); break;
   case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
   case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
   case ISD::MSCATTER:           Res = WidenVecOp_MSCATTER(N, OpNo); break;
   case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
   case ISD::FCOPYSIGN:          Res = WidenVecOp_FCOPYSIGN(N); break;
 
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     Res = WidenVecOp_EXTEND(N);
     break;
 
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::TRUNCATE:
     Res = WidenVecOp_Convert(N);
     break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this.
   if (Res.getNode() == N)
     return true;
 
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   SDValue InOp = N->getOperand(0);
   assert(getTypeAction(InOp.getValueType()) ==
              TargetLowering::TypeWidenVector &&
          "Unexpected type action");
   InOp = GetWidenedVector(InOp);
   assert(VT.getVectorNumElements() <
              InOp.getValueType().getVectorNumElements() &&
          "Input wasn't widened!");
 
   // We may need to further widen the operand until it has the same total
   // vector size as the result.
   EVT InVT = InOp.getValueType();
   if (InVT.getSizeInBits() != VT.getSizeInBits()) {
     EVT InEltVT = InVT.getVectorElementType();
     for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) {
       EVT FixedVT = (MVT::SimpleValueType)i;
       EVT FixedEltVT = FixedVT.getVectorElementType();
       if (TLI.isTypeLegal(FixedVT) &&
           FixedVT.getSizeInBits() == VT.getSizeInBits() &&
           FixedEltVT == InEltVT) {
         assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
                "Not enough elements in the fixed type for the operand!");
         assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
                "We can't have the same type as we started with!");
         if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
           InOp = DAG.getNode(
               ISD::INSERT_SUBVECTOR, DL, FixedVT, DAG.getUNDEF(FixedVT), InOp,
               DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
         else
           InOp = DAG.getNode(
               ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
               DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
         break;
       }
     }
     InVT = InOp.getValueType();
     if (InVT.getSizeInBits() != VT.getSizeInBits())
       // We couldn't find a legal vector type that was a widening of the input
       // and could be extended in-register to the result type, so we have to
       // scalarize.
       return WidenVecOp_Convert(N);
   }
 
   // Use special DAG nodes to represent the operation of extending the
   // low lanes.
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Extend legalization on extend operation!");
   case ISD::ANY_EXTEND:
     return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
   case ISD::SIGN_EXTEND:
     return DAG.getSignExtendVectorInReg(InOp, DL, VT);
   case ISD::ZERO_EXTEND:
     return DAG.getZeroExtendVectorInReg(InOp, DL, VT);
   }
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
   // The result (and first input) is legal, but the second input is illegal.
   // We can't do much to fix that, so just unroll and let the extracts off of
   // the second input be widened as needed later.
   return DAG.UnrollVectorOp(N);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   // Since the result is legal and the input is illegal.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
   SDLoc dl(N);
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InOp = N->getOperand(0);
   assert(getTypeAction(InOp.getValueType()) ==
              TargetLowering::TypeWidenVector &&
          "Unexpected type action");
   InOp = GetWidenedVector(InOp);
   EVT InVT = InOp.getValueType();
   unsigned Opcode = N->getOpcode();
 
   // See if a widened result type would be legal, if so widen the node.
   EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                                 InVT.getVectorNumElements());
   if (TLI.isTypeLegal(WideVT)) {
     SDValue Res = DAG.getNode(Opcode, dl, WideVT, InOp);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   EVT InEltVT = InVT.getVectorElementType();
 
   // Unroll the convert into some scalar code and create a nasty build vector.
   SmallVector<SDValue, 16> Ops(NumElts);
   for (unsigned i=0; i < NumElts; ++i)
     Ops[i] = DAG.getNode(
         Opcode, dl, EltVT,
         DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
             DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
   return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   EVT InWidenVT = InOp.getValueType();
   SDLoc dl(N);
 
   // Check if we can convert between two legal vector types and extract.
   unsigned InWidenSize = InWidenVT.getSizeInBits();
   unsigned Size = VT.getSizeInBits();
   // x86mmx is not an acceptable vector element type, so don't try.
   if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
     unsigned NewNumElts = InWidenSize / Size;
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
     if (TLI.isTypeLegal(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
       return DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
           DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     }
   }
 
   return CreateStackStoreLoad(InOp, VT);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
   // If the input vector is not legal, it is likely that we will not find a
   // legal vector of the same size. Replace the concatenate vector with a
   // nasty build vector.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
   SDLoc dl(N);
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(NumElts);
 
   EVT InVT = N->getOperand(0).getValueType();
   unsigned NumInElts = InVT.getVectorNumElements();
 
   unsigned Idx = 0;
   unsigned NumOperands = N->getNumOperands();
   for (unsigned i=0; i < NumOperands; ++i) {
     SDValue InOp = N->getOperand(i);
     assert(getTypeAction(InOp.getValueType()) ==
                TargetLowering::TypeWidenVector &&
            "Unexpected type action");
     InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
           DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
                      N->getValueType(0), InOp, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      N->getValueType(0), InOp, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   // We have to widen the value, but we want only to store the original
   // vector type.
   StoreSDNode *ST = cast<StoreSDNode>(N);
 
   if (!ST->getMemoryVT().getScalarType().isByteSized())
     return TLI.scalarizeVectorStore(ST, DAG);
 
   SmallVector<SDValue, 16> StChain;
   if (ST->isTruncatingStore())
     GenWidenVectorTruncStores(StChain, ST);
   else
     GenWidenVectorStores(StChain, ST);
 
   if (StChain.size() == 1)
     return StChain[0];
   else
     return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
-  assert(OpNo == 3 && "Can widen only data operand of mstore");
+  assert((OpNo == 2 || OpNo == 3) &&
+         "Can widen only data or mask operand of mstore");
   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
   EVT MaskVT = Mask.getValueType();
   SDValue StVal = MST->getValue();
-  // Widen the value
-  SDValue WideVal = GetWidenedVector(StVal);
   SDLoc dl(N);
 
-  // The mask should be widened as well.
-  EVT WideVT = WideVal.getValueType();
-  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
-                                    MaskVT.getVectorElementType(),
-                                    WideVT.getVectorNumElements());
-  Mask = ModifyToType(Mask, WideMaskVT, true);
+  if (OpNo == 3) {
+    // Widen the value
+    StVal = GetWidenedVector(StVal);
 
+    // The mask should be widened as well.
+    EVT WideVT = StVal.getValueType();
+    EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                      MaskVT.getVectorElementType(),
+                                      WideVT.getVectorNumElements());
+    Mask = ModifyToType(Mask, WideMaskVT, true);
+  } else {
+    EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), MaskVT);
+    Mask = ModifyToType(Mask, WideMaskVT, true);
+
+    EVT ValueVT = StVal.getValueType();
+    if (getTypeAction(ValueVT) == TargetLowering::TypeWidenVector)
+      StVal = GetWidenedVector(StVal);
+    else {
+      EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
+                                    ValueVT.getVectorElementType(),
+                                    WideMaskVT.getVectorNumElements());
+      StVal = ModifyToType(StVal, WideVT);
+    }
+  }
+
   assert(Mask.getValueType().getVectorNumElements() ==
-         WideVal.getValueType().getVectorNumElements() &&
+         StVal.getValueType().getVectorNumElements() &&
          "Mask and data vectors should have the same number of elements");
-  return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
+  return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(),
                             Mask, MST->getMemoryVT(), MST->getMemOperand(),
                             false, MST->isCompressingStore());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
   assert(OpNo == 1 && "Can widen only data operand of mscatter");
   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
   SDValue DataOp = MSC->getValue();
   SDValue Mask = MSC->getMask();
   EVT MaskVT = Mask.getValueType();
   SDValue Scale = MSC->getScale();
 
   // Widen the value.
   SDValue WideVal = GetWidenedVector(DataOp);
   EVT WideVT = WideVal.getValueType();
   unsigned NumElts = WideVT.getVectorNumElements();
   SDLoc dl(N);
 
   // The mask should be widened as well.
   EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
                                     MaskVT.getVectorElementType(), NumElts);
   Mask = ModifyToType(Mask, WideMaskVT, true);
 
   // Widen index.
   SDValue Index = MSC->getIndex();
   EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
                                      Index.getValueType().getScalarType(),
                                      NumElts);
   Index = ModifyToType(Index, WideIndexVT);
 
   SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index,
                    Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
                               MSC->getMemoryVT(), dl, Ops,
                               MSC->getMemOperand());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // WARNING: In this code we widen the compare instruction with garbage.
   // This garbage may contain denormal floats which may be slow. Is this a real
   // concern ? Should we zero the unused lanes if this is a float compare ?
 
   // Get a new SETCC node to compare the newly widened operands.
   // Only some of the compared elements are legal.
   EVT SVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                    InOp0.getValueType());
   // The result type is legal, if its vXi1, keep vXi1 for the new SETCC.
   if (VT.getScalarType() == MVT::i1)
     SVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                            SVT.getVectorNumElements());
 
   SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
                                   SVT, InOp0, InOp1, N->getOperand(2));
 
   // Extract the needed results from the result vector.
   EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
                                SVT.getVectorElementType(),
                                VT.getVectorNumElements());
   SDValue CC = DAG.getNode(
       ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   return PromoteTargetBoolean(CC, VT);
 }
 
 
 //===----------------------------------------------------------------------===//
 // Vector Widening Utilities
 //===----------------------------------------------------------------------===//
 
 // Utility function to find the type to chop up a widen vector for load/store
 //  TLI:       Target lowering used to determine legal types.
 //  Width:     Width left need to load/store.
 //  WidenVT:   The widen vector type to load to/store from
 //  Align:     If 0, don't allow use of a wider type
 //  WidenEx:   If Align is not 0, the amount additional we can load/store from.
 
 static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
                        unsigned Width, EVT WidenVT,
                        unsigned Align = 0, unsigned WidenEx = 0) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   unsigned WidenWidth = WidenVT.getSizeInBits();
   unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
   unsigned AlignInBits = Align*8;
 
   // If we have one element to load/store, return it.
   EVT RetVT = WidenEltVT;
   if (Width == WidenEltWidth)
     return RetVT;
 
   // See if there is larger legal integer than the element type to load/store.
   unsigned VT;
   for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
        VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
     EVT MemVT((MVT::SimpleValueType) VT);
     unsigned MemVTWidth = MemVT.getSizeInBits();
     if (MemVT.getSizeInBits() <= WidenEltWidth)
       break;
     auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
     if ((Action == TargetLowering::TypeLegal ||
          Action == TargetLowering::TypePromoteInteger) &&
         (WidenWidth % MemVTWidth) == 0 &&
         isPowerOf2_32(WidenWidth / MemVTWidth) &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
       RetVT = MemVT;
       break;
     }
   }
 
   // See if there is a larger vector type to load/store that has the same vector
   // element type and is evenly divisible with the WidenVT.
   for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
        VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
     EVT MemVT = (MVT::SimpleValueType) VT;
     unsigned MemVTWidth = MemVT.getSizeInBits();
     if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
         (WidenWidth % MemVTWidth) == 0 &&
         isPowerOf2_32(WidenWidth / MemVTWidth) &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
       if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
         return MemVT;
     }
   }
 
   return RetVT;
 }
 
 // Builds a vector type from scalar loads
 //  VecTy: Resulting Vector type
 //  LDOps: Load operators to build a vector type
 //  [Start,End) the list of loads to use.
 static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
                                      SmallVectorImpl<SDValue> &LdOps,
                                      unsigned Start, unsigned End) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(LdOps[Start]);
   EVT LdTy = LdOps[Start].getValueType();
   unsigned Width = VecTy.getSizeInBits();
   unsigned NumElts = Width / LdTy.getSizeInBits();
   EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
 
   unsigned Idx = 1;
   SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
 
   for (unsigned i = Start + 1; i != End; ++i) {
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       NumElts = Width / NewLdTy.getSizeInBits();
       NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
       VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
       // Readjust position and vector position based on new load type.
       Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
       LdTy = NewLdTy;
     }
     VecOp = DAG.getNode(
         ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
         DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
 }
 
 SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                                               LoadSDNode *LD) {
   // The strategy assumes that we can efficiently load power-of-two widths.
   // The routine chops the vector into the largest vector loads with the same
   // element type or scalar loads and then recombines it to the widen vector
   // type.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
   unsigned WidenWidth = WidenVT.getSizeInBits();
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
   assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
 
   // Load information
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
   unsigned Align = LD->getAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
   unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.
 
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              Align, MMOFlags, AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction.
   if (LdWidth <= NewVTWidth) {
     if (!NewVT.isVector()) {
       unsigned NumElts = WidenWidth / NewVTWidth;
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
     }
     if (NewVT == WidenVT)
       return LdOp;
 
     assert(WidenWidth % NewVTWidth == 0);
     unsigned NumConcat = WidenWidth / NewVTWidth;
     SmallVector<SDValue, 16> ConcatOps(NumConcat);
     SDValue UndefVal = DAG.getUNDEF(NewVT);
     ConcatOps[0] = LdOp;
     for (unsigned i = 1; i != NumConcat; ++i)
       ConcatOps[i] = UndefVal;
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
   }
 
   // Load vector by using multiple loads from largest vector to scalar.
   SmallVector<SDValue, 16> LdOps;
   LdOps.push_back(LdOp);
 
   LdWidth -= NewVTWidth;
   unsigned Offset = 0;
 
   while (LdWidth > 0) {
     unsigned Increment = NewVTWidth / 8;
     Offset += Increment;
     BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
 
     SDValue L;
     if (LdWidth < NewVTWidth) {
       // The current type we are using is too large. Find a better size.
       NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
                       MinAlign(Align, Increment), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
         // Later code assumes the vector loads produced will be mergeable, so we
         // must pad the final entry up to the previous width. Scalars are
         // combined separately.
         SmallVector<SDValue, 16> Loads;
         Loads.push_back(L);
         unsigned size = L->getValueSizeInBits(0);
         while (size < LdOp->getValueSizeInBits(0)) {
           Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
           size += L->getValueSizeInBits(0);
         }
         L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
       }
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
                       MinAlign(Align, Increment), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
     LdOps.push_back(L);
     LdOp = L;
 
     LdWidth -= NewVTWidth;
   }
 
   // Build the vector from the load operations.
   unsigned End = LdOps.size();
   if (!LdOps[0].getValueType().isVector())
     // All the loads are scalar loads.
     return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
 
   // If the load contains vectors, build the vector using concat vector.
   // All of the vectors used to load are power-of-2, and the scalar loads can be
   // combined to make a power-of-2 vector.
   SmallVector<SDValue, 16> ConcatOps(End);
   int i = End - 1;
   int Idx = End;
   EVT LdTy = LdOps[i].getValueType();
   // First, combine the scalar loads to a vector.
   if (!LdTy.isVector())  {
     for (--i; i >= 0; --i) {
       LdTy = LdOps[i].getValueType();
       if (LdTy.isVector())
         break;
     }
     ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
   }
   ConcatOps[--Idx] = LdOps[i];
   for (--i; i >= 0; --i) {
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       // Create a larger vector.
       ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
                                      makeArrayRef(&ConcatOps[Idx], End - Idx));
       Idx = End - 1;
       LdTy = NewLdTy;
     }
     ConcatOps[--Idx] = LdOps[i];
   }
 
   if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
                        makeArrayRef(&ConcatOps[Idx], End - Idx));
 
   // We need to fill the rest with undefs to build the vector.
   unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
   SmallVector<SDValue, 16> WidenOps(NumOps);
   SDValue UndefVal = DAG.getUNDEF(LdTy);
   {
     unsigned i = 0;
     for (; i != End-Idx; ++i)
       WidenOps[i] = ConcatOps[Idx+i];
     for (; i != NumOps; ++i)
       WidenOps[i] = UndefVal;
   }
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
 }
 
 SDValue
 DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
                                          LoadSDNode *LD,
                                          ISD::LoadExtType ExtType) {
   // For extension loads, it may not be more efficient to chop up the vector
   // and then extend it. Instead, we unroll the load and build a new vector.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
   EVT LdVT    = LD->getMemoryVT();
   SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
 
   // Load information
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
   unsigned Align = LD->getAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
   unsigned NumElts = LdVT.getVectorNumElements();
 
   // Load each element and widen.
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] =
       DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
                      LdEltVT, Align, MMOFlags, AAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
                             Align, MMOFlags, AAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
   // Fill the rest with undefs.
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i != WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
   return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // The strategy assumes that we can efficiently store power-of-two widths.
   // The routine chops the vector into the largest vector stores with the same
   // element type or scalar stores.
   SDValue  Chain = ST->getChain();
   SDValue  BasePtr = ST->getBasePtr();
   unsigned Align = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
   EVT StVT = ST->getMemoryVT();
   unsigned StWidth = StVT.getSizeInBits();
   EVT ValVT = ValOp.getValueType();
   unsigned ValWidth = ValVT.getSizeInBits();
   EVT ValEltVT = ValVT.getVectorElementType();
   unsigned ValEltWidth = ValEltVT.getSizeInBits();
   assert(StVT.getVectorElementType() == ValEltVT);
 
   int Idx = 0;          // current index to store
   unsigned Offset = 0;  // offset from base to store
   while (StWidth != 0) {
     // Find the largest vector type we can store with.
     EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
     unsigned NewVTWidth = NewVT.getSizeInBits();
     unsigned Increment = NewVTWidth / 8;
     if (NewVT.isVector()) {
       unsigned NumVTElts = NewVT.getVectorNumElements();
       do {
         SDValue EOp = DAG.getNode(
             ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
             DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
         StChain.push_back(DAG.getStore(
             Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
             MinAlign(Align, Offset), MMOFlags, AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
 
         BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
       } while (StWidth != 0 && StWidth >= NewVTWidth);
     } else {
       // Cast the vector to the scalar type we can store.
       unsigned NumElts = ValWidth / NewVTWidth;
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
       // Readjust index position based on new vector type.
       Idx = Idx * ValEltWidth / NewVTWidth;
       do {
         SDValue EOp = DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
             DAG.getConstant(Idx++, dl,
                             TLI.getVectorIdxTy(DAG.getDataLayout())));
         StChain.push_back(DAG.getStore(
             Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
             MinAlign(Align, Offset), MMOFlags, AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
       } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type.
       Idx = Idx * NewVTWidth / ValEltWidth;
     }
   }
 }
 
 void
 DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // For extension loads, it may not be more efficient to truncate the vector
   // and then store it. Instead, we extract each element and then store it.
   SDValue Chain = ST->getChain();
   SDValue BasePtr = ST->getBasePtr();
   unsigned Align = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDValue ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
   EVT StVT = ST->getMemoryVT();
   EVT ValVT = ValOp.getValueType();
 
   // It must be true that the wide vector type is bigger than where we need to
   // store.
   assert(StVT.isVector() && ValOp.getValueType().isVector());
   assert(StVT.bitsLT(ValOp.getValueType()));
 
   // For truncating stores, we can not play the tricks of chopping legal vector
   // types and bitcast it to the right type. Instead, we unroll the store.
   EVT StEltVT  = StVT.getVectorElementType();
   EVT ValEltVT = ValVT.getVectorElementType();
   unsigned Increment = ValEltVT.getSizeInBits() / 8;
   unsigned NumElts = StVT.getVectorNumElements();
   SDValue EOp = DAG.getNode(
       ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT, Align,
                                       MMOFlags, AAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
     SDValue EOp = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
         DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     StChain.push_back(DAG.getTruncStore(
         Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
         StEltVT, MinAlign(Align, Offset), MMOFlags, AAInfo));
   }
 }
 
 /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 /// FillWithZeroes specifies that the vector should be widened with zeroes.
 SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
                                        bool FillWithZeroes) {
   // Note that InOp might have been widened so it might already have
   // the right width or it might need be narrowed.
   EVT InVT = InOp.getValueType();
   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
          "input and widen element type must match");
   SDLoc dl(InOp);
 
   // Check if InOp already has the right width.
   if (InVT == NVT)
     return InOp;
 
   unsigned InNumElts = InVT.getVectorNumElements();
   unsigned WidenNumElts = NVT.getVectorNumElements();
   if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
     unsigned NumConcat = WidenNumElts / InNumElts;
     SmallVector<SDValue, 16> Ops(NumConcat);
     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
       DAG.getUNDEF(InVT);
     Ops[0] = InOp;
     for (unsigned i = 1; i != NumConcat; ++i)
       Ops[i] = FillVal;
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
   }
 
   if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
     return DAG.getNode(
         ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
         DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   // Fall back to extract and build.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = NVT.getVectorElementType();
   unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
   unsigned Idx;
   for (Idx = 0; Idx < MinNumElts; ++Idx)
     Ops[Idx] = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
         DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
     DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = FillVal;
   return DAG.getBuildVector(NVT, dl, Ops);
 }
Index: vendor/llvm/dist-release_70/lib/MC/MCParser/ELFAsmParser.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/MC/MCParser/ELFAsmParser.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/MC/MCParser/ELFAsmParser.cpp	(revision 337631)
@@ -1,913 +1,893 @@
 //===- ELFAsmParser.cpp - ELF Assembly Parser -----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <cstdint>
 #include <utility>
 
 using namespace llvm;
 
 namespace {
 
 class ELFAsmParser : public MCAsmParserExtension {
   template<bool (ELFAsmParser::*HandlerMethod)(StringRef, SMLoc)>
   void addDirectiveHandler(StringRef Directive) {
     MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
         this, HandleDirective<ELFAsmParser, HandlerMethod>);
 
     getParser().addDirectiveHandler(Directive, Handler);
   }
 
   bool ParseSectionSwitch(StringRef Section, unsigned Type, unsigned Flags,
                           SectionKind Kind);
 
 public:
   ELFAsmParser() { BracketExpressionsSupported = true; }
 
   void Initialize(MCAsmParser &Parser) override {
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
     addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveData>(".data");
     addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveText>(".text");
     addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveBSS>(".bss");
     addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata");
     addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata");
     addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss");
     addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel");
     addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
     addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
     addDirectiveHandler<
       &ELFAsmParser::ParseDirectivePushSection>(".pushsection");
     addDirectiveHandler<&ELFAsmParser::ParseDirectivePopSection>(".popsection");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size");
     addDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
     addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".protected");
     addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".internal");
     addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".hidden");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveSubsection>(".subsection");
     addDirectiveHandler<&ELFAsmParser::ParseDirectiveCGProfile>(".cg_profile");
   }
 
   // FIXME: Part of this logic is duplicated in the MCELFStreamer. What is
   // the best way for us to get access to it?
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
     return ParseSectionSwitch(".data", ELF::SHT_PROGBITS,
                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
                               SectionKind::getData());
   }
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
     return ParseSectionSwitch(".text", ELF::SHT_PROGBITS,
                               ELF::SHF_EXECINSTR |
                               ELF::SHF_ALLOC, SectionKind::getText());
   }
   bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
     return ParseSectionSwitch(".bss", ELF::SHT_NOBITS,
                               ELF::SHF_WRITE |
                               ELF::SHF_ALLOC, SectionKind::getBSS());
   }
   bool ParseSectionDirectiveRoData(StringRef, SMLoc) {
     return ParseSectionSwitch(".rodata", ELF::SHT_PROGBITS,
                               ELF::SHF_ALLOC,
                               SectionKind::getReadOnly());
   }
   bool ParseSectionDirectiveTData(StringRef, SMLoc) {
     return ParseSectionSwitch(".tdata", ELF::SHT_PROGBITS,
                               ELF::SHF_ALLOC |
                               ELF::SHF_TLS | ELF::SHF_WRITE,
                               SectionKind::getThreadData());
   }
   bool ParseSectionDirectiveTBSS(StringRef, SMLoc) {
     return ParseSectionSwitch(".tbss", ELF::SHT_NOBITS,
                               ELF::SHF_ALLOC |
                               ELF::SHF_TLS | ELF::SHF_WRITE,
                               SectionKind::getThreadBSS());
   }
   bool ParseSectionDirectiveDataRel(StringRef, SMLoc) {
     return ParseSectionSwitch(".data.rel", ELF::SHT_PROGBITS,
                               ELF::SHF_ALLOC | ELF::SHF_WRITE,
                               SectionKind::getData());
   }
   bool ParseSectionDirectiveDataRelRo(StringRef, SMLoc) {
     return ParseSectionSwitch(".data.rel.ro", ELF::SHT_PROGBITS,
                               ELF::SHF_ALLOC |
                               ELF::SHF_WRITE,
                               SectionKind::getReadOnlyWithRel());
   }
   bool ParseSectionDirectiveEhFrame(StringRef, SMLoc) {
     return ParseSectionSwitch(".eh_frame", ELF::SHT_PROGBITS,
                               ELF::SHF_ALLOC | ELF::SHF_WRITE,
                               SectionKind::getData());
   }
   bool ParseDirectivePushSection(StringRef, SMLoc);
   bool ParseDirectivePopSection(StringRef, SMLoc);
   bool ParseDirectiveSection(StringRef, SMLoc);
   bool ParseDirectiveSize(StringRef, SMLoc);
   bool ParseDirectivePrevious(StringRef, SMLoc);
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveIdent(StringRef, SMLoc);
   bool ParseDirectiveSymver(StringRef, SMLoc);
   bool ParseDirectiveVersion(StringRef, SMLoc);
   bool ParseDirectiveWeakref(StringRef, SMLoc);
   bool ParseDirectiveSymbolAttribute(StringRef, SMLoc);
   bool ParseDirectiveSubsection(StringRef, SMLoc);
   bool ParseDirectiveCGProfile(StringRef, SMLoc);
 
 private:
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionArguments(bool IsPush, SMLoc loc);
   unsigned parseSunStyleSectionFlags();
   bool maybeParseSectionType(StringRef &TypeName);
   bool parseMergeSize(int64_t &Size);
   bool parseGroup(StringRef &GroupName);
   bool parseMetadataSym(MCSymbolELF *&Associated);
   bool maybeParseUniqueID(int64_t &UniqueID);
 };
 
 } // end anonymous namespace
 
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ]
 bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
   MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Directive)
     .Case(".weak", MCSA_Weak)
     .Case(".local", MCSA_Local)
     .Case(".hidden", MCSA_Hidden)
     .Case(".internal", MCSA_Internal)
     .Case(".protected", MCSA_Protected)
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
         return TokError("expected identifier in directive");
 
       MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
       getStreamer().EmitSymbolAttribute(Sym, Attr);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
 
       if (getLexer().isNot(AsmToken::Comma))
         return TokError("unexpected token in directive");
       Lex();
     }
   }
 
   Lex();
   return false;
 }
 
 bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
                                       unsigned Flags, SectionKind Kind) {
   const MCExpr *Subsection = nullptr;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getParser().parseExpression(Subsection))
       return true;
   }
   Lex();
 
   getStreamer().SwitchSection(getContext().getELFSection(Section, Type, Flags),
                               Subsection);
 
   return false;
 }
 
 bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
   MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name));
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");
   Lex();
 
   const MCExpr *Expr;
   if (getParser().parseExpression(Expr))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
   Lex();
 
   getStreamer().emitELFSize(Sym, Expr);
   return false;
 }
 
 bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   // A section name can contain -, so we cannot just use
   // parseIdentifier.
   SMLoc FirstLoc = getLexer().getLoc();
   unsigned Size = 0;
 
   if (getLexer().is(AsmToken::String)) {
     SectionName = getTok().getIdentifier();
     Lex();
     return false;
   }
 
   while (!getParser().hasPendingError()) {
     SMLoc PrevLoc = getLexer().getLoc();
     if (getLexer().is(AsmToken::Comma) ||
       getLexer().is(AsmToken::EndOfStatement))
       break;
 
     unsigned CurSize;
     if (getLexer().is(AsmToken::String)) {
       CurSize = getTok().getIdentifier().size() + 2;
       Lex();
     } else if (getLexer().is(AsmToken::Identifier)) {
       CurSize = getTok().getIdentifier().size();
       Lex();
     } else {
       CurSize = getTok().getString().size();
       Lex();
     }
     Size += CurSize;
     SectionName = StringRef(FirstLoc.getPointer(), Size);
 
     // Make sure the following token is adjacent.
     if (PrevLoc.getPointer() + CurSize != getTok().getLoc().getPointer())
       break;
   }
   if (Size == 0)
     return true;
 
   return false;
 }
 
 static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
   unsigned flags = 0;
 
   // If a valid numerical value is set for the section flag, use it verbatim
   if (!flagsStr.getAsInteger(0, flags))
     return flags;
 
   for (char i : flagsStr) {
     switch (i) {
     case 'a':
       flags |= ELF::SHF_ALLOC;
       break;
     case 'e':
       flags |= ELF::SHF_EXCLUDE;
       break;
     case 'x':
       flags |= ELF::SHF_EXECINSTR;
       break;
     case 'w':
       flags |= ELF::SHF_WRITE;
       break;
     case 'o':
       flags |= ELF::SHF_LINK_ORDER;
       break;
     case 'M':
       flags |= ELF::SHF_MERGE;
       break;
     case 'S':
       flags |= ELF::SHF_STRINGS;
       break;
     case 'T':
       flags |= ELF::SHF_TLS;
       break;
     case 'c':
       flags |= ELF::XCORE_SHF_CP_SECTION;
       break;
     case 'd':
       flags |= ELF::XCORE_SHF_DP_SECTION;
       break;
     case 'y':
       flags |= ELF::SHF_ARM_PURECODE;
       break;
     case 'G':
       flags |= ELF::SHF_GROUP;
       break;
     case '?':
       *UseLastGroup = true;
       break;
     default:
       return -1U;
     }
   }
 
   return flags;
 }
 
 unsigned ELFAsmParser::parseSunStyleSectionFlags() {
   unsigned flags = 0;
   while (getLexer().is(AsmToken::Hash)) {
     Lex(); // Eat the #.
 
     if (!getLexer().is(AsmToken::Identifier))
       return -1U;
 
     StringRef flagId = getTok().getIdentifier();
     if (flagId == "alloc")
       flags |= ELF::SHF_ALLOC;
     else if (flagId == "execinstr")
       flags |= ELF::SHF_EXECINSTR;
     else if (flagId == "write")
       flags |= ELF::SHF_WRITE;
     else if (flagId == "tls")
       flags |= ELF::SHF_TLS;
     else
       return -1U;
 
     Lex(); // Eat the flag.
 
     if (!getLexer().is(AsmToken::Comma))
         break;
     Lex(); // Eat the comma.
   }
   return flags;
 }
 
 
 bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) {
   getStreamer().PushSection();
 
   if (ParseSectionArguments(/*IsPush=*/true, loc)) {
     getStreamer().PopSection();
     return true;
   }
 
   return false;
 }
 
 bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
   if (!getStreamer().PopSection())
     return TokError(".popsection without corresponding .pushsection");
   return false;
 }
 
 bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) {
   return ParseSectionArguments(/*IsPush=*/false, loc);
 }
 
 bool ELFAsmParser::maybeParseSectionType(StringRef &TypeName) {
   MCAsmLexer &L = getLexer();
   if (L.isNot(AsmToken::Comma))
     return false;
   Lex();
   if (L.isNot(AsmToken::At) && L.isNot(AsmToken::Percent) &&
       L.isNot(AsmToken::String)) {
     if (L.getAllowAtInIdentifier())
       return TokError("expected '@<type>', '%<type>' or \"<type>\"");
     else
       return TokError("expected '%<type>' or \"<type>\"");
   }
   if (!L.is(AsmToken::String))
     Lex();
   if (L.is(AsmToken::Integer)) {
     TypeName = getTok().getString();
     Lex();
   } else if (getParser().parseIdentifier(TypeName))
     return TokError("expected identifier in directive");
   return false;
 }
 
 bool ELFAsmParser::parseMergeSize(int64_t &Size) {
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("expected the entry size");
   Lex();
   if (getParser().parseAbsoluteExpression(Size))
     return true;
   if (Size <= 0)
     return TokError("entry size must be positive");
   return false;
 }
 
 bool ELFAsmParser::parseGroup(StringRef &GroupName) {
   MCAsmLexer &L = getLexer();
   if (L.isNot(AsmToken::Comma))
     return TokError("expected group name");
   Lex();
   if (L.is(AsmToken::Integer)) {
     GroupName = getTok().getString();
     Lex();
   } else if (getParser().parseIdentifier(GroupName)) {
     return TokError("invalid group name");
   }
   if (L.is(AsmToken::Comma)) {
     Lex();
     StringRef Linkage;
     if (getParser().parseIdentifier(Linkage))
       return TokError("invalid linkage");
     if (Linkage != "comdat")
       return TokError("Linkage must be 'comdat'");
   }
   return false;
 }
 
 bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) {
   MCAsmLexer &L = getLexer();
   if (L.isNot(AsmToken::Comma))
     return TokError("expected metadata symbol");
   Lex();
   StringRef Name;
   if (getParser().parseIdentifier(Name))
     return TokError("invalid metadata symbol");
   Associated = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
   if (!Associated || !Associated->isInSection())
     return TokError("symbol is not in a section: " + Name);
   return false;
 }
 
 bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) {
   MCAsmLexer &L = getLexer();
   if (L.isNot(AsmToken::Comma))
     return false;
   Lex();
   StringRef UniqueStr;
   if (getParser().parseIdentifier(UniqueStr))
     return TokError("expected identifier in directive");
   if (UniqueStr != "unique")
     return TokError("expected 'unique'");
   if (L.isNot(AsmToken::Comma))
     return TokError("expected commma");
   Lex();
   if (getParser().parseAbsoluteExpression(UniqueID))
     return true;
   if (UniqueID < 0)
     return TokError("unique id must be positive");
   if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
     return TokError("unique id is too large");
   return false;
 }
 
 static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
   return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
 }
 
-// Return a set of section flags based on the section name that can then
-// be augmented later, otherwise return 0 if we don't have any reasonable
-// defaults.
-static unsigned defaultSectionFlags(StringRef SectionName) {
-
-  if (hasPrefix(SectionName, ".rodata.cst"))
-    return ELF::SHF_ALLOC | ELF::SHF_MERGE;
-
-  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
-    return ELF::SHF_ALLOC;
-
-  if (SectionName == ".fini" || SectionName == ".init" ||
-      hasPrefix(SectionName, ".text."))
-    return ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
-
-  if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
-      hasPrefix(SectionName, ".bss.") ||
-      hasPrefix(SectionName, ".init_array.") ||
-      hasPrefix(SectionName, ".fini_array.") ||
-      hasPrefix(SectionName, ".preinit_array."))
-    return ELF::SHF_ALLOC | ELF::SHF_WRITE;
-
-  if (hasPrefix(SectionName, ".tdata.") || hasPrefix(SectionName, ".tbss."))
-    return ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
-
-  return 0;
-}
-
 bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
   if (ParseSectionName(SectionName))
     return TokError("expected identifier in directive");
 
   StringRef TypeName;
   int64_t Size = 0;
   StringRef GroupName;
+  unsigned Flags = 0;
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
   MCSymbolELF *Associated = nullptr;
   int64_t UniqueID = ~0;
 
-  // Set the default section flags first in case no others are given.
-  unsigned Flags = defaultSectionFlags(SectionName);
+  // Set the defaults first.
+  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
+    Flags |= ELF::SHF_ALLOC;
+  else if (SectionName == ".fini" || SectionName == ".init" ||
+           hasPrefix(SectionName, ".text."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+  else if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
+           hasPrefix(SectionName, ".bss.") ||
+           hasPrefix(SectionName, ".init_array.") ||
+           hasPrefix(SectionName, ".fini_array.") ||
+           hasPrefix(SectionName, ".preinit_array."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  else if (hasPrefix(SectionName, ".tdata.") ||
+           hasPrefix(SectionName, ".tbss."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
 
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
 
     if (IsPush && getLexer().isNot(AsmToken::String)) {
       if (getParser().parseExpression(Subsection))
         return true;
       if (getLexer().isNot(AsmToken::Comma))
         goto EndStmt;
       Lex();
     }
 
     unsigned extraFlags;
 
     if (getLexer().isNot(AsmToken::String)) {
       if (!getContext().getAsmInfo()->usesSunStyleELFSectionSwitchSyntax()
           || getLexer().isNot(AsmToken::Hash))
         return TokError("expected string in directive");
       extraFlags = parseSunStyleSectionFlags();
     } else {
       StringRef FlagsStr = getTok().getStringContents();
       Lex();
       extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup);
     }
 
     if (extraFlags == -1U)
       return TokError("unknown flag");
-
-    // If we found additional section flags on a known section then give a
-    // warning.
-    if (Flags && Flags != extraFlags)
-      Warning(loc, "setting incorrect section attributes for " + SectionName);
-
     Flags |= extraFlags;
 
     bool Mergeable = Flags & ELF::SHF_MERGE;
     bool Group = Flags & ELF::SHF_GROUP;
     if (Group && UseLastGroup)
       return TokError("Section cannot specifiy a group name while also acting "
                       "as a member of the last group");
 
     if (maybeParseSectionType(TypeName))
       return true;
 
     MCAsmLexer &L = getLexer();
     if (TypeName.empty()) {
       if (Mergeable)
         return TokError("Mergeable section must specify the type");
       if (Group)
         return TokError("Group section must specify the type");
       if (L.isNot(AsmToken::EndOfStatement))
         return TokError("unexpected token in directive");
     }
 
     if (Mergeable)
       if (parseMergeSize(Size))
         return true;
     if (Group)
       if (parseGroup(GroupName))
         return true;
     if (Flags & ELF::SHF_LINK_ORDER)
       if (parseMetadataSym(Associated))
         return true;
     if (maybeParseUniqueID(UniqueID))
       return true;
   }
 
 EndStmt:
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
   Lex();
 
   unsigned Type = ELF::SHT_PROGBITS;
 
   if (TypeName.empty()) {
     if (SectionName.startswith(".note"))
       Type = ELF::SHT_NOTE;
     else if (hasPrefix(SectionName, ".init_array."))
       Type = ELF::SHT_INIT_ARRAY;
     else if (hasPrefix(SectionName, ".bss."))
       Type = ELF::SHT_NOBITS;
     else if (hasPrefix(SectionName, ".tbss."))
       Type = ELF::SHT_NOBITS;
     else if (hasPrefix(SectionName, ".fini_array."))
       Type = ELF::SHT_FINI_ARRAY;
     else if (hasPrefix(SectionName, ".preinit_array."))
       Type = ELF::SHT_PREINIT_ARRAY;
   } else {
     if (TypeName == "init_array")
       Type = ELF::SHT_INIT_ARRAY;
     else if (TypeName == "fini_array")
       Type = ELF::SHT_FINI_ARRAY;
     else if (TypeName == "preinit_array")
       Type = ELF::SHT_PREINIT_ARRAY;
     else if (TypeName == "nobits")
       Type = ELF::SHT_NOBITS;
     else if (TypeName == "progbits")
       Type = ELF::SHT_PROGBITS;
     else if (TypeName == "note")
       Type = ELF::SHT_NOTE;
     else if (TypeName == "unwind")
       Type = ELF::SHT_X86_64_UNWIND;
     else if (TypeName == "llvm_odrtab")
       Type = ELF::SHT_LLVM_ODRTAB;
     else if (TypeName == "llvm_linker_options")
       Type = ELF::SHT_LLVM_LINKER_OPTIONS;
     else if (TypeName == "llvm_call_graph_profile")
       Type = ELF::SHT_LLVM_CALL_GRAPH_PROFILE;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
 
   if (UseLastGroup) {
     MCSectionSubPair CurrentSection = getStreamer().getCurrentSection();
     if (const MCSectionELF *Section =
             cast_or_null<MCSectionELF>(CurrentSection.first))
       if (const MCSymbol *Group = Section->getGroup()) {
         GroupName = Group->getName();
         Flags |= ELF::SHF_GROUP;
       }
   }
 
   MCSection *ELFSection =
       getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
                                  UniqueID, Associated);
   getStreamer().SwitchSection(ELFSection, Subsection);
 
   if (getContext().getGenDwarfForAssembly()) {
     bool InsertResult = getContext().addGenDwarfSection(ELFSection);
     if (InsertResult) {
       if (getContext().getDwarfVersion() <= 2)
         Warning(loc, "DWARF2 only supports one section per compilation unit");
 
       if (!ELFSection->getBeginSymbol()) {
         MCSymbol *SectionStartSymbol = getContext().createTempSymbol();
         getStreamer().EmitLabel(SectionStartSymbol);
         ELFSection->setBeginSymbol(SectionStartSymbol);
       }
     }
   }
 
   return false;
 }
 
 bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
   MCSectionSubPair PreviousSection = getStreamer().getPreviousSection();
   if (PreviousSection.first == nullptr)
       return TokError(".previous without corresponding .section");
   getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second);
 
   return false;
 }
 
 static MCSymbolAttr MCAttrForString(StringRef Type) {
   return StringSwitch<MCSymbolAttr>(Type)
           .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction)
           .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject)
           .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS)
           .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon)
           .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType)
           .Cases("STT_GNU_IFUNC", "gnu_indirect_function",
                  MCSA_ELF_TypeIndFunction)
           .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
           .Default(MCSA_Invalid);
 }
 
 /// ParseDirectiveELFType
 ///  ::= .type identifier , STT_<TYPE_IN_UPPER_CASE>
 ///  ::= .type identifier , #attribute
 ///  ::= .type identifier , @attribute
 ///  ::= .type identifier , %attribute
 ///  ::= .type identifier , "attribute"
 bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   // Handle the identifier as the key symbol.
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
   // NOTE the comma is optional in all cases.  It is only documented as being
   // optional in the first case, however, GAS will silently treat the comma as
   // optional in all cases.  Furthermore, although the documentation states that
   // the first form only accepts STT_<TYPE_IN_UPPER_CASE>, in reality, GAS
   // accepts both the upper case name as well as the lower case aliases.
   if (getLexer().is(AsmToken::Comma))
     Lex();
 
   if (getLexer().isNot(AsmToken::Identifier) &&
       getLexer().isNot(AsmToken::Hash) &&
       getLexer().isNot(AsmToken::Percent) &&
       getLexer().isNot(AsmToken::String)) {
     if (!getLexer().getAllowAtInIdentifier())
       return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', "
                       "'%<type>' or \"<type>\"");
     else if (getLexer().isNot(AsmToken::At))
       return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', "
                       "'%<type>' or \"<type>\"");
   }
 
   if (getLexer().isNot(AsmToken::String) &&
       getLexer().isNot(AsmToken::Identifier))
     Lex();
 
   SMLoc TypeLoc = getLexer().getLoc();
 
   StringRef Type;
   if (getParser().parseIdentifier(Type))
     return TokError("expected symbol type in directive");
 
   MCSymbolAttr Attr = MCAttrForString(Type);
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.type' directive");
   Lex();
 
   getStreamer().EmitSymbolAttribute(Sym, Attr);
 
   return false;
 }
 
 /// ParseDirectiveIdent
 ///  ::= .ident string
 bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::String))
     return TokError("unexpected token in '.ident' directive");
 
   StringRef Data = getTok().getIdentifier();
 
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.ident' directive");
   Lex();
 
   getStreamer().EmitIdent(Data);
   return false;
 }
 
 /// ParseDirectiveSymver
 ///  ::= .symver foo, bar2@zed
 bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("expected a comma");
 
   // ARM assembly uses @ for a comment...
   // except when parsing the second parameter of the .symver directive.
   // Force the next symbol to allow @ in the identifier, which is
   // required for this directive and then reset it to its initial state.
   const bool AllowAtInIdentifier = getLexer().getAllowAtInIdentifier();
   getLexer().setAllowAtInIdentifier(true);
   Lex();
   getLexer().setAllowAtInIdentifier(AllowAtInIdentifier);
 
   StringRef AliasName;
   if (getParser().parseIdentifier(AliasName))
     return TokError("expected identifier in directive");
 
   if (AliasName.find('@') == StringRef::npos)
     return TokError("expected a '@' in the name");
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
   getStreamer().emitELFSymverDirective(AliasName, Sym);
   return false;
 }
 
 /// ParseDirectiveVersion
 ///  ::= .version string
 bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::String))
     return TokError("unexpected token in '.version' directive");
 
   StringRef Data = getTok().getIdentifier();
 
   Lex();
 
   MCSection *Note = getContext().getELFSection(".note", ELF::SHT_NOTE, 0);
 
   getStreamer().PushSection();
   getStreamer().SwitchSection(Note);
   getStreamer().EmitIntValue(Data.size()+1, 4); // namesz.
   getStreamer().EmitIntValue(0, 4);             // descsz = 0 (no description).
   getStreamer().EmitIntValue(1, 4);             // type = NT_VERSION.
   getStreamer().EmitBytes(Data);                // name.
   getStreamer().EmitIntValue(0, 1);             // terminate the string.
   getStreamer().EmitValueToAlignment(4);        // ensure 4 byte alignment.
   getStreamer().PopSection();
   return false;
 }
 
 /// ParseDirectiveWeakref
 ///  ::= .weakref foo, bar
 bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
   // FIXME: Share code with the other alias building directives.
 
   StringRef AliasName;
   if (getParser().parseIdentifier(AliasName))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("expected a comma");
 
   Lex();
 
   StringRef Name;
   if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName);
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
   getStreamer().EmitWeakReference(Alias, Sym);
   return false;
 }
 
 bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
   const MCExpr *Subsection = nullptr;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getParser().parseExpression(Subsection))
      return true;
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
   Lex();
 
   getStreamer().SubSection(Subsection);
   return false;
 }
 
 /// ParseDirectiveCGProfile
 ///  ::= .cg_profile identifier, identifier, <number>
 bool ELFAsmParser::ParseDirectiveCGProfile(StringRef, SMLoc) {
   StringRef From;
   SMLoc FromLoc = getLexer().getLoc();
   if (getParser().parseIdentifier(From))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("expected a comma");
   Lex();
 
   StringRef To;
   SMLoc ToLoc = getLexer().getLoc();
   if (getParser().parseIdentifier(To))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("expected a comma");
   Lex();
 
   int64_t Count;
   if (getParser().parseIntToken(
           Count, "expected integer count in '.cg_profile' directive"))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
   MCSymbol *FromSym = getContext().getOrCreateSymbol(From);
   MCSymbol *ToSym = getContext().getOrCreateSymbol(To);
 
   getStreamer().emitCGProfileEntry(
       MCSymbolRefExpr::create(FromSym, MCSymbolRefExpr::VK_None, getContext(),
                               FromLoc),
       MCSymbolRefExpr::create(ToSym, MCSymbolRefExpr::VK_None, getContext(),
                               ToLoc),
       Count);
   return false;
 }
 
 namespace llvm {
 
 MCAsmParserExtension *createELFAsmParser() {
   return new ELFAsmParser;
 }
 
 } // end namespace llvm
Index: vendor/llvm/dist-release_70/lib/Target/AMDGPU/AMDGPU.td
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/AMDGPU/AMDGPU.td	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Target/AMDGPU/AMDGPU.td	(revision 337631)
@@ -1,746 +1,737 @@
 //===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===------------------------------------------------------------===//
 
 include "llvm/TableGen/SearchableTable.td"
 include "llvm/Target/Target.td"
 include "AMDGPUFeatures.td"
 
 //===------------------------------------------------------------===//
 // Subtarget Features (device properties)
 //===------------------------------------------------------------===//
 
 def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
   "FastFMAF32",
   "true",
   "Assuming f32 fma is at least as fast as mul + add"
 >;
 
 def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
   "MIMG_R128",
   "true",
   "Support 128-bit texture resources"
 >;
 
 def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
   "HalfRate64Ops",
   "true",
   "Most fp64 instructions are half rate instead of quarter"
 >;
 
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
   "FlatAddressSpace",
   "true",
   "Support flat address space"
 >;
 
 def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
   "FlatInstOffsets",
   "true",
   "Flat instructions have immediate offset addressing mode"
 >;
 
 def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
   "FlatGlobalInsts",
   "true",
   "Have global_* flat memory instructions"
 >;
 
 def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
   "FlatScratchInsts",
   "true",
   "Have scratch_* flat memory instructions"
 >;
 
 def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
   "AddNoCarryInsts",
   "true",
   "Have VALU add/sub instructions without carry out"
 >;
 
 def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "UnalignedBufferAccess",
   "true",
   "Support unaligned global loads and stores"
 >;
 
 def FeatureTrapHandler: SubtargetFeature<"trap-handler",
   "TrapHandler",
   "true",
   "Trap handler support"
 >;
 
 def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
   "UnalignedScratchAccess",
   "true",
   "Support unaligned scratch loads and stores"
 >;
 
 def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
   "HasApertureRegs",
   "true",
   "Has Memory Aperture Base and Size Registers"
 >;
 
 def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
   "HasMadMixInsts",
   "true",
   "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
 >;
 
 def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
   "HasFmaMixInsts",
   "true",
   "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
 >;
 
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
 // - compute ring: XNACK enabled
 //
 // If XNACK is enabled, the VMEM latency can be worse.
 // If XNACK is disabled, the 2 SGPRs can be used for general purposes.
 def FeatureXNACK : SubtargetFeature<"xnack",
   "EnableXNACK",
   "true",
   "Enable XNACK support"
 >;
 
 def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
   "SGPRInitBug",
   "true",
   "VI SGPR initialization bug requiring a fixed SGPR allocation size"
 >;
 
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
   "ldsbankcount"#Value,
   "LDSBankCount",
   !cast<string>(Value),
   "The number of LDS banks per compute unit."
 >;
 
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
 def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
   "GCN3Encoding",
   "true",
   "Encoding format for VI"
 >;
 
 def FeatureCIInsts : SubtargetFeature<"ci-insts",
   "CIInsts",
   "true",
   "Additional instructions for CI+"
 >;
 
 def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
   "GFX9Insts",
   "true",
   "Additional instructions for GFX9+"
 >;
 
 def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
   "HasSMemRealTime",
   "true",
   "Has s_memrealtime instruction"
 >;
 
 def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm",
   "HasInv2PiInlineImm",
   "true",
   "Has 1 / (2 * pi) as inline immediate"
 >;
 
 def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
   "Has16BitInsts",
   "true",
   "Has i16/f16 instructions"
 >;
 
 def FeatureVOP3P : SubtargetFeature<"vop3p",
   "HasVOP3PInsts",
   "true",
   "Has VOP3P packed instructions"
 >;
 
 def FeatureMovrel : SubtargetFeature<"movrel",
   "HasMovrel",
   "true",
   "Has v_movrel*_b32 instructions"
 >;
 
 def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
   "HasVGPRIndexMode",
   "true",
   "Has VGPR mode register indexing"
 >;
 
 def FeatureScalarStores : SubtargetFeature<"scalar-stores",
   "HasScalarStores",
   "true",
   "Has store scalar memory instructions"
 >;
 
 def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
   "HasScalarAtomics",
   "true",
   "Has atomic scalar memory instructions"
 >;
 
 def FeatureSDWA : SubtargetFeature<"sdwa",
   "HasSDWA",
   "true",
   "Support SDWA (Sub-DWORD Addressing) extension"
 >;
 
 def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
   "HasSDWAOmod",
   "true",
   "Support OMod with SDWA (Sub-DWORD Addressing) extension"
 >;
 
 def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
   "HasSDWAScalar",
   "true",
   "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
 >;
 
 def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
   "HasSDWASdst",
   "true",
   "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
 >;
 
 def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
   "HasSDWAMac",
   "true",
   "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
 >;
 
 def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc",
   "HasSDWAOutModsVOPC",
   "true",
   "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
 >;
 
 def FeatureDPP : SubtargetFeature<"dpp",
   "HasDPP",
   "true",
   "Support DPP (Data Parallel Primitives) extension"
 >;
 
 def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
   "HasIntClamp",
   "true",
   "Support clamp for integer destination"
 >;
 
 def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
   "HasUnpackedD16VMem",
   "true",
   "Has unpacked d16 vmem instructions"
 >;
 
 def FeatureDLInsts : SubtargetFeature<"dl-insts",
   "HasDLInsts",
   "true",
   "Has deep learning instructions"
 >;
 
 def FeatureD16PreservesUnusedBits : SubtargetFeature<
   "d16-preserves-unused-bits",
   "D16PreservesUnusedBits",
   "true",
   "If present, then instructions defined by HasD16LoadStore predicate preserve "
   "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
   "zero unused bits."
 >;
 
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
 
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
-  "FP32Denormals",
-  "true",
-  "Enable single precision denormal handling"
->;
-
 // Denormal handling for fp64 and fp16 is controlled by the same
 // config register when fp16 supported.
 // TODO: Do we need a separate f16 setting when not legal?
 def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals",
   "FP64FP16Denormals",
   "true",
   "Enable double and half precision denormal handling",
   [FeatureFP64]
 >;
 
 def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
   "FP64FP16Denormals",
   "true",
   "Enable double and half precision denormal handling",
   [FeatureFP64, FeatureFP64FP16Denormals]
 >;
 
 def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
   "FP64FP16Denormals",
   "true",
   "Enable half precision denormal handling",
   [FeatureFP64FP16Denormals]
 >;
 
 def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
   "FPExceptions",
   "true",
   "Enable floating point exceptions"
 >;
 
 class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
   "max-private-element-size-"#size,
   "MaxPrivateElementSize",
   !cast<string>(size),
   "Maximum private access size may be "#size
 >;
 
 def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
 def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
 def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
 
 def FeatureEnableHugePrivateBuffer : SubtargetFeature<
   "huge-private-buffer",
   "EnableHugePrivateBuffer",
   "true",
   "Enable private/scratch buffer sizes greater than 128 GB"
 >;
 
 def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
   "EnableVGPRSpilling",
   "true",
   "Enable spilling of VGPRs to scratch memory"
 >;
 
 def FeatureDumpCode : SubtargetFeature <"DumpCode",
   "DumpCode",
   "true",
   "Dump MachineInstrs in the CodeEmitter"
 >;
 
 def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
   "DumpCode",
   "true",
   "Dump MachineInstrs in the CodeEmitter"
 >;
 
 // XXX - This should probably be removed once enabled by default
 def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
   "EnableLoadStoreOpt",
   "true",
   "Enable SI load/store optimizer pass"
 >;
 
 // Performance debugging feature. Allow using DS instruction immediate
 // offsets even if the base pointer can't be proven to be base. On SI,
 // base pointer values that won't give the same result as a 16-bit add
 // are not safe to fold, but this will override the conservative test
 // for the base pointer.
 def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <
   "unsafe-ds-offset-folding",
   "EnableUnsafeDSOffsetFolding",
   "true",
   "Force using DS instruction immediate offsets on SI"
 >;
 
 def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
   "EnableSIScheduler",
   "true",
   "Enable SI Machine Scheduler"
 >;
 
 def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
   "EnableDS128",
   "true",
   "Use ds_{read|write}_b128"
 >;
 
 // Unless +-flat-for-global is specified, turn on FlatForGlobal for
 // all OS-es on VI and newer hardware to avoid assertion failures due
 // to missing ADDR64 variants of MUBUF instructions.
 // FIXME: moveToVALU should be able to handle converting addr64 MUBUF
 // instructions.
 
 def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
   "FlatForGlobal",
   "true",
   "Force to generate flat instruction for global"
 >;
 
 def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
   "auto-waitcnt-before-barrier",
   "AutoWaitcntBeforeBarrier",
   "true",
   "Hardware automatically inserts waitcnt before barrier"
 >;
 
 def FeatureCodeObjectV3 : SubtargetFeature <
   "code-object-v3",
   "CodeObjectV3",
   "true",
   "Generate code object version 3"
 >;
 
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
   "Dummy feature to disable assembler instructions"
 >;
 
 def FeatureGCN : SubtargetFeature<"gcn",
   "IsGCN",
   "true",
   "GCN or newer GPU"
 >;
 
 class GCNSubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
 
 def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN,
   FeatureLDSBankCount32, FeatureMovrel]
 >;
 
 def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel]
 >;
 
 def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
    FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
    FeatureIntClamp
   ]
 >;
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   [FeatureFP64, FeatureLocalMemorySize65536,
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
    FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureScalarAtomics
   ]
 >;
 
 class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
                                   list<SubtargetFeature> Implies>
                                  : SubtargetFeature <
   "isaver"#Major#"."#Minor#"."#Stepping,
   "IsaVersion",
   "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
   "Instruction set version number",
   Implies
 >;
 
 def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
   [FeatureSeaIslands,
    HalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
    FeatureLDSBankCount16,
    FeatureFastFMAF32]>;
 
 def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
   [FeatureSeaIslands,
    FeatureLDSBankCount16]>;
 
 def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
   [FeatureVolcanicIslands,
    FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureXNACK,
    FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureSGPRInitBug,
    FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
    FeatureUnpackedD16VMem]>;
 
 def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount16,
    FeatureXNACK]>;
 
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
    FeatureXNACK,
    FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
    FeatureFmaMixInsts,
    FeatureD16PreservesUnusedBits]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
    FeatureDLInsts]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
 //===----------------------------------------------------------------------===//
 
 def FeatureDebuggerInsertNops : SubtargetFeature<
   "amdgpu-debugger-insert-nops",
   "DebuggerInsertNops",
   "true",
   "Insert one nop instruction for each high level source statement"
 >;
 
 def FeatureDebuggerEmitPrologue : SubtargetFeature<
   "amdgpu-debugger-emit-prologue",
   "DebuggerEmitPrologue",
   "true",
   "Emit debugger prologue"
 >;
 
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
   let noNamedPositionallyEncodedOperands = 1;
 }
 
 def AMDGPUAsmParser : AsmParser {
   // Some of the R600 registers have the same name, so this crashes.
   // For example T0_XYZW and T0_XY both have the asm name T0.
   let ShouldEmitMatchRegisterName = 0;
 }
 
 def AMDGPUAsmWriter : AsmWriter {
   int PassSubtarget = 1;
 }
 
 def AMDGPUAsmVariants {
   string Default = "Default";
   int Default_ID = 0;
   string VOP3 = "VOP3";
   int VOP3_ID = 1;
   string SDWA = "SDWA";
   int SDWA_ID = 2;
   string SDWA9 = "SDWA9";
   int SDWA9_ID = 3;
   string DPP = "DPP";
   int DPP_ID = 4;
   string Disable = "Disable";
   int Disable_ID = 5;
 }
 
 def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.Default_ID;
   let Name = AMDGPUAsmVariants.Default;
 }
 
 def VOP3AsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.VOP3_ID;
   let Name = AMDGPUAsmVariants.VOP3;
 }
 
 def SDWAAsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.SDWA_ID;
   let Name = AMDGPUAsmVariants.SDWA;
 }
 
 def SDWA9AsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.SDWA9_ID;
   let Name = AMDGPUAsmVariants.SDWA9;
 }
 
 
 def DPPAsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.DPP_ID;
   let Name = AMDGPUAsmVariants.DPP;
 }
 
 def AMDGPU : Target {
   // Pull in Instruction Info:
   let InstructionSet = AMDGPUInstrInfo;
   let AssemblyParsers = [AMDGPUAsmParser];
   let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
                                 VOP3AsmParserVariant,
                                 SDWAAsmParserVariant,
                                 SDWA9AsmParserVariant,
                                 DPPAsmParserVariant];
   let AssemblyWriters = [AMDGPUAsmWriter];
   let AllowRegisterRenaming = 1;
 }
 
 // Dummy Instruction itineraries for pseudo instructions
 def ALU_NULL : FuncUnit;
 def NullALU : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Predicate helper class
 //===----------------------------------------------------------------------===//
 
 def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
 >, AssemblerPredicate<"!FeatureGCN3Encoding">;
 
 def isVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate<"FeatureGCN3Encoding">;
 
 def isGFX9 : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<"FeatureGFX9Insts">;
 
 // TODO: Either the name to be changed or we simply use IsCI!
 def isCIVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<"FeatureCIInsts">;
 
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<"FeatureFlatAddressSpace">;
 
 def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
   AssemblerPredicate<"FeatureFlatGlobalInsts">;
 def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
   AssemblerPredicate<"FeatureFlatScratchInsts">;
 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
   AssemblerPredicate<"FeatureGFX9Insts">;
 
 def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
   AssemblerPredicate<"FeatureUnpackedD16VMem">;
 def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
   AssemblerPredicate<"!FeatureUnpackedD16VMem">;
 
 def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
   AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
 
 def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<"FeatureGFX9Insts">;
 
 def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">,
   AssemblerPredicate<"FeatureAddNoCarryInsts">;
 
 def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">,
   AssemblerPredicate<"!FeatureAddNoCarryInsts">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<"Feature16BitInsts">;
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<"FeatureVOP3P">;
 
 def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<"!FeatureVOP3P">;
 
 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
   AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
 
 def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
   AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;
 
 def HasDPP : Predicate<"Subtarget->hasDPP()">,
   AssemblerPredicate<"FeatureDPP">;
 
 def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
   AssemblerPredicate<"FeatureIntClamp">;
 
 def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
   AssemblerPredicate<"FeatureMadMixInsts">;
 
 def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
   AssemblerPredicate<"FeatureScalarAtomics">;
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
 def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
                       AssemblerPredicate<"FeatureVGPRIndexMode">;
 def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
                 AssemblerPredicate<"FeatureMovrel">;
 
 def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
   AssemblerPredicate<"FeatureFmaMixInsts">;
 
 def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
   AssemblerPredicate<"FeatureDLInsts">;
 
 
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
 
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
 include "AMDGPUInstrInfo.td"
 include "AMDGPUIntrinsics.td"
 include "SIIntrinsics.td"
 include "AMDGPURegisterInfo.td"
 include "AMDGPURegisterBanks.td"
 include "AMDGPUInstructions.td"
 include "SIInstrInfo.td"
 include "AMDGPUCallingConv.td"
 include "AMDGPUSearchableTables.td"
Index: vendor/llvm/dist-release_70/lib/Target/AMDGPU/AMDGPUFeatures.td
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/AMDGPU/AMDGPUFeatures.td	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Target/AMDGPU/AMDGPUFeatures.td	(revision 337631)
@@ -1,60 +1,69 @@
 //===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 def FeatureFP64 : SubtargetFeature<"fp64",
   "FP64",
   "true",
   "Enable double precision operations"
 >;
 
 def FeatureFMA : SubtargetFeature<"fmaf",
   "FMA",
   "true",
   "Enable single precision FMA (not as fast as mul+add, but fused)"
 >;
 
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+  "FP32Denormals",
+  "true",
+  "Enable single precision denormal handling"
+>;
+
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
   "localmemorysize"#Value,
   "LocalMemorySize",
   !cast<string>(Value),
   "The size of local memory in bytes"
 >;
 
 def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
 def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
 def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
 
 class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
   "wavefrontsize"#Value,
   "WavefrontSize",
   !cast<string>(Value),
   "The number of threads per wavefront"
 >;
 
 def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
 def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 
 class SubtargetFeatureGeneration <string Value, string Subtarget,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
                           Value#" GPU generation", Implies>;
 
 def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
   "DX10Clamp",
   "true",
   "clamp modifier clamps NaNs to 0.0"
 >;
 
 def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
   "EnablePromoteAlloca",
   "true",
   "Enable promote alloca pass"
 >;
 
Index: vendor/llvm/dist-release_70/lib/Target/AMDGPU/R600ISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/AMDGPU/R600ISelLowering.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Target/AMDGPU/R600ISelLowering.cpp	(revision 337631)
@@ -1,2265 +1,2305 @@
 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Custom DAG lowering for R600
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600FrameLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 #include "R600GenCallingConv.inc"
 
 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
                                        const R600Subtarget &STI)
     : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
   addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
   addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
   addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
   addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
   addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
   addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 
   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
   // spaces, so it is custom lowered to handle those where it isn't.
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
   }
 
   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
 
   setOperationAction(ISD::STORE, MVT::i8, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
   // We need to include these since trunc STORES to PRIVATE need
   // special handling to accommodate RMW
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
 
   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
 
   // Set condition code actions
   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
 
   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
 
   setOperationAction(ISD::FCOS, MVT::f32, Custom);
   setOperationAction(ISD::FSIN, MVT::f32, Custom);
 
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
   setOperationAction(ISD::FCEIL, MVT::f64, Custom);
   setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
   setOperationAction(ISD::FRINT, MVT::f64, Custom);
   setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
 
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
 
   // ADD, SUB overflow.
   // TODO: turn these into Legal?
   if (Subtarget->hasCARRY())
     setOperationAction(ISD::UADDO, MVT::i32, Custom);
 
   if (Subtarget->hasBORROW())
     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 
   // Expand sign extension of vectors
   if (!Subtarget->hasBFE())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 
   if (!Subtarget->hasBFE())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 
   if (!Subtarget->hasBFE())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 
   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
   //  to be Legal/Custom in order to avoid library calls.
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 
   if (!Subtarget->hasFMA()) {
     setOperationAction(ISD::FMA, MVT::f32, Expand);
     setOperationAction(ISD::FMA, MVT::f64, Expand);
   }
 
   // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
   // need it for R600.
   if (!Subtarget->hasFP32Denormals())
     setOperationAction(ISD::FMAD, MVT::f32, Legal);
 
   if (!Subtarget->hasBFI()) {
     // fcopysign can be done in a single instruction with BFI.
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   }
 
   if (!Subtarget->hasBCNT(32))
     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 
   if (!Subtarget->hasBCNT(64))
     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
   if (Subtarget->hasFFBH())
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
 
   if (Subtarget->hasFFBL())
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
 
   // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
   // need it for R600.
   if (Subtarget->hasBFE())
     setHasExtractBitsInsn(true);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
     setOperationAction(ISD::ADDC, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
     setOperationAction(ISD::SUBE, VT, Expand);
   }
 
   // LLVM will expand these to atomic_cmp_swap(0)
   // and atomic_swap, respectively.
   setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
 
   // We need to custom lower some of the intrinsics
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   setSchedulingPreference(Sched::Source);
 
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::FP_TO_SINT);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
   setTargetDAGCombine(ISD::LOAD);
 }
 
 static inline bool isEOP(MachineBasicBlock::iterator I) {
   if (std::next(I) == I->getParent()->end())
     return false;
   return std::next(I)->getOpcode() == R600::RETURN;
 }
 
 MachineBasicBlock *
 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                 MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = MI;
   const R600InstrInfo *TII = Subtarget->getInstrInfo();
 
   switch (MI.getOpcode()) {
   default:
     // Replace LDS_*_RET instruction that don't have any uses with the
     // equivalent LDS_*_NORET instruction.
     if (TII->isLDSRetInstr(MI.getOpcode())) {
       int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
       assert(DstIdx != -1);
       MachineInstrBuilder NewMI;
       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
       //        LDS_1A2D support and remove this special case.
       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
           MI.getOpcode() == R600::LDS_CMPST_RET)
         return BB;
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
                       TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
         NewMI.add(MI.getOperand(i));
       }
     } else {
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
     }
     break;
 
   case R600::FABS_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
         *BB, I, R600::MOV, MI.getOperand(0).getReg(),
         MI.getOperand(1).getReg());
     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
     break;
   }
 
   case R600::FNEG_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
         *BB, I, R600::MOV, MI.getOperand(0).getReg(),
         MI.getOperand(1).getReg());
     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
     break;
   }
 
   case R600::MASK_WRITE: {
     unsigned maskedRegister = MI.getOperand(0).getReg();
     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     break;
   }
 
   case R600::MOV_IMM_F32:
     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
                                                             .getFPImm()
                                                             ->getValueAPF()
                                                             .bitcastToAPInt()
                                                             .getZExtValue());
     break;
 
   case R600::MOV_IMM_I32:
     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
                      MI.getOperand(1).getImm());
     break;
 
   case R600::MOV_IMM_GLOBAL_ADDR: {
     //TODO: Perhaps combine this instruction with the next if possible
     auto MIB = TII->buildDefaultInstruction(
         *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
     int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
     //TODO: Ugh this is rather ugly
     MIB->getOperand(Idx) = MI.getOperand(1);
     break;
   }
 
   case R600::CONST_COPY: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(
         *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST);
     TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
                        MI.getOperand(1).getImm());
     break;
   }
 
   case R600::RAT_WRITE_CACHELESS_32_eg:
   case R600::RAT_WRITE_CACHELESS_64_eg:
   case R600::RAT_WRITE_CACHELESS_128_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
   case R600::RAT_STORE_TYPED_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
         .add(MI.getOperand(2))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
   case R600::BRANCH:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
         .add(MI.getOperand(0));
     break;
 
   case R600::BRANCH_COND_f32: {
     MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
                 R600::PREDICATE_BIT)
             .add(MI.getOperand(1))
             .addImm(R600::PRED_SETNE)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
         .add(MI.getOperand(0))
         .addReg(R600::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
   case R600::BRANCH_COND_i32: {
     MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
                 R600::PREDICATE_BIT)
             .add(MI.getOperand(1))
             .addImm(R600::PRED_SETNE_INT)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
         .add(MI.getOperand(0))
         .addReg(R600::PREDICATE_BIT, RegState::Kill);
     break;
   }
 
   case R600::EG_ExportSwz:
   case R600::R600_ExportSwz: {
     // Instruction is left unmodified if its not the last one of its type
     bool isLastInstructionOfItsType = true;
     unsigned InstExportType = MI.getOperand(1).getImm();
     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
          EndBlock = BB->end(); NextExportInst != EndBlock;
          NextExportInst = std::next(NextExportInst)) {
       if (NextExportInst->getOpcode() == R600::EG_ExportSwz ||
           NextExportInst->getOpcode() == R600::R600_ExportSwz) {
         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
             .getImm();
         if (CurrentInstExportType == InstExportType) {
           isLastInstructionOfItsType = false;
           break;
         }
       }
     }
     bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
     unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40;
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
         .add(MI.getOperand(0))
         .add(MI.getOperand(1))
         .add(MI.getOperand(2))
         .add(MI.getOperand(3))
         .add(MI.getOperand(4))
         .add(MI.getOperand(5))
         .add(MI.getOperand(6))
         .addImm(CfInst)
         .addImm(EOP);
     break;
   }
   case R600::RETURN: {
     return BB;
   }
   }
 
   MI.eraseFromParent();
   return BB;
 }
 
 //===----------------------------------------------------------------------===//
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
   case ISD::SRA_PARTS:
   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
   case ISD::FCOS:
   case ISD::FSIN: return LowerTrig(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: {
     SDValue Result = LowerLOAD(Op, DAG);
     assert((!Result.getNode() ||
             Result.getNode()->getNumValues() == 2) &&
            "Load should return a value and a chain");
     return Result;
   }
 
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
     case Intrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
       const SDValue Args[8] = {
         Chain,
         Op.getOperand(2), // Export Value
         Op.getOperand(3), // ArrayBase
         Op.getOperand(4), // Type
         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
       };
       return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
     }
 
     // default for switch(IntrinsicID)
     default: break;
     }
     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
     switch (IntrinsicID) {
     case Intrinsic::r600_tex:
     case Intrinsic::r600_texc: {
       unsigned TextureOp;
       switch (IntrinsicID) {
       case Intrinsic::r600_tex:
         TextureOp = 0;
         break;
       case Intrinsic::r600_texc:
         TextureOp = 1;
         break;
       default:
         llvm_unreachable("unhandled texture operation");
       }
 
       SDValue TexArgs[19] = {
         DAG.getConstant(TextureOp, DL, MVT::i32),
         Op.getOperand(1),
         DAG.getConstant(0, DL, MVT::i32),
         DAG.getConstant(1, DL, MVT::i32),
         DAG.getConstant(2, DL, MVT::i32),
         DAG.getConstant(3, DL, MVT::i32),
         Op.getOperand(2),
         Op.getOperand(3),
         Op.getOperand(4),
         DAG.getConstant(0, DL, MVT::i32),
         DAG.getConstant(1, DL, MVT::i32),
         DAG.getConstant(2, DL, MVT::i32),
         DAG.getConstant(3, DL, MVT::i32),
         Op.getOperand(5),
         Op.getOperand(6),
         Op.getOperand(7),
         Op.getOperand(8),
         Op.getOperand(9),
         Op.getOperand(10)
       };
       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
     case Intrinsic::r600_dot4: {
       SDValue Args[8] = {
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(0, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(0, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(1, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(1, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(2, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(2, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
           DAG.getConstant(3, DL, MVT::i32)),
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(3, DL, MVT::i32))
       };
       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     }
 
     case Intrinsic::r600_implicitarg_ptr: {
       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
       uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
     case Intrinsic::r600_read_ngroups_x:
       return LowerImplicitParameter(DAG, VT, DL, 0);
     case Intrinsic::r600_read_ngroups_y:
       return LowerImplicitParameter(DAG, VT, DL, 1);
     case Intrinsic::r600_read_ngroups_z:
       return LowerImplicitParameter(DAG, VT, DL, 2);
     case Intrinsic::r600_read_global_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 3);
     case Intrinsic::r600_read_global_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 4);
     case Intrinsic::r600_read_global_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 5);
     case Intrinsic::r600_read_local_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 6);
     case Intrinsic::r600_read_local_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 7);
     case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T0_Z, VT);
 
     case Intrinsic::r600_recipsqrt_ieee:
       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
 
     case Intrinsic::r600_recipsqrt_clamped:
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
     default:
       return Op;
     }
 
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     break;
   }
   } // end switch(Op.getOpcode())
   return SDValue();
 }
 
 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
                                             SmallVectorImpl<SDValue> &Results,
                                             SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
   default:
     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
     return;
   case ISD::FP_TO_UINT:
     if (N->getValueType(0) == MVT::i1) {
       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
       return;
     }
     // Since we don't care about out of bounds values we can use FP_TO_SINT for
     // uints too. The DAGLegalizer code for uint considers some extra cases
     // which are not necessary here.
     LLVM_FALLTHROUGH;
   case ISD::FP_TO_SINT: {
     if (N->getValueType(0) == MVT::i1) {
       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
       return;
     }
 
     SDValue Result;
     if (expandFP_TO_SINT(N, Result, DAG))
       Results.push_back(Result);
     return;
   }
   case ISD::SDIVREM: {
     SDValue Op = SDValue(N, 1);
     SDValue RES = LowerSDIVREM(Op, DAG);
     Results.push_back(RES);
     Results.push_back(RES.getValue(1));
     break;
   }
   case ISD::UDIVREM: {
     SDValue Op = SDValue(N, 0);
     LowerUDIVREM64(Op, DAG, Results);
     break;
   }
   }
 }
 
 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
                                                    SDValue Vector) const {
   SDLoc DL(Vector);
   EVT VecVT = Vector.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
   SmallVector<SDValue, 8> Args;
 
   for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
     Args.push_back(DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
   }
 
   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 }
 
 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                     SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Vector = Op.getOperand(0);
   SDValue Index = Op.getOperand(1);
 
   if (isa<ConstantSDNode>(Index) ||
       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     return Op;
 
   Vector = vectorToVerticalVector(DAG, Vector);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
                      Vector, Index);
 }
 
 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Vector = Op.getOperand(0);
   SDValue Value = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
 
   if (isa<ConstantSDNode>(Index) ||
       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
     return Op;
 
   Vector = vectorToVerticalVector(DAG, Vector);
   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
                                Vector, Value, Index);
   return vectorToVerticalVector(DAG, Insert);
 }
 
 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                                SDValue Op,
                                                SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   const DataLayout &DL = DAG.getDataLayout();
   const GlobalValue *GV = GSD->getGlobal();
   MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
 
   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
 }
 
 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   // On hw >= R700, COS/SIN input must be between -1. and 1.
   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
   EVT VT = Op.getValueType();
   SDValue Arg = Op.getOperand(0);
   SDLoc DL(Op);
 
   // TODO: Should this propagate fast-math-flags?
   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT,
         DAG.getNode(ISD::FMUL, DL, VT, Arg,
           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
         DAG.getConstantFP(0.5, DL, MVT::f32)));
   unsigned TrigNode;
   switch (Op.getOpcode()) {
   case ISD::FCOS:
     TrigNode = AMDGPUISD::COS_HW;
     break;
   case ISD::FSIN:
     TrigNode = AMDGPUISD::SIN_HW;
     break;
   default:
     llvm_unreachable("Wrong trig opcode");
   }
   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT, FractPart,
         DAG.getConstantFP(-0.5, DL, MVT::f32)));
   if (Gen >= AMDGPUSubtarget::R700)
     return TrigVal;
   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
 }
 
 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Shift = Op.getOperand(2);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue One  = DAG.getConstant(1, DL, VT);
 
   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 
   // The dance around Width1 is necessary for 0 special case.
   // Without it the CompShift might be 32, producing incorrect results in
   // Overflow. So we do the shift in two steps, the alternative is to
   // add a conditional to filter the special case.
 
   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
 
   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
 
   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
   SDValue LoBig = Zero;
 
   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 
   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 }
 
 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Shift = Op.getOperand(2);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue One  = DAG.getConstant(1, DL, VT);
 
   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
 
   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 
   // The dance around Width1 is necessary for 0 special case.
   // Without it the CompShift might be 32, producing incorrect results in
   // Overflow. So we do the shift in two steps, the alternative is to
   // add a conditional to filter the special case.
 
   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
 
   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
 
   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
 
   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 
   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 }
 
 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
                                           unsigned mainop, unsigned ovf) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
 
   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
   // Extend sign.
   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
                     DAG.getValueType(MVT::i1));
 
   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
 
   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
 }
 
 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(
       ISD::SETCC,
       DL,
       MVT::i1,
       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
       DAG.getCondCode(ISD::SETEQ));
 }
 
 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(
       ISD::SETCC,
       DL,
       MVT::i1,
       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
       DAG.getCondCode(ISD::SETEQ));
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    const SDLoc &DL,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUASI.CONSTANT_BUFFER_0);
+                                      AMDGPUASI.PARAM_I_ADDRESS);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
 
   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
 }
 
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     return Cst->isNullValue();
   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
     return CstFP->isZero();
   } else {
     return false;
   }
 }
 
 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->isExactlyValue(1.0);
   }
   return isAllOnesConstant(Op);
 }
 
 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->getValueAPF().isZero();
   }
   return isNullConstant(Op);
 }
 
 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue True = Op.getOperand(2);
   SDValue False = Op.getOperand(3);
   SDValue CC = Op.getOperand(4);
   SDValue Temp;
 
   if (VT == MVT::f32) {
     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
     SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     if (MinMax)
       return MinMax;
   }
 
   // LHS and RHS are guaranteed to be the same value type
   EVT CompareVT = LHS.getValueType();
 
   // Check if we can lower this to a native operation.
 
   // Try to lower to a SET* instruction:
   //
   // SET* can match the following patterns:
   //
   // select_cc f32, f32, -1,  0, cc_supported
   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
   // select_cc i32, i32, -1,  0, cc_supported
   //
 
   // Move hardware True/False values to the correct operand.
   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   ISD::CondCode InverseCC =
      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   if (isHWTrueValue(False) && isHWFalseValue(True)) {
     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
       std::swap(False, True);
       CC = DAG.getCondCode(InverseCC);
     } else {
       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
         std::swap(False, True);
         std::swap(LHS, RHS);
         CC = DAG.getCondCode(SwapInvCC);
       }
     }
   }
 
   if (isHWTrueValue(True) && isHWFalseValue(False) &&
       (CompareVT == VT || VT == MVT::i32)) {
     // This can be matched by a SET* instruction.
     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
   }
 
   // Try to lower to a CND* instruction:
   //
   // CND* can match the following patterns:
   //
   // select_cc f32, 0.0, f32, f32, cc_supported
   // select_cc f32, 0.0, i32, i32, cc_supported
   // select_cc i32, 0,   f32, f32, cc_supported
   // select_cc i32, 0,   i32, i32, cc_supported
   //
 
   // Try to move the zero value to the RHS
   if (isZero(LHS)) {
     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     // Try swapping the operands
     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
       std::swap(LHS, RHS);
       CC = DAG.getCondCode(CCSwapped);
     } else {
       // Try inverting the conditon and then swapping the operands
       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
         std::swap(True, False);
         std::swap(LHS, RHS);
         CC = DAG.getCondCode(CCSwapped);
       }
     }
   }
   if (isZero(RHS)) {
     SDValue Cond = LHS;
     SDValue Zero = RHS;
     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     if (CompareVT != VT) {
       // Bitcast True / False to the correct types.  This will end up being
       // a nop, but it allows us to define only a single pattern in the
       // .TD files for each CND* instruction rather than having to have
       // one pattern for integer True/False and one for fp True/False
       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
     }
 
     switch (CCOpcode) {
     case ISD::SETONE:
     case ISD::SETUNE:
     case ISD::SETNE:
       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
       Temp = True;
       True = False;
       False = Temp;
       break;
     default:
       break;
     }
     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
         Cond, Zero,
         True, False,
         DAG.getCondCode(CCOpcode));
     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   }
 
   // If we make it this for it means we have no native instructions to handle
   // this SELECT_CC, so we must lower it.
   SDValue HWTrue, HWFalse;
 
   if (CompareVT == MVT::f32) {
     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
   } else if (CompareVT == MVT::i32) {
     HWTrue = DAG.getConstant(-1, DL, CompareVT);
     HWFalse = DAG.getConstant(0, DL, CompareVT);
   }
   else {
     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
   }
 
   // Lower this unsupported SELECT_CC into a combination of two supported
   // SELECT_CC operations.
   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 
   return DAG.getNode(ISD::SELECT_CC, DL, VT,
       Cond, HWFalse,
       True, False,
       DAG.getCondCode(ISD::SETNE));
 }
 
 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 /// for indirect addressing.
 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
                                                unsigned StackWidth,
                                                SelectionDAG &DAG) const {
   unsigned SRLPad;
   switch(StackWidth) {
   case 1:
     SRLPad = 2;
     break;
   case 2:
     SRLPad = 3;
     break;
   case 4:
     SRLPad = 4;
     break;
   default: llvm_unreachable("Invalid stack width");
   }
 
   SDLoc DL(Ptr);
   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
                      DAG.getConstant(SRLPad, DL, MVT::i32));
 }
 
 void R600TargetLowering::getStackAddress(unsigned StackWidth,
                                          unsigned ElemIdx,
                                          unsigned &Channel,
                                          unsigned &PtrIncr) const {
   switch (StackWidth) {
   default:
   case 1:
     Channel = 0;
     if (ElemIdx > 0) {
       PtrIncr = 1;
     } else {
       PtrIncr = 0;
     }
     break;
   case 2:
     Channel = ElemIdx % 2;
     if (ElemIdx == 2) {
       PtrIncr = 1;
     } else {
       PtrIncr = 0;
     }
     break;
   case 4:
     Channel = ElemIdx;
     PtrIncr = 0;
     break;
   }
 }
 
 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
                                                    SelectionDAG &DAG) const {
   SDLoc DL(Store);
   //TODO: Who creates the i8 stores?
   assert(Store->isTruncatingStore()
          || Store->getValue().getValueType() == MVT::i8);
   assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   SDValue Mask;
   if (Store->getMemoryVT() == MVT::i8) {
     assert(Store->getAlignment() >= 1);
     Mask = DAG.getConstant(0xff, DL, MVT::i32);
   } else if (Store->getMemoryVT() == MVT::i16) {
     assert(Store->getAlignment() >= 2);
     Mask = DAG.getConstant(0xffff, DL, MVT::i32);
   } else {
     llvm_unreachable("Unsupported private trunc store");
   }
 
   SDValue OldChain = Store->getChain();
   bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
   // Skip dummy
   SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
   SDValue BasePtr = Store->getBasePtr();
   SDValue Offset = Store->getOffset();
   EVT MemVT = Store->getMemoryVT();
 
   SDValue LoadPtr = BasePtr;
   if (!Offset.isUndef()) {
     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
   }
 
   // Get dword location
   // TODO: this should be eliminated by the future SHR ptr, 2
   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
 
   // Load dword
   // TODO: can we be smarter about machine pointer info?
   MachinePointerInfo PtrInfo(UndefValue::get(
       Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
   SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
 
   Chain = Dst.getValue(1);
 
   // Get offset in dword
   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
                                 DAG.getConstant(0x3, DL, MVT::i32));
 
   // Convert byte offset to bit shift
   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                  DAG.getConstant(3, DL, MVT::i32));
 
   // TODO: Contrary to the name of the functiom,
   // it also handles sub i32 non-truncating stores (like i1)
   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
                                   Store->getValue());
 
   // Mask the value to the right type
   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
 
   // Shift the value in place
   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
                                      MaskedValue, ShiftAmt);
 
   // Shift the mask in place
   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
 
   // Invert the mask. NOTE: if we had native ROL instructions we could
   // use inverted mask
   DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
 
   // Cleanup the target bits
   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
 
   // Add the new bits
   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
 
   // Store dword
   // TODO: Can we be smarter about MachinePointerInfo?
   SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo);
 
   // If we are part of expanded vector, make our neighbors depend on this store
   if (VectorTrunc) {
     // Make all other vector elements depend on this store
     Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
     DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
   }
   return NewStore;
 }
 
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   unsigned AS = StoreNode->getAddressSpace();
 
   SDValue Chain = StoreNode->getChain();
   SDValue Ptr = StoreNode->getBasePtr();
   SDValue Value = StoreNode->getValue();
 
   EVT VT = Value.getValueType();
   EVT MemVT = StoreNode->getMemoryVT();
   EVT PtrVT = Ptr.getValueType();
 
   SDLoc DL(Op);
 
   // Neither LOCAL nor PRIVATE can do vectors at the moment
   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
     if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
          StoreNode->isTruncatingStore()) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
       // TODO: can the chain be replaced without creating a new store?
       SDValue NewStore = DAG.getTruncStore(
           NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
           MemVT, StoreNode->getAlignment(),
           StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
       StoreNode = cast<StoreSDNode>(NewStore);
     }
 
     return scalarizeVectorStore(StoreNode, DAG);
   }
 
   unsigned Align = StoreNode->getAlignment();
   if (Align < MemVT.getStoreSize() &&
       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
     return expandUnalignedStore(StoreNode, DAG);
   }
 
   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
                                   DAG.getConstant(2, DL, PtrVT));
 
   if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // It is beneficial to create MSKOR here instead of combiner to avoid
     // artificial dependencies introduced by RMW
     if (StoreNode->isTruncatingStore()) {
       assert(VT.bitsLE(MVT::i32));
       SDValue MaskConstant;
       if (MemVT == MVT::i8) {
         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
       } else {
         assert(MemVT == MVT::i16);
         assert(StoreNode->getAlignment() >= 2);
         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
       }
 
       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
                                       DAG.getConstant(0x00000003, DL, PtrVT));
       SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
                                      DAG.getConstant(3, DL, VT));
 
       // Put the mask in correct place
       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
 
       // Put the value bits in correct place
       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
 
       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
       // vector instead.
       SDValue Src[4] = {
         ShiftedValue,
         DAG.getConstant(0, DL, MVT::i32),
         DAG.getConstant(0, DL, MVT::i32),
         Mask
       };
       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
       SDValue Args[3] = { Chain, Input, DWordAddr };
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
                                      Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
       // Convert pointer from byte address to dword address.
       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
 
       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
         llvm_unreachable("Truncated and indexed stores not supported yet");
       } else {
         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
       }
       return Chain;
     }
   }
 
   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
   if (AS != AMDGPUASI.PRIVATE_ADDRESS)
     return SDValue();
 
   if (MemVT.bitsLT(MVT::i32))
     return lowerPrivateTruncStore(StoreNode, DAG);
 
   // Standard i32+ store, tag it with DWORDADDR to note that the address
   // has been shifted
   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
     return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   }
 
   // Tagged i32+ stores will be matched by patterns
   return SDValue();
 }
 
 // return (512 + (kc_bank << 12)
 static int
 ConstantAddressBlock(unsigned AddressSpace) {
   switch (AddressSpace) {
   case AMDGPUAS::CONSTANT_BUFFER_0:
     return 512;
   case AMDGPUAS::CONSTANT_BUFFER_1:
     return 512 + 4096;
   case AMDGPUAS::CONSTANT_BUFFER_2:
     return 512 + 4096 * 2;
   case AMDGPUAS::CONSTANT_BUFFER_3:
     return 512 + 4096 * 3;
   case AMDGPUAS::CONSTANT_BUFFER_4:
     return 512 + 4096 * 4;
   case AMDGPUAS::CONSTANT_BUFFER_5:
     return 512 + 4096 * 5;
   case AMDGPUAS::CONSTANT_BUFFER_6:
     return 512 + 4096 * 6;
   case AMDGPUAS::CONSTANT_BUFFER_7:
     return 512 + 4096 * 7;
   case AMDGPUAS::CONSTANT_BUFFER_8:
     return 512 + 4096 * 8;
   case AMDGPUAS::CONSTANT_BUFFER_9:
     return 512 + 4096 * 9;
   case AMDGPUAS::CONSTANT_BUFFER_10:
     return 512 + 4096 * 10;
   case AMDGPUAS::CONSTANT_BUFFER_11:
     return 512 + 4096 * 11;
   case AMDGPUAS::CONSTANT_BUFFER_12:
     return 512 + 4096 * 12;
   case AMDGPUAS::CONSTANT_BUFFER_13:
     return 512 + 4096 * 13;
   case AMDGPUAS::CONSTANT_BUFFER_14:
     return 512 + 4096 * 14;
   case AMDGPUAS::CONSTANT_BUFFER_15:
     return 512 + 4096 * 15;
   default:
     return -1;
   }
 }
 
 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
   ISD::LoadExtType ExtType = Load->getExtensionType();
   EVT MemVT = Load->getMemoryVT();
   assert(Load->getAlignment() >= MemVT.getStoreSize());
 
   SDValue BasePtr = Load->getBasePtr();
   SDValue Chain = Load->getChain();
   SDValue Offset = Load->getOffset();
 
   SDValue LoadPtr = BasePtr;
   if (!Offset.isUndef()) {
     LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
   }
 
   // Get dword location
   // NOTE: this should be eliminated by the future SHR ptr, 2
   SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
                             DAG.getConstant(0xfffffffc, DL, MVT::i32));
 
   // Load dword
   // TODO: can we be smarter about machine pointer info?
   MachinePointerInfo PtrInfo(UndefValue::get(
       Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
   SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
 
   // Get offset within the register.
   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
                                 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
 
   // Bit offset of target byte (byteIdx * 8).
   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                  DAG.getConstant(3, DL, MVT::i32));
 
   // Shift to the right.
   SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
 
   // Eliminate the upper bits by setting them to ...
   EVT MemEltVT = MemVT.getScalarType();
 
   if (ExtType == ISD::SEXTLOAD) { // ... ones.
     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
     Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
   } else { // ... or zeros.
     Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
   }
 
   SDValue Ops[] = {
     Ret,
     Read.getValue(1) // This should be our output chain
   };
 
   return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   unsigned AS = LoadNode->getAddressSpace();
   EVT MemVT = LoadNode->getMemoryVT();
   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
 
   if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
     return lowerPrivateExtLoad(Op, DAG);
   }
 
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Chain = LoadNode->getChain();
   SDValue Ptr = LoadNode->getBasePtr();
 
   if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
       LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
       return scalarizeVectorLoad(LoadNode, DAG);
   }
 
+  // This is still used for explicit load from addrspace(8)
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   if (ConstantBlock > -1 &&
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
-        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+    if (isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
         isa<ConstantSDNode>(Ptr)) {
-      SDValue Slots[4];
-      for (unsigned i = 0; i < 4; i++) {
-        // We want Const position encoded with the following formula :
-        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
-        // const_index is Ptr computed by llvm using an alignment of 16.
-        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
-        // then div by 4 at the ISel step
-        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
-        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
-      }
-      EVT NewVT = MVT::v4i32;
-      unsigned NumElements = 4;
-      if (VT.isVector()) {
-        NewVT = VT;
-        NumElements = VT.getVectorNumElements();
-      }
-      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+      return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG);
     } else {
+      //TODO: Does this even work?
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
                       DAG.getConstant(4, DL, MVT::i32)),
                       DAG.getConstant(LoadNode->getAddressSpace() -
                                       AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
           );
     }
 
     if (!VT.isVector()) {
       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
                            DAG.getConstant(0, DL, MVT::i32));
     }
 
     SDValue MergedValues[2] = {
       Result,
       Chain
     };
     return DAG.getMergeValues(MergedValues, DL);
   }
 
   // For most operations returning SDValue() will result in the node being
   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   // need to manually expand loads that may be legal in some address spaces and
   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
   // compute shaders, since the data is sign extended when it is uploaded to the
   // buffer. However SEXT loads from other address spaces are not supported, so
   // we need to expand them here.
   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
     EVT MemVT = LoadNode->getMemoryVT();
     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
     SDValue NewLoad = DAG.getExtLoad(
         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
                               DAG.getValueType(MemVT));
 
     SDValue MergedValues[2] = { Res, Chain };
     return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
     return SDValue();
   }
 
   // DWORDADDR ISD marks already shifted address
   if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
     assert(VT == MVT::i32);
     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
     return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
   }
   return SDValue();
 }
 
 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Jump  = Op.getOperand(2);
 
   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
                      Chain, Jump, Cond);
 }
 
 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
                                             SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const R600FrameLowering *TFL = Subtarget->getFrameLowering();
 
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
   unsigned FrameIndex = FIN->getIndex();
   unsigned IgnoredFrameReg;
   unsigned Offset =
     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
                          Op.getValueType());
 }
 
 CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                   bool IsVarArg) const {
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
     llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
   case CallingConv::AMDGPU_CS:
   case CallingConv::AMDGPU_HS:
   case CallingConv::AMDGPU_ES:
   case CallingConv::AMDGPU_LS:
     return CC_R600;
   default:
     report_fatal_error("Unsupported calling convention.");
   }
 }
 
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
 SDValue R600TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<ISD::InputArg, 8> LocalIns;
 
   if (AMDGPU::isShader(CallConv)) {
     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
   } else {
     analyzeFormalArgumentsCompute(CCInfo, Ins);
   }
 
   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     const ISD::InputArg &In = Ins[i];
     EVT VT = In.VT;
     EVT MemVT = VA.getLocVT();
     if (!VT.isVector() && MemVT.isVector()) {
       // Get load source type if scalarized.
       MemVT = MemVT.getVectorElementType();
     }
 
     if (AMDGPU::isShader(CallConv)) {
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
       continue;
     }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUASI.CONSTANT_BUFFER_0);
+                                          AMDGPUASI.PARAM_I_ADDRESS);
 
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
     // being invalid. Somehow this seems to work with i64 arguments, but breaks
     // for <1 x i64>.
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
       // FIXME: This should really check the extload type, but the handling of
       // extload vector parameters seems to be broken.
 
       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
       Ext = ISD::SEXTLOAD;
     }
 
     // Compute the offset from the value.
     // XXX - I think PartOffset should give you this, but it seems to give the
     // size of the register which isn't useful.
 
     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
+    unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset);
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
     SDValue Arg = DAG.getLoad(
         ISD::UNINDEXED, Ext, VT, DL, Chain,
         DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
         PtrInfo,
-        MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+        MemVT, Alignment, MachineMemOperand::MONonTemporal |
                                         MachineMemOperand::MODereferenceable |
                                         MachineMemOperand::MOInvariant);
 
-    // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
 }
 
 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
                                            EVT VT) const {
    if (!VT.isVector())
      return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
 }
 
 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
                                           const SelectionDAG &DAG) const {
   // Local and Private addresses do not handle vectors. Limit to i32
   if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
     return (MemVT.getSizeInBits() <= 32);
   }
   return true;
 }
 
 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                         unsigned AddrSpace,
                                                         unsigned Align,
                                                         bool *IsFast) const {
   if (IsFast)
     *IsFast = false;
 
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
 
   if (VT.bitsLT(MVT::i32))
     return false;
 
   // TODO: This is a rough estimate.
   if (IsFast)
     *IsFast = true;
 
   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
 }
 
 static SDValue CompactSwizzlableVector(
   SelectionDAG &DAG, SDValue VectorEntry,
   DenseMap<unsigned, unsigned> &RemapSwizzle) {
   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
   SDValue NewBldVec[4] = {
     VectorEntry.getOperand(0),
     VectorEntry.getOperand(1),
     VectorEntry.getOperand(2),
     VectorEntry.getOperand(3)
   };
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].isUndef())
       // We mask write here to teach later passes that the ith element of this
       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
       // break false dependencies and additionnaly make assembly easier to read.
       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
       if (C->isZero()) {
         RemapSwizzle[i] = 4; // SEL_0
         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
       } else if (C->isExactlyValue(1.0)) {
         RemapSwizzle[i] = 5; // SEL_1
         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
       }
     }
 
     if (NewBldVec[i].isUndef())
       continue;
     for (unsigned j = 0; j < i; j++) {
       if (NewBldVec[i] == NewBldVec[j]) {
         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
         RemapSwizzle[i] = j;
         break;
       }
     }
   }
 
   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
                             NewBldVec);
 }
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
   SDValue NewBldVec[4] = {
       VectorEntry.getOperand(0),
       VectorEntry.getOperand(1),
       VectorEntry.getOperand(2),
       VectorEntry.getOperand(3)
   };
   bool isUnmovable[4] = { false, false, false, false };
   for (unsigned i = 0; i < 4; i++) {
     RemapSwizzle[i] = i;
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
       if (i == Idx)
         isUnmovable[Idx] = true;
     }
   }
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
       if (isUnmovable[Idx])
         continue;
       // Swap i and Idx
       std::swap(NewBldVec[Idx], NewBldVec[i]);
       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
       break;
     }
   }
 
   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
                             NewBldVec);
 }
 
 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
                                             SelectionDAG &DAG,
                                             const SDLoc &DL) const {
   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   // Old -> New swizzle values
   DenseMap<unsigned, unsigned> SwizzleRemap;
 
   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   for (unsigned i = 0; i < 4; i++) {
     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
   }
 
   SwizzleRemap.clear();
   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   for (unsigned i = 0; i < 4; i++) {
     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
   }
 
   return BuildVector;
 }
 
+SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(LoadNode);
+  EVT VT = LoadNode->getValueType(0);
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = LoadNode->getBasePtr();
+  assert (isa<ConstantSDNode>(Ptr));
+
+  //TODO: Support smaller loads
+  if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
+    return SDValue();
+
+  if (LoadNode->getAlignment() < 4)
+    return SDValue();
+
+  int ConstantBlock = ConstantAddressBlock(Block);
+
+  SDValue Slots[4];
+  for (unsigned i = 0; i < 4; i++) {
+    // We want Const position encoded with the following formula :
+    // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+    // const_index is Ptr computed by llvm using an alignment of 16.
+    // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+    // then div by 4 at the ISel step
+    SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+        DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
+    Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+  }
+  EVT NewVT = MVT::v4i32;
+  unsigned NumElements = 4;
+  if (VT.isVector()) {
+    NewVT = VT;
+    NumElements = VT.getVectorNumElements();
+  }
+  SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+  if (!VT.isVector()) {
+    Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+                         DAG.getConstant(0, DL, MVT::i32));
+  }
+  SDValue MergedValues[2] = {
+    Result,
+    Chain
+  };
+  return DAG.getMergeValues(MergedValues, DL);
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Optimizations
 //===----------------------------------------------------------------------===//
 
 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
   switch (N->getOpcode()) {
   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   case ISD::FP_ROUND: {
       SDValue Arg = N->getOperand(0);
       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
         return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
                            Arg.getOperand(0));
       }
       break;
     }
 
   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
   // (i32 select_cc f32, f32, -1, 0 cc)
   //
   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
   // this to one of the SET*_DX10 instructions.
   case ISD::FP_TO_SINT: {
     SDValue FNeg = N->getOperand(0);
     if (FNeg.getOpcode() != ISD::FNEG) {
       return SDValue();
     }
     SDValue SelectCC = FNeg.getOperand(0);
     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
         !isHWTrueValue(SelectCC.getOperand(2)) ||
         !isHWFalseValue(SelectCC.getOperand(3))) {
       return SDValue();
     }
 
     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
                            SelectCC.getOperand(0), // LHS
                            SelectCC.getOperand(1), // RHS
                            DAG.getConstant(-1, DL, MVT::i32), // True
                            DAG.getConstant(0, DL, MVT::i32),  // False
                            SelectCC.getOperand(4)); // CC
 
     break;
   }
 
   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
   // => build_vector elt0, ... , NewEltIdx, ... , eltN
   case ISD::INSERT_VECTOR_ELT: {
     SDValue InVec = N->getOperand(0);
     SDValue InVal = N->getOperand(1);
     SDValue EltNo = N->getOperand(2);
 
     // If the inserted element is an UNDEF, just use the input vector.
     if (InVal.isUndef())
       return InVec;
 
     EVT VT = InVec.getValueType();
 
     // If we can't generate a legal BUILD_VECTOR, exit
     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
       return SDValue();
 
     // Check that we know which element is being inserted
     if (!isa<ConstantSDNode>(EltNo))
       return SDValue();
     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
 
     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
     // vector elements.
     SmallVector<SDValue, 8> Ops;
     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
       Ops.append(InVec.getNode()->op_begin(),
                  InVec.getNode()->op_end());
     } else if (InVec.isUndef()) {
       unsigned NElts = VT.getVectorNumElements();
       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
     } else {
       return SDValue();
     }
 
     // Insert the element
     if (Elt < Ops.size()) {
       // All the operands of BUILD_VECTOR must have the same type;
       // we enforce that here.
       EVT OpVT = Ops[0].getValueType();
       if (InVal.getValueType() != OpVT)
         InVal = OpVT.bitsGT(InVal.getValueType()) ?
           DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
           DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
       Ops[Elt] = InVal;
     }
 
     // Return the new vector
     return DAG.getBuildVector(VT, DL, Ops);
   }
 
   // Extract_vec (Build_vector) generated by custom lowering
   // also needs to be customly combined
   case ISD::EXTRACT_VECTOR_ELT: {
     SDValue Arg = N->getOperand(0);
     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
         unsigned Element = Const->getZExtValue();
         return Arg->getOperand(Element);
       }
     }
     if (Arg.getOpcode() == ISD::BITCAST &&
         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
         (Arg.getOperand(0).getValueType().getVectorNumElements() ==
          Arg.getValueType().getVectorNumElements())) {
       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
         unsigned Element = Const->getZExtValue();
         return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
                            Arg->getOperand(0).getOperand(Element));
       }
     }
     break;
   }
 
   case ISD::SELECT_CC: {
     // Try common optimizations
     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
       return Ret;
 
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
     //      selectcc x, y, a, b, inv(cc)
     //
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
     //      selectcc x, y, a, b, cc
     SDValue LHS = N->getOperand(0);
     if (LHS.getOpcode() != ISD::SELECT_CC) {
       return SDValue();
     }
 
     SDValue RHS = N->getOperand(1);
     SDValue True = N->getOperand(2);
     SDValue False = N->getOperand(3);
     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
 
     if (LHS.getOperand(2).getNode() != True.getNode() ||
         LHS.getOperand(3).getNode() != False.getNode() ||
         RHS.getNode() != False.getNode()) {
       return SDValue();
     }
 
     switch (NCC) {
     default: return SDValue();
     case ISD::SETNE: return LHS;
     case ISD::SETEQ: {
       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
       LHSCC = ISD::getSetCCInverse(LHSCC,
                                   LHS.getOperand(0).getValueType().isInteger());
       if (DCI.isBeforeLegalizeOps() ||
           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
         return DAG.getSelectCC(DL,
                                LHS.getOperand(0),
                                LHS.getOperand(1),
                                LHS.getOperand(2),
                                LHS.getOperand(3),
                                LHSCC);
       break;
     }
     }
     return SDValue();
   }
 
   case AMDGPUISD::R600_EXPORT: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
       break;
 
     SDValue NewArgs[8] = {
       N->getOperand(0), // Chain
       SDValue(),
       N->getOperand(2), // ArrayBase
       N->getOperand(3), // Type
       N->getOperand(4), // SWZ_X
       N->getOperand(5), // SWZ_Y
       N->getOperand(6), // SWZ_Z
       N->getOperand(7) // SWZ_W
     };
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
     return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
   }
   case AMDGPUISD::TEXTURE_FETCH: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
       break;
 
     SDValue NewArgs[19] = {
       N->getOperand(0),
       N->getOperand(1),
       N->getOperand(2),
       N->getOperand(3),
       N->getOperand(4),
       N->getOperand(5),
       N->getOperand(6),
       N->getOperand(7),
       N->getOperand(8),
       N->getOperand(9),
       N->getOperand(10),
       N->getOperand(11),
       N->getOperand(12),
       N->getOperand(13),
       N->getOperand(14),
       N->getOperand(15),
       N->getOperand(16),
       N->getOperand(17),
       N->getOperand(18),
     };
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
   }
+
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(N);
+    SDValue Ptr = LoadNode->getBasePtr();
+    if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS &&
+         isa<ConstantSDNode>(Ptr))
+      return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG);
+    break;
+  }
+
   default: break;
   }
 
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
                                      SDValue &Sel, SDValue &Imm,
                                      SelectionDAG &DAG) const {
   const R600InstrInfo *TII = Subtarget->getInstrInfo();
   if (!Src.isMachineOpcode())
     return false;
 
   switch (Src.getMachineOpcode()) {
   case R600::FNEG_R600:
     if (!Neg.getNode())
       return false;
     Src = Src.getOperand(0);
     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
     return true;
   case R600::FABS_R600:
     if (!Abs.getNode())
       return false;
     Src = Src.getOperand(0);
     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
     return true;
   case R600::CONST_COPY: {
     unsigned Opcode = ParentNode->getMachineOpcode();
     bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
 
     if (!Sel.getNode())
       return false;
 
     SDValue CstOffset = Src.getOperand(0);
     if (ParentNode->getValueType(0).isVector())
       return false;
 
     // Gather constants values
     int SrcIndices[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0),
       TII->getOperandIdx(Opcode, R600::OpName::src1),
       TII->getOperandIdx(Opcode, R600::OpName::src2),
       TII->getOperandIdx(Opcode, R600::OpName::src0_X),
       TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src0_W),
       TII->getOperandIdx(Opcode, R600::OpName::src1_X),
       TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src1_W)
     };
     std::vector<unsigned> Consts;
     for (int OtherSrcIdx : SrcIndices) {
       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
         continue;
       if (HasDst) {
         OtherSrcIdx--;
         OtherSelIdx--;
       }
       if (RegisterSDNode *Reg =
           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
         if (Reg->getReg() == R600::ALU_CONST) {
           ConstantSDNode *Cst
             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
           Consts.push_back(Cst->getZExtValue());
         }
       }
     }
 
     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
     Consts.push_back(Cst->getZExtValue());
     if (!TII->fitsConstReadLimitations(Consts)) {
       return false;
     }
 
     Sel = CstOffset;
     Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
     return true;
   }
   case R600::MOV_IMM_GLOBAL_ADDR:
     // Check if the Imm slot is used. Taken from below.
     if (cast<ConstantSDNode>(Imm)->getZExtValue())
       return false;
     Imm = Src.getOperand(0);
     Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
     return true;
   case R600::MOV_IMM_I32:
   case R600::MOV_IMM_F32: {
     unsigned ImmReg = R600::ALU_LITERAL_X;
     uint64_t ImmValue = 0;
 
     if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
       float FloatValue = FPC->getValueAPF().convertToFloat();
       if (FloatValue == 0.0) {
         ImmReg = R600::ZERO;
       } else if (FloatValue == 0.5) {
         ImmReg = R600::HALF;
       } else if (FloatValue == 1.0) {
         ImmReg = R600::ONE;
       } else {
         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
       }
     } else {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
       uint64_t Value = C->getZExtValue();
       if (Value == 0) {
         ImmReg = R600::ZERO;
       } else if (Value == 1) {
         ImmReg = R600::ONE_INT;
       } else {
         ImmValue = Value;
       }
     }
 
     // Check that we aren't already using an immediate.
     // XXX: It's possible for an instruction to have more than one
     // immediate operand, but this is not supported yet.
     if (ImmReg == R600::ALU_LITERAL_X) {
       if (!Imm.getNode())
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
       assert(C);
       if (C->getZExtValue())
         return false;
       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
     }
     Src = DAG.getRegister(ImmReg, MVT::i32);
     return true;
   }
   default:
     return false;
   }
 }
 
 /// Fold the instructions after selecting them
 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
                                             SelectionDAG &DAG) const {
   const R600InstrInfo *TII = Subtarget->getInstrInfo();
   if (!Node->isMachineOpcode())
     return Node;
 
   unsigned Opcode = Node->getMachineOpcode();
   SDValue FakeOp;
 
   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
 
   if (Opcode == R600::DOT_4) {
     int OperandIdx[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0_X),
       TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src0_W),
       TII->getOperandIdx(Opcode, R600::OpName::src1_X),
       TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src1_W)
         };
     int NegIdx[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
     };
     int AbsIdx[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
       TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
     };
     for (unsigned i = 0; i < 8; i++) {
       if (OperandIdx[i] < 0)
         return Node;
       SDValue &Src = Ops[OperandIdx[i] - 1];
       SDValue &Neg = Ops[NegIdx[i] - 1];
       SDValue &Abs = Ops[AbsIdx[i] - 1];
       bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
       if (HasDst)
         SelIdx--;
       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
   } else if (Opcode == R600::REG_SEQUENCE) {
     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
       SDValue &Src = Ops[i];
       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
   } else {
     if (!TII->hasInstrModifiers(Opcode))
       return Node;
     int OperandIdx[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0),
       TII->getOperandIdx(Opcode, R600::OpName::src1),
       TII->getOperandIdx(Opcode, R600::OpName::src2)
     };
     int NegIdx[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
       TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
       TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
     };
     int AbsIdx[] = {
       TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
       TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
       -1
     };
     for (unsigned i = 0; i < 3; i++) {
       if (OperandIdx[i] < 0)
         return Node;
       SDValue &Src = Ops[OperandIdx[i] - 1];
       SDValue &Neg = Ops[NegIdx[i] - 1];
       SDValue FakeAbs;
       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
       bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1;
       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
       int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
       if (HasDst) {
         SelIdx--;
         ImmIdx--;
       }
       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
       SDValue &Imm = Ops[ImmIdx];
       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
     }
   }
 
   return Node;
 }
Index: vendor/llvm/dist-release_70/lib/Target/AMDGPU/R600ISelLowering.h
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/AMDGPU/R600ISelLowering.h	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Target/AMDGPU/R600ISelLowering.h	(revision 337631)
@@ -1,110 +1,112 @@
 //===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// R600 DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 
 namespace llvm {
 
 class R600InstrInfo;
 class R600Subtarget;
 
 class R600TargetLowering final : public AMDGPUTargetLowering {
 
   const R600Subtarget *Subtarget;
 public:
   R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
 
   const R600Subtarget *getSubtarget() const;
 
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
                          EVT VT) const override;
 
   bool canMergeStoresTo(unsigned AS, EVT MemVT,
                         const SelectionDAG &DAG) const override;
 
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *IsFast) const override;
 
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
   /// first nine dwords of a Vertex Buffer.  These implicit parameters are
   /// lowered to load instructions which retrieve the values from the Vertex
   /// Buffer.
   SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL,
                                  unsigned DwordOffset) const;
 
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
   SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
                           const SDLoc &DL) const;
   SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
 
   SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
                         unsigned mainop, unsigned ovf) const;
 
   SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
                                           SelectionDAG &DAG) const;
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
- bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
-                  SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
-                  SelectionDAG &DAG) const;
+  bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+                   SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+                   SelectionDAG &DAG) const;
+  SDValue constBufferLoad(LoadSDNode *LoadNode, int Block,
+                          SelectionDAG &DAG) const;
 
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
 
 } // End namespace llvm;
 
 #endif
Index: vendor/llvm/dist-release_70/lib/Target/AMDGPU/VOP3Instructions.td
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/AMDGPU/VOP3Instructions.td	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Target/AMDGPU/VOP3Instructions.td	(revision 337631)
@@ -1,861 +1,850 @@
 //===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // VOP3 Classes
 //===----------------------------------------------------------------------===//
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
   dag src0 = !if(P.HasOMod,
     (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
     (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
 
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
     (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
     (node (P.Src0VT src0)))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
 }
 
 class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
                                     (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
     (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
                           (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
     (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
 }
 
 class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (node (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
                                     (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
     (node !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
                           (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
     (node (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
 }
 
 class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (node (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
                                     (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
     (node !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
                           (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
     (node (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
 }
 
 class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
   list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
   list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))];
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
 }
 
 class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))];
   list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))];
   list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))];
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
                   ret1));
 }
 
 class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
   VOP3_Pseudo<OpName, P,
     !if(P.HasOpSel,
         !if(P.HasModifiers,
             getVOP3OpSelModPat<P, node>.ret,
             getVOP3OpSelPat<P, node>.ret),
         !if(P.HasModifiers,
             getVOP3ModPat<P, node>.ret,
             !if(P.HasIntClamp,
                 getVOP3ClampPat<P, node>.ret,
                 getVOP3Pat<P, node>.ret))),
     VOP3Only, 0, P.HasOpSel> {
 
   let IntClamp = P.HasIntClamp;
   let AsmMatchConverter =
     !if(P.HasOpSel,
         "cvtVOP3OpSel",
         !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
             "cvtVOP3",
             ""));
 }
 
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
 // only VOP instruction that implicitly reads VCC.
 let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
 def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
   let Outs64 = (outs DstRC.RegClass:$vdst);
 }
 def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
   let Outs64 = (outs DstRC.RegClass:$vdst);
 }
 }
 
 class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
   list<dag> ret =
     [(set P.DstVT:$vdst,
       (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
             (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
             (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
             (i1 VCC)))];
 }
 
 class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
   bit HasClamp = Clamp;
   bit HasOpSel = OpSel;
   bit IsPacked = Packed;
 }
 
 def VOP3_REGULAR : VOP3Features<0, 0, 0>;
 def VOP3_CLAMP   : VOP3Features<1, 0, 0>;
 def VOP3_OPSEL   : VOP3Features<1, 1, 0>;
 def VOP3_PACKED  : VOP3Features<1, 1, 1>;
 
 class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
 
   let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
   let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
   let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
 
   let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
 
   // FIXME: Hack to stop printing _e64
   let Outs64 = (outs DstRC.RegClass:$vdst);
   let Asm64 =
     " " # !if(Features.HasOpSel,
               getAsmVOP3OpSel<NumSrcArgs,
                               HasIntClamp,
                               HasSrc0FloatMods,
                               HasSrc1FloatMods,
                               HasSrc2FloatMods>.ret,
               !if(Features.HasClamp,
                   getAsm64<HasDst, NumSrcArgs, HasIntClamp,
                            HasModifiers, HasOMod, DstVT>.ret,
                   P.Asm64));
 }
 
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
   // v_div_scale_{f32|f64} do not support input modifiers.
   let HasModifiers = 0;
   let HasOMod = 0;
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
   let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }
 
 def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
   // FIXME: Hack to stop printing _e64
   let DstRC = RegisterOperand<VGPR_32>;
 }
 
 def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
   // FIXME: Hack to stop printing _e64
   let DstRC = RegisterOperand<VReg_64>;
 }
 
 def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
   let HasClamp = 1;
 
   // FIXME: Hack to stop printing _e64
   let DstRC = RegisterOperand<VReg_64>;
 
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
   let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
 }
 
 //===----------------------------------------------------------------------===//
 // VOP3 INTERP
 //===----------------------------------------------------------------------===//
 
 class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
   let AsmMatchConverter = "cvtVOP3Interp";
 }
 
 def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
                    Attr:$attr, AttrChan:$attrchan,
                    clampmod:$clamp, omod:$omod);
 
   let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod";
 }
 
 def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
   let Ins64 = (ins InterpSlot:$src0,
                    Attr:$attr, AttrChan:$attrchan,
                    clampmod:$clamp, omod:$omod);
 
   let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
 
   let HasClamp = 1;
 }
 
 class getInterp16Asm <bit HasSrc2, bit HasOMod> {
   string src2 = !if(HasSrc2, ", $src2_modifiers", "");
   string omod = !if(HasOMod, "$omod", "");
   string ret =
     " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod;
 }
 
 class getInterp16Ins <bit HasSrc2, bit HasOMod,
                       Operand Src0Mod, Operand Src2Mod> {
   dag ret = !if(HasSrc2,
                 !if(HasOMod,
                     (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
                          Attr:$attr, AttrChan:$attrchan,
                          Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
                          highmod:$high, clampmod:$clamp, omod:$omod),
                     (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
                          Attr:$attr, AttrChan:$attrchan,
                          Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
                          highmod:$high, clampmod:$clamp)
                 ),
                 (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
                      Attr:$attr, AttrChan:$attrchan,
                      highmod:$high, clampmod:$clamp, omod:$omod)
             );
 }
 
 class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 
   let HasOMod = !if(!eq(DstVT.Value, f16.Value), 0, 1);
   let HasHigh = 1;
 
   let Outs64 = (outs VGPR_32:$vdst);
   let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
   let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
 }
 
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
 
 let isCommutable = 1 in {
 
 def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
 def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
 def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
 def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteQuarterRate32] in {
 def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
 def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
 } // End SchedRW = [WriteQuarterRate32]
 
 let Uses = [VCC, EXEC] in {
 // v_div_fmas_f32:
 //   result = src0 * src1 + src2
 //   if (vcc)
 //     result *= 2^32
 //
 def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
   getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> {
   let SchedRW = [WriteFloatFMA];
 }
 // v_div_fmas_f64:
 //   result = src0 * src1 + src2
 //   if (vcc)
 //     result *= 2^64
 //
 def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
   getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
   let SchedRW = [WriteDouble];
 }
 } // End Uses = [VCC, EXEC]
 
 } // End isCommutable = 1
 
 def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
 def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
 def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
 def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
 def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
 def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
 def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
 def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbit>;
 def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
 def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
 def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
 def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
 def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
 def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
 def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
 def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
 def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
 def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
 def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
 def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
 
 let SchedRW = [WriteDoubleAdd] in {
 def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
 def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
   let AsmMatchConverter = "";
 }
 
 // Double precision division pre-scale.
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
   let AsmMatchConverter = "";
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
 let Constraints = "@earlyclobber $vdst" in {
 def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 } // End Constraints = "@earlyclobber $vdst"
 
 def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
   let SchedRW = [WriteDouble];
 }
 
 let SchedRW = [Write64Bit] in {
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
 def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>;
 def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>;
 def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
 } // End SubtargetPredicate = isSICI
 
 let SubtargetPredicate = isVI in {
 def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
 def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
 def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 } // End SubtargetPredicate = isVI
 } // End SchedRW = [Write64Bit]
 
 let SubtargetPredicate = isCIVI in {
 
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
 
 let isCommutable = 1 in {
 let SchedRW = [WriteQuarterRate32, WriteSALU] in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SchedRW = [WriteDouble, WriteSALU]
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isCIVI
 
 
 def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
   let Predicates = [Has16BitInsts, isVIOnly];
 }
 def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
                                       VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
   let renamedInGFX9 = 1;
   let Predicates = [Has16BitInsts, isGFX9];
 }
 
 let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
 
 let renamedInGFX9 = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
 def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
 def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 }
 
 let SubtargetPredicate = isGFX9 in {
 def V_MAD_F16_gfx9   : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
 def V_MAD_U16_gfx9   : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
 def V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
 def V_FMA_F16_gfx9   : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9
 
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
 let SubtargetPredicate = isVI in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
 
 def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
 } // End SubtargetPredicate = isVI
 
 let Predicates = [Has16BitInsts] in {
 
 multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
                              Instruction inst, SDPatternOperator op3> {
 def : GCNPat <
   (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
   (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
 >;
 
-def : GCNPat<
-  (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
-  (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
->;
-
-def : GCNPat<
-  (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
-   (REG_SEQUENCE VReg_64,
-     (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0,
-     (V_MOV_B32_e32 (i32 0)), sub1)
->;
 }
 
 defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
 defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
 
 } // End Predicates = [Has16BitInsts]
 
 let SubtargetPredicate = isGFX9 in {
 def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 
 def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 
 def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
 def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
 def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
 
 def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
 def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
 def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
 
 def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
 def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
 def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
 
 def V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
 def V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
 
 def V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
 def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
 
 def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 
 def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
 } // End SubtargetPredicate = isGFX9
 
 //===----------------------------------------------------------------------===//
 // Integer Clamp Patterns
 //===----------------------------------------------------------------------===//
 
 class getClampPat<VOPProfile P, SDPatternOperator node> {
   dag ret3 = (P.DstVT (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2));
   dag ret2 = (P.DstVT (node P.Src0VT:$src0, P.Src1VT:$src1));
   dag ret1 = (P.DstVT (node P.Src0VT:$src0));
   dag ret = !if(!eq(P.NumSrcArgs, 3), ret3,
             !if(!eq(P.NumSrcArgs, 2), ret2,
             ret1));
 }
 
 class getClampRes<VOPProfile P, Instruction inst> {
   dag ret3 = (inst P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, (i1 0));
   dag ret2 = (inst P.Src0VT:$src0, P.Src1VT:$src1, (i1 0));
   dag ret1 = (inst P.Src0VT:$src0, (i1 0));
   dag ret = !if(!eq(P.NumSrcArgs, 3), ret3,
             !if(!eq(P.NumSrcArgs, 2), ret2,
             ret1));
 }
 
 class IntClampPat<VOP3Inst inst, SDPatternOperator node> : GCNPat<
   getClampPat<inst.Pfl, node>.ret,
   getClampRes<inst.Pfl, inst>.ret
 >;
 
 def : IntClampPat<V_MAD_I32_I24, AMDGPUmad_i24>;
 def : IntClampPat<V_MAD_U32_U24, AMDGPUmad_u24>;
 
 def : IntClampPat<V_SAD_U8, int_amdgcn_sad_u8>;
 def : IntClampPat<V_SAD_HI_U8, int_amdgcn_sad_hi_u8>;
 def : IntClampPat<V_SAD_U16, int_amdgcn_sad_u16>;
 
 def : IntClampPat<V_MSAD_U8, int_amdgcn_msad_u8>;
 def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>;
 
 def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>;
 def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
 
 //===----------------------------------------------------------------------===//
 // Target
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // SI
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
 
 multiclass VOP3_Real_si<bits<9> op> {
   def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
             VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3be_Real_si<bits<9> op> {
   def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
             VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 } // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
 
 defm V_MAD_LEGACY_F32   : VOP3_Real_si <0x140>;
 defm V_MAD_F32          : VOP3_Real_si <0x141>;
 defm V_MAD_I32_I24      : VOP3_Real_si <0x142>;
 defm V_MAD_U32_U24      : VOP3_Real_si <0x143>;
 defm V_CUBEID_F32       : VOP3_Real_si <0x144>;
 defm V_CUBESC_F32       : VOP3_Real_si <0x145>;
 defm V_CUBETC_F32       : VOP3_Real_si <0x146>;
 defm V_CUBEMA_F32       : VOP3_Real_si <0x147>;
 defm V_BFE_U32          : VOP3_Real_si <0x148>;
 defm V_BFE_I32          : VOP3_Real_si <0x149>;
 defm V_BFI_B32          : VOP3_Real_si <0x14a>;
 defm V_FMA_F32          : VOP3_Real_si <0x14b>;
 defm V_FMA_F64          : VOP3_Real_si <0x14c>;
 defm V_LERP_U8          : VOP3_Real_si <0x14d>;
 defm V_ALIGNBIT_B32     : VOP3_Real_si <0x14e>;
 defm V_ALIGNBYTE_B32    : VOP3_Real_si <0x14f>;
 defm V_MULLIT_F32       : VOP3_Real_si <0x150>;
 defm V_MIN3_F32         : VOP3_Real_si <0x151>;
 defm V_MIN3_I32         : VOP3_Real_si <0x152>;
 defm V_MIN3_U32         : VOP3_Real_si <0x153>;
 defm V_MAX3_F32         : VOP3_Real_si <0x154>;
 defm V_MAX3_I32         : VOP3_Real_si <0x155>;
 defm V_MAX3_U32         : VOP3_Real_si <0x156>;
 defm V_MED3_F32         : VOP3_Real_si <0x157>;
 defm V_MED3_I32         : VOP3_Real_si <0x158>;
 defm V_MED3_U32         : VOP3_Real_si <0x159>;
 defm V_SAD_U8           : VOP3_Real_si <0x15a>;
 defm V_SAD_HI_U8        : VOP3_Real_si <0x15b>;
 defm V_SAD_U16          : VOP3_Real_si <0x15c>;
 defm V_SAD_U32          : VOP3_Real_si <0x15d>;
 defm V_CVT_PK_U8_F32    : VOP3_Real_si <0x15e>;
 defm V_DIV_FIXUP_F32    : VOP3_Real_si <0x15f>;
 defm V_DIV_FIXUP_F64    : VOP3_Real_si <0x160>;
 defm V_LSHL_B64         : VOP3_Real_si <0x161>;
 defm V_LSHR_B64         : VOP3_Real_si <0x162>;
 defm V_ASHR_I64         : VOP3_Real_si <0x163>;
 defm V_ADD_F64          : VOP3_Real_si <0x164>;
 defm V_MUL_F64          : VOP3_Real_si <0x165>;
 defm V_MIN_F64          : VOP3_Real_si <0x166>;
 defm V_MAX_F64          : VOP3_Real_si <0x167>;
 defm V_LDEXP_F64        : VOP3_Real_si <0x168>;
 defm V_MUL_LO_U32       : VOP3_Real_si <0x169>;
 defm V_MUL_HI_U32       : VOP3_Real_si <0x16a>;
 defm V_MUL_LO_I32       : VOP3_Real_si <0x16b>;
 defm V_MUL_HI_I32       : VOP3_Real_si <0x16c>;
 defm V_DIV_SCALE_F32    : VOP3be_Real_si <0x16d>;
 defm V_DIV_SCALE_F64    : VOP3be_Real_si <0x16e>;
 defm V_DIV_FMAS_F32     : VOP3_Real_si <0x16f>;
 defm V_DIV_FMAS_F64     : VOP3_Real_si <0x170>;
 defm V_MSAD_U8          : VOP3_Real_si <0x171>;
 defm V_MQSAD_PK_U16_U8  : VOP3_Real_si <0x173>;
 defm V_TRIG_PREOP_F64   : VOP3_Real_si <0x174>;
 
 //===----------------------------------------------------------------------===//
 // CI
 //===----------------------------------------------------------------------===//
 
 multiclass VOP3_Real_ci<bits<9> op> {
   def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
             VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicates = [isCIOnly];
     let DecoderNamespace = "CI";
   }
 }
 
 multiclass VOP3be_Real_ci<bits<9> op> {
   def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
             VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
     let AssemblerPredicates = [isCIOnly];
     let DecoderNamespace = "CI";
   }
 }
 
 defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
 defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x175>;
 defm V_MAD_U64_U32      : VOP3be_Real_ci <0x176>;
 defm V_MAD_I64_I32      : VOP3be_Real_ci <0x177>;
 
 //===----------------------------------------------------------------------===//
 // VI
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
 
 multiclass VOP3_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3be_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3Interp_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
 
 let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in {
 
 multiclass VOP3_F16_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
 }
 
 } // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI"
 
 let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
 
 multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
   def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
             VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
 
 multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
   def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
             VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
 
 multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
   def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
             VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
               VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
 
 multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
   def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
               VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
               VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
               let AsmString = AsmName # ps.AsmOperands;
             }
 }
 
 } // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9"
 
 defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
 defm V_MAD_I64_I32      : VOP3be_Real_vi <0x1E9>;
 
 defm V_MAD_LEGACY_F32   : VOP3_Real_vi <0x1c0>;
 defm V_MAD_F32          : VOP3_Real_vi <0x1c1>;
 defm V_MAD_I32_I24      : VOP3_Real_vi <0x1c2>;
 defm V_MAD_U32_U24      : VOP3_Real_vi <0x1c3>;
 defm V_CUBEID_F32       : VOP3_Real_vi <0x1c4>;
 defm V_CUBESC_F32       : VOP3_Real_vi <0x1c5>;
 defm V_CUBETC_F32       : VOP3_Real_vi <0x1c6>;
 defm V_CUBEMA_F32       : VOP3_Real_vi <0x1c7>;
 defm V_BFE_U32          : VOP3_Real_vi <0x1c8>;
 defm V_BFE_I32          : VOP3_Real_vi <0x1c9>;
 defm V_BFI_B32          : VOP3_Real_vi <0x1ca>;
 defm V_FMA_F32          : VOP3_Real_vi <0x1cb>;
 defm V_FMA_F64          : VOP3_Real_vi <0x1cc>;
 defm V_LERP_U8          : VOP3_Real_vi <0x1cd>;
 defm V_ALIGNBIT_B32     : VOP3_Real_vi <0x1ce>;
 defm V_ALIGNBYTE_B32    : VOP3_Real_vi <0x1cf>;
 defm V_MIN3_F32         : VOP3_Real_vi <0x1d0>;
 defm V_MIN3_I32         : VOP3_Real_vi <0x1d1>;
 defm V_MIN3_U32         : VOP3_Real_vi <0x1d2>;
 defm V_MAX3_F32         : VOP3_Real_vi <0x1d3>;
 defm V_MAX3_I32         : VOP3_Real_vi <0x1d4>;
 defm V_MAX3_U32         : VOP3_Real_vi <0x1d5>;
 defm V_MED3_F32         : VOP3_Real_vi <0x1d6>;
 defm V_MED3_I32         : VOP3_Real_vi <0x1d7>;
 defm V_MED3_U32         : VOP3_Real_vi <0x1d8>;
 defm V_SAD_U8           : VOP3_Real_vi <0x1d9>;
 defm V_SAD_HI_U8        : VOP3_Real_vi <0x1da>;
 defm V_SAD_U16          : VOP3_Real_vi <0x1db>;
 defm V_SAD_U32          : VOP3_Real_vi <0x1dc>;
 defm V_CVT_PK_U8_F32    : VOP3_Real_vi <0x1dd>;
 defm V_DIV_FIXUP_F32    : VOP3_Real_vi <0x1de>;
 defm V_DIV_FIXUP_F64    : VOP3_Real_vi <0x1df>;
 defm V_DIV_SCALE_F32    : VOP3be_Real_vi <0x1e0>;
 defm V_DIV_SCALE_F64    : VOP3be_Real_vi <0x1e1>;
 defm V_DIV_FMAS_F32     : VOP3_Real_vi <0x1e2>;
 defm V_DIV_FMAS_F64     : VOP3_Real_vi <0x1e3>;
 defm V_MSAD_U8          : VOP3_Real_vi <0x1e4>;
 defm V_QSAD_PK_U16_U8   : VOP3_Real_vi <0x1e5>;
 defm V_MQSAD_PK_U16_U8  : VOP3_Real_vi <0x1e6>;
 defm V_MQSAD_U32_U8     : VOP3_Real_vi <0x1e7>;
 
 defm V_PERM_B32         : VOP3_Real_vi <0x1ed>;
 
 defm V_MAD_F16          : VOP3_F16_Real_vi <0x1ea>;
 defm V_MAD_U16          : VOP3_F16_Real_vi <0x1eb>;
 defm V_MAD_I16          : VOP3_F16_Real_vi <0x1ec>;
 defm V_FMA_F16          : VOP3_F16_Real_vi <0x1ee>;
 defm V_DIV_FIXUP_F16    : VOP3_F16_Real_vi <0x1ef>;
 defm V_INTERP_P2_F16    : VOP3Interp_F16_Real_vi <0x276>;
 
 defm V_MAD_LEGACY_F16       : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16",       "v_mad_legacy_f16">;
 defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
 defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 defm V_FMA_LEGACY_F16       : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16",       "v_fma_legacy_f16">;
 defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">;
 defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">;
 
 defm V_MAD_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
 defm V_MAD_U16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
 defm V_MAD_I16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
 defm V_FMA_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
 defm V_DIV_FIXUP_F16_gfx9   : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
 defm V_INTERP_P2_F16_gfx9   : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
 
 defm V_ADD_I32_gfx9         : VOP3_Real_gfx9 <0x29c, "v_add_i32">;
 defm V_SUB_I32_gfx9         : VOP3_Real_gfx9 <0x29d, "v_sub_i32">;
 
 defm V_INTERP_P1_F32_e64  : VOP3Interp_Real_vi <0x270>;
 defm V_INTERP_P2_F32_e64  : VOP3Interp_Real_vi <0x271>;
 defm V_INTERP_MOV_F32_e64 : VOP3Interp_Real_vi <0x272>;
 
 defm V_INTERP_P1LL_F16  : VOP3Interp_Real_vi <0x274>;
 defm V_INTERP_P1LV_F16  : VOP3Interp_Real_vi <0x275>;
 defm V_ADD_F64          : VOP3_Real_vi <0x280>;
 defm V_MUL_F64          : VOP3_Real_vi <0x281>;
 defm V_MIN_F64          : VOP3_Real_vi <0x282>;
 defm V_MAX_F64          : VOP3_Real_vi <0x283>;
 defm V_LDEXP_F64        : VOP3_Real_vi <0x284>;
 defm V_MUL_LO_U32       : VOP3_Real_vi <0x285>;
 
 // removed from VI as identical to V_MUL_LO_U32
 let isAsmParserOnly = 1 in {
 defm V_MUL_LO_I32       : VOP3_Real_vi <0x285>;
 }
 
 defm V_MUL_HI_U32       : VOP3_Real_vi <0x286>;
 defm V_MUL_HI_I32       : VOP3_Real_vi <0x287>;
 
 defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
 defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
 defm V_ASHRREV_I64      : VOP3_Real_vi <0x291>;
 defm V_TRIG_PREOP_F64   : VOP3_Real_vi <0x292>;
 
 defm V_LSHL_ADD_U32 : VOP3_Real_vi <0x1fd>;
 defm V_ADD_LSHL_U32 : VOP3_Real_vi <0x1fe>;
 defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>;
 defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
 defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
 defm V_OR3_B32 : VOP3_Real_vi <0x202>;
 defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx9 <0x2a0>;
 
 defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
 
 defm V_MIN3_F16 : VOP3OpSel_Real_gfx9 <0x1f4>;
 defm V_MIN3_I16 : VOP3OpSel_Real_gfx9 <0x1f5>;
 defm V_MIN3_U16 : VOP3OpSel_Real_gfx9 <0x1f6>;
 
 defm V_MAX3_F16 : VOP3OpSel_Real_gfx9 <0x1f7>;
 defm V_MAX3_I16 : VOP3OpSel_Real_gfx9 <0x1f8>;
 defm V_MAX3_U16 : VOP3OpSel_Real_gfx9 <0x1f9>;
 
 defm V_MED3_F16 : VOP3OpSel_Real_gfx9 <0x1fa>;
 defm V_MED3_I16 : VOP3OpSel_Real_gfx9 <0x1fb>;
 defm V_MED3_U16 : VOP3OpSel_Real_gfx9 <0x1fc>;
 
 defm V_ADD_I16  : VOP3OpSel_Real_gfx9 <0x29e>;
 defm V_SUB_I16  : VOP3OpSel_Real_gfx9 <0x29f>;
 
 defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx9 <0x1f1>;
 defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>;
 
 defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
 defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
Index: vendor/llvm/dist-release_70/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp	(revision 337631)
@@ -1,3724 +1,3772 @@
 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines an instruction selector for the NVPTX target.
 //
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXISelDAGToDAG.h"
 #include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "nvptx-isel"
 
 /// createNVPTXISelDag - This pass converts a legalized DAG into a
 /// NVPTX-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
                                        llvm::CodeGenOpt::Level OptLevel) {
   return new NVPTXDAGToDAGISel(TM, OptLevel);
 }
 
 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
                                      CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm) {
   doMulWide = (OptLevel > 0);
 }
 
 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
 int NVPTXDAGToDAGISel::getDivF32Level() const {
   return Subtarget->getTargetLowering()->getDivF32Level();
 }
 
 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
   return Subtarget->getTargetLowering()->usePrecSqrtF32();
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
 }
 
 bool NVPTXDAGToDAGISel::allowFMA() const {
   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
   return TL->allowFMA(*MF, OptLevel);
 }
 
 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
   return TL->allowUnsafeFPMath(*MF);
 }
 
 bool NVPTXDAGToDAGISel::useShortPointers() const {
   return TM.useShortPointers();
 }
 
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 void NVPTXDAGToDAGISel::Select(SDNode *N) {
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
     return; // Already selected.
   }
 
   switch (N->getOpcode()) {
   case ISD::LOAD:
+  case ISD::ATOMIC_LOAD:
     if (tryLoad(N))
       return;
     break;
   case ISD::STORE:
+  case ISD::ATOMIC_STORE:
     if (tryStore(N))
       return;
     break;
   case ISD::EXTRACT_VECTOR_ELT:
     if (tryEXTRACT_VECTOR_ELEMENT(N))
       return;
     break;
   case NVPTXISD::SETP_F16X2:
     SelectSETP_F16X2(N);
     return;
 
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
     if (tryLoadVector(N))
       return;
     break;
   case NVPTXISD::LDGV2:
   case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
     if (tryLDGLDU(N))
       return;
     break;
   case NVPTXISD::StoreV2:
   case NVPTXISD::StoreV4:
     if (tryStoreVector(N))
       return;
     break;
   case NVPTXISD::LoadParam:
   case NVPTXISD::LoadParamV2:
   case NVPTXISD::LoadParamV4:
     if (tryLoadParam(N))
       return;
     break;
   case NVPTXISD::StoreRetval:
   case NVPTXISD::StoreRetvalV2:
   case NVPTXISD::StoreRetvalV4:
     if (tryStoreRetval(N))
       return;
     break;
   case NVPTXISD::StoreParam:
   case NVPTXISD::StoreParamV2:
   case NVPTXISD::StoreParamV4:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParamU32:
     if (tryStoreParam(N))
       return;
     break;
   case ISD::INTRINSIC_WO_CHAIN:
     if (tryIntrinsicNoChain(N))
       return;
     break;
   case ISD::INTRINSIC_W_CHAIN:
     if (tryIntrinsicChain(N))
       return;
     break;
   case NVPTXISD::Tex1DFloatS32:
   case NVPTXISD::Tex1DFloatFloat:
   case NVPTXISD::Tex1DFloatFloatLevel:
   case NVPTXISD::Tex1DFloatFloatGrad:
   case NVPTXISD::Tex1DS32S32:
   case NVPTXISD::Tex1DS32Float:
   case NVPTXISD::Tex1DS32FloatLevel:
   case NVPTXISD::Tex1DS32FloatGrad:
   case NVPTXISD::Tex1DU32S32:
   case NVPTXISD::Tex1DU32Float:
   case NVPTXISD::Tex1DU32FloatLevel:
   case NVPTXISD::Tex1DU32FloatGrad:
   case NVPTXISD::Tex1DArrayFloatS32:
   case NVPTXISD::Tex1DArrayFloatFloat:
   case NVPTXISD::Tex1DArrayFloatFloatLevel:
   case NVPTXISD::Tex1DArrayFloatFloatGrad:
   case NVPTXISD::Tex1DArrayS32S32:
   case NVPTXISD::Tex1DArrayS32Float:
   case NVPTXISD::Tex1DArrayS32FloatLevel:
   case NVPTXISD::Tex1DArrayS32FloatGrad:
   case NVPTXISD::Tex1DArrayU32S32:
   case NVPTXISD::Tex1DArrayU32Float:
   case NVPTXISD::Tex1DArrayU32FloatLevel:
   case NVPTXISD::Tex1DArrayU32FloatGrad:
   case NVPTXISD::Tex2DFloatS32:
   case NVPTXISD::Tex2DFloatFloat:
   case NVPTXISD::Tex2DFloatFloatLevel:
   case NVPTXISD::Tex2DFloatFloatGrad:
   case NVPTXISD::Tex2DS32S32:
   case NVPTXISD::Tex2DS32Float:
   case NVPTXISD::Tex2DS32FloatLevel:
   case NVPTXISD::Tex2DS32FloatGrad:
   case NVPTXISD::Tex2DU32S32:
   case NVPTXISD::Tex2DU32Float:
   case NVPTXISD::Tex2DU32FloatLevel:
   case NVPTXISD::Tex2DU32FloatGrad:
   case NVPTXISD::Tex2DArrayFloatS32:
   case NVPTXISD::Tex2DArrayFloatFloat:
   case NVPTXISD::Tex2DArrayFloatFloatLevel:
   case NVPTXISD::Tex2DArrayFloatFloatGrad:
   case NVPTXISD::Tex2DArrayS32S32:
   case NVPTXISD::Tex2DArrayS32Float:
   case NVPTXISD::Tex2DArrayS32FloatLevel:
   case NVPTXISD::Tex2DArrayS32FloatGrad:
   case NVPTXISD::Tex2DArrayU32S32:
   case NVPTXISD::Tex2DArrayU32Float:
   case NVPTXISD::Tex2DArrayU32FloatLevel:
   case NVPTXISD::Tex2DArrayU32FloatGrad:
   case NVPTXISD::Tex3DFloatS32:
   case NVPTXISD::Tex3DFloatFloat:
   case NVPTXISD::Tex3DFloatFloatLevel:
   case NVPTXISD::Tex3DFloatFloatGrad:
   case NVPTXISD::Tex3DS32S32:
   case NVPTXISD::Tex3DS32Float:
   case NVPTXISD::Tex3DS32FloatLevel:
   case NVPTXISD::Tex3DS32FloatGrad:
   case NVPTXISD::Tex3DU32S32:
   case NVPTXISD::Tex3DU32Float:
   case NVPTXISD::Tex3DU32FloatLevel:
   case NVPTXISD::Tex3DU32FloatGrad:
   case NVPTXISD::TexCubeFloatFloat:
   case NVPTXISD::TexCubeFloatFloatLevel:
   case NVPTXISD::TexCubeS32Float:
   case NVPTXISD::TexCubeS32FloatLevel:
   case NVPTXISD::TexCubeU32Float:
   case NVPTXISD::TexCubeU32FloatLevel:
   case NVPTXISD::TexCubeArrayFloatFloat:
   case NVPTXISD::TexCubeArrayFloatFloatLevel:
   case NVPTXISD::TexCubeArrayS32Float:
   case NVPTXISD::TexCubeArrayS32FloatLevel:
   case NVPTXISD::TexCubeArrayU32Float:
   case NVPTXISD::TexCubeArrayU32FloatLevel:
   case NVPTXISD::Tld4R2DFloatFloat:
   case NVPTXISD::Tld4G2DFloatFloat:
   case NVPTXISD::Tld4B2DFloatFloat:
   case NVPTXISD::Tld4A2DFloatFloat:
   case NVPTXISD::Tld4R2DS64Float:
   case NVPTXISD::Tld4G2DS64Float:
   case NVPTXISD::Tld4B2DS64Float:
   case NVPTXISD::Tld4A2DS64Float:
   case NVPTXISD::Tld4R2DU64Float:
   case NVPTXISD::Tld4G2DU64Float:
   case NVPTXISD::Tld4B2DU64Float:
   case NVPTXISD::Tld4A2DU64Float:
   case NVPTXISD::TexUnified1DFloatS32:
   case NVPTXISD::TexUnified1DFloatFloat:
   case NVPTXISD::TexUnified1DFloatFloatLevel:
   case NVPTXISD::TexUnified1DFloatFloatGrad:
   case NVPTXISD::TexUnified1DS32S32:
   case NVPTXISD::TexUnified1DS32Float:
   case NVPTXISD::TexUnified1DS32FloatLevel:
   case NVPTXISD::TexUnified1DS32FloatGrad:
   case NVPTXISD::TexUnified1DU32S32:
   case NVPTXISD::TexUnified1DU32Float:
   case NVPTXISD::TexUnified1DU32FloatLevel:
   case NVPTXISD::TexUnified1DU32FloatGrad:
   case NVPTXISD::TexUnified1DArrayFloatS32:
   case NVPTXISD::TexUnified1DArrayFloatFloat:
   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
   case NVPTXISD::TexUnified1DArrayS32S32:
   case NVPTXISD::TexUnified1DArrayS32Float:
   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
   case NVPTXISD::TexUnified1DArrayU32S32:
   case NVPTXISD::TexUnified1DArrayU32Float:
   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
   case NVPTXISD::TexUnified2DFloatS32:
   case NVPTXISD::TexUnified2DFloatFloat:
   case NVPTXISD::TexUnified2DFloatFloatLevel:
   case NVPTXISD::TexUnified2DFloatFloatGrad:
   case NVPTXISD::TexUnified2DS32S32:
   case NVPTXISD::TexUnified2DS32Float:
   case NVPTXISD::TexUnified2DS32FloatLevel:
   case NVPTXISD::TexUnified2DS32FloatGrad:
   case NVPTXISD::TexUnified2DU32S32:
   case NVPTXISD::TexUnified2DU32Float:
   case NVPTXISD::TexUnified2DU32FloatLevel:
   case NVPTXISD::TexUnified2DU32FloatGrad:
   case NVPTXISD::TexUnified2DArrayFloatS32:
   case NVPTXISD::TexUnified2DArrayFloatFloat:
   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
   case NVPTXISD::TexUnified2DArrayS32S32:
   case NVPTXISD::TexUnified2DArrayS32Float:
   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
   case NVPTXISD::TexUnified2DArrayU32S32:
   case NVPTXISD::TexUnified2DArrayU32Float:
   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
   case NVPTXISD::TexUnified3DFloatS32:
   case NVPTXISD::TexUnified3DFloatFloat:
   case NVPTXISD::TexUnified3DFloatFloatLevel:
   case NVPTXISD::TexUnified3DFloatFloatGrad:
   case NVPTXISD::TexUnified3DS32S32:
   case NVPTXISD::TexUnified3DS32Float:
   case NVPTXISD::TexUnified3DS32FloatLevel:
   case NVPTXISD::TexUnified3DS32FloatGrad:
   case NVPTXISD::TexUnified3DU32S32:
   case NVPTXISD::TexUnified3DU32Float:
   case NVPTXISD::TexUnified3DU32FloatLevel:
   case NVPTXISD::TexUnified3DU32FloatGrad:
   case NVPTXISD::TexUnifiedCubeFloatFloat:
   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
   case NVPTXISD::TexUnifiedCubeS32Float:
   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
   case NVPTXISD::TexUnifiedCubeU32Float:
   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
   case NVPTXISD::TexUnifiedCubeArrayS32Float:
   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
   case NVPTXISD::TexUnifiedCubeArrayU32Float:
   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
   case NVPTXISD::Tld4UnifiedR2DS64Float:
   case NVPTXISD::Tld4UnifiedG2DS64Float:
   case NVPTXISD::Tld4UnifiedB2DS64Float:
   case NVPTXISD::Tld4UnifiedA2DS64Float:
   case NVPTXISD::Tld4UnifiedR2DU64Float:
   case NVPTXISD::Tld4UnifiedG2DU64Float:
   case NVPTXISD::Tld4UnifiedB2DU64Float:
   case NVPTXISD::Tld4UnifiedA2DU64Float:
     if (tryTextureIntrinsic(N))
       return;
     break;
   case NVPTXISD::Suld1DI8Clamp:
   case NVPTXISD::Suld1DI16Clamp:
   case NVPTXISD::Suld1DI32Clamp:
   case NVPTXISD::Suld1DI64Clamp:
   case NVPTXISD::Suld1DV2I8Clamp:
   case NVPTXISD::Suld1DV2I16Clamp:
   case NVPTXISD::Suld1DV2I32Clamp:
   case NVPTXISD::Suld1DV2I64Clamp:
   case NVPTXISD::Suld1DV4I8Clamp:
   case NVPTXISD::Suld1DV4I16Clamp:
   case NVPTXISD::Suld1DV4I32Clamp:
   case NVPTXISD::Suld1DArrayI8Clamp:
   case NVPTXISD::Suld1DArrayI16Clamp:
   case NVPTXISD::Suld1DArrayI32Clamp:
   case NVPTXISD::Suld1DArrayI64Clamp:
   case NVPTXISD::Suld1DArrayV2I8Clamp:
   case NVPTXISD::Suld1DArrayV2I16Clamp:
   case NVPTXISD::Suld1DArrayV2I32Clamp:
   case NVPTXISD::Suld1DArrayV2I64Clamp:
   case NVPTXISD::Suld1DArrayV4I8Clamp:
   case NVPTXISD::Suld1DArrayV4I16Clamp:
   case NVPTXISD::Suld1DArrayV4I32Clamp:
   case NVPTXISD::Suld2DI8Clamp:
   case NVPTXISD::Suld2DI16Clamp:
   case NVPTXISD::Suld2DI32Clamp:
   case NVPTXISD::Suld2DI64Clamp:
   case NVPTXISD::Suld2DV2I8Clamp:
   case NVPTXISD::Suld2DV2I16Clamp:
   case NVPTXISD::Suld2DV2I32Clamp:
   case NVPTXISD::Suld2DV2I64Clamp:
   case NVPTXISD::Suld2DV4I8Clamp:
   case NVPTXISD::Suld2DV4I16Clamp:
   case NVPTXISD::Suld2DV4I32Clamp:
   case NVPTXISD::Suld2DArrayI8Clamp:
   case NVPTXISD::Suld2DArrayI16Clamp:
   case NVPTXISD::Suld2DArrayI32Clamp:
   case NVPTXISD::Suld2DArrayI64Clamp:
   case NVPTXISD::Suld2DArrayV2I8Clamp:
   case NVPTXISD::Suld2DArrayV2I16Clamp:
   case NVPTXISD::Suld2DArrayV2I32Clamp:
   case NVPTXISD::Suld2DArrayV2I64Clamp:
   case NVPTXISD::Suld2DArrayV4I8Clamp:
   case NVPTXISD::Suld2DArrayV4I16Clamp:
   case NVPTXISD::Suld2DArrayV4I32Clamp:
   case NVPTXISD::Suld3DI8Clamp:
   case NVPTXISD::Suld3DI16Clamp:
   case NVPTXISD::Suld3DI32Clamp:
   case NVPTXISD::Suld3DI64Clamp:
   case NVPTXISD::Suld3DV2I8Clamp:
   case NVPTXISD::Suld3DV2I16Clamp:
   case NVPTXISD::Suld3DV2I32Clamp:
   case NVPTXISD::Suld3DV2I64Clamp:
   case NVPTXISD::Suld3DV4I8Clamp:
   case NVPTXISD::Suld3DV4I16Clamp:
   case NVPTXISD::Suld3DV4I32Clamp:
   case NVPTXISD::Suld1DI8Trap:
   case NVPTXISD::Suld1DI16Trap:
   case NVPTXISD::Suld1DI32Trap:
   case NVPTXISD::Suld1DI64Trap:
   case NVPTXISD::Suld1DV2I8Trap:
   case NVPTXISD::Suld1DV2I16Trap:
   case NVPTXISD::Suld1DV2I32Trap:
   case NVPTXISD::Suld1DV2I64Trap:
   case NVPTXISD::Suld1DV4I8Trap:
   case NVPTXISD::Suld1DV4I16Trap:
   case NVPTXISD::Suld1DV4I32Trap:
   case NVPTXISD::Suld1DArrayI8Trap:
   case NVPTXISD::Suld1DArrayI16Trap:
   case NVPTXISD::Suld1DArrayI32Trap:
   case NVPTXISD::Suld1DArrayI64Trap:
   case NVPTXISD::Suld1DArrayV2I8Trap:
   case NVPTXISD::Suld1DArrayV2I16Trap:
   case NVPTXISD::Suld1DArrayV2I32Trap:
   case NVPTXISD::Suld1DArrayV2I64Trap:
   case NVPTXISD::Suld1DArrayV4I8Trap:
   case NVPTXISD::Suld1DArrayV4I16Trap:
   case NVPTXISD::Suld1DArrayV4I32Trap:
   case NVPTXISD::Suld2DI8Trap:
   case NVPTXISD::Suld2DI16Trap:
   case NVPTXISD::Suld2DI32Trap:
   case NVPTXISD::Suld2DI64Trap:
   case NVPTXISD::Suld2DV2I8Trap:
   case NVPTXISD::Suld2DV2I16Trap:
   case NVPTXISD::Suld2DV2I32Trap:
   case NVPTXISD::Suld2DV2I64Trap:
   case NVPTXISD::Suld2DV4I8Trap:
   case NVPTXISD::Suld2DV4I16Trap:
   case NVPTXISD::Suld2DV4I32Trap:
   case NVPTXISD::Suld2DArrayI8Trap:
   case NVPTXISD::Suld2DArrayI16Trap:
   case NVPTXISD::Suld2DArrayI32Trap:
   case NVPTXISD::Suld2DArrayI64Trap:
   case NVPTXISD::Suld2DArrayV2I8Trap:
   case NVPTXISD::Suld2DArrayV2I16Trap:
   case NVPTXISD::Suld2DArrayV2I32Trap:
   case NVPTXISD::Suld2DArrayV2I64Trap:
   case NVPTXISD::Suld2DArrayV4I8Trap:
   case NVPTXISD::Suld2DArrayV4I16Trap:
   case NVPTXISD::Suld2DArrayV4I32Trap:
   case NVPTXISD::Suld3DI8Trap:
   case NVPTXISD::Suld3DI16Trap:
   case NVPTXISD::Suld3DI32Trap:
   case NVPTXISD::Suld3DI64Trap:
   case NVPTXISD::Suld3DV2I8Trap:
   case NVPTXISD::Suld3DV2I16Trap:
   case NVPTXISD::Suld3DV2I32Trap:
   case NVPTXISD::Suld3DV2I64Trap:
   case NVPTXISD::Suld3DV4I8Trap:
   case NVPTXISD::Suld3DV4I16Trap:
   case NVPTXISD::Suld3DV4I32Trap:
   case NVPTXISD::Suld1DI8Zero:
   case NVPTXISD::Suld1DI16Zero:
   case NVPTXISD::Suld1DI32Zero:
   case NVPTXISD::Suld1DI64Zero:
   case NVPTXISD::Suld1DV2I8Zero:
   case NVPTXISD::Suld1DV2I16Zero:
   case NVPTXISD::Suld1DV2I32Zero:
   case NVPTXISD::Suld1DV2I64Zero:
   case NVPTXISD::Suld1DV4I8Zero:
   case NVPTXISD::Suld1DV4I16Zero:
   case NVPTXISD::Suld1DV4I32Zero:
   case NVPTXISD::Suld1DArrayI8Zero:
   case NVPTXISD::Suld1DArrayI16Zero:
   case NVPTXISD::Suld1DArrayI32Zero:
   case NVPTXISD::Suld1DArrayI64Zero:
   case NVPTXISD::Suld1DArrayV2I8Zero:
   case NVPTXISD::Suld1DArrayV2I16Zero:
   case NVPTXISD::Suld1DArrayV2I32Zero:
   case NVPTXISD::Suld1DArrayV2I64Zero:
   case NVPTXISD::Suld1DArrayV4I8Zero:
   case NVPTXISD::Suld1DArrayV4I16Zero:
   case NVPTXISD::Suld1DArrayV4I32Zero:
   case NVPTXISD::Suld2DI8Zero:
   case NVPTXISD::Suld2DI16Zero:
   case NVPTXISD::Suld2DI32Zero:
   case NVPTXISD::Suld2DI64Zero:
   case NVPTXISD::Suld2DV2I8Zero:
   case NVPTXISD::Suld2DV2I16Zero:
   case NVPTXISD::Suld2DV2I32Zero:
   case NVPTXISD::Suld2DV2I64Zero:
   case NVPTXISD::Suld2DV4I8Zero:
   case NVPTXISD::Suld2DV4I16Zero:
   case NVPTXISD::Suld2DV4I32Zero:
   case NVPTXISD::Suld2DArrayI8Zero:
   case NVPTXISD::Suld2DArrayI16Zero:
   case NVPTXISD::Suld2DArrayI32Zero:
   case NVPTXISD::Suld2DArrayI64Zero:
   case NVPTXISD::Suld2DArrayV2I8Zero:
   case NVPTXISD::Suld2DArrayV2I16Zero:
   case NVPTXISD::Suld2DArrayV2I32Zero:
   case NVPTXISD::Suld2DArrayV2I64Zero:
   case NVPTXISD::Suld2DArrayV4I8Zero:
   case NVPTXISD::Suld2DArrayV4I16Zero:
   case NVPTXISD::Suld2DArrayV4I32Zero:
   case NVPTXISD::Suld3DI8Zero:
   case NVPTXISD::Suld3DI16Zero:
   case NVPTXISD::Suld3DI32Zero:
   case NVPTXISD::Suld3DI64Zero:
   case NVPTXISD::Suld3DV2I8Zero:
   case NVPTXISD::Suld3DV2I16Zero:
   case NVPTXISD::Suld3DV2I32Zero:
   case NVPTXISD::Suld3DV2I64Zero:
   case NVPTXISD::Suld3DV4I8Zero:
   case NVPTXISD::Suld3DV4I16Zero:
   case NVPTXISD::Suld3DV4I32Zero:
     if (trySurfaceIntrinsic(N))
       return;
     break;
   case ISD::AND:
   case ISD::SRA:
   case ISD::SRL:
     // Try to select BFE
     if (tryBFE(N))
       return;
     break;
   case ISD::ADDRSPACECAST:
     SelectAddrSpaceCast(N);
     return;
   case ISD::ConstantFP:
     if (tryConstantFP16(N))
       return;
     break;
   default:
     break;
   }
   SelectCode(N);
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   switch (IID) {
   default:
     return false;
   case Intrinsic::nvvm_ldg_global_f:
   case Intrinsic::nvvm_ldg_global_i:
   case Intrinsic::nvvm_ldg_global_p:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_p:
     return tryLDGLDU(N);
   }
 }
 
 // There's no way to specify FP16 immediates in .f16 ops, so we have to
 // load them into an .f16 register first.
 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
   if (N->getValueType(0) != MVT::f16)
     return false;
   SDValue Val = CurDAG->getTargetConstantFP(
       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
   SDNode *LoadConstF16 =
       CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
   ReplaceNode(N, LoadConstF16);
   return true;
 }
 
 // Map ISD:CONDCODE value to appropriate CmpMode expected by
 // NVPTXInstPrinter::printCmpMode()
 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
   using NVPTX::PTXCmpMode::CmpMode;
   unsigned PTXCmpMode = [](ISD::CondCode CC) {
     switch (CC) {
     default:
       llvm_unreachable("Unexpected condition code.");
     case ISD::SETOEQ:
       return CmpMode::EQ;
     case ISD::SETOGT:
       return CmpMode::GT;
     case ISD::SETOGE:
       return CmpMode::GE;
     case ISD::SETOLT:
       return CmpMode::LT;
     case ISD::SETOLE:
       return CmpMode::LE;
     case ISD::SETONE:
       return CmpMode::NE;
     case ISD::SETO:
       return CmpMode::NUM;
     case ISD::SETUO:
       return CmpMode::NotANumber;
     case ISD::SETUEQ:
       return CmpMode::EQU;
     case ISD::SETUGT:
       return CmpMode::GTU;
     case ISD::SETUGE:
       return CmpMode::GEU;
     case ISD::SETULT:
       return CmpMode::LTU;
     case ISD::SETULE:
       return CmpMode::LEU;
     case ISD::SETUNE:
       return CmpMode::NEU;
     case ISD::SETEQ:
       return CmpMode::EQ;
     case ISD::SETGT:
       return CmpMode::GT;
     case ISD::SETGE:
       return CmpMode::GE;
     case ISD::SETLT:
       return CmpMode::LT;
     case ISD::SETLE:
       return CmpMode::LE;
     case ISD::SETNE:
       return CmpMode::NE;
     }
   }(CondCode.get());
 
   if (FTZ)
     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
 
   return PTXCmpMode;
 }
 
 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
   unsigned PTXCmpMode =
       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
   SDLoc DL(N);
   SDNode *SetP = CurDAG->getMachineNode(
       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
   ReplaceNode(N, SetP);
   return true;
 }
 
 // Find all instances of extract_vector_elt that use this v2f16 vector
 // and coalesce them into a scattering move instruction.
 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
   SDValue Vector = N->getOperand(0);
 
   // We only care about f16x2 as it's the only real vector type we
   // need to deal with.
   if (Vector.getSimpleValueType() != MVT::v2f16)
     return false;
 
   // Find and record all uses of this vector that extract element 0 or 1.
   SmallVector<SDNode *, 4> E0, E1;
   for (const auto &U : Vector.getNode()->uses()) {
     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       continue;
     if (U->getOperand(0) != Vector)
       continue;
     if (const ConstantSDNode *IdxConst =
             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
       if (IdxConst->getZExtValue() == 0)
         E0.push_back(U);
       else if (IdxConst->getZExtValue() == 1)
         E1.push_back(U);
       else
         llvm_unreachable("Invalid vector index.");
     }
   }
 
   // There's no point scattering f16x2 if we only ever access one
   // element of it.
   if (E0.empty() || E1.empty())
     return false;
 
   unsigned Op = NVPTX::SplitF16x2;
   // If the vector has been BITCAST'ed from i32, we can use original
   // value directly and avoid register-to-register move.
   SDValue Source = Vector;
   if (Vector->getOpcode() == ISD::BITCAST) {
     Op = NVPTX::SplitI32toF16x2;
     Source = Vector->getOperand(0);
   }
   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
   // into f16,f16 SplitF16x2(V)
   SDNode *ScatterOp =
       CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
   for (auto *Node : E0)
     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
   for (auto *Node : E1)
     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
 
   return true;
 }
 
 static unsigned int getCodeAddrSpace(MemSDNode *N) {
   const Value *Src = N->getMemOperand()->getValue();
 
   if (!Src)
     return NVPTX::PTXLdStInstCode::GENERIC;
 
   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
     switch (PT->getAddressSpace()) {
     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
     default: break;
     }
   }
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
                           unsigned CodeAddrSpace, MachineFunction *F) {
   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
   // space.
   //
   // We have two ways of identifying invariant loads: Loads may be explicitly
   // marked as invariant, or we may infer them to be invariant.
   //
   // We currently infer invariance for loads from
   //  - constant global variables, and
   //  - kernel function pointer params that are noalias (i.e. __restrict) and
   //    never written to.
   //
   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
   // not during the SelectionDAG phase).
   //
   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
   // explicitly invariant loads because these are how clang tells us to use ldg
   // when the user uses a builtin.
   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
     return false;
 
   if (N->isInvariant())
     return true;
 
   bool IsKernelFn = isKernelFunction(F->getFunction());
 
   // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
   // because the former looks through phi nodes while the latter does not. We
   // need to look through phi nodes to handle pointer induction variables.
   SmallVector<Value *, 8> Objs;
   GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
                        Objs, F->getDataLayout());
 
   return all_of(Objs, [&](Value *V) {
     if (auto *A = dyn_cast<const Argument>(V))
       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
     if (auto *GV = dyn_cast<const GlobalVariable>(V))
       return GV->isConstant();
     return false;
   });
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   switch (IID) {
   default:
     return false;
   case Intrinsic::nvvm_texsurf_handle_internal:
     SelectTexSurfHandle(N);
     return true;
   }
 }
 
 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
   // Op 0 is the intrinsic ID
   SDValue Wrapper = N->getOperand(1);
   SDValue GlobalVal = Wrapper.getOperand(0);
   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
                                         MVT::i64, GlobalVal));
 }
 
 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
   unsigned DstAddrSpace = CastN->getDestAddressSpace();
   assert(SrcAddrSpace != DstAddrSpace &&
          "addrspacecast must be between different address spaces");
 
   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
     // Specific to generic
     unsigned Opc;
     switch (SrcAddrSpace) {
     default: report_fatal_error("Bad address space in addrspacecast");
     case ADDRESS_SPACE_GLOBAL:
       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
       break;
     case ADDRESS_SPACE_SHARED:
       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
                                                : NVPTX::cvta_shared_yes_64)
                          : NVPTX::cvta_shared_yes;
       break;
     case ADDRESS_SPACE_CONST:
       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
                                                : NVPTX::cvta_const_yes_64)
                          : NVPTX::cvta_const_yes;
       break;
     case ADDRESS_SPACE_LOCAL:
       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
                                                : NVPTX::cvta_local_yes_64)
                          : NVPTX::cvta_local_yes;
       break;
     }
     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
                                           Src));
     return;
   } else {
     // Generic to specific
     if (SrcAddrSpace != 0)
       report_fatal_error("Cannot cast between two non-generic address spaces");
     unsigned Opc;
     switch (DstAddrSpace) {
     default: report_fatal_error("Bad address space in addrspacecast");
     case ADDRESS_SPACE_GLOBAL:
       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
                          : NVPTX::cvta_to_global_yes;
       break;
     case ADDRESS_SPACE_SHARED:
       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
                                                 : NVPTX::cvta_to_shared_yes_64)
                          : NVPTX::cvta_to_shared_yes;
       break;
     case ADDRESS_SPACE_CONST:
       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
                                              : NVPTX::cvta_to_const_yes_64)
                          : NVPTX::cvta_to_const_yes;
       break;
     case ADDRESS_SPACE_LOCAL:
       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
                                                : NVPTX::cvta_to_local_yes_64)
                          : NVPTX::cvta_to_local_yes;
       break;
     case ADDRESS_SPACE_PARAM:
       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
                          : NVPTX::nvvm_ptr_gen_to_param;
       break;
     }
     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
                                           Src));
     return;
   }
 }
 
 // Helper function template to reduce amount of boilerplate code for
 // opcode selection.
 static Optional<unsigned> pickOpcodeForVT(
     MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
     unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
     unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
   switch (VT) {
   case MVT::i1:
   case MVT::i8:
     return Opcode_i8;
   case MVT::i16:
     return Opcode_i16;
   case MVT::i32:
     return Opcode_i32;
   case MVT::i64:
     return Opcode_i64;
   case MVT::f16:
     return Opcode_f16;
   case MVT::v2f16:
     return Opcode_f16x2;
   case MVT::f32:
     return Opcode_f32;
   case MVT::f64:
     return Opcode_f64;
   default:
     return None;
   }
 }
 
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDLoc dl(N);
-  LoadSDNode *LD = cast<LoadSDNode>(N);
+  MemSDNode *LD = cast<MemSDNode>(N);
+  assert(LD->readMem() && "Expected load");
+  LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
   EVT LoadedVT = LD->getMemoryVT();
   SDNode *NVPTXLD = nullptr;
 
   // do not support pre/post inc/dec
-  if (LD->isIndexed())
+  if (PlainLoad && PlainLoad->isIndexed())
     return false;
 
   if (!LoadedVT.isSimple())
     return false;
 
+  AtomicOrdering Ordering = LD->getOrdering();
+  // In order to lower atomic loads with stronger guarantees we would need to
+  // use load.acquire or insert fences. However these features were only added
+  // with PTX ISA 6.0 / sm_70.
+  // TODO: Check if we can actually use the new instructions and implement them.
+  if (isStrongerThanMonotonic(Ordering))
+    return false;
+
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
 
   // Volatile Setting
-  // - .volatile is only availalble for .global and .shared
-  bool isVolatile = LD->isVolatile();
+  // - .volatile is only available for .global and .shared
+  // - .volatile has the same memory synchronization semantics as .relaxed.sys
+  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT SimpleVT = LoadedVT.getSimpleVT();
   MVT ScalarVT = SimpleVT.getScalarType();
   // Read at least 8 bits (predicates are stored as 8-bit values)
   unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int fromType;
 
   // Vector Setting
   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
   if (SimpleVT.isVector()) {
     assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
     // v2f16 is loaded using ld.b32
     fromTypeWidth = 32;
   }
 
-  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+  if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
     // f16 uses .b16 as its storage type.
     fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
                                              : NVPTX::PTXLdStInstCode::Float;
   else
     fromType = NVPTX::PTXLdStInstCode::Unsigned;
 
   // Create the machine instruction DAG
   SDValue Chain = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue Addr;
   SDValue Offset, Base;
   Optional<unsigned> Opcode;
   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
     Opcode = pickOpcodeForVT(
         TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
         NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
     if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
           NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
     else
       Opcode = pickOpcodeForVT(
           TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
           NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
           NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
           NVPTX::LD_f64_areg_64);
     else
       Opcode = pickOpcodeForVT(
           TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
           NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), N1, Chain };
     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
                                      MVT::Other, Ops);
   }
 
   if (!NVPTXLD)
     return false;
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   ReplaceNode(N, NVPTXLD);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
   Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   MemSDNode *MemSD = cast<MemSDNode>(N);
   EVT LoadedVT = MemSD->getMemoryVT();
 
   if (!LoadedVT.isSimple())
     return false;
 
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool IsVolatile = MemSD->isVolatile();
   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     IsVolatile = false;
 
   // Vector Setting
   MVT SimpleVT = LoadedVT.getSimpleVT();
 
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT ScalarVT = SimpleVT.getScalarType();
   // Read at least 8 bits (predicates are stored as 8-bit values)
   unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int FromType;
   // The last operand holds the original LoadSDNode::getExtensionType() value
   unsigned ExtensionType = cast<ConstantSDNode>(
       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
   if (ExtensionType == ISD::SEXTLOAD)
     FromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
     FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
                                              : NVPTX::PTXLdStInstCode::Float;
   else
     FromType = NVPTX::PTXLdStInstCode::Unsigned;
 
   unsigned VecType;
 
   switch (N->getOpcode()) {
   case NVPTXISD::LoadV2:
     VecType = NVPTX::PTXLdStInstCode::V2;
     break;
   case NVPTXISD::LoadV4:
     VecType = NVPTX::PTXLdStInstCode::V4;
     break;
   default:
     return false;
   }
 
   EVT EltVT = N->getValueType(0);
 
   // v8f16 is a special case. PTX doesn't have ld.v8.f16
   // instruction. Instead, we split the vector into v2f16 chunks and
   // load them with ld.v4.b32.
   if (EltVT == MVT::v2f16) {
     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
     EltVT = MVT::i32;
     FromType = NVPTX::PTXLdStInstCode::Untyped;
     FromTypeWidth = 32;
   }
 
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
                                NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
       break;
     case NVPTXISD::LoadV4:
       Opcode =
           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
                           NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
                           NVPTX::LDV_f32_v4_avar, None);
       break;
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
                                NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
       break;
     case NVPTXISD::LoadV4:
       Opcode =
           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
                           NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
                           NVPTX::LDV_f32_v4_asi, None);
       break;
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
             NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
             NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
             NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
             NVPTX::LDV_f64_v2_ari_64);
         break;
       case NVPTXISD::LoadV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
             NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
             NVPTX::LDV_f32_v4_ari_64, None);
         break;
       }
     } else {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
                                  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
         break;
       case NVPTXISD::LoadV4:
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
                             NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
                             NVPTX::LDV_f32_v4_ari, None);
         break;
       }
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
 
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else {
     if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
             NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
             NVPTX::LDV_f64_v2_areg_64);
         break;
       case NVPTXISD::LoadV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
             NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
             NVPTX::LDV_f32_v4_areg_64, None);
         break;
       }
     } else {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
                             NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
                             NVPTX::LDV_f64_v2_areg);
         break;
       case NVPTXISD::LoadV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
             NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
             NVPTX::LDV_f32_v4_areg, None);
         break;
       }
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   ReplaceNode(N, LD);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
 
   SDValue Chain = N->getOperand(0);
   SDValue Op1;
   MemSDNode *Mem;
   bool IsLDG = true;
 
   // If this is an LDG intrinsic, the address is the third operand. If its an
   // LDG/LDU SD node (from custom vector handling), then its the second operand
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     Op1 = N->getOperand(2);
     Mem = cast<MemIntrinsicSDNode>(N);
     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     switch (IID) {
     default:
       return false;
     case Intrinsic::nvvm_ldg_global_f:
     case Intrinsic::nvvm_ldg_global_i:
     case Intrinsic::nvvm_ldg_global_p:
       IsLDG = true;
       break;
     case Intrinsic::nvvm_ldu_global_f:
     case Intrinsic::nvvm_ldu_global_i:
     case Intrinsic::nvvm_ldu_global_p:
       IsLDG = false;
       break;
     }
   } else {
     Op1 = N->getOperand(1);
     Mem = cast<MemSDNode>(N);
   }
 
   Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   SDValue Base, Offset, Addr;
 
   EVT EltVT = Mem->getMemoryVT();
   unsigned NumElts = 1;
   if (EltVT.isVector()) {
     NumElts = EltVT.getVectorNumElements();
     EltVT = EltVT.getVectorElementType();
     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
     if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
       EltVT = MVT::v2f16;
       NumElts /= 2;
     }
   }
 
   // Build the "promoted" result VTList for the load. If we are really loading
   // i8s, then the return type will be promoted to i16 since we do not expose
   // 8-bit registers in NVPTX.
   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
   SmallVector<EVT, 5> InstVTs;
   for (unsigned i = 0; i != NumElts; ++i) {
     InstVTs.push_back(NodeVT);
   }
   InstVTs.push_back(MVT::Other);
   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
 
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case ISD::LOAD:
     case ISD::INTRINSIC_W_CHAIN:
       if (IsLDG)
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
                                      NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
       else
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
       break;
     case NVPTXISD::LoadV2:
     case NVPTXISD::LDGV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
                                    NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDUV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                    NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LoadV4:
     case NVPTXISD::LDGV4:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
                                NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
                                NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
                                NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
                                NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
                                NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
       break;
     case NVPTXISD::LDUV4:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
                                NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
                                NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
                                NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
                                NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
                                NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
       break;
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = { Addr, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG)
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
         else
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LDUV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
         break;
       case NVPTXISD::LDUV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
         break;
       }
     } else {
       switch (N->getOpcode()) {
       default:
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG)
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
         else
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LDUV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
         break;
       case NVPTXISD::LDUV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
         break;
       }
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = {Base, Offset, Chain};
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG)
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
         else
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LDUV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
         break;
       case NVPTXISD::LDUV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
         break;
       }
     } else {
       switch (N->getOpcode()) {
       default:
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
         if (IsLDG)
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
         else
           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LDUV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
         break;
       case NVPTXISD::LDUV4:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
         break;
       }
     }
     if (!Opcode)
       return false;
     SDValue Ops[] = { Op1, Chain };
     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = Mem->getMemOperand();
   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   // For automatic generation of LDG (through SelectLoad[Vector], not the
   // intrinsics), we may have an extending load like:
   //
   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
   //
   // In this case, the matching logic above will select a load for the original
   // memory type (in this case, i8) and our types will not match (the node needs
   // to return an i32 in this case). Our LDG/LDU nodes do not support the
   // concept of sign-/zero-extension, so emulate it here by adding an explicit
   // CVT instruction. Ptxas should clean up any redundancies here.
 
   EVT OrigType = N->getValueType(0);
   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
 
   if (OrigType != EltVT && LdNode) {
     // We have an extending-load. The instruction we selected operates on the
     // smaller type, but the SDNode we are replacing has the larger type. We
     // need to emit a CVT to make the types match.
     bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
     unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
                                        EltVT.getSimpleVT(), IsSigned);
 
     // For each output value, apply the manual sign/zero-extension and make sure
     // all users of the load go through that CVT.
     for (unsigned i = 0; i != NumElts; ++i) {
       SDValue Res(LD, i);
       SDValue OrigVal(N, i);
 
       SDNode *CvtNode =
         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
                                                          DL, MVT::i32));
       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
     }
   }
 
   ReplaceNode(N, LD);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   SDLoc dl(N);
-  StoreSDNode *ST = cast<StoreSDNode>(N);
+  MemSDNode *ST = cast<MemSDNode>(N);
+  assert(ST->writeMem() && "Expected store");
+  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
+  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
+  assert((PlainStore || AtomicStore) && "Expected store");
   EVT StoreVT = ST->getMemoryVT();
   SDNode *NVPTXST = nullptr;
 
   // do not support pre/post inc/dec
-  if (ST->isIndexed())
+  if (PlainStore && PlainStore->isIndexed())
     return false;
 
   if (!StoreVT.isSimple())
     return false;
 
+  AtomicOrdering Ordering = ST->getOrdering();
+  // In order to lower atomic loads with stronger guarantees we would need to
+  // use store.release or insert fences. However these features were only added
+  // with PTX ISA 6.0 / sm_70.
+  // TODO: Check if we can actually use the new instructions and implement them.
+  if (isStrongerThanMonotonic(Ordering))
+    return false;
+
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
 
   // Volatile Setting
-  // - .volatile is only availalble for .global and .shared
-  bool isVolatile = ST->isVolatile();
+  // - .volatile is only available for .global and .shared
+  // - .volatile has the same memory synchronization semantics as .relaxed.sys
+  bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
   // Vector Setting
   MVT SimpleVT = StoreVT.getSimpleVT();
   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   //
   MVT ScalarVT = SimpleVT.getScalarType();
   unsigned toTypeWidth = ScalarVT.getSizeInBits();
   if (SimpleVT.isVector()) {
     assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
     // v2f16 is stored using st.b32
     toTypeWidth = 32;
   }
 
   unsigned int toType;
   if (ScalarVT.isFloatingPoint())
     // f16 uses .b16 as its storage type.
     toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
                                            : NVPTX::PTXLdStInstCode::Float;
   else
     toType = NVPTX::PTXLdStInstCode::Unsigned;
 
   // Create the machine instruction DAG
-  SDValue Chain = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDValue N2 = N->getOperand(2);
+  SDValue Chain = ST->getChain();
+  SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
+  SDValue BasePtr = ST->getBasePtr();
   SDValue Addr;
   SDValue Offset, Base;
   Optional<unsigned> Opcode;
-  MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
+  MVT::SimpleValueType SourceVT =
+      Value.getNode()->getSimpleValueType(0).SimpleTy;
 
-  if (SelectDirectAddr(N2, Addr)) {
+  if (SelectDirectAddr(BasePtr, Addr)) {
     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
                              NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
-                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
-                      Chain };
+    SDValue Ops[] = {Value,
+                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(toType, dl),
+                     getI32Imm(toTypeWidth, dl),
+                     Addr,
+                     Chain};
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
-  } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
-                               : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+  } else if (PointerSize == 64
+                 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
+                 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
-                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
-                      Offset, Chain };
+    SDValue Ops[] = {Value,
+                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(toType, dl),
+                     getI32Imm(toTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
-  } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
-                               : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+  } else if (PointerSize == 64
+                 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
+                 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
     if (PointerSize == 64)
       Opcode = pickOpcodeForVT(
           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
           NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
     else
       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
                                NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
     if (!Opcode)
       return false;
 
-    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
-                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
-                      Offset, Chain };
+    SDValue Ops[] = {Value,
+                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(toType, dl),
+                     getI32Imm(toTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
       Opcode =
           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
                           NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
     else
       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
                                NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
-                      getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl),
-                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
-                      Chain };
+    SDValue Ops[] = {Value,
+                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(toType, dl),
+                     getI32Imm(toTypeWidth, dl),
+                     BasePtr,
+                     Chain};
     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   }
 
   if (!NVPTXST)
     return false;
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
   ReplaceNode(N, NVPTXST);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
   Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *ST;
   EVT EltVT = Op1.getValueType();
   MemSDNode *MemSD = cast<MemSDNode>(N);
   EVT StoreVT = MemSD->getMemoryVT();
 
   // Address Space Setting
   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
     report_fatal_error("Cannot store to pointer that points to constant "
                        "memory space");
   }
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
   // Volatile Setting
   // - .volatile is only availalble for .global and .shared
   bool IsVolatile = MemSD->isVolatile();
   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     IsVolatile = false;
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   assert(StoreVT.isSimple() && "Store value is not simple");
   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
   unsigned ToType;
   if (ScalarVT.isFloatingPoint())
     ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
                                            : NVPTX::PTXLdStInstCode::Float;
   else
     ToType = NVPTX::PTXLdStInstCode::Unsigned;
 
   SmallVector<SDValue, 12> StOps;
   SDValue N2;
   unsigned VecType;
 
   switch (N->getOpcode()) {
   case NVPTXISD::StoreV2:
     VecType = NVPTX::PTXLdStInstCode::V2;
     StOps.push_back(N->getOperand(1));
     StOps.push_back(N->getOperand(2));
     N2 = N->getOperand(3);
     break;
   case NVPTXISD::StoreV4:
     VecType = NVPTX::PTXLdStInstCode::V4;
     StOps.push_back(N->getOperand(1));
     StOps.push_back(N->getOperand(2));
     StOps.push_back(N->getOperand(3));
     StOps.push_back(N->getOperand(4));
     N2 = N->getOperand(5);
     break;
   default:
     return false;
   }
 
   // v8f16 is a special case. PTX doesn't have st.v8.f16
   // instruction. Instead, we split the vector into v2f16 chunks and
   // store them with st.v4.b32.
   if (EltVT == MVT::v2f16) {
     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
     EltVT = MVT::i32;
     ToType = NVPTX::PTXLdStInstCode::Untyped;
     ToTypeWidth = 32;
   }
 
   StOps.push_back(getI32Imm(IsVolatile, DL));
   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
   StOps.push_back(getI32Imm(VecType, DL));
   StOps.push_back(getI32Imm(ToType, DL));
   StOps.push_back(getI32Imm(ToTypeWidth, DL));
 
   if (SelectDirectAddr(N2, Addr)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::StoreV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
                                NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
       break;
     case NVPTXISD::StoreV4:
       Opcode =
           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
                           NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
                           NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
                           NVPTX::STV_f32_v4_avar, None);
       break;
     }
     StOps.push_back(Addr);
   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::StoreV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
                                NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
       break;
     case NVPTXISD::StoreV4:
       Opcode =
           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
                           NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
                           NVPTX::STV_f32_v4_asi, None);
       break;
     }
     StOps.push_back(Base);
     StOps.push_back(Offset);
   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
     if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::StoreV2:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
             NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
             NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
             NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
             NVPTX::STV_f64_v2_ari_64);
         break;
       case NVPTXISD::StoreV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
             NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
             NVPTX::STV_f32_v4_ari_64, None);
         break;
       }
     } else {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::StoreV2:
         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
                                  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
         break;
       case NVPTXISD::StoreV4:
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
                             NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
                             NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
                             NVPTX::STV_f32_v4_ari, None);
         break;
       }
     }
     StOps.push_back(Base);
     StOps.push_back(Offset);
   } else {
     if (PointerSize == 64) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::StoreV2:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
             NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
             NVPTX::STV_f64_v2_areg_64);
         break;
       case NVPTXISD::StoreV4:
         Opcode = pickOpcodeForVT(
             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
             NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
             NVPTX::STV_f32_v4_areg_64, None);
         break;
       }
     } else {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::StoreV2:
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
                             NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
                             NVPTX::STV_f64_v2_areg);
         break;
       case NVPTXISD::StoreV4:
         Opcode =
             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
                             NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
                             NVPTX::STV_f32_v4_areg, None);
         break;
       }
     }
     StOps.push_back(N2);
   }
 
   if (!Opcode)
     return false;
 
   StOps.push_back(Chain);
 
   ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   ReplaceNode(N, ST);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   SDValue Chain = Node->getOperand(0);
   SDValue Offset = Node->getOperand(2);
   SDValue Flag = Node->getOperand(3);
   SDLoc DL(Node);
   MemSDNode *Mem = cast<MemSDNode>(Node);
 
   unsigned VecSize;
   switch (Node->getOpcode()) {
   default:
     return false;
   case NVPTXISD::LoadParam:
     VecSize = 1;
     break;
   case NVPTXISD::LoadParamV2:
     VecSize = 2;
     break;
   case NVPTXISD::LoadParamV4:
     VecSize = 4;
     break;
   }
 
   EVT EltVT = Node->getValueType(0);
   EVT MemVT = Mem->getMemoryVT();
 
   Optional<unsigned> Opcode;
 
   switch (VecSize) {
   default:
     return false;
   case 1:
     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
                              NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
     break;
   case 2:
     Opcode =
         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
                         NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
                         NVPTX::LoadParamMemV2F64);
     break;
   case 4:
     Opcode = pickOpcodeForVT(
         MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
         NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
         NVPTX::LoadParamMemV4F32, None);
     break;
   }
   if (!Opcode)
     return false;
 
   SDVTList VTs;
   if (VecSize == 1) {
     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
   } else if (VecSize == 2) {
     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
   } else {
     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
     VTs = CurDAG->getVTList(EVTs);
   }
 
   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
 
   SmallVector<SDValue, 2> Ops;
   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
   ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
   SDValue Offset = N->getOperand(1);
   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
   MemSDNode *Mem = cast<MemSDNode>(N);
 
   // How many elements do we have?
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
     return false;
   case NVPTXISD::StoreRetval:
     NumElts = 1;
     break;
   case NVPTXISD::StoreRetvalV2:
     NumElts = 2;
     break;
   case NVPTXISD::StoreRetvalV4:
     NumElts = 4;
     break;
   }
 
   // Build vector of operands
   SmallVector<SDValue, 6> Ops;
   for (unsigned i = 0; i < NumElts; ++i)
     Ops.push_back(N->getOperand(i + 2));
   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
   Ops.push_back(Chain);
 
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
   Optional<unsigned> Opcode = 0;
   switch (NumElts) {
   default:
     return false;
   case 1:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
                              NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
     break;
   case 2:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
                              NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
     break;
   case 4:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
                              NVPTX::StoreRetvalV4I32, None,
                              NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
                              NVPTX::StoreRetvalV4F32, None);
     break;
   }
   if (!Opcode)
     return false;
 
   SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   ReplaceNode(N, Ret);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
   SDValue Param = N->getOperand(1);
   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
   SDValue Offset = N->getOperand(2);
   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
   MemSDNode *Mem = cast<MemSDNode>(N);
   SDValue Flag = N->getOperand(N->getNumOperands() - 1);
 
   // How many elements do we have?
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
     return false;
   case NVPTXISD::StoreParamU32:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParam:
     NumElts = 1;
     break;
   case NVPTXISD::StoreParamV2:
     NumElts = 2;
     break;
   case NVPTXISD::StoreParamV4:
     NumElts = 4;
     break;
   }
 
   // Build vector of operands
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i < NumElts; ++i)
     Ops.push_back(N->getOperand(i + 3));
   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
   Optional<unsigned> Opcode = 0;
   switch (N->getOpcode()) {
   default:
     switch (NumElts) {
     default:
       return false;
     case 1:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
                                NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
       break;
     case 2:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
                                NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
       break;
     case 4:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
                                NVPTX::StoreParamV4I32, None,
                                NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
                                NVPTX::StoreParamV4F32, None);
       break;
     }
     if (!Opcode)
       return false;
     break;
   // Special case: if we have a sign-extend/zero-extend node, insert the
   // conversion instruction first, and use that as the value operand to
   // the selected StoreParam node.
   case NVPTXISD::StoreParamU32: {
     Opcode = NVPTX::StoreParamI32;
     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
                                                 MVT::i32);
     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
                                          MVT::i32, Ops[0], CvtNone);
     Ops[0] = SDValue(Cvt, 0);
     break;
   }
   case NVPTXISD::StoreParamS32: {
     Opcode = NVPTX::StoreParamI32;
     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
                                                 MVT::i32);
     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
                                          MVT::i32, Ops[0], CvtNone);
     Ops[0] = SDValue(Cvt, 0);
     break;
   }
   }
 
   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
   SDNode *Ret =
       CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   ReplaceNode(N, Ret);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
   unsigned Opc = 0;
 
   switch (N->getOpcode()) {
   default: return false;
   case NVPTXISD::Tex1DFloatS32:
     Opc = NVPTX::TEX_1D_F32_S32;
     break;
   case NVPTXISD::Tex1DFloatFloat:
     Opc = NVPTX::TEX_1D_F32_F32;
     break;
   case NVPTXISD::Tex1DFloatFloatLevel:
     Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
     break;
   case NVPTXISD::Tex1DFloatFloatGrad:
     Opc = NVPTX::TEX_1D_F32_F32_GRAD;
     break;
   case NVPTXISD::Tex1DS32S32:
     Opc = NVPTX::TEX_1D_S32_S32;
     break;
   case NVPTXISD::Tex1DS32Float:
     Opc = NVPTX::TEX_1D_S32_F32;
     break;
   case NVPTXISD::Tex1DS32FloatLevel:
     Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
     break;
   case NVPTXISD::Tex1DS32FloatGrad:
     Opc = NVPTX::TEX_1D_S32_F32_GRAD;
     break;
   case NVPTXISD::Tex1DU32S32:
     Opc = NVPTX::TEX_1D_U32_S32;
     break;
   case NVPTXISD::Tex1DU32Float:
     Opc = NVPTX::TEX_1D_U32_F32;
     break;
   case NVPTXISD::Tex1DU32FloatLevel:
     Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tex1DU32FloatGrad:
     Opc = NVPTX::TEX_1D_U32_F32_GRAD;
     break;
   case NVPTXISD::Tex1DArrayFloatS32:
     Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
     break;
   case NVPTXISD::Tex1DArrayFloatFloat:
     Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
     break;
   case NVPTXISD::Tex1DArrayFloatFloatLevel:
     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
     break;
   case NVPTXISD::Tex1DArrayFloatFloatGrad:
     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
     break;
   case NVPTXISD::Tex1DArrayS32S32:
     Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
     break;
   case NVPTXISD::Tex1DArrayS32Float:
     Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
     break;
   case NVPTXISD::Tex1DArrayS32FloatLevel:
     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
     break;
   case NVPTXISD::Tex1DArrayS32FloatGrad:
     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
     break;
   case NVPTXISD::Tex1DArrayU32S32:
     Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
     break;
   case NVPTXISD::Tex1DArrayU32Float:
     Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
     break;
   case NVPTXISD::Tex1DArrayU32FloatLevel:
     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tex1DArrayU32FloatGrad:
     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
     break;
   case NVPTXISD::Tex2DFloatS32:
     Opc = NVPTX::TEX_2D_F32_S32;
     break;
   case NVPTXISD::Tex2DFloatFloat:
     Opc = NVPTX::TEX_2D_F32_F32;
     break;
   case NVPTXISD::Tex2DFloatFloatLevel:
     Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
     break;
   case NVPTXISD::Tex2DFloatFloatGrad:
     Opc = NVPTX::TEX_2D_F32_F32_GRAD;
     break;
   case NVPTXISD::Tex2DS32S32:
     Opc = NVPTX::TEX_2D_S32_S32;
     break;
   case NVPTXISD::Tex2DS32Float:
     Opc = NVPTX::TEX_2D_S32_F32;
     break;
   case NVPTXISD::Tex2DS32FloatLevel:
     Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
     break;
   case NVPTXISD::Tex2DS32FloatGrad:
     Opc = NVPTX::TEX_2D_S32_F32_GRAD;
     break;
   case NVPTXISD::Tex2DU32S32:
     Opc = NVPTX::TEX_2D_U32_S32;
     break;
   case NVPTXISD::Tex2DU32Float:
     Opc = NVPTX::TEX_2D_U32_F32;
     break;
   case NVPTXISD::Tex2DU32FloatLevel:
     Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tex2DU32FloatGrad:
     Opc = NVPTX::TEX_2D_U32_F32_GRAD;
     break;
   case NVPTXISD::Tex2DArrayFloatS32:
     Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
     break;
   case NVPTXISD::Tex2DArrayFloatFloat:
     Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
     break;
   case NVPTXISD::Tex2DArrayFloatFloatLevel:
     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
     break;
   case NVPTXISD::Tex2DArrayFloatFloatGrad:
     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
     break;
   case NVPTXISD::Tex2DArrayS32S32:
     Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
     break;
   case NVPTXISD::Tex2DArrayS32Float:
     Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
     break;
   case NVPTXISD::Tex2DArrayS32FloatLevel:
     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
     break;
   case NVPTXISD::Tex2DArrayS32FloatGrad:
     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
     break;
   case NVPTXISD::Tex2DArrayU32S32:
     Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
     break;
   case NVPTXISD::Tex2DArrayU32Float:
     Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
     break;
   case NVPTXISD::Tex2DArrayU32FloatLevel:
     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tex2DArrayU32FloatGrad:
     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
     break;
   case NVPTXISD::Tex3DFloatS32:
     Opc = NVPTX::TEX_3D_F32_S32;
     break;
   case NVPTXISD::Tex3DFloatFloat:
     Opc = NVPTX::TEX_3D_F32_F32;
     break;
   case NVPTXISD::Tex3DFloatFloatLevel:
     Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
     break;
   case NVPTXISD::Tex3DFloatFloatGrad:
     Opc = NVPTX::TEX_3D_F32_F32_GRAD;
     break;
   case NVPTXISD::Tex3DS32S32:
     Opc = NVPTX::TEX_3D_S32_S32;
     break;
   case NVPTXISD::Tex3DS32Float:
     Opc = NVPTX::TEX_3D_S32_F32;
     break;
   case NVPTXISD::Tex3DS32FloatLevel:
     Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
     break;
   case NVPTXISD::Tex3DS32FloatGrad:
     Opc = NVPTX::TEX_3D_S32_F32_GRAD;
     break;
   case NVPTXISD::Tex3DU32S32:
     Opc = NVPTX::TEX_3D_U32_S32;
     break;
   case NVPTXISD::Tex3DU32Float:
     Opc = NVPTX::TEX_3D_U32_F32;
     break;
   case NVPTXISD::Tex3DU32FloatLevel:
     Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tex3DU32FloatGrad:
     Opc = NVPTX::TEX_3D_U32_F32_GRAD;
     break;
   case NVPTXISD::TexCubeFloatFloat:
     Opc = NVPTX::TEX_CUBE_F32_F32;
     break;
   case NVPTXISD::TexCubeFloatFloatLevel:
     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexCubeS32Float:
     Opc = NVPTX::TEX_CUBE_S32_F32;
     break;
   case NVPTXISD::TexCubeS32FloatLevel:
     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexCubeU32Float:
     Opc = NVPTX::TEX_CUBE_U32_F32;
     break;
   case NVPTXISD::TexCubeU32FloatLevel:
     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexCubeArrayFloatFloat:
     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
     break;
   case NVPTXISD::TexCubeArrayFloatFloatLevel:
     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexCubeArrayS32Float:
     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
     break;
   case NVPTXISD::TexCubeArrayS32FloatLevel:
     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexCubeArrayU32Float:
     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
     break;
   case NVPTXISD::TexCubeArrayU32FloatLevel:
     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tld4R2DFloatFloat:
     Opc = NVPTX::TLD4_R_2D_F32_F32;
     break;
   case NVPTXISD::Tld4G2DFloatFloat:
     Opc = NVPTX::TLD4_G_2D_F32_F32;
     break;
   case NVPTXISD::Tld4B2DFloatFloat:
     Opc = NVPTX::TLD4_B_2D_F32_F32;
     break;
   case NVPTXISD::Tld4A2DFloatFloat:
     Opc = NVPTX::TLD4_A_2D_F32_F32;
     break;
   case NVPTXISD::Tld4R2DS64Float:
     Opc = NVPTX::TLD4_R_2D_S32_F32;
     break;
   case NVPTXISD::Tld4G2DS64Float:
     Opc = NVPTX::TLD4_G_2D_S32_F32;
     break;
   case NVPTXISD::Tld4B2DS64Float:
     Opc = NVPTX::TLD4_B_2D_S32_F32;
     break;
   case NVPTXISD::Tld4A2DS64Float:
     Opc = NVPTX::TLD4_A_2D_S32_F32;
     break;
   case NVPTXISD::Tld4R2DU64Float:
     Opc = NVPTX::TLD4_R_2D_U32_F32;
     break;
   case NVPTXISD::Tld4G2DU64Float:
     Opc = NVPTX::TLD4_G_2D_U32_F32;
     break;
   case NVPTXISD::Tld4B2DU64Float:
     Opc = NVPTX::TLD4_B_2D_U32_F32;
     break;
   case NVPTXISD::Tld4A2DU64Float:
     Opc = NVPTX::TLD4_A_2D_U32_F32;
     break;
   case NVPTXISD::TexUnified1DFloatS32:
     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
     break;
   case NVPTXISD::TexUnified1DFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
     break;
   case NVPTXISD::TexUnified1DFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified1DFloatFloatGrad:
     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified1DS32S32:
     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
     break;
   case NVPTXISD::TexUnified1DS32Float:
     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
     break;
   case NVPTXISD::TexUnified1DS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified1DS32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified1DU32S32:
     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
     break;
   case NVPTXISD::TexUnified1DU32Float:
     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
     break;
   case NVPTXISD::TexUnified1DU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified1DU32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified1DArrayFloatS32:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
     break;
   case NVPTXISD::TexUnified1DArrayFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
     break;
   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified1DArrayS32S32:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
     break;
   case NVPTXISD::TexUnified1DArrayS32Float:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
     break;
   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified1DArrayU32S32:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
     break;
   case NVPTXISD::TexUnified1DArrayU32Float:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
     break;
   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified2DFloatS32:
     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
     break;
   case NVPTXISD::TexUnified2DFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
     break;
   case NVPTXISD::TexUnified2DFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified2DFloatFloatGrad:
     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified2DS32S32:
     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
     break;
   case NVPTXISD::TexUnified2DS32Float:
     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
     break;
   case NVPTXISD::TexUnified2DS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified2DS32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified2DU32S32:
     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
     break;
   case NVPTXISD::TexUnified2DU32Float:
     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
     break;
   case NVPTXISD::TexUnified2DU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified2DU32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified2DArrayFloatS32:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
     break;
   case NVPTXISD::TexUnified2DArrayFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
     break;
   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified2DArrayS32S32:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
     break;
   case NVPTXISD::TexUnified2DArrayS32Float:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
     break;
   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified2DArrayU32S32:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
     break;
   case NVPTXISD::TexUnified2DArrayU32Float:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
     break;
   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified3DFloatS32:
     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
     break;
   case NVPTXISD::TexUnified3DFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
     break;
   case NVPTXISD::TexUnified3DFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified3DFloatFloatGrad:
     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified3DS32S32:
     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
     break;
   case NVPTXISD::TexUnified3DS32Float:
     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
     break;
   case NVPTXISD::TexUnified3DS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified3DS32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
     break;
   case NVPTXISD::TexUnified3DU32S32:
     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
     break;
   case NVPTXISD::TexUnified3DU32Float:
     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
     break;
   case NVPTXISD::TexUnified3DU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnified3DU32FloatGrad:
     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
     break;
   case NVPTXISD::TexUnifiedCubeFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
     break;
   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnifiedCubeS32Float:
     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
     break;
   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnifiedCubeU32Float:
     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
     break;
   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
     break;
   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnifiedCubeArrayS32Float:
     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
     break;
   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
     break;
   case NVPTXISD::TexUnifiedCubeArrayU32Float:
     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
     break;
   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
     break;
   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
     break;
   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
     break;
   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
     break;
   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
     break;
   case NVPTXISD::Tld4UnifiedR2DS64Float:
     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
     break;
   case NVPTXISD::Tld4UnifiedG2DS64Float:
     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
     break;
   case NVPTXISD::Tld4UnifiedB2DS64Float:
     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
     break;
   case NVPTXISD::Tld4UnifiedA2DS64Float:
     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
     break;
   case NVPTXISD::Tld4UnifiedR2DU64Float:
     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
     break;
   case NVPTXISD::Tld4UnifiedG2DU64Float:
     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
     break;
   case NVPTXISD::Tld4UnifiedB2DU64Float:
     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
     break;
   case NVPTXISD::Tld4UnifiedA2DU64Float:
     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
     break;
   }
 
   // Copy over operands
   SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
   Ops.push_back(N->getOperand(0)); // Move chain to the back.
 
   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
   return true;
 }
 
 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
   unsigned Opc = 0;
   switch (N->getOpcode()) {
   default: return false;
   case NVPTXISD::Suld1DI8Clamp:
     Opc = NVPTX::SULD_1D_I8_CLAMP;
     break;
   case NVPTXISD::Suld1DI16Clamp:
     Opc = NVPTX::SULD_1D_I16_CLAMP;
     break;
   case NVPTXISD::Suld1DI32Clamp:
     Opc = NVPTX::SULD_1D_I32_CLAMP;
     break;
   case NVPTXISD::Suld1DI64Clamp:
     Opc = NVPTX::SULD_1D_I64_CLAMP;
     break;
   case NVPTXISD::Suld1DV2I8Clamp:
     Opc = NVPTX::SULD_1D_V2I8_CLAMP;
     break;
   case NVPTXISD::Suld1DV2I16Clamp:
     Opc = NVPTX::SULD_1D_V2I16_CLAMP;
     break;
   case NVPTXISD::Suld1DV2I32Clamp:
     Opc = NVPTX::SULD_1D_V2I32_CLAMP;
     break;
   case NVPTXISD::Suld1DV2I64Clamp:
     Opc = NVPTX::SULD_1D_V2I64_CLAMP;
     break;
   case NVPTXISD::Suld1DV4I8Clamp:
     Opc = NVPTX::SULD_1D_V4I8_CLAMP;
     break;
   case NVPTXISD::Suld1DV4I16Clamp:
     Opc = NVPTX::SULD_1D_V4I16_CLAMP;
     break;
   case NVPTXISD::Suld1DV4I32Clamp:
     Opc = NVPTX::SULD_1D_V4I32_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayI8Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayI16Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayI32Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayI64Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV2I8Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV2I16Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV2I32Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV2I64Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV4I8Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV4I16Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
     break;
   case NVPTXISD::Suld1DArrayV4I32Clamp:
     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
     break;
   case NVPTXISD::Suld2DI8Clamp:
     Opc = NVPTX::SULD_2D_I8_CLAMP;
     break;
   case NVPTXISD::Suld2DI16Clamp:
     Opc = NVPTX::SULD_2D_I16_CLAMP;
     break;
   case NVPTXISD::Suld2DI32Clamp:
     Opc = NVPTX::SULD_2D_I32_CLAMP;
     break;
   case NVPTXISD::Suld2DI64Clamp:
     Opc = NVPTX::SULD_2D_I64_CLAMP;
     break;
   case NVPTXISD::Suld2DV2I8Clamp:
     Opc = NVPTX::SULD_2D_V2I8_CLAMP;
     break;
   case NVPTXISD::Suld2DV2I16Clamp:
     Opc = NVPTX::SULD_2D_V2I16_CLAMP;
     break;
   case NVPTXISD::Suld2DV2I32Clamp:
     Opc = NVPTX::SULD_2D_V2I32_CLAMP;
     break;
   case NVPTXISD::Suld2DV2I64Clamp:
     Opc = NVPTX::SULD_2D_V2I64_CLAMP;
     break;
   case NVPTXISD::Suld2DV4I8Clamp:
     Opc = NVPTX::SULD_2D_V4I8_CLAMP;
     break;
   case NVPTXISD::Suld2DV4I16Clamp:
     Opc = NVPTX::SULD_2D_V4I16_CLAMP;
     break;
   case NVPTXISD::Suld2DV4I32Clamp:
     Opc = NVPTX::SULD_2D_V4I32_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayI8Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayI16Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayI32Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayI64Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV2I8Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV2I16Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV2I32Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV2I64Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV4I8Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV4I16Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
     break;
   case NVPTXISD::Suld2DArrayV4I32Clamp:
     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
     break;
   case NVPTXISD::Suld3DI8Clamp:
     Opc = NVPTX::SULD_3D_I8_CLAMP;
     break;
   case NVPTXISD::Suld3DI16Clamp:
     Opc = NVPTX::SULD_3D_I16_CLAMP;
     break;
   case NVPTXISD::Suld3DI32Clamp:
     Opc = NVPTX::SULD_3D_I32_CLAMP;
     break;
   case NVPTXISD::Suld3DI64Clamp:
     Opc = NVPTX::SULD_3D_I64_CLAMP;
     break;
   case NVPTXISD::Suld3DV2I8Clamp:
     Opc = NVPTX::SULD_3D_V2I8_CLAMP;
     break;
   case NVPTXISD::Suld3DV2I16Clamp:
     Opc = NVPTX::SULD_3D_V2I16_CLAMP;
     break;
   case NVPTXISD::Suld3DV2I32Clamp:
     Opc = NVPTX::SULD_3D_V2I32_CLAMP;
     break;
   case NVPTXISD::Suld3DV2I64Clamp:
     Opc = NVPTX::SULD_3D_V2I64_CLAMP;
     break;
   case NVPTXISD::Suld3DV4I8Clamp:
     Opc = NVPTX::SULD_3D_V4I8_CLAMP;
     break;
   case NVPTXISD::Suld3DV4I16Clamp:
     Opc = NVPTX::SULD_3D_V4I16_CLAMP;
     break;
   case NVPTXISD::Suld3DV4I32Clamp:
     Opc = NVPTX::SULD_3D_V4I32_CLAMP;
     break;
   case NVPTXISD::Suld1DI8Trap:
     Opc = NVPTX::SULD_1D_I8_TRAP;
     break;
   case NVPTXISD::Suld1DI16Trap:
     Opc = NVPTX::SULD_1D_I16_TRAP;
     break;
   case NVPTXISD::Suld1DI32Trap:
     Opc = NVPTX::SULD_1D_I32_TRAP;
     break;
   case NVPTXISD::Suld1DI64Trap:
     Opc = NVPTX::SULD_1D_I64_TRAP;
     break;
   case NVPTXISD::Suld1DV2I8Trap:
     Opc = NVPTX::SULD_1D_V2I8_TRAP;
     break;
   case NVPTXISD::Suld1DV2I16Trap:
     Opc = NVPTX::SULD_1D_V2I16_TRAP;
     break;
   case NVPTXISD::Suld1DV2I32Trap:
     Opc = NVPTX::SULD_1D_V2I32_TRAP;
     break;
   case NVPTXISD::Suld1DV2I64Trap:
     Opc = NVPTX::SULD_1D_V2I64_TRAP;
     break;
   case NVPTXISD::Suld1DV4I8Trap:
     Opc = NVPTX::SULD_1D_V4I8_TRAP;
     break;
   case NVPTXISD::Suld1DV4I16Trap:
     Opc = NVPTX::SULD_1D_V4I16_TRAP;
     break;
   case NVPTXISD::Suld1DV4I32Trap:
     Opc = NVPTX::SULD_1D_V4I32_TRAP;
     break;
   case NVPTXISD::Suld1DArrayI8Trap:
     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
     break;
   case NVPTXISD::Suld1DArrayI16Trap:
     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
     break;
   case NVPTXISD::Suld1DArrayI32Trap:
     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
     break;
   case NVPTXISD::Suld1DArrayI64Trap:
     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV2I8Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV2I16Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV2I32Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV2I64Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV4I8Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV4I16Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
     break;
   case NVPTXISD::Suld1DArrayV4I32Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
     break;
   case NVPTXISD::Suld2DI8Trap:
     Opc = NVPTX::SULD_2D_I8_TRAP;
     break;
   case NVPTXISD::Suld2DI16Trap:
     Opc = NVPTX::SULD_2D_I16_TRAP;
     break;
   case NVPTXISD::Suld2DI32Trap:
     Opc = NVPTX::SULD_2D_I32_TRAP;
     break;
   case NVPTXISD::Suld2DI64Trap:
     Opc = NVPTX::SULD_2D_I64_TRAP;
     break;
   case NVPTXISD::Suld2DV2I8Trap:
     Opc = NVPTX::SULD_2D_V2I8_TRAP;
     break;
   case NVPTXISD::Suld2DV2I16Trap:
     Opc = NVPTX::SULD_2D_V2I16_TRAP;
     break;
   case NVPTXISD::Suld2DV2I32Trap:
     Opc = NVPTX::SULD_2D_V2I32_TRAP;
     break;
   case NVPTXISD::Suld2DV2I64Trap:
     Opc = NVPTX::SULD_2D_V2I64_TRAP;
     break;
   case NVPTXISD::Suld2DV4I8Trap:
     Opc = NVPTX::SULD_2D_V4I8_TRAP;
     break;
   case NVPTXISD::Suld2DV4I16Trap:
     Opc = NVPTX::SULD_2D_V4I16_TRAP;
     break;
   case NVPTXISD::Suld2DV4I32Trap:
     Opc = NVPTX::SULD_2D_V4I32_TRAP;
     break;
   case NVPTXISD::Suld2DArrayI8Trap:
     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
     break;
   case NVPTXISD::Suld2DArrayI16Trap:
     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
     break;
   case NVPTXISD::Suld2DArrayI32Trap:
     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
     break;
   case NVPTXISD::Suld2DArrayI64Trap:
     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV2I8Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV2I16Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV2I32Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV2I64Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV4I8Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV4I16Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
     break;
   case NVPTXISD::Suld2DArrayV4I32Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
     break;
   case NVPTXISD::Suld3DI8Trap:
     Opc = NVPTX::SULD_3D_I8_TRAP;
     break;
   case NVPTXISD::Suld3DI16Trap:
     Opc = NVPTX::SULD_3D_I16_TRAP;
     break;
   case NVPTXISD::Suld3DI32Trap:
     Opc = NVPTX::SULD_3D_I32_TRAP;
     break;
   case NVPTXISD::Suld3DI64Trap:
     Opc = NVPTX::SULD_3D_I64_TRAP;
     break;
   case NVPTXISD::Suld3DV2I8Trap:
     Opc = NVPTX::SULD_3D_V2I8_TRAP;
     break;
   case NVPTXISD::Suld3DV2I16Trap:
     Opc = NVPTX::SULD_3D_V2I16_TRAP;
     break;
   case NVPTXISD::Suld3DV2I32Trap:
     Opc = NVPTX::SULD_3D_V2I32_TRAP;
     break;
   case NVPTXISD::Suld3DV2I64Trap:
     Opc = NVPTX::SULD_3D_V2I64_TRAP;
     break;
   case NVPTXISD::Suld3DV4I8Trap:
     Opc = NVPTX::SULD_3D_V4I8_TRAP;
     break;
   case NVPTXISD::Suld3DV4I16Trap:
     Opc = NVPTX::SULD_3D_V4I16_TRAP;
     break;
   case NVPTXISD::Suld3DV4I32Trap:
     Opc = NVPTX::SULD_3D_V4I32_TRAP;
     break;
   case NVPTXISD::Suld1DI8Zero:
     Opc = NVPTX::SULD_1D_I8_ZERO;
     break;
   case NVPTXISD::Suld1DI16Zero:
     Opc = NVPTX::SULD_1D_I16_ZERO;
     break;
   case NVPTXISD::Suld1DI32Zero:
     Opc = NVPTX::SULD_1D_I32_ZERO;
     break;
   case NVPTXISD::Suld1DI64Zero:
     Opc = NVPTX::SULD_1D_I64_ZERO;
     break;
   case NVPTXISD::Suld1DV2I8Zero:
     Opc = NVPTX::SULD_1D_V2I8_ZERO;
     break;
   case NVPTXISD::Suld1DV2I16Zero:
     Opc = NVPTX::SULD_1D_V2I16_ZERO;
     break;
   case NVPTXISD::Suld1DV2I32Zero:
     Opc = NVPTX::SULD_1D_V2I32_ZERO;
     break;
   case NVPTXISD::Suld1DV2I64Zero:
     Opc = NVPTX::SULD_1D_V2I64_ZERO;
     break;
   case NVPTXISD::Suld1DV4I8Zero:
     Opc = NVPTX::SULD_1D_V4I8_ZERO;
     break;
   case NVPTXISD::Suld1DV4I16Zero:
     Opc = NVPTX::SULD_1D_V4I16_ZERO;
     break;
   case NVPTXISD::Suld1DV4I32Zero:
     Opc = NVPTX::SULD_1D_V4I32_ZERO;
     break;
   case NVPTXISD::Suld1DArrayI8Zero:
     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
     break;
   case NVPTXISD::Suld1DArrayI16Zero:
     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
     break;
   case NVPTXISD::Suld1DArrayI32Zero:
     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
     break;
   case NVPTXISD::Suld1DArrayI64Zero:
     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV2I8Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV2I16Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV2I32Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV2I64Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV4I8Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV4I16Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
     break;
   case NVPTXISD::Suld1DArrayV4I32Zero:
     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
     break;
   case NVPTXISD::Suld2DI8Zero:
     Opc = NVPTX::SULD_2D_I8_ZERO;
     break;
   case NVPTXISD::Suld2DI16Zero:
     Opc = NVPTX::SULD_2D_I16_ZERO;
     break;
   case NVPTXISD::Suld2DI32Zero:
     Opc = NVPTX::SULD_2D_I32_ZERO;
     break;
   case NVPTXISD::Suld2DI64Zero:
     Opc = NVPTX::SULD_2D_I64_ZERO;
     break;
   case NVPTXISD::Suld2DV2I8Zero:
     Opc = NVPTX::SULD_2D_V2I8_ZERO;
     break;
   case NVPTXISD::Suld2DV2I16Zero:
     Opc = NVPTX::SULD_2D_V2I16_ZERO;
     break;
   case NVPTXISD::Suld2DV2I32Zero:
     Opc = NVPTX::SULD_2D_V2I32_ZERO;
     break;
   case NVPTXISD::Suld2DV2I64Zero:
     Opc = NVPTX::SULD_2D_V2I64_ZERO;
     break;
   case NVPTXISD::Suld2DV4I8Zero:
     Opc = NVPTX::SULD_2D_V4I8_ZERO;
     break;
   case NVPTXISD::Suld2DV4I16Zero:
     Opc = NVPTX::SULD_2D_V4I16_ZERO;
     break;
   case NVPTXISD::Suld2DV4I32Zero:
     Opc = NVPTX::SULD_2D_V4I32_ZERO;
     break;
   case NVPTXISD::Suld2DArrayI8Zero:
     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
     break;
   case NVPTXISD::Suld2DArrayI16Zero:
     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
     break;
   case NVPTXISD::Suld2DArrayI32Zero:
     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
     break;
   case NVPTXISD::Suld2DArrayI64Zero:
     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV2I8Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV2I16Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV2I32Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV2I64Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV4I8Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV4I16Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
     break;
   case NVPTXISD::Suld2DArrayV4I32Zero:
     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
     break;
   case NVPTXISD::Suld3DI8Zero:
     Opc = NVPTX::SULD_3D_I8_ZERO;
     break;
   case NVPTXISD::Suld3DI16Zero:
     Opc = NVPTX::SULD_3D_I16_ZERO;
     break;
   case NVPTXISD::Suld3DI32Zero:
     Opc = NVPTX::SULD_3D_I32_ZERO;
     break;
   case NVPTXISD::Suld3DI64Zero:
     Opc = NVPTX::SULD_3D_I64_ZERO;
     break;
   case NVPTXISD::Suld3DV2I8Zero:
     Opc = NVPTX::SULD_3D_V2I8_ZERO;
     break;
   case NVPTXISD::Suld3DV2I16Zero:
     Opc = NVPTX::SULD_3D_V2I16_ZERO;
     break;
   case NVPTXISD::Suld3DV2I32Zero:
     Opc = NVPTX::SULD_3D_V2I32_ZERO;
     break;
   case NVPTXISD::Suld3DV2I64Zero:
     Opc = NVPTX::SULD_3D_V2I64_ZERO;
     break;
   case NVPTXISD::Suld3DV4I8Zero:
     Opc = NVPTX::SULD_3D_V4I8_ZERO;
     break;
   case NVPTXISD::Suld3DV4I16Zero:
     Opc = NVPTX::SULD_3D_V4I16_ZERO;
     break;
   case NVPTXISD::Suld3DV4I32Zero:
     Opc = NVPTX::SULD_3D_V4I32_ZERO;
     break;
   }
 
   // Copy over operands
   SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
   Ops.push_back(N->getOperand(0)); // Move chain to the back.
 
   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
   return true;
 }
 
 
 /// SelectBFE - Look for instruction sequences that can be made more efficient
 /// by using the 'bfe' (bit-field extract) PTX instruction
 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
   SDLoc DL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDValue Len;
   SDValue Start;
   SDValue Val;
   bool IsSigned = false;
 
   if (N->getOpcode() == ISD::AND) {
     // Canonicalize the operands
     // We want 'and %val, %mask'
     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
       std::swap(LHS, RHS);
     }
 
     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
     if (!Mask) {
       // We need a constant mask on the RHS of the AND
       return false;
     }
 
     // Extract the mask bits
     uint64_t MaskVal = Mask->getZExtValue();
     if (!isMask_64(MaskVal)) {
       // We *could* handle shifted masks here, but doing so would require an
       // 'and' operation to fix up the low-order bits so we would trade
       // shr+and for bfe+and, which has the same throughput
       return false;
     }
 
     // How many bits are in our mask?
     uint64_t NumBits = countTrailingOnes(MaskVal);
     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
 
     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
       // We have a 'srl/and' pair, extract the effective start bit and length
       Val = LHS.getNode()->getOperand(0);
       Start = LHS.getNode()->getOperand(1);
       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
       if (StartConst) {
         uint64_t StartVal = StartConst->getZExtValue();
         // How many "good" bits do we have left?  "good" is defined here as bits
         // that exist in the original value, not shifted in.
         uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
         if (NumBits > GoodBits) {
           // Do not handle the case where bits have been shifted in. In theory
           // we could handle this, but the cost is likely higher than just
           // emitting the srl/and pair.
           return false;
         }
         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
       } else {
         // Do not handle the case where the shift amount (can be zero if no srl
         // was found) is not constant. We could handle this case, but it would
         // require run-time logic that would be more expensive than just
         // emitting the srl/and pair.
         return false;
       }
     } else {
       // Do not handle the case where the LHS of the and is not a shift. While
       // it would be trivial to handle this case, it would just transform
       // 'and' -> 'bfe', but 'and' has higher-throughput.
       return false;
     }
   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
     if (LHS->getOpcode() == ISD::AND) {
       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
       if (!ShiftCnst) {
         // Shift amount must be constant
         return false;
       }
 
       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
 
       SDValue AndLHS = LHS->getOperand(0);
       SDValue AndRHS = LHS->getOperand(1);
 
       // Canonicalize the AND to have the mask on the RHS
       if (isa<ConstantSDNode>(AndLHS)) {
         std::swap(AndLHS, AndRHS);
       }
 
       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
       if (!MaskCnst) {
         // Mask must be constant
         return false;
       }
 
       uint64_t MaskVal = MaskCnst->getZExtValue();
       uint64_t NumZeros;
       uint64_t NumBits;
       if (isMask_64(MaskVal)) {
         NumZeros = 0;
         // The number of bits in the result bitfield will be the number of
         // trailing ones (the AND) minus the number of bits we shift off
         NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
       } else if (isShiftedMask_64(MaskVal)) {
         NumZeros = countTrailingZeros(MaskVal);
         unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
         // The number of bits in the result bitfield will be the number of
         // trailing zeros plus the number of set bits in the mask minus the
         // number of bits we shift off
         NumBits = NumZeros + NumOnes - ShiftAmt;
       } else {
         // This is not a mask we can handle
         return false;
       }
 
       if (ShiftAmt < NumZeros) {
         // Handling this case would require extra logic that would make this
         // transformation non-profitable
         return false;
       }
 
       Val = AndLHS;
       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
     } else if (LHS->getOpcode() == ISD::SHL) {
       // Here, we have a pattern like:
       //
       // (sra (shl val, NN), MM)
       // or
       // (srl (shl val, NN), MM)
       //
       // If MM >= NN, we can efficiently optimize this with bfe
       Val = LHS->getOperand(0);
 
       SDValue ShlRHS = LHS->getOperand(1);
       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
       if (!ShlCnst) {
         // Shift amount must be constant
         return false;
       }
       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
 
       SDValue ShrRHS = RHS;
       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
       if (!ShrCnst) {
         // Shift amount must be constant
         return false;
       }
       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
 
       // To avoid extra codegen and be profitable, we need Outer >= Inner
       if (OuterShiftAmt < InnerShiftAmt) {
         return false;
       }
 
       // If the outer shift is more than the type size, we have no bitfield to
       // extract (since we also check that the inner shift is <= the outer shift
       // then this also implies that the inner shift is < the type size)
       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
         return false;
       }
 
       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
                                         MVT::i32);
       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
                                       DL, MVT::i32);
 
       if (N->getOpcode() == ISD::SRA) {
         // If we have a arithmetic right shift, we need to use the signed bfe
         // variant
         IsSigned = true;
       }
     } else {
       // No can do...
       return false;
     }
   } else {
     // No can do...
     return false;
   }
 
 
   unsigned Opc;
   // For the BFE operations we form here from "and" and "srl", always use the
   // unsigned variants.
   if (Val.getValueType() == MVT::i32) {
     if (IsSigned) {
       Opc = NVPTX::BFE_S32rii;
     } else {
       Opc = NVPTX::BFE_U32rii;
     }
   } else if (Val.getValueType() == MVT::i64) {
     if (IsSigned) {
       Opc = NVPTX::BFE_S64rii;
     } else {
       Opc = NVPTX::BFE_U64rii;
     }
   } else {
     // We cannot handle this type
     return false;
   }
 
   SDValue Ops[] = {
     Val, Start, Len
   };
 
   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
   return true;
 }
 
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
   // Return true if TGA or ES.
   if (N.getOpcode() == ISD::TargetGlobalAddress ||
       N.getOpcode() == ISD::TargetExternalSymbol) {
     Address = N;
     return true;
   }
   if (N.getOpcode() == NVPTXISD::Wrapper) {
     Address = N.getOperand(0);
     return true;
   }
   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
   }
   return false;
 }
 
 // symbol+offset
 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
   if (Addr.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
       SDValue base = Addr.getOperand(0);
       if (SelectDirectAddr(base, Base)) {
         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
                                            mvt);
         return true;
       }
     }
   }
   return false;
 }
 
 // symbol+offset
 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
 }
 
 // symbol+offset
 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
                                        SDValue &Base, SDValue &Offset) {
   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
 }
 
 // register+offset
 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
     return true;
   }
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
       Addr.getOpcode() == ISD::TargetGlobalAddress)
     return false; // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
       return false;
     }
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
       if (FrameIndexSDNode *FIN =
               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
         // Constant offset from frame ref.
         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
       else
         Base = Addr.getOperand(0);
       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
                                          mvt);
       return true;
     }
   }
   return false;
 }
 
 // register+offset
 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
 }
 
 // register+offset
 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
                                        SDValue &Base, SDValue &Offset) {
   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
 }
 
 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
                                                  unsigned int spN) const {
   const Value *Src = nullptr;
   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
       return true;
     Src = mN->getMemOperand()->getValue();
   }
   if (!Src)
     return false;
   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
     return (PT->getAddressSpace() == spN);
   return false;
 }
 
 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
 /// inline asm expressions.
 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
   SDValue Op0, Op1;
   switch (ConstraintID) {
   default:
     return true;
   case InlineAsm::Constraint_m: // memory
     if (SelectDirectAddr(Op, Op0)) {
       OutOps.push_back(Op0);
       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
       return false;
     }
     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
       OutOps.push_back(Op0);
       OutOps.push_back(Op1);
       return false;
     }
     break;
   }
   return true;
 }
 
 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
 /// conversion from \p SrcTy to \p DestTy.
 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
                                              bool IsSigned) {
   switch (SrcTy.SimpleTy) {
   default:
     llvm_unreachable("Unhandled source type");
   case MVT::i8:
     switch (DestTy.SimpleTy) {
     default:
       llvm_unreachable("Unhandled dest type");
     case MVT::i16:
       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
     case MVT::i32:
       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
     case MVT::i64:
       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
     }
   case MVT::i16:
     switch (DestTy.SimpleTy) {
     default:
       llvm_unreachable("Unhandled dest type");
     case MVT::i8:
       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
     case MVT::i32:
       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
     case MVT::i64:
       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
     }
   case MVT::i32:
     switch (DestTy.SimpleTy) {
     default:
       llvm_unreachable("Unhandled dest type");
     case MVT::i8:
       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
     case MVT::i16:
       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
     case MVT::i64:
       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
     }
   case MVT::i64:
     switch (DestTy.SimpleTy) {
     default:
       llvm_unreachable("Unhandled dest type");
     case MVT::i8:
       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
     case MVT::i16:
       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
     case MVT::i32:
       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
     }
   }
 }
Index: vendor/llvm/dist-release_70/lib/Transforms/Instrumentation/BoundsChecking.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Transforms/Instrumentation/BoundsChecking.cpp	(revision 337630)
+++ vendor/llvm/dist-release_70/lib/Transforms/Instrumentation/BoundsChecking.cpp	(revision 337631)
@@ -1,244 +1,248 @@
 //===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "bounds-checking"
 
 static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
                                   cl::desc("Use one trap block per function"));
 
 STATISTIC(ChecksAdded, "Bounds checks added");
 STATISTIC(ChecksSkipped, "Bounds checks skipped");
 STATISTIC(ChecksUnable, "Bounds checks unable to add");
 
 using BuilderTy = IRBuilder<TargetFolder>;
 
-/// Adds run-time bounds checks to memory accessing instructions.
+/// Gets the conditions under which memory accessing instructions will overflow.
 ///
 /// \p Ptr is the pointer that will be read/written, and \p InstVal is either
 /// the result from the load or the value being stored. It is used to determine
 /// the size of memory block that is touched.
 ///
-/// \p GetTrapBB is a callable that returns the trap BB to use on failure.
-///
-/// Returns true if any change was made to the IR, false otherwise.
-template <typename GetTrapBBT>
-static bool instrumentMemAccess(Value *Ptr, Value *InstVal,
-                                const DataLayout &DL, TargetLibraryInfo &TLI,
-                                ObjectSizeOffsetEvaluator &ObjSizeEval,
-                                BuilderTy &IRB, GetTrapBBT GetTrapBB,
-                                ScalarEvolution &SE) {
+/// Returns the condition under which the access will overflow.
+static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
+                                 const DataLayout &DL, TargetLibraryInfo &TLI,
+                                 ObjectSizeOffsetEvaluator &ObjSizeEval,
+                                 BuilderTy &IRB, ScalarEvolution &SE) {
   uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
   LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
                     << " bytes\n");
 
   SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
 
   if (!ObjSizeEval.bothKnown(SizeOffset)) {
     ++ChecksUnable;
-    return false;
+    return nullptr;
   }
 
   Value *Size   = SizeOffset.first;
   Value *Offset = SizeOffset.second;
   ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
 
   Type *IntTy = DL.getIntPtrType(Ptr->getType());
   Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
 
   auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
   auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
   auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
 
   // three checks are required to ensure safety:
   // . Offset >= 0  (since the offset is given from the base ptr)
   // . Size >= Offset  (unsigned)
   // . Size - Offset >= NeededSize  (unsigned)
   //
   // optimization: if Size >= 0 (signed), skip 1st check
   // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
   Value *ObjSize = IRB.CreateSub(Size, Offset);
   Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
                     ? ConstantInt::getFalse(Ptr->getContext())
                     : IRB.CreateICmpULT(Size, Offset);
   Value *Cmp3 = SizeRange.sub(OffsetRange)
                         .getUnsignedMin()
                         .uge(NeededSizeRange.getUnsignedMax())
                     ? ConstantInt::getFalse(Ptr->getContext())
                     : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
   Value *Or = IRB.CreateOr(Cmp2, Cmp3);
   if ((!SizeCI || SizeCI->getValue().slt(0)) &&
       !SizeRange.getSignedMin().isNonNegative()) {
     Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
     Or = IRB.CreateOr(Cmp1, Or);
   }
 
+  return Or;
+}
+
+/// Adds run-time bounds checks to memory accessing instructions.
+///
+/// \p Or is the condition that should guard the trap.
+///
+/// \p GetTrapBB is a callable that returns the trap BB to use on failure.
+template <typename GetTrapBBT>
+static void insertBoundsCheck(Value *Or, BuilderTy IRB, GetTrapBBT GetTrapBB) {
   // check if the comparison is always false
   ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or);
   if (C) {
     ++ChecksSkipped;
     // If non-zero, nothing to do.
     if (!C->getZExtValue())
-      return true;
+      return;
   }
   ++ChecksAdded;
 
   BasicBlock::iterator SplitI = IRB.GetInsertPoint();
   BasicBlock *OldBB = SplitI->getParent();
   BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
   OldBB->getTerminator()->eraseFromParent();
 
   if (C) {
     // If we have a constant zero, unconditionally branch.
     // FIXME: We should really handle this differently to bypass the splitting
     // the block.
     BranchInst::Create(GetTrapBB(IRB), OldBB);
-    return true;
+    return;
   }
 
   // Create the conditional branch.
   BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
-  return true;
 }
 
 static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
                               ScalarEvolution &SE) {
   const DataLayout &DL = F.getParent()->getDataLayout();
   ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(),
                                            /*RoundToAlign=*/true);
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
   // touching instructions
-  std::vector<Instruction *> WorkList;
+  SmallVector<std::pair<Instruction *, Value *>, 4> TrapInfo;
   for (Instruction &I : instructions(F)) {
-    if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) ||
-        isa<AtomicRMWInst>(I))
-        WorkList.push_back(&I);
+    Value *Or = nullptr;
+    BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL));
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
+                              ObjSizeEval, IRB, SE);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
+                              DL, TLI, ObjSizeEval, IRB, SE);
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
+                              DL, TLI, ObjSizeEval, IRB, SE);
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
+      Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(), DL,
+                              TLI, ObjSizeEval, IRB, SE);
+    }
+    if (Or)
+      TrapInfo.push_back(std::make_pair(&I, Or));
   }
 
   // Create a trapping basic block on demand using a callback. Depending on
   // flags, this will either create a single block for the entire function or
   // will create a fresh block every time it is called.
   BasicBlock *TrapBB = nullptr;
   auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
     if (TrapBB && SingleTrapBB)
       return TrapBB;
 
     Function *Fn = IRB.GetInsertBlock()->getParent();
     // FIXME: This debug location doesn't make a lot of sense in the
     // `SingleTrapBB` case.
     auto DebugLoc = IRB.getCurrentDebugLocation();
     IRBuilder<>::InsertPointGuard Guard(IRB);
     TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
     IRB.SetInsertPoint(TrapBB);
 
     auto *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
     CallInst *TrapCall = IRB.CreateCall(F, {});
     TrapCall->setDoesNotReturn();
     TrapCall->setDoesNotThrow();
     TrapCall->setDebugLoc(DebugLoc);
     IRB.CreateUnreachable();
 
     return TrapBB;
   };
 
-  bool MadeChange = false;
-  for (Instruction *Inst : WorkList) {
+  // Add the checks.
+  for (const auto &Entry : TrapInfo) {
+    Instruction *Inst = Entry.first;
     BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
-    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-      MadeChange |= instrumentMemAccess(LI->getPointerOperand(), LI, DL, TLI,
-                                        ObjSizeEval, IRB, GetTrapBB, SE);
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      MadeChange |=
-          instrumentMemAccess(SI->getPointerOperand(), SI->getValueOperand(),
-                              DL, TLI, ObjSizeEval, IRB, GetTrapBB, SE);
-    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
-      MadeChange |=
-          instrumentMemAccess(AI->getPointerOperand(), AI->getCompareOperand(),
-                              DL, TLI, ObjSizeEval, IRB, GetTrapBB, SE);
-    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
-      MadeChange |=
-          instrumentMemAccess(AI->getPointerOperand(), AI->getValOperand(), DL,
-                              TLI, ObjSizeEval, IRB, GetTrapBB, SE);
-    } else {
-      llvm_unreachable("unknown Instruction type");
-    }
+    insertBoundsCheck(Entry.second, IRB, GetTrapBB);
   }
-  return MadeChange;
+
+  return !TrapInfo.empty();
 }
 
 PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
 
   if (!addBoundsChecking(F, TLI, SE))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
 }
 
 namespace {
 struct BoundsCheckingLegacyPass : public FunctionPass {
   static char ID;
 
   BoundsCheckingLegacyPass() : FunctionPass(ID) {
     initializeBoundsCheckingLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override {
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     return addBoundsChecking(F, TLI, SE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
   }
 };
 } // namespace
 
 char BoundsCheckingLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(BoundsCheckingLegacyPass, "bounds-checking",
                       "Run-time bounds checking", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(BoundsCheckingLegacyPass, "bounds-checking",
                     "Run-time bounds checking", false, false)
 
 FunctionPass *llvm::createBoundsCheckingLegacyPass() {
   return new BoundsCheckingLegacyPass();
 }
Index: vendor/llvm/dist-release_70/test/CodeGen/AArch64/fcopysign.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/AArch64/fcopysign.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/CodeGen/AArch64/fcopysign.ll	(revision 337631)
@@ -1,23 +1,42 @@
 ; RUN: llc -o - %s | FileCheck %s
 ; Check that selection dag legalization of fcopysign works in cases with
 ; different modes for the arguments.
 target triple = "aarch64--"
 
 declare fp128 @llvm.copysign.f128(fp128, fp128)
 
-@val = global double zeroinitializer, align 8
+@val_float = global float zeroinitializer, align 4
+@val_double = global double zeroinitializer, align 8
+@val_fp128 = global fp128 zeroinitializer, align 16
 
 ; CHECK-LABEL: copysign0
-; CHECK: ldr [[REG:x[0-9]+]], [x8, :lo12:val]
+; CHECK: ldr [[REG:x[0-9]+]], [x8, :lo12:val_double]
 ; CHECK: and [[ANDREG:x[0-9]+]], [[REG]], #0x8000000000000000
 ; CHECK: lsr x[[LSRREGNUM:[0-9]+]], [[ANDREG]], #56
 ; CHECK: bfxil w[[LSRREGNUM]], w{{[0-9]+}}, #0, #7
 ; CHECK: strb w[[LSRREGNUM]],
 ; CHECK: ldr q{{[0-9]+}},
 define fp128 @copysign0() {
 entry:
-  %v = load double, double* @val, align 8
+  %v = load double, double* @val_double, align 8
   %conv = fpext double %v to fp128
   %call = tail call fp128 @llvm.copysign.f128(fp128 0xL00000000000000007FFF000000000000, fp128 %conv) #2
+  ret fp128 %call
+}
+
+; CHECK-LABEL: copysign1
+; CHECK-DAG: ldr [[REG:q[0-9]+]], [x8, :lo12:val_fp128]
+; CHECK-DAG: ldr [[REG:w[0-9]+]], [x8, :lo12:val_float]
+; CHECK: and [[ANDREG:w[0-9]+]], [[REG]], #0x80000000
+; CHECK: lsr w[[LSRREGNUM:[0-9]+]], [[ANDREG]], #24
+; CHECK: bfxil w[[LSRREGNUM]], w{{[0-9]+}}, #0, #7
+; CHECK: strb w[[LSRREGNUM]],
+; CHECK: ldr q{{[0-9]+}},
+define fp128@copysign1() {
+entry:
+  %v0 = load fp128, fp128* @val_fp128, align 16
+  %v1 = load float, float* @val_float, align 4
+  %conv = fpext float %v1 to fp128
+  %call = tail call fp128 @llvm.copysign.f128(fp128 %v0, fp128 %conv)
   ret fp128 %call
 }
Index: vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/kernel-args.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/kernel-args.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/kernel-args.ll	(revision 337631)
@@ -1,811 +1,800 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
 
 ; FUNC-LABEL: {{^}}i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 
-; EG: LSHR   T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: MOV * T1.X, KC0[2].Z,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-
-; CM: LSHR * T0.X, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:	  MOV * T1.X, KC0[2].Z,
+; EGCM: VTX_READ_8{{.*}} #3
+; EGCM: KC0[2].Y
 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
   %ext = zext i8 %in to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i8_zext_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
 
 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:	2(2.802597e-45), 0(0.000000e+00)
 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
   %ext = zext i8 %in to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i8_sext_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
 ; HSA-VI: flat_store_dword
 
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
 
 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
   %ext = sext i8 %in to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i16_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
 ; HSA-VI: flat_store_dword
 
-
-; EG: LSHR   T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: MOV * T1.X, KC0[2].Z,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-
-; CM: LSHR * T0.X, KC0[2].Y, literal.x,
-; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT: MOV * T1.X, KC0[2].Z,
+; EGCM: VTX_READ_16
+; EGCM: KC0[2].Y
 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
   %ext = zext i16 %in to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i16_zext_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
 ; HSA-VI: flat_store_dword
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
 
 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
   %ext = zext i16 %in to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i16_sext_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
 ; HSA-VI: flat_store_dword
 
 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
 
 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
   %ext = sext i16 %in to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 
 ; GCN: s_load_dword s
 ; GCN-NOT: {{buffer|flat|global}}_load_
 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2i16_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 
 ; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 16
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2f32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 16
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v3i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
 
 ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
 ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v3i16_arg:
 ; HSA-VI: kernarg_segment_byte_size = 16
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
 
 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
 ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v3i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v3f32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 
 ; GCN-DAG: s_load_dwordx2 s
 ; GCN-DAG: s_load_dword s
 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i16_arg:
 ; HSA-VI: kernarg_segment_byte_size = 16
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 
 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
 
 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
 
 
 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
 
 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4f32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FIXME: Lots of unpack and re-pack junk on VI
 ; FUNC-LABEL: {{^}}v8i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 16
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 
 ; SI-NOT: {{buffer|flat|global}}_load
 ; SI: s_load_dwordx2 s
 ; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 ; VI: s_load_dwordx2 s
 ; VI-NEXT: s_load_dwordx2 s
 ; VI-NOT: lshl
 ; VI-NOT: _or
 ; VI-NOT: _sdwa
 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v8i16_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 
 ; SI: s_load_dwordx4
 ; SI-NEXT: s_load_dwordx2
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
 
 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v8i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 64
 ; HSA-VI: kernarg_segment_alignment = 5
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v8f32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 64
 ; HSA-VI: kernarg_segment_alignment = 5
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FIXME: Pack/repack on VI
 
 ; FUNC-LABEL: {{^}}v16i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 ; EGCM: VTX_READ_8
 
 ; SI: s_load_dwordx4 s
 ; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
 ; VI: s_load_dwordx4 s
 ; VI-NOT: shr
 ; VI-NOT: shl
 ; VI-NOT: _sdwa
 ; VI-NOT: _or_
 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v16i16_arg:
 ; HSA-VI: kernarg_segment_byte_size = 64
 ; HSA-VI: kernarg_segment_alignment = 5
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 ; EGCM: VTX_READ_16
 
 ; SI: s_load_dwordx8 s
 ; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 
 ; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v16i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 128
 ; HSA-VI: kernarg_segment_alignment = 6
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v16f32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 128
 ; HSA-VI: kernarg_segment_alignment = 6
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}kernel_arg_i64:
 ; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; MESA-GCN: buffer_store_dwordx2
 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f64_kernel_arg:
 ; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
 ; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
 ; MESA-GCN: buffer_store_dwordx2
 
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
 ; XGCN: s_load_dwordx2
 ; XGCN: s_load_dwordx2
 ; XGCN: buffer_store_dwordx2
 ; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
 ; FUNC-LABEL: {{^}}i65_arg:
 ; HSA-VI: kernarg_segment_byte_size = 24
 ; HSA-VI: kernarg_segment_alignment = 4
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
 entry:
   store i65 %in, i65 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i1_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword s
 ; GCN: s_and_b32
 ; GCN: {{buffer|flat}}_store_byte
 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_zext_i32:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword
 ; SGCN: buffer_store_dword
 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_zext_i64:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword s
 ; GCN: {{buffer|flat}}_store_dwordx2
 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_sext_i32:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword
 ; GCN: {{buffer|flat}}_store_dword
 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i1_arg_sext_i64:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
 ; GCN: s_load_dword
 ; GCN: s_bfe_i64
 ; GCN: {{buffer|flat}}_store_dwordx2
 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}empty_struct_arg:
 ; HSA-VI: kernarg_segment_byte_size = 0
 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
   ret void
 }
 
 ; The correct load offsets for these:
 ; load 4 from 0,
 ; load 8 from 8
 ; load 4 from 24
 ; load 8 from 32
 
 ; With the SelectionDAG argument lowering, the alignments for the
 ; struct members is not properly considered, making these wrong.
 
 ; FIXME: Total argument size is computed wrong
 ; FUNC-LABEL: {{^}}struct_argument_alignment:
 ; HSA-VI: kernarg_segment_byte_size = 40
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
   %val2 = extractvalue {i32, i64} %arg1, 0
   %val3 = extractvalue {i32, i64} %arg1, 1
   store volatile i32 %val0, i32 addrspace(1)* null
   store volatile i64 %val1, i64 addrspace(1)* null
   store volatile i32 %val2, i32 addrspace(1)* null
   store volatile i64 %val3, i64 addrspace(1)* null
   ret void
 }
 
 ; No padding between i8 and next struct, but round up at end to 4 byte
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
 ; HSA-VI: kernarg_segment_byte_size = 28
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1
   %val2 = extractvalue <{i32, i64}> %arg1, 0
   %val3 = extractvalue <{i32, i64}> %arg1, 1
   store volatile i32 %val0, i32 addrspace(1)* null
   store volatile i64 %val1, i64 addrspace(1)* null
   store volatile i32 %val2, i32 addrspace(1)* null
   store volatile i64 %val3, i64 addrspace(1)* null
   ret void
 }
 
 ; GCN-LABEL: {{^}}struct_argument_alignment_after:
 ; HSA-VI: kernarg_segment_byte_size = 64
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
   %val2 = extractvalue {i32, i64} %arg2, 0
   %val3 = extractvalue {i32, i64} %arg2, 1
   store volatile i32 %val0, i32 addrspace(1)* null
   store volatile i64 %val1, i64 addrspace(1)* null
   store volatile i32 %val2, i32 addrspace(1)* null
   store volatile i64 %val3, i64 addrspace(1)* null
   store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
   ret void
 }
 
 ; GCN-LABEL: {{^}}array_3xi32:
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
   store volatile i16 %arg0, i16 addrspace(1)* undef
   store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
   ret void
 }
 
 ; FIXME: Why not all scalar loads?
 ; GCN-LABEL: {{^}}array_3xi16:
 ; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
 ; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
   store volatile i8 %arg0, i8 addrspace(1)* undef
   store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
   ret void
 }
Index: vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/mad_uint24.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/mad_uint24.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/mad_uint24.ll	(revision 337631)
@@ -1,140 +1,227 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; FUNC-LABEL: {{^}}u32_mad24:
 ; EG: MULADD_UINT24
 ; SI: v_mad_u32_u24
 ; VI: v_mad_u32_u24
 
 define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
   %1 = shl i32 %b, 8
   %b_24 = lshr i32 %1, 8
   %2 = mul i32 %a_24, %b_24
   %3 = add i32 %2, %c
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i16_mad24:
 ; The order of A and B does not matter.
 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG: 16
 ; FIXME: Should be using scalar instructions here.
 ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
 define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
   %2 = sext i16 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; FUNC-LABEL: {{^}}i8_mad24:
 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG: 8
 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
 define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
   %2 = sext i8 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
 
 ; This tests for a bug where the mad_u24 pattern matcher would call
 ; SimplifyDemandedBits on the first operand of the mul instruction
 ; assuming that the pattern would be matched to a 24-bit mad.  This
 ; led to some instructions being incorrectly erased when the entire
 ; 24-bit mad pattern wasn't being matched.
 
 ; Check that the select instruction is not deleted.
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
 ; SI: v_cndmask
 define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
   %2 = select i1 %1, i32 %0, i32 34
   %3 = mul i32 %2, %c
   %4 = add i32 %3, %d
   store i32 %4, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}extra_and:
 ; SI-NOT: v_and
 ; SI: v_mad_u32_u24
 ; SI: v_mad_u32_u24
 define amdgpu_kernel void @extra_and(i32 addrspace(1)* %arg, i32 %arg2, i32 %arg3) {
 bb:
   br label %bb4
 
 bb4:                                              ; preds = %bb4, %bb
   %tmp = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ]
   %tmp5 = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ]
   %tmp6 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ]
   %tmp7 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ]
   %tmp8 = and i32 %tmp7, 16777215
   %tmp9 = and i32 %tmp6, 16777215
   %tmp10 = and i32 %tmp5, 16777215
   %tmp11 = and i32 %tmp, 16777215
   %tmp12 = mul i32 %tmp8, %tmp11
   %tmp13 = add i32 %arg2, %tmp12
   %tmp14 = mul i32 %tmp9, %tmp11
   %tmp15 = add i32 %arg3, %tmp14
   %tmp16 = add nuw nsw i32 %tmp13, %tmp15
   %tmp17 = icmp eq i32 %tmp16, 8
   br i1 %tmp17, label %bb18, label %bb4
 
 bb18:                                             ; preds = %bb4
   store i32 %tmp16, i32 addrspace(1)* %arg
   ret void
 }
 
 ; FUNC-LABEL: {{^}}dont_remove_shift
 ; SI: v_lshr
 ; SI: v_mad_u32_u24
 ; SI: v_mad_u32_u24
 define amdgpu_kernel void @dont_remove_shift(i32 addrspace(1)* %arg, i32 %arg2, i32 %arg3) {
 bb:
   br label %bb4
 
 bb4:                                              ; preds = %bb4, %bb
   %tmp = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ]
   %tmp5 = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ]
   %tmp6 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ]
   %tmp7 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ]
   %tmp8 = lshr i32 %tmp7, 8
   %tmp9 = lshr i32 %tmp6, 8
   %tmp10 = lshr i32 %tmp5, 8
   %tmp11 = lshr i32 %tmp, 8
   %tmp12 = mul i32 %tmp8, %tmp11
   %tmp13 = add i32 %arg2, %tmp12
   %tmp14 = mul i32 %tmp9, %tmp11
   %tmp15 = add i32 %arg3, %tmp14
   %tmp16 = add nuw nsw i32 %tmp13, %tmp15
   %tmp17 = icmp eq i32 %tmp16, 8
   br i1 %tmp17, label %bb18, label %bb4
 
 bb18:                                             ; preds = %bb4
   store i32 %tmp16, i32 addrspace(1)* %arg
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mad_sat_16:
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
+; GCN: v_med3_i32 v{{[0-9]}}, [[EXT]],
+define amdgpu_kernel void @i8_mad_sat_16(i8 addrspace(1)* %out, i8 addrspace(1)* %in0, i8 addrspace(1)* %in1, i8 addrspace(1)* %in2, i64 addrspace(5)* %idx) {
+entry:
+  %retval.0.i = load i64, i64 addrspace(5)* %idx
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 %retval.0.i
+  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 %retval.0.i
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %in2, i64 %retval.0.i
+  %l1 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %l2 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %l3 = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %conv1.i = sext i8 %l1 to i16
+  %conv3.i = sext i8 %l2 to i16
+  %conv5.i = sext i8 %l3 to i16
+  %mul.i.i.i = mul nsw i16 %conv3.i, %conv1.i
+  %add.i.i = add i16 %mul.i.i.i, %conv5.i
+  %c4 = icmp sgt i16 %add.i.i, -128
+  %cond.i.i = select i1 %c4, i16 %add.i.i, i16 -128
+  %c5 = icmp slt i16 %cond.i.i, 127
+  %cond13.i.i = select i1 %c5, i16 %cond.i.i, i16 127
+  %conv8.i = trunc i16 %cond13.i.i to i8
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %retval.0.i
+  store i8 %conv8.i, i8 addrspace(1)* %arrayidx7, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mad_32:
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
+define amdgpu_kernel void @i8_mad_32(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b, i8 addrspace(1)* %c, i64 addrspace(5)* %idx) {
+entry:
+  %retval.0.i = load i64, i64 addrspace(5)* %idx
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %a, i64 %retval.0.i
+  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %b, i64 %retval.0.i
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %c, i64 %retval.0.i
+  %la = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %lb = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %lc = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %exta = sext i8 %la to i16
+  %extb = sext i8 %lb to i16
+  %extc = sext i8 %lc to i16
+  %mul = mul i16 %exta, %extb
+  %mad = add i16 %mul, %extc
+  %mad_ext = sext i16 %mad to i32
+  store i32 %mad_ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mad_64:
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
+define amdgpu_kernel void @i8_mad_64(i64 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b, i8 addrspace(1)* %c, i64 addrspace(5)* %idx) {
+entry:
+  %retval.0.i = load i64, i64 addrspace(5)* %idx
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %a, i64 %retval.0.i
+  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %b, i64 %retval.0.i
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %c, i64 %retval.0.i
+  %la = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %lb = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %lc = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %exta = sext i8 %la to i16
+  %extb = sext i8 %lb to i16
+  %extc = sext i8 %lc to i16
+  %mul = mul i16 %exta, %extb
+  %mad = add i16 %mul, %extc
+  %mad_ext = sext i16 %mad to i64
+  store i64 %mad_ext, i64 addrspace(1)* %out
   ret void
 }
Index: vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll	(revision 337631)
@@ -1,96 +1,99 @@
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | \
 ; RUN: FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}tgid_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
 define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}tgid_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[REG:T[0-9]+]].X
 ; EG: MOV [[REG]].X, T1.Y
 define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}tgid_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[REG:T[0-9]+]].X
 ; EG: MOV [[REG]].X, T1.Z
 define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}tidig_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
 define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}tidig_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[REG:T[0-9]+]].X
 ; EG: MOV [[REG]].X, T0.Y
 define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}tidig_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[REG:T[0-9]+]].X
 ; EG: MOV [[REG]].X, T0.Z
 define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_implicit:
-; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
-; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
+; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56 == KC0[3].Z
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], [[PTR:T[0-9]+.[XYZW]]]
+; EG-NOT: VTX_READ
+; EG-DAG: MOV {{\*?}} [[VAL]], KC0[3].Z
+; EG-DAG: LSHR {{\*? *}}[[PTR]], KC0[2].Y, literal
 define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
   %value = load i32, i32 addrspace(7)* %gep
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_implicit_dyn:
 ; 36 prepended implicit bytes + 8(out pointer + in) = 44
-; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
+; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44, #3
 define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
   %value = load i32, i32 addrspace(7)* %gep
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
 
 declare i8 addrspace(7)* @llvm.r600.implicitarg.ptr() #0
 
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
 declare i32 @llvm.r600.read.tgid.z() #0
 
 declare i32 @llvm.r600.read.tidig.x() #0
 declare i32 @llvm.r600.read.tidig.y() #0
 declare i32 @llvm.r600.read.tidig.z() #0
 
 attributes #0 = { readnone }
Index: vendor/llvm/dist-release_70/test/CodeGen/NVPTX/load-store.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/NVPTX/load-store.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/CodeGen/NVPTX/load-store.ll	(revision 337631)
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+; CHECK-LABEL: plain
+define void @plain(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
+  ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, i8* %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, i8* %a
+
+  ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, i16* %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, i16* %b
+
+  ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, i32* %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, i32* %c
+
+  ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, i64* %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, i64* %d
+
+  ret void
+}
+
+; CHECK-LABEL: volatile
+define void @volatile(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, i8* %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, i8* %a
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, i16* %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, i16* %b
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, i32* %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, i32* %c
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, i64* %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, i64* %d
+
+  ret void
+}
+
+; CHECK-LABEL: monotonic
+define void @monotonic(i8* %a, i16* %b, i32* %c, i64* %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, i8* %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, i8* %a monotonic, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, i16* %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, i16* %b monotonic, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, i32* %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, i32* %c monotonic, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, i64* %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, i64* %d monotonic, align 8
+
+  ret void
+}
Index: vendor/llvm/dist-release_70/test/CodeGen/X86/masked_memop.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/X86/masked_memop.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/CodeGen/X86/masked_memop.ll	(revision 337631)
@@ -1,1334 +1,1393 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX  --check-prefix=AVX1
 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
 
 ; To test for the case where masked load/store is not legal, we should add a run with a target
 ; that does not have AVX, but that case should probably be a separate test file using less tests
 ; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
 
 define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
 ; AVX-LABEL: loadv1:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    testq %rdi, %rdi
 ; AVX-NEXT:    ## implicit-def: $xmm1
 ; AVX-NEXT:    je LBB0_1
 ; AVX-NEXT:  ## %bb.2: ## %else
 ; AVX-NEXT:    testq %rdi, %rdi
 ; AVX-NEXT:    jne LBB0_3
 ; AVX-NEXT:  LBB0_4: ## %else
 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB0_1: ## %cond.load
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    testq %rdi, %rdi
 ; AVX-NEXT:    je LBB0_4
 ; AVX-NEXT:  LBB0_3: ## %else
 ; AVX-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: loadv1:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    testq %rdi, %rdi
 ; AVX512F-NEXT:    ## implicit-def: $xmm1
 ; AVX512F-NEXT:    jne LBB0_2
 ; AVX512F-NEXT:  ## %bb.1: ## %cond.load
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512F-NEXT:  LBB0_2: ## %else
 ; AVX512F-NEXT:    testq %rdi, %rdi
 ; AVX512F-NEXT:    sete %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: loadv1:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    testq %rdi, %rdi
 ; SKX-NEXT:    ## implicit-def: $xmm1
 ; SKX-NEXT:    jne LBB0_2
 ; SKX-NEXT:  ## %bb.1: ## %cond.load
 ; SKX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; SKX-NEXT:  LBB0_2: ## %else
 ; SKX-NEXT:    testq %rdi, %rdi
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <1 x i64> %trigger, zeroinitializer
   %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
   ret <1 x double> %res
 }
 declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
 
 define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
 ; AVX-LABEL: storev1:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    testl %edi, %edi
 ; AVX-NEXT:    je LBB1_1
 ; AVX-NEXT:  ## %bb.2: ## %else
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB1_1: ## %cond.store
 ; AVX-NEXT:    movl %edx, (%rsi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: storev1:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    testl %edi, %edi
 ; AVX512-NEXT:    je LBB1_1
 ; AVX512-NEXT:  ## %bb.2: ## %else
 ; AVX512-NEXT:    retq
 ; AVX512-NEXT:  LBB1_1: ## %cond.store
 ; AVX512-NEXT:    movl %edx, (%rsi)
 ; AVX512-NEXT:    retq
   %mask = icmp eq <1 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
   ret void
 }
 declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
 
 define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
 ; AVX-LABEL: test6:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: test6:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test6:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
   %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
   ret <2 x double> %res
 }
 
 define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
 ; AVX-LABEL: test7:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: test7:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test7:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
   ret <4 x float> %res
 }
 
 define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
 ; AVX1-LABEL: test8:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test8:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test8:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
   ret <4 x i32> %res
 }
 
 define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
 ; AVX1-LABEL: test9:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test9:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test9:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
   ret void
 }
 
 define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
 ; AVX1-LABEL: test10:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test10:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test10:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
   ret <4 x double> %res
 }
 
 define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
 ; AVX1-LABEL: test10b:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test10b:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test10b:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test10b:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
   ret <4 x double> %res
 }
 
 define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
 ; AVX1-LABEL: test11a:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
 ; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11a:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test11a:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11a:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; SKX-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
   ret <8 x float> %res
 }
 
 define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
 ; AVX1-LABEL: test11b:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
 ; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11b:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test11b:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11b:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovw2m %xmm0, %k1
 ; SKX-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
   ret <8 x i32> %res
 }
 
 define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
 ; AVX1-LABEL: test11c:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11c:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test11c:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11c:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovw2m %xmm0, %k1
 ; SKX-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %res
 }
 
 define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
 ; AVX1-LABEL: test11d:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11d:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test11d:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11d:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovw2m %xmm0, %k1
 ; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
   ret <8 x i32> %res
 }
 
 define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
 ; AVX1-LABEL: test12:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test12:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test12:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test12:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; SKX-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
   ret void
 }
 
 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
 ; AVX1-LABEL: test14:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test14:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test14:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test14:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovups %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
   ret void
 }
 
 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
 ; AVX1-LABEL: test15:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test15:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test15:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test15:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
   ret void
 }
 
 define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
 ; AVX1-LABEL: test16:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test16:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test16:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vblendmps (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test16:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
   ret <2 x float> %res
 }
 
 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
 ; AVX1-LABEL: test17:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test17:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test17:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test17:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
   ret <2 x i32> %res
 }
 
 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
 ; AVX1-LABEL: test18:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test18:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test18:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test18:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; SKX-NEXT:    vptestnmq %xmm0, %xmm0, %k1
 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
   ret <2 x float> %res
 }
 
 define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
 ; AVX-LABEL: load_all:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmovups (%rdi), %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_all:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    movw $15, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: load_all:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
   ret <4 x float> %res
 }
 
 ;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
 
 ; 128-bit FP vectors are supported with AVX.
 
 define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
 ; AVX-LABEL: mload_constmask_v4f32:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v4f32:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    movw $13, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v4f32:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $13, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
   ret <4 x float> %res
 }
 
 define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
 ; AVX-LABEL: mload_constmask_v2f64:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: mload_constmask_v2f64:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX512-NEXT:    retq
   %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
   ret <2 x double> %res
 }
 
 ; 128-bit integer vectors are supported with AVX2.
 
 define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
 ; AVX1-LABEL: mload_constmask_v4i32:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mload_constmask_v4i32:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
 ; AVX2-NEXT:    vpmaskmovd (%rdi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v4i32:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    movw $14, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v4i32:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $14, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
   ret <4 x i32> %res
 }
 
 define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
 ; AVX-LABEL: mload_constmask_v2i64:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: mload_constmask_v2i64:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
   ret <2 x i64> %res
 }
 
 ; 256-bit FP vectors are supported with AVX.
 
 define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
 ; AVX-LABEL: mload_constmask_v8f32:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
 ; AVX-NEXT:    vmaskmovps (%rdi), %ymm1, %ymm1
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v8f32:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    movw $7, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v8f32:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $7, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovups (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
   ret <8 x float> %res
 }
 
 define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
 ; AVX-LABEL: mload_constmask_v4f64:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
 ; AVX-NEXT:    vmaskmovpd (%rdi), %ymm1, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v4f64:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v4f64:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $7, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
   ret <4 x double> %res
 }
 
 ; 256-bit integer vectors are supported with AVX2.
 
 define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
 ; AVX-LABEL: mload_constmask_v8i32:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v8i32:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    movw $135, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v8i32:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $-121, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
   ret <8 x i32> %res
 }
 
 define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
 ; AVX-LABEL: mload_constmask_v4i64:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v4i64:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    movb $9, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v4i64:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $9, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
   ret <4 x i64> %res
 }
 
 ; 512-bit FP vectors are supported with AVX512.
 
 define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
 ; AVX-LABEL: mload_constmask_v8f64:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v8f64:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    movb $-121, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v8f64:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $-121, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
   ret <8 x double> %res
 }
 
 ; If the pass-through operand is undef, no blend is needed.
 
 define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
 ; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
 ; AVX-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $7, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
   ret <4 x double> %res
 }
 
 define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
 ; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
 ; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
 ; AVX2-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    movb $6, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    movb $6, %al
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
   ret <4 x i64> %res
 }
 
 define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
 ; AVX1-LABEL: test21:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test21:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test21:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    movw $15, %ax
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test21:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
   ret void
 }
 
 ;  When only one element of the mask is set, reduce to a scalar store.
 
 define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
 ; AVX-LABEL: one_mask_bit_set1:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmovss %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: one_mask_bit_set1:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
   ret void
 }
 
 ; Choose a different element to show that the correct address offset is produced.
 
 define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
 ; AVX-LABEL: one_mask_bit_set2:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vextractps $2, %xmm0, 8(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: one_mask_bit_set2:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextractps $2, %xmm0, 8(%rdi)
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
   ret void
 }
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
 define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
 ; AVX-LABEL: one_mask_bit_set3:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vmovlps %xmm0, 16(%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: one_mask_bit_set3:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovlps %xmm0, 16(%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
   ret void
 }
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
 define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
 ; AVX-LABEL: one_mask_bit_set4:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vmovhpd %xmm0, 24(%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: one_mask_bit_set4:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovhpd %xmm0, 24(%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
   ret void
 }
 
 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
 
 define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
 ; AVX-LABEL: one_mask_bit_set5:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX-NEXT:    vmovlps %xmm0, 48(%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: one_mask_bit_set5:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
 ; AVX512-NEXT:    vmovlps %xmm0, 48(%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
   ret void
 }
 
 ;  When only one element of the mask is set, reduce to a scalar load.
 
 define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
 ; AVX-LABEL: load_one_mask_bit_set1:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: load_one_mask_bit_set1:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
   ret <4 x i32> %res
 }
 
 ; Choose a different element to show that the correct address offset is produced.
 
 define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
 ; AVX-LABEL: load_one_mask_bit_set2:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: load_one_mask_bit_set2:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX512-NEXT:    retq
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
   ret <4 x float> %res
 }
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
 define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
 ; AVX1-LABEL: load_one_mask_bit_set3:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: load_one_mask_bit_set3:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_one_mask_bit_set3:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
   ret <4 x i64> %res
 }
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
 define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
 ; AVX-LABEL: load_one_mask_bit_set4:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: load_one_mask_bit_set4:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
   ret <4 x double> %res
 }
 
 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
 
 define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
 ; AVX-LABEL: load_one_mask_bit_set5:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: load_one_mask_bit_set5:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
 ; AVX512-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
   ret <8 x double> %res
 }
 
 ; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
 ; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
 
 define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
 ; AVX-LABEL: trunc_mask:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vmaskmovps %xmm0, %xmm2, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_mask:
 ; AVX512F:       ## %bb.0:
 ; AVX512F-NEXT:    ## kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
 ; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
 ; AVX512F-NEXT:    vmovups %zmm0, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_mask:
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; SKX-NEXT:    vpcmpgtd %xmm2, %xmm1, %k1
 ; SKX-NEXT:    vmovups %xmm0, (%rdi) {%k1}
 ; SKX-NEXT:    retq
   %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
   ret void
 }
 
+; This needs to be widened to v4i32.
+; This used to assert in type legalization. PR38436
+; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
+define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
+; AVX1-LABEL: widen_masked_store:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    vmovd %esi, %xmm2
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT:    vmovd %ecx, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: widen_masked_store:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    vmovd %esi, %xmm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT:    vmovd %ecx, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaskmovd %xmm0, %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: widen_masked_store:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; SKX-LABEL: widen_masked_store:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vmovdqa32 %xmm1, %xmm1 {%k1} {z}
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
+; SKX-NEXT:    retq
+  call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>)
+
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
 declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
 declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
 declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
 declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
 declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
 declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
 declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
 
Index: vendor/llvm/dist-release_70/test/Instrumentation/BoundsChecking/many-traps-2.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Instrumentation/BoundsChecking/many-traps-2.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/Instrumentation/BoundsChecking/many-traps-2.ll	(revision 337631)
@@ -0,0 +1,65 @@
+; RUN: opt < %s -bounds-checking -S | FileCheck %s
+@array = internal global [1819 x i16] zeroinitializer, section ".bss,bss"
+@offsets = external dso_local global [10 x i16]
+
+; CHECK-LABEL: @test
+define dso_local void @test() {
+bb1:
+  br label %bb19
+
+bb20:
+  %_tmp819 = load i16, i16* null
+; CHECK: br {{.*}} %trap
+  %_tmp820 = sub nsw i16 9, %_tmp819
+  %_tmp821 = sext i16 %_tmp820 to i64
+  %_tmp822 = getelementptr [10 x i16], [10 x i16]* @offsets, i16 0, i64 %_tmp821
+  %_tmp823 = load i16, i16* %_tmp822
+  br label %bb33
+
+bb34:
+  %_tmp907 = zext i16 %i__7.107.0 to i64
+  %_tmp908 = getelementptr [1819 x i16], [1819 x i16]* @array, i16 0, i64 %_tmp907
+  store i16 0, i16* %_tmp908
+; CHECK: br {{.*}} %trap
+  %_tmp910 = add i16 %i__7.107.0, 1
+  br label %bb33
+
+bb33:
+  %i__7.107.0 = phi i16 [ undef, %bb20 ], [ %_tmp910, %bb34 ]
+  %_tmp913 = add i16 %_tmp823, 191
+  %_tmp914 = icmp ult i16 %i__7.107.0, %_tmp913
+  br i1 %_tmp914, label %bb34, label %bb19
+
+bb19:
+  %_tmp976 = icmp slt i16 0, 10
+  br i1 %_tmp976, label %bb20, label %bb39
+
+bb39:
+  ret void
+}
+
+@e = dso_local local_unnamed_addr global [1 x i16] zeroinitializer, align 1
+
+; CHECK-LABEL: @test2
+define dso_local void @test2() local_unnamed_addr {
+entry:
+  br label %while.cond1.preheader
+
+while.cond1.preheader:
+  %0 = phi i16 [ undef, %entry ], [ %inc, %while.end ]
+  %1 = load i16, i16* undef, align 1
+; CHECK: br {{.*}} %trap
+  br label %while.end
+
+while.end:
+  %inc = add nsw i16 %0, 1
+  %arrayidx = getelementptr inbounds [1 x i16], [1 x i16]* @e, i16 0, i16
+ %0
+  %2 = load i16, i16* %arrayidx, align 1
+; CHECK: or i1
+; CHECK-NEXT: br {{.*}} %trap
+  br i1 false, label %while.end6, label %while.cond1.preheader
+
+while.end6:
+  ret void
+}
Index: vendor/llvm/dist-release_70/test/MC/ELF/extra-section-flags.s
===================================================================
--- vendor/llvm/dist-release_70/test/MC/ELF/extra-section-flags.s	(revision 337630)
+++ vendor/llvm/dist-release_70/test/MC/ELF/extra-section-flags.s	(nonexistent)
@@ -1,12 +0,0 @@
-# RUN: llvm-mc -triple x86_64-unknown-unknown -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
-
-.section .rodata, "ax"
-# CHECK: warning: setting incorrect section attributes for .rodata
-nop
-
-.section .rodata, "a"
-nop
-.section .rodata.cst4, "aM",@progbits,8
-nop
-# CHECK-NOT: warning:
-

Property changes on: vendor/llvm/dist-release_70/test/MC/ELF/extra-section-flags.s
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: vendor/llvm/dist-release_70/test/Transforms/InstSimplify/AndOrXor.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Transforms/InstSimplify/AndOrXor.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/Transforms/InstSimplify/AndOrXor.ll	(revision 337631)
@@ -1,1159 +1,1134 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
 define i8 @and0(i8 %x) {
 ; CHECK-LABEL: @and0(
 ; CHECK-NEXT:    ret i8 0
 ;
   %r = and i8 %x, 0
   ret i8 %r
 }
 
 define <2 x i8> @and0_vec_undef_elt(<2 x i8> %x) {
 ; CHECK-LABEL: @and0_vec_undef_elt(
 ; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
   %r = and <2 x i8> %x, <i8 undef, i8 0>
   ret <2 x i8> %r
 }
 
 ; add nsw (xor X, signbit), signbit --> X
 
 define <2 x i32> @add_nsw_signbit(<2 x i32> %x) {
 ; CHECK-LABEL: @add_nsw_signbit(
 ; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
   %y = xor <2 x i32> %x, <i32 -2147483648, i32 -2147483648>
   %z = add nsw <2 x i32> %y, <i32 -2147483648, i32 -2147483648>
   ret <2 x i32> %z
 }
 
 ; Undef elements in either constant vector are ok.
 
 define <2 x i32> @add_nsw_signbit_undef(<2 x i32> %x) {
 ; CHECK-LABEL: @add_nsw_signbit_undef(
 ; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
   %y = xor <2 x i32> %x, <i32 undef, i32 -2147483648>
   %z = add nsw <2 x i32> %y, <i32 -2147483648, i32 undef>
   ret <2 x i32> %z
 }
 
 ; add nuw (xor X, signbit), signbit --> X
 
 define <2 x i5> @add_nuw_signbit(<2 x i5> %x) {
 ; CHECK-LABEL: @add_nuw_signbit(
 ; CHECK-NEXT:    ret <2 x i5> [[X:%.*]]
 ;
   %y = xor <2 x i5> %x, <i5 -16, i5 -16>
   %z = add nuw <2 x i5> %y, <i5 -16, i5 -16>
   ret <2 x i5> %z
 }
 
 ; Undef elements in either constant vector are ok.
 
 define <2 x i5> @add_nuw_signbit_undef(<2 x i5> %x) {
 ; CHECK-LABEL: @add_nuw_signbit_undef(
 ; CHECK-NEXT:    ret <2 x i5> [[X:%.*]]
 ;
   %y = xor <2 x i5> %x, <i5 -16, i5 undef>
   %z = add nuw <2 x i5> %y, <i5 undef, i5 -16>
   ret <2 x i5> %z
 }
 
 define i64 @pow2(i32 %x) {
 ; CHECK-LABEL: @pow2(
 ; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X:%.*]]
 ; CHECK-NEXT:    [[X2:%.*]] = and i32 [[X]], [[NEGX]]
 ; CHECK-NEXT:    [[E:%.*]] = zext i32 [[X2]] to i64
 ; CHECK-NEXT:    ret i64 [[E]]
 ;
   %negx = sub i32 0, %x
   %x2 = and i32 %x, %negx
   %e = zext i32 %x2 to i64
   %nege = sub i64 0, %e
   %e2 = and i64 %e, %nege
   ret i64 %e2
 }
 
 define i64 @pow2b(i32 %x) {
 ; CHECK-LABEL: @pow2b(
 ; CHECK-NEXT:    [[SH:%.*]] = shl i32 2, [[X:%.*]]
 ; CHECK-NEXT:    [[E:%.*]] = zext i32 [[SH]] to i64
 ; CHECK-NEXT:    ret i64 [[E]]
 ;
   %sh = shl i32 2, %x
   %e = zext i32 %sh to i64
   %nege = sub i64 0, %e
   %e2 = and i64 %e, %nege
   ret i64 %e2
 }
 
 define i1 @and_of_icmps0(i32 %b) {
 ; CHECK-LABEL: @and_of_icmps0(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = add i32 %b, 2
   %2 = icmp ult i32 %1, 4
   %cmp3 = icmp sgt i32 %b, 2
   %cmp = and i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @and_of_icmps0_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @and_of_icmps0_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %1 = add <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp ult <2 x i32> %1, <i32 4, i32 4>
   %cmp3 = icmp sgt <2 x i32> %b, <i32 2, i32 2>
   %cmp = and <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @and_of_icmps1(i32 %b) {
 ; CHECK-LABEL: @and_of_icmps1(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = add nsw i32 %b, 2
   %2 = icmp slt i32 %1, 4
   %cmp3 = icmp sgt i32 %b, 2
   %cmp = and i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @and_of_icmps1_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @and_of_icmps1_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %1 = add nsw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp slt <2 x i32> %1, <i32 4, i32 4>
   %cmp3 = icmp sgt <2 x i32> %b, <i32 2, i32 2>
   %cmp = and <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @and_of_icmps2(i32 %b) {
 ; CHECK-LABEL: @and_of_icmps2(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = add i32 %b, 2
   %2 = icmp ule i32 %1, 3
   %cmp3 = icmp sgt i32 %b, 2
   %cmp = and i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @and_of_icmps2_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @and_of_icmps2_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %1 = add <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp ule <2 x i32> %1, <i32 3, i32 3>
   %cmp3 = icmp sgt <2 x i32> %b, <i32 2, i32 2>
   %cmp = and <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @and_of_icmps3(i32 %b) {
 ; CHECK-LABEL: @and_of_icmps3(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = add nsw i32 %b, 2
   %2 = icmp sle i32 %1, 3
   %cmp3 = icmp sgt i32 %b, 2
   %cmp = and i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @and_of_icmps3_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @and_of_icmps3_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %1 = add nsw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp sle <2 x i32> %1, <i32 3, i32 3>
   %cmp3 = icmp sgt <2 x i32> %b, <i32 2, i32 2>
   %cmp = and <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @and_of_icmps4(i32 %b) {
 ; CHECK-LABEL: @and_of_icmps4(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = add nuw i32 %b, 2
   %2 = icmp ult i32 %1, 4
   %cmp3 = icmp ugt i32 %b, 2
   %cmp = and i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @and_of_icmps4_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @and_of_icmps4_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %1 = add nuw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp ult <2 x i32> %1, <i32 4, i32 4>
   %cmp3 = icmp ugt <2 x i32> %b, <i32 2, i32 2>
   %cmp = and <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @and_of_icmps5(i32 %b) {
 ; CHECK-LABEL: @and_of_icmps5(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = add nuw i32 %b, 2
   %2 = icmp ule i32 %1, 3
   %cmp3 = icmp ugt i32 %b, 2
   %cmp = and i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @and_of_icmps5_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @and_of_icmps5_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %1 = add nuw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp ule <2 x i32> %1, <i32 3, i32 3>
   %cmp3 = icmp ugt <2 x i32> %b, <i32 2, i32 2>
   %cmp = and <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @or_of_icmps0(i32 %b) {
 ; CHECK-LABEL: @or_of_icmps0(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = add i32 %b, 2
   %2 = icmp uge i32 %1, 4
   %cmp3 = icmp sle i32 %b, 2
   %cmp = or i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @or_of_icmps0_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @or_of_icmps0_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %1 = add <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp uge <2 x i32> %1, <i32 4, i32 4>
   %cmp3 = icmp sle <2 x i32> %b, <i32 2, i32 2>
   %cmp = or <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @or_of_icmps1(i32 %b) {
 ; CHECK-LABEL: @or_of_icmps1(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = add nsw i32 %b, 2
   %2 = icmp sge i32 %1, 4
   %cmp3 = icmp sle i32 %b, 2
   %cmp = or i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @or_of_icmps1_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @or_of_icmps1_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %1 = add nsw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp sge <2 x i32> %1, <i32 4, i32 4>
   %cmp3 = icmp sle <2 x i32> %b, <i32 2, i32 2>
   %cmp = or <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @or_of_icmps2(i32 %b) {
 ; CHECK-LABEL: @or_of_icmps2(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = add i32 %b, 2
   %2 = icmp ugt i32 %1, 3
   %cmp3 = icmp sle i32 %b, 2
   %cmp = or i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @or_of_icmps2_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @or_of_icmps2_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %1 = add <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp ugt <2 x i32> %1, <i32 3, i32 3>
   %cmp3 = icmp sle <2 x i32> %b, <i32 2, i32 2>
   %cmp = or <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @or_of_icmps3(i32 %b) {
 ; CHECK-LABEL: @or_of_icmps3(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = add nsw i32 %b, 2
   %2 = icmp sgt i32 %1, 3
   %cmp3 = icmp sle i32 %b, 2
   %cmp = or i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @or_of_icmps3_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @or_of_icmps3_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %1 = add nsw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp sgt <2 x i32> %1, <i32 3, i32 3>
   %cmp3 = icmp sle <2 x i32> %b, <i32 2, i32 2>
   %cmp = or <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @or_of_icmps4(i32 %b) {
 ; CHECK-LABEL: @or_of_icmps4(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = add nuw i32 %b, 2
   %2 = icmp uge i32 %1, 4
   %cmp3 = icmp ule i32 %b, 2
   %cmp = or i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @or_of_icmps4_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @or_of_icmps4_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %1 = add nuw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp uge <2 x i32> %1, <i32 4, i32 4>
   %cmp3 = icmp ule <2 x i32> %b, <i32 2, i32 2>
   %cmp = or <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i1 @or_of_icmps5(i32 %b) {
 ; CHECK-LABEL: @or_of_icmps5(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = add nuw i32 %b, 2
   %2 = icmp ugt i32 %1, 3
   %cmp3 = icmp ule i32 %b, 2
   %cmp = or i1 %2, %cmp3
   ret i1 %cmp
 }
 
 define <2 x i1> @or_of_icmps5_vec(<2 x i32> %b) {
 ; CHECK-LABEL: @or_of_icmps5_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %1 = add nuw <2 x i32> %b, <i32 2, i32 2>
   %2 = icmp ugt <2 x i32> %1, <i32 3, i32 3>
   %cmp3 = icmp ule <2 x i32> %b, <i32 2, i32 2>
   %cmp = or <2 x i1> %2, %cmp3
   ret <2 x i1> %cmp
 }
 
 define i32 @neg_nuw(i32 %x) {
 ; CHECK-LABEL: @neg_nuw(
 ; CHECK-NEXT:    ret i32 0
 ;
   %neg = sub nuw i32 0, %x
   ret i32 %neg
 }
 
 define i1 @and_icmp1(i32 %x, i32 %y) {
 ; CHECK-LABEL: @and_icmp1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %1 = icmp ult i32 %x, %y
   %2 = icmp ne i32 %y, 0
   %3 = and i1 %1, %2
   ret i1 %3
 }
 
 define i1 @and_icmp2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @and_icmp2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %1 = icmp ugt i32 %x, %y
   %2 = icmp ne i32 %x, 0
   %3 = and i1 %1, %2
   ret i1 %3
 }
 
 define i1 @and_icmp3(i32 %x, i32 %y) {
 ; CHECK-LABEL: @and_icmp3(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = icmp ult i32 %x, %y
   %2 = icmp eq i32 %y, 0
   %3 = and i1 %1, %2
   ret i1 %3
 }
 
 define i1 @and_icmp4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @and_icmp4(
 ; CHECK-NEXT:    ret i1 false
 ;
   %1 = icmp ugt i32 %x, %y
   %2 = icmp eq i32 %x, 0
   %3 = and i1 %1, %2
   ret i1 %3
 }
 
 define i1 @or_icmp1(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_icmp1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %1 = icmp ult i32 %x, %y
   %2 = icmp ne i32 %y, 0
   %3 = or i1 %1, %2
   ret i1 %3
 }
 
 define i1 @or_icmp2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_icmp2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %1 = icmp ugt i32 %x, %y
   %2 = icmp ne i32 %x, 0
   %3 = or i1 %1, %2
   ret i1 %3
 }
 
 define i1 @or_icmp3(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_icmp3(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = icmp uge i32 %x, %y
   %2 = icmp ne i32 %y, 0
   %3 = or i1 %1, %2
   ret i1 %3
 }
 
 define i1 @or_icmp4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_icmp4(
 ; CHECK-NEXT:    ret i1 true
 ;
   %1 = icmp ule i32 %x, %y
   %2 = icmp ne i32 %x, 0
   %3 = or i1 %1, %2
   ret i1 %3
 }
 
 define i1 @or_icmp5(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_icmp5(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %1 = icmp uge i32 %x, %y
   %2 = icmp eq i32 %y, 0
   %3 = or i1 %1, %2
   ret i1 %3
 }
 
 define i1 @or_icmp6(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_icmp6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %1 = icmp ule i32 %x, %y
   %2 = icmp eq i32 %x, 0
   %3 = or i1 %1, %2
   ret i1 %3
 }
 
 ; PR27869 - Look through casts to eliminate cmps and bitwise logic.
 
 define i32 @and_of_zexted_icmps(i32 %i) {
 ; CHECK-LABEL: @and_of_zexted_icmps(
 ; CHECK-NEXT:    ret i32 0
 ;
   %cmp0 = icmp eq i32 %i, 0
   %conv0 = zext i1 %cmp0 to i32
   %cmp1 = icmp ugt i32 %i, 4
   %conv1 = zext i1 %cmp1 to i32
   %and = and i32 %conv0, %conv1
   ret i32 %and
 }
 
 ; Make sure vectors work too.
 
 define <4 x i32> @and_of_zexted_icmps_vec(<4 x i32> %i) {
 ; CHECK-LABEL: @and_of_zexted_icmps_vec(
 ; CHECK-NEXT:    ret <4 x i32> zeroinitializer
 ;
   %cmp0 = icmp eq <4 x i32> %i, zeroinitializer
   %conv0 = zext <4 x i1> %cmp0 to <4 x i32>
   %cmp1 = icmp slt <4 x i32> %i, zeroinitializer
   %conv1 = zext <4 x i1> %cmp1 to <4 x i32>
   %and = and <4 x i32> %conv0, %conv1
   ret <4 x i32> %and
 }
 
 ; Try a different cast and weird types.
 
 define i5 @and_of_sexted_icmps(i3 %i) {
 ; CHECK-LABEL: @and_of_sexted_icmps(
 ; CHECK-NEXT:    ret i5 0
 ;
   %cmp0 = icmp eq i3 %i, 0
   %conv0 = sext i1 %cmp0 to i5
   %cmp1 = icmp ugt i3 %i, 1
   %conv1 = sext i1 %cmp1 to i5
   %and = and i5 %conv0, %conv1
   ret i5 %and
 }
 
 ; Try a different cast and weird vector types.
 
 define i3 @and_of_bitcast_icmps_vec(<3 x i65> %i) {
 ; CHECK-LABEL: @and_of_bitcast_icmps_vec(
 ; CHECK-NEXT:    ret i3 0
 ;
   %cmp0 = icmp sgt <3 x i65> %i, zeroinitializer
   %conv0 = bitcast <3 x i1> %cmp0 to i3
   %cmp1 = icmp slt <3 x i65> %i, zeroinitializer
   %conv1 = bitcast <3 x i1> %cmp1 to i3
   %and = and i3 %conv0, %conv1
   ret i3 %and
 }
 
 ; We can't do this if the casts are different.
 
 define i16 @and_of_different_cast_icmps(i8 %i) {
 ; CHECK-LABEL: @and_of_different_cast_icmps(
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i8 [[I:%.*]], 0
 ; CHECK-NEXT:    [[CONV0:%.*]] = zext i1 [[CMP0]] to i16
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[I]], 1
 ; CHECK-NEXT:    [[CONV1:%.*]] = sext i1 [[CMP1]] to i16
 ; CHECK-NEXT:    [[AND:%.*]] = and i16 [[CONV0]], [[CONV1]]
 ; CHECK-NEXT:    ret i16 [[AND]]
 ;
   %cmp0 = icmp eq i8 %i, 0
   %conv0 = zext i1 %cmp0 to i16
   %cmp1 = icmp eq i8 %i, 1
   %conv1 = sext i1 %cmp1 to i16
   %and = and i16 %conv0, %conv1
   ret i16 %and
 }
 
 define <2 x i3> @and_of_different_cast_icmps_vec(<2 x i8> %i, <2 x i16> %j) {
 ; CHECK-LABEL: @and_of_different_cast_icmps_vec(
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq <2 x i8> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[CONV0:%.*]] = zext <2 x i1> [[CMP0]] to <2 x i3>
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt <2 x i16> [[J:%.*]], <i16 1, i16 1>
 ; CHECK-NEXT:    [[CONV1:%.*]] = zext <2 x i1> [[CMP1]] to <2 x i3>
 ; CHECK-NEXT:    [[AND:%.*]] = and <2 x i3> [[CONV0]], [[CONV1]]
 ; CHECK-NEXT:    ret <2 x i3> [[AND]]
 ;
   %cmp0 = icmp eq <2 x i8> %i, zeroinitializer
   %conv0 = zext <2 x i1> %cmp0 to <2 x i3>
   %cmp1 = icmp ugt <2 x i16> %j, <i16 1, i16 1>
   %conv1 = zext <2 x i1> %cmp1 to <2 x i3>
   %and = and <2 x i3> %conv0, %conv1
   ret <2 x i3> %and
 }
 
 define i32 @or_of_zexted_icmps(i32 %i) {
 ; CHECK-LABEL: @or_of_zexted_icmps(
 ; CHECK-NEXT:    ret i32 1
 ;
   %cmp0 = icmp ne i32 %i, 0
   %conv0 = zext i1 %cmp0 to i32
   %cmp1 = icmp uge i32 4, %i
   %conv1 = zext i1 %cmp1 to i32
   %or = or i32 %conv0, %conv1
   ret i32 %or
 }
 
 ; Try a different cast and weird vector types.
 
 define i3 @or_of_bitcast_icmps_vec(<3 x i65> %i) {
 ; CHECK-LABEL: @or_of_bitcast_icmps_vec(
 ; CHECK-NEXT:    ret i3 bitcast (<3 x i1> <i1 true, i1 true, i1 true> to i3)
 ;
   %cmp0 = icmp sge <3 x i65> %i, zeroinitializer
   %conv0 = bitcast <3 x i1> %cmp0 to i3
   %cmp1 = icmp slt <3 x i65> %i, zeroinitializer
   %conv1 = bitcast <3 x i1> %cmp1 to i3
   %or = or i3 %conv0, %conv1
   ret i3 %or
 }
 
 ; We can't simplify if the casts are different.
 
 define i16 @or_of_different_cast_icmps(i8 %i) {
 ; CHECK-LABEL: @or_of_different_cast_icmps(
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i8 [[I:%.*]], 0
 ; CHECK-NEXT:    [[CONV0:%.*]] = zext i1 [[CMP0]] to i16
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[I]], 1
 ; CHECK-NEXT:    [[CONV1:%.*]] = sext i1 [[CMP1]] to i16
 ; CHECK-NEXT:    [[OR:%.*]] = or i16 [[CONV0]], [[CONV1]]
 ; CHECK-NEXT:    ret i16 [[OR]]
 ;
   %cmp0 = icmp ne i8 %i, 0
   %conv0 = zext i1 %cmp0 to i16
   %cmp1 = icmp ne i8 %i, 1
   %conv1 = sext i1 %cmp1 to i16
   %or = or i16 %conv0, %conv1
   ret i16 %or
 }
 
 ; (A & ~B) | (A ^ B) -> A ^ B
 
 define i32 @test43(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test43(
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %neg = xor i32 %b, -1
   %and = and i32 %a, %neg
   %xor = xor i32 %a, %b
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 define i32 @test43_commuted_and(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test43_commuted_and(
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %neg = xor i32 %b, -1
   %and = and i32 %neg, %a
   %xor = xor i32 %a, %b
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 ; Commute operands of the 'or'.
 ; (A ^ B) | (A & ~B) -> A ^ B
 
 define i32 @test44(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test44(
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %xor = xor i32 %a, %b
   %neg = xor i32 %b, -1
   %and = and i32 %a, %neg
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 define i32 @test44_commuted_and(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test44_commuted_and(
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %xor = xor i32 %a, %b
   %neg = xor i32 %b, -1
   %and = and i32 %neg, %a
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 ; (~A & ~B) | (~A ^ B) -> ~A ^ B
 
 define i32 @test45(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test45(
 ; CHECK-NEXT:    [[NEGB:%.*]] = xor i32 [[B:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[NEGB]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %negb = xor i32 %b, -1
   %and = and i32 %nega, %negb
   %xor = xor i32 %a, %negb
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 define i32 @test45_commuted_and(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test45_commuted_and(
 ; CHECK-NEXT:    [[NEGB:%.*]] = xor i32 [[B:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[NEGB]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %negb = xor i32 %b, -1
   %and = and i32 %negb, %nega
   %xor = xor i32 %a, %negb
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 ; Commute operands of the 'or'.
 ; (~A ^ B) | (~A & ~B) -> ~A ^ B
 
 define i32 @test46(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test46(
 ; CHECK-NEXT:    [[NEGB:%.*]] = xor i32 [[B:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[NEGB]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %negb = xor i32 %b, -1
   %and = and i32 %nega, %negb
   %xor = xor i32 %a, %negb
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 ; (~A & ~B) | (~A ^ B) -> ~A ^ B
 
 define i32 @test46_commuted_and(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test46_commuted_and(
 ; CHECK-NEXT:    [[NEGB:%.*]] = xor i32 [[B:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A:%.*]], [[NEGB]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %negb = xor i32 %b, -1
   %and = and i32 %negb, %nega
   %xor = xor i32 %a, %negb
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 ; (~A ^ B) | (A & B) -> ~A ^ B
 
 define i32 @test47(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test47(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %a, %b
   %xor = xor i32 %nega, %b
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 define i32 @test48(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test48(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %a, %b
   %xor = xor i32 %b, %nega
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 define i32 @test49(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test49(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %b, %a
   %xor = xor i32 %b, %nega
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 define i32 @test50(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test50(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %b, %a
   %xor = xor i32 %nega, %b
   %or = or i32 %xor, %and
   ret i32 %or
 }
 
 define i32 @test51(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test51(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %a, %b
   %xor = xor i32 %nega, %b
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 define i32 @test52(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test52(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %a, %b
   %xor = xor i32 %b, %nega
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 define i32 @test53(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test53(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %b, %a
   %xor = xor i32 %b, %nega
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 define i32 @test54(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test54(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %nega = xor i32 %a, -1
   %and = and i32 %b, %a
   %xor = xor i32 %nega, %b
   %or = or i32 %and, %xor
   ret i32 %or
 }
 
 ; (A & B) | ~(A ^ B) -> ~(A ^ B)
 
 define i32 @test55(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test55(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[AND]], [[XNOR]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %and = and i32 %a, %b
   %xor = xor i32 %a, %b
   %xnor = xor i32 %xor, -1
   %or = or i32 %and, %xnor
   ret i32 %or
 }
 
 ; ~(A ^ B) | (A & B) -> ~(A ^ B)
 
 define i32 @test56(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test56(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XNOR]], [[AND]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %and = and i32 %a, %b
   %xor = xor i32 %a, %b
   %xnor = xor i32 %xor, -1
   %or = or i32 %xnor, %and
   ret i32 %or
 }
 
 ; (B & A) | ~(A ^ B) -> ~(A ^ B)
 
 define i32 @test57(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test57(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[AND]], [[XNOR]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %and = and i32 %b, %a
   %xor = xor i32 %a, %b
   %xnor = xor i32 %xor, -1
   %or = or i32 %and, %xnor
   ret i32 %or
 }
 
 ; ~(A ^ B) | (A & B) -> ~(A ^ B)
 
 define i32 @test58(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test58(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XNOR]], [[AND]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %and = and i32 %b, %a
   %xor = xor i32 %a, %b
   %xnor = xor i32 %xor, -1
   %or = or i32 %xnor, %and
   ret i32 %or
 }
 
 define i8 @lshr_perfect_mask(i8 %x) {
 ; CHECK-LABEL: @lshr_perfect_mask(
 ; CHECK-NEXT:    [[SH:%.*]] = lshr i8 [[X:%.*]], 5
 ; CHECK-NEXT:    ret i8 [[SH]]
 ;
   %sh = lshr i8 %x, 5
   %mask = and i8 %sh, 7  ; 0x07
   ret i8 %mask
 }
 
 define <2 x i8> @lshr_oversized_mask_splat(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_oversized_mask_splat(
 ; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[X:%.*]], <i8 5, i8 5>
 ; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %sh = lshr <2 x i8> %x, <i8 5, i8 5>
   %mask = and <2 x i8> %sh, <i8 135, i8 135>  ; 0x87
   ret <2 x i8> %mask
 }
 
 define i8 @lshr_undersized_mask(i8 %x) {
 ; CHECK-LABEL: @lshr_undersized_mask(
 ; CHECK-NEXT:    [[SH:%.*]] = lshr i8 [[X:%.*]], 5
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[SH]], -2
 ; CHECK-NEXT:    ret i8 [[MASK]]
 ;
   %sh = lshr i8 %x, 5
   %mask = and i8 %sh, -2  ; 0xFE
   ret i8 %mask
 }
 
 define <2 x i8> @shl_perfect_mask_splat(<2 x i8> %x) {
 ; CHECK-LABEL: @shl_perfect_mask_splat(
 ; CHECK-NEXT:    [[SH:%.*]] = shl <2 x i8> [[X:%.*]], <i8 6, i8 6>
 ; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %sh = shl <2 x i8> %x, <i8 6, i8 6>
   %mask = and <2 x i8> %sh, <i8 192, i8 192>  ; 0xC0
   ret <2 x i8> %mask
 }
 
 define i8 @shl_oversized_mask(i8 %x) {
 ; CHECK-LABEL: @shl_oversized_mask(
 ; CHECK-NEXT:    [[SH:%.*]] = shl i8 [[X:%.*]], 6
 ; CHECK-NEXT:    ret i8 [[SH]]
 ;
   %sh = shl i8 %x, 6
   %mask = and i8 %sh, 195  ; 0xC3
   ret i8 %mask
 }
 
 define <2 x i8> @shl_undersized_mask_splat(<2 x i8> %x) {
 ; CHECK-LABEL: @shl_undersized_mask_splat(
 ; CHECK-NEXT:    [[SH:%.*]] = shl <2 x i8> [[X:%.*]], <i8 6, i8 6>
 ; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -120, i8 -120>
 ; CHECK-NEXT:    ret <2 x i8> [[MASK]]
 ;
   %sh = shl <2 x i8> %x, <i8 6, i8 6>
   %mask = and <2 x i8> %sh, <i8 136, i8 136>  ; 0x88
   ret <2 x i8> %mask
 }
 
 define i32 @reversed_not(i32 %a) {
 ; CHECK-LABEL: @reversed_not(
 ; CHECK-NEXT:    ret i32 -1
 ;
   %nega = xor i32 -1, %a
   %or = or i32 %a, %nega
   ret i32 %or
 }
 
 define i64 @shl_or_and1(i32 %a, i1 %b) {
 ; CHECK-LABEL: @shl_or_and1(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[B:%.*]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP1]], 32
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %tmp1 = zext i32 %a to i64
   %tmp2 = zext i1 %b to i64
   %tmp3 = shl nuw i64 %tmp1, 32
   %tmp4 = or i64 %tmp2, %tmp3
   %tmp5 = and i64 %tmp4, 1
   ret i64 %tmp5
 }
 
 define i64 @shl_or_and2(i32 %a, i1 %b) {
 ; CHECK-LABEL: @shl_or_and2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[B:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[A:%.*]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP1]], 32
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 4294967296
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
   %tmp1 = zext i1 %b to i64
   %tmp2 = zext i32 %a to i64
   %tmp3 = shl nuw i64 %tmp1, 32
   %tmp4 = or i64 %tmp2, %tmp3
   %tmp5 = and i64 %tmp4, 4294967296
   ret i64 %tmp5
 }
 
-; concatinate two 32-bit integers and extract lower 32-bit
+; concatenate two 32-bit integers and extract lower 32-bit
 define i64 @shl_or_and3(i32 %a, i32 %b) {
 ; CHECK-LABEL: @shl_or_and3(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[B:%.*]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP1]], 32
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 4294967295
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %tmp1 = zext i32 %a to i64
   %tmp2 = zext i32 %b to i64
   %tmp3 = shl nuw i64 %tmp1, 32
   %tmp4 = or i64 %tmp2, %tmp3
   %tmp5 = and i64 %tmp4, 4294967295
   ret i64 %tmp5
 }
 
-; concatinate two 16-bit integers and extract higher 16-bit
+; concatenate two 16-bit integers and extract higher 16-bit
 define i32 @shl_or_and4(i16 %a, i16 %b) {
 ; CHECK-LABEL: @shl_or_and4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], -65536
-; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %tmp1 = zext i16 %a to i32
   %tmp2 = zext i16 %b to i32
   %tmp3 = shl nuw i32 %tmp1, 16
   %tmp4 = or i32 %tmp2, %tmp3
   %tmp5 = and i32 %tmp4, 4294901760 ; mask with 0xFFFF0000
   ret i32 %tmp5
 }
 
 define i128 @shl_or_and5(i64 %a, i1 %b) {
 ; CHECK-LABEL: @shl_or_and5(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[A:%.*]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[B:%.*]] to i128
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i128 [[TMP1]], 64
-; CHECK-NEXT:    [[TMP4:%.*]] = or i128 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i128 [[TMP4]], 1
-; CHECK-NEXT:    ret i128 [[TMP5]]
+; CHECK-NEXT:    ret i128 [[TMP2]]
 ;
   %tmp1 = zext i64 %a to i128
   %tmp2 = zext i1 %b to i128
   %tmp3 = shl nuw i128 %tmp1, 64
   %tmp4 = or i128 %tmp2, %tmp3
   %tmp5 = and i128 %tmp4, 1
   ret i128 %tmp5
 }
 
 ; A variation of above test cases; it fails due to the mask value
 define i32 @shl_or_and6(i16 %a, i16 %b) {
 ; CHECK-LABEL: @shl_or_and6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP1]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], -65535
 ; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
   %tmp1 = zext i16 %a to i32
   %tmp2 = zext i16 %b to i32
   %tmp3 = shl nuw i32 %tmp1, 16
   %tmp4 = or i32 %tmp2, %tmp3
   %tmp5 = and i32 %tmp4, 4294901761 ; mask with 0xFFFF0001
   ret i32 %tmp5
 }
 
 ; A variation of above test cases; it fails due to the mask value
 define i32 @shl_or_and7(i16 %a, i16 %b) {
 ; CHECK-LABEL: @shl_or_and7(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP1]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], -131072
 ; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
   %tmp1 = zext i16 %a to i32
   %tmp2 = zext i16 %b to i32
   %tmp3 = shl nuw i32 %tmp1, 16
   %tmp4 = or i32 %tmp2, %tmp3
   %tmp5 = and i32 %tmp4, 4294836224 ; mask with 0xFFFE0000
   ret i32 %tmp5
 }
 
 ; A variation of above test cases; it fails due to the mask value
 define i32 @shl_or_and8(i16 %a, i16 %b) {
 ; CHECK-LABEL: @shl_or_and8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP1]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 131071
 ; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
   %tmp1 = zext i16 %a to i32
   %tmp2 = zext i16 %b to i32
   %tmp3 = shl nuw i32 %tmp1, 16
   %tmp4 = or i32 %tmp2, %tmp3
   %tmp5 = and i32 %tmp4, 131071 ; mask with 0x1FFFF
   ret i32 %tmp5
 }
 
 define <2 x i64> @shl_or_and1v(<2 x i32> %a, <2 x i1> %b) {
 ; CHECK-LABEL: @shl_or_and1v(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <2 x i64> [[TMP1]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], <i64 1, i64 1>
-; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %tmp1 = zext <2 x i32> %a to <2 x i64>
   %tmp2 = zext <2 x i1> %b to <2 x i64>
   %tmp3 = shl nuw <2 x i64> %tmp1, <i64 32, i64 32>
   %tmp4 = or <2 x i64> %tmp3, %tmp2
   %tmp5 = and <2 x i64> %tmp4, <i64 1, i64 1>
   ret <2 x i64> %tmp5
 }
 
 define <2 x i64> @shl_or_and2v(<2 x i32> %a, <2 x i1> %b) {
 ; CHECK-LABEL: @shl_or_and2v(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <2 x i64> [[TMP1]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967296, i64 4294967296>
-; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %tmp1 = zext <2 x i1> %b to <2 x i64>
   %tmp2 = zext <2 x i32> %a to <2 x i64>
   %tmp3 = shl nuw <2 x i64> %tmp1, <i64 32, i64 32>
   %tmp4 = or <2 x i64> %tmp2, %tmp3
   %tmp5 = and <2 x i64> %tmp4, <i64 4294967296, i64 4294967296>
   ret <2 x i64> %tmp5
 }
 
 define <2 x i32> @shl_or_and3v(<2 x i16> %a, <2 x i16> %b) {
 ; A variation of above test case, but fails due to the mask value
 ; CHECK-LABEL: @shl_or_and3v(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[A:%.*]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i16> [[B:%.*]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <2 x i32> [[TMP1]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 -65535, i32 -65535>
 ; CHECK-NEXT:    ret <2 x i32> [[TMP5]]
 ;
   %tmp1 = zext <2 x i16> %a to <2 x i32>
   %tmp2 = zext <2 x i16> %b to <2 x i32>
   %tmp3 = shl nuw <2 x i32> %tmp1, <i32 16, i32 16>
   %tmp4 = or <2 x i32> %tmp2, %tmp3
   %tmp5 = and <2 x i32> %tmp4, <i32 4294901761, i32 4294901761> ; mask with 0xFFFF0001
   ret <2 x i32> %tmp5
 }
Index: vendor/llvm/dist-release_70/test/Transforms/InstSimplify/floating-point-compare.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Transforms/InstSimplify/floating-point-compare.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/Transforms/InstSimplify/floating-point-compare.ll	(revision 337631)
@@ -1,376 +1,378 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
 ; Infinity
 
 define i1 @inf0(double %arg) {
 ; CHECK-LABEL: @inf0(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp ogt double %arg, 0x7FF0000000000000
   ret i1 %tmp
 }
 
 define i1 @inf1(double %arg) {
 ; CHECK-LABEL: @inf1(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp ule double %arg, 0x7FF0000000000000
   ret i1 %tmp
 }
 
 ; Negative infinity
 
 define i1 @ninf0(double %arg) {
 ; CHECK-LABEL: @ninf0(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp olt double %arg, 0xFFF0000000000000
   ret i1 %tmp
 }
 
 define i1 @ninf1(double %arg) {
 ; CHECK-LABEL: @ninf1(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp uge double %arg, 0xFFF0000000000000
   ret i1 %tmp
 }
 
 ; NaNs
 
 define i1 @nan0(double %arg) {
 ; CHECK-LABEL: @nan0(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp ord double %arg, 0x7FF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nan1(double %arg) {
 ; CHECK-LABEL: @nan1(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp oeq double %arg, 0x7FF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nan2(double %arg) {
 ; CHECK-LABEL: @nan2(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp olt double %arg, 0x7FF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nan3(double %arg) {
 ; CHECK-LABEL: @nan3(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp uno double %arg, 0x7FF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nan4(double %arg) {
 ; CHECK-LABEL: @nan4(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp une double %arg, 0x7FF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nan5(double %arg) {
 ; CHECK-LABEL: @nan5(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp ult double %arg, 0x7FF00000FFFFFFFF
   ret i1 %tmp
 }
 
 ; Negative NaN.
 
 define i1 @nnan0(double %arg) {
 ; CHECK-LABEL: @nnan0(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp ord double %arg, 0xFFF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nnan1(double %arg) {
 ; CHECK-LABEL: @nnan1(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp oeq double %arg, 0xFFF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nnan2(double %arg) {
 ; CHECK-LABEL: @nnan2(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp olt double %arg, 0xFFF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nnan3(double %arg) {
 ; CHECK-LABEL: @nnan3(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp uno double %arg, 0xFFF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nnan4(double %arg) {
 ; CHECK-LABEL: @nnan4(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp une double %arg, 0xFFF00000FFFFFFFF
   ret i1 %tmp
 }
 
 define i1 @nnan5(double %arg) {
 ; CHECK-LABEL: @nnan5(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp ult double %arg, 0xFFF00000FFFFFFFF
   ret i1 %tmp
 }
 
 ; Negative zero.
 
 define i1 @nzero0() {
 ; CHECK-LABEL: @nzero0(
 ; CHECK-NEXT:    ret i1 true
 ;
   %tmp = fcmp oeq double 0.0, -0.0
   ret i1 %tmp
 }
 
 define i1 @nzero1() {
 ; CHECK-LABEL: @nzero1(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp ogt double 0.0, -0.0
   ret i1 %tmp
 }
 
 ; No enlightenment here.
 
 define i1 @one_with_self(double %arg) {
 ; CHECK-LABEL: @one_with_self(
 ; CHECK-NEXT:    ret i1 false
 ;
   %tmp = fcmp one double %arg, %arg
   ret i1 %tmp
 }
 
 ; These tests choose arbitrarily between float and double,
 ; and between uge and olt, to give reasonble coverage
 ; without combinatorial explosion.
 
 declare half @llvm.fabs.f16(half)
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.powi.f64(double,i32)
 declare float @llvm.exp.f32(float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.exp2.f64(double)
 declare float @llvm.fma.f32(float,float,float)
 
 declare void @expect_equal(i1,i1)
 
 define i1 @orderedLessZeroTree(float,float,float,float) {
 ; CHECK-LABEL: @orderedLessZeroTree(
 ; CHECK-NEXT:    ret i1 true
 ;
   %square = fmul float %0, %0
   %abs = call float @llvm.fabs.f32(float %1)
   %sqrt = call float @llvm.sqrt.f32(float %2)
   %fma = call float @llvm.fma.f32(float %3, float %3, float %sqrt)
   %div = fdiv float %square, %abs
   %rem = frem float %sqrt, %fma
   %add = fadd float %div, %rem
   %uge = fcmp uge float %add, 0.000000e+00
   ret i1 %uge
 }
 
 define i1 @orderedLessZeroExpExt(float) {
 ; CHECK-LABEL: @orderedLessZeroExpExt(
 ; CHECK-NEXT:    ret i1 true
 ;
   %a = call float @llvm.exp.f32(float %0)
   %b = fpext float %a to double
   %uge = fcmp uge double %b, 0.000000e+00
   ret i1 %uge
 }
 
 define i1 @orderedLessZeroExp2Trunc(double) {
 ; CHECK-LABEL: @orderedLessZeroExp2Trunc(
 ; CHECK-NEXT:    ret i1 false
 ;
   %a = call double @llvm.exp2.f64(double %0)
   %b = fptrunc double %a to float
   %olt = fcmp olt float %b, 0.000000e+00
   ret i1 %olt
 }
 
 define i1 @orderedLessZeroPowi(double,double) {
 ; CHECK-LABEL: @orderedLessZeroPowi(
 ; CHECK-NEXT:    ret i1 false
 ;
   ; Even constant exponent
   %a = call double @llvm.powi.f64(double %0, i32 2)
   %square = fmul double %1, %1
   ; Odd constant exponent with provably non-negative base
   %b = call double @llvm.powi.f64(double %square, i32 3)
   %c = fadd double %a, %b
   %olt = fcmp olt double %b, 0.000000e+00
   ret i1 %olt
 }
 
 define i1 @orderedLessZeroUIToFP(i32) {
 ; CHECK-LABEL: @orderedLessZeroUIToFP(
 ; CHECK-NEXT:    ret i1 true
 ;
   %a = uitofp i32 %0 to float
   %uge = fcmp uge float %a, 0.000000e+00
   ret i1 %uge
 }
 
 define i1 @orderedLessZeroSelect(float, float) {
 ; CHECK-LABEL: @orderedLessZeroSelect(
 ; CHECK-NEXT:    ret i1 true
 ;
   %a = call float @llvm.exp.f32(float %0)
   %b = call float @llvm.fabs.f32(float %1)
   %c = fcmp olt float %0, %1
   %d = select i1 %c, float %a, float %b
   %e = fadd float %d, 1.0
   %uge = fcmp uge float %e, 0.000000e+00
   ret i1 %uge
 }
 
 define i1 @orderedLessZeroMinNum(float, float) {
 ; CHECK-LABEL: @orderedLessZeroMinNum(
 ; CHECK-NEXT:    ret i1 true
 ;
   %a = call float @llvm.exp.f32(float %0)
   %b = call float @llvm.fabs.f32(float %1)
   %c = call float @llvm.minnum.f32(float %a, float %b)
   %uge = fcmp uge float %c, 0.000000e+00
   ret i1 %uge
 }
 
-; FIXME: This is wrong.
 ; PR37776: https://bugs.llvm.org/show_bug.cgi?id=37776
 ; exp() may return nan, leaving %1 as the unknown result, so we can't simplify.
 
 define i1 @orderedLessZeroMaxNum(float, float) {
 ; CHECK-LABEL: @orderedLessZeroMaxNum(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.exp.f32(float [[TMP0:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maxnum.f32(float [[A]], float [[TMP1:%.*]])
+; CHECK-NEXT:    [[UGE:%.*]] = fcmp uge float [[B]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[UGE]]
 ;
   %a = call float @llvm.exp.f32(float %0)
   %b = call float @llvm.maxnum.f32(float %a, float %1)
   %uge = fcmp uge float %b, 0.000000e+00
   ret i1 %uge
 }
 
 define i1 @known_positive_olt_with_negative_constant(double %a) {
 ; CHECK-LABEL: @known_positive_olt_with_negative_constant(
 ; CHECK-NEXT:    ret i1 false
 ;
   %call = call double @llvm.fabs.f64(double %a)
   %cmp = fcmp olt double %call, -1.0
   ret i1 %cmp
 }
 
 define <2 x i1> @known_positive_ole_with_negative_constant_splat_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @known_positive_ole_with_negative_constant_splat_vec(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %call = uitofp <2 x i32> %a to <2 x double>
   %cmp = fcmp ole <2 x double> %call, <double -2.0, double -2.0>
   ret <2 x i1> %cmp
 }
 
 define i1 @known_positive_ugt_with_negative_constant(i32 %a) {
 ; CHECK-LABEL: @known_positive_ugt_with_negative_constant(
 ; CHECK-NEXT:    ret i1 true
 ;
   %call = uitofp i32 %a to float
   %cmp = fcmp ugt float %call, -3.0
   ret i1 %cmp
 }
 
 define <2 x i1> @known_positive_uge_with_negative_constant_splat_vec(<2 x float> %a) {
 ; CHECK-LABEL: @known_positive_uge_with_negative_constant_splat_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
   %cmp = fcmp uge <2 x float> %call, <float -4.0, float -4.0>
   ret <2 x i1> %cmp
 }
 
 define i1 @known_positive_oeq_with_negative_constant(half %a) {
 ; CHECK-LABEL: @known_positive_oeq_with_negative_constant(
 ; CHECK-NEXT:    ret i1 false
 ;
   %call = call half @llvm.fabs.f16(half %a)
   %cmp = fcmp oeq half %call, -5.0
   ret i1 %cmp
 }
 
 define <2 x i1> @known_positive_une_with_negative_constant_splat_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @known_positive_une_with_negative_constant_splat_vec(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %call = uitofp <2 x i32> %a to <2 x half>
   %cmp = fcmp une <2 x half> %call, <half -6.0, half -6.0>
   ret <2 x i1> %cmp
 }
 
 define i1 @nonans1(double %in1, double %in2) {
 ; CHECK-LABEL: @nonans1(
 ; CHECK-NEXT:    ret i1 false
 ;
   %cmp = fcmp nnan uno double %in1, %in2
   ret i1 %cmp
 }
 
 define i1 @nonans2(double %in1, double %in2) {
 ; CHECK-LABEL: @nonans2(
 ; CHECK-NEXT:    ret i1 true
 ;
   %cmp = fcmp nnan ord double %in1, %in2
   ret i1 %cmp
 }
 
 define <2 x i1> @orderedCompareWithNaNVector(<2 x double> %A) {
 ; CHECK-LABEL: @orderedCompareWithNaNVector(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %cmp = fcmp olt <2 x double> %A, <double 0xFFFFFFFFFFFFFFFF, double 0xFFFFFFFFFFFFFFFF>
   ret <2 x i1> %cmp
 }
 
 define <2 x i1> @orderedCompareWithNaNVector_undef_elt(<2 x double> %A) {
 ; CHECK-LABEL: @orderedCompareWithNaNVector_undef_elt(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %cmp = fcmp olt <2 x double> %A, <double 0xFFFFFFFFFFFFFFFF, double undef>
   ret <2 x i1> %cmp
 }
 
 define <2 x i1> @unorderedCompareWithNaNVector_undef_elt(<2 x double> %A) {
 ; CHECK-LABEL: @unorderedCompareWithNaNVector_undef_elt(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %cmp = fcmp ult <2 x double> %A, <double undef, double 0xFFFFFFFFFFFFFFFF>
   ret <2 x i1> %cmp
 }
 
Index: vendor/llvm/dist-release_70/test/Transforms/NewGVN/pair_jumpthread.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Transforms/NewGVN/pair_jumpthread.ll	(revision 337630)
+++ vendor/llvm/dist-release_70/test/Transforms/NewGVN/pair_jumpthread.ll	(revision 337631)
@@ -1,122 +1,120 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 ; RUN: opt < %s -newgvn -jump-threading -S | FileCheck --check-prefix=CHECK-JT %s
-; This test is expected to fail until the transformation is committed.
-; XFAIL: *
 
 define signext i32 @testBI(i32 signext %v) {
 ; Test with std::pair<bool, int>
 ; based on the following C++ code
 ; std::pair<bool, int> callee(int v) {
 ;   int a = dummy(v);
 ;   if (a) return std::make_pair(true, dummy(a));
 ;   else return std::make_pair(v < 0, v);
 ; }
 ; int func(int v) {
 ;   std::pair<bool, int> rc = callee(v);
 ;   if (rc.first) dummy(0);
 ;   return rc.second;
 ; }
 ; CHECK-LABEL: @testBI(
 ; CHECK:  _ZL6calleei.exit:
 ; CHECK:    [[PHIOFOPS:%.*]] = phi i64 [ 1, %if.then.i ], [ {{%.*}}, %if.else.i ]
 ; CHECK:    [[TOBOOL:%.*]] = icmp eq i64 [[PHIOFOPS]], 0
 ;
 ; CHECK-JT-LABEL: @testBI(
 ; CHECK-JT:       _ZL6calleei.exit.thread:
 ;
 
 entry:
   %call.i = call signext i32 @dummy(i32 signext %v)
   %tobool.i = icmp eq i32 %call.i, 0
   br i1 %tobool.i, label %if.else.i, label %if.then.i
 
 if.then.i:                                        ; preds = %entry
   %call2.i = call signext i32 @dummy(i32 signext %call.i)
   %retval.sroa.22.0.insert.ext.i.i = zext i32 %call2.i to i64
   %retval.sroa.22.0.insert.shift.i.i = shl nuw i64 %retval.sroa.22.0.insert.ext.i.i, 32
   %retval.sroa.0.0.insert.insert.i.i = or i64 %retval.sroa.22.0.insert.shift.i.i, 1
   br label %_ZL6calleei.exit
 
 if.else.i:                                        ; preds = %entry
   %.lobit.i = lshr i32 %v, 31
   %0 = zext i32 %.lobit.i to i64
   %retval.sroa.22.0.insert.ext.i8.i = zext i32 %v to i64
   %retval.sroa.22.0.insert.shift.i9.i = shl nuw i64 %retval.sroa.22.0.insert.ext.i8.i, 32
   %retval.sroa.0.0.insert.insert.i11.i = or i64 %retval.sroa.22.0.insert.shift.i9.i, %0
   br label %_ZL6calleei.exit
 
 _ZL6calleei.exit:                                 ; preds = %if.then.i, %if.else.i
   %retval.sroa.0.0.i = phi i64 [ %retval.sroa.0.0.insert.insert.i.i, %if.then.i ], [ %retval.sroa.0.0.insert.insert.i11.i, %if.else.i ]
   %rc.sroa.43.0.extract.shift = lshr i64 %retval.sroa.0.0.i, 32
   %rc.sroa.43.0.extract.trunc = trunc i64 %rc.sroa.43.0.extract.shift to i32
   %1 = and i64 %retval.sroa.0.0.i, 1
   %tobool = icmp eq i64 %1, 0
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %_ZL6calleei.exit
   %call1 = call signext i32 @dummy(i32 signext 0)
   br label %if.end
 
 if.end:                                           ; preds = %_ZL6calleei.exit, %if.then
   ret i32 %rc.sroa.43.0.extract.trunc
 }
 
 
 define signext i32 @testIB(i32 signext %v) {
 ; Test with std::pair<int, bool>
 ; based on the following C++ code
 ; std::pair<int, bool> callee(int v) {
 ;   int a = dummy(v);
 ;   if (a) return std::make_pair(dummy(v), true);
 ;   else return std::make_pair(v, v < 0);
 ; }
 ; int func(int v) {
 ;   std::pair<int, bool> rc = callee(v);
 ;   if (rc.second) dummy(0);
 ;   return rc.first;
 ; }
 ; CHECK-LABEL: @testIB(
 ; CHECK:  _ZL6calleei.exit:
 ; CHECK:     [[PHIOFOPS:%.*]] = phi i64 [ 4294967296, %if.then.i ], [ {{%.*}}, %if.else.i ]
 ; CHECK:     [[TOBOOL:%.*]] = icmp eq i64 [[PHIOFOPS]], 0
 ;
 ; CHECK-JT-LABEL: @testIB(
 ; CHECK-JT:       _ZL6calleei.exit.thread:
 ;
 
 entry:
   %call.i = call signext i32 @dummy(i32 signext %v)
   %tobool.i = icmp eq i32 %call.i, 0
   br i1 %tobool.i, label %if.else.i, label %if.then.i
 
 if.then.i:                                        ; preds = %entry
   %call1.i = call signext i32 @dummy(i32 signext %v)
   %retval.sroa.0.0.insert.ext.i.i = zext i32 %call1.i to i64
   %retval.sroa.0.0.insert.insert.i.i = or i64 %retval.sroa.0.0.insert.ext.i.i, 4294967296
   br label %_ZL6calleei.exit
 
 if.else.i:                                        ; preds = %entry
   %.lobit.i = lshr i32 %v, 31
   %0 = zext i32 %.lobit.i to i64
   %retval.sroa.2.0.insert.shift.i8.i = shl nuw nsw i64 %0, 32
   %retval.sroa.0.0.insert.ext.i9.i = zext i32 %v to i64
   %retval.sroa.0.0.insert.insert.i10.i = or i64 %retval.sroa.2.0.insert.shift.i8.i, %retval.sroa.0.0.insert.ext.i9.i
   br label %_ZL6calleei.exit
 
 _ZL6calleei.exit:                                 ; preds = %if.then.i, %if.else.i
   %retval.sroa.0.0.i = phi i64 [ %retval.sroa.0.0.insert.insert.i.i, %if.then.i ], [ %retval.sroa.0.0.insert.insert.i10.i, %if.else.i ]
   %rc.sroa.0.0.extract.trunc = trunc i64 %retval.sroa.0.0.i to i32
   %1 = and i64 %retval.sroa.0.0.i, 4294967296
   %tobool = icmp eq i64 %1, 0
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %_ZL6calleei.exit
   %call1 = call signext i32 @dummy(i32 signext 0)
   br label %if.end
 
 if.end:                                           ; preds = %_ZL6calleei.exit, %if.then
   ret i32 %rc.sroa.0.0.extract.trunc
 }
 
 declare signext i32 @dummy(i32 signext %v)