Index: vendor/llvm/dist-release_70/cmake/modules/HandleLLVMOptions.cmake
===================================================================
--- vendor/llvm/dist-release_70/cmake/modules/HandleLLVMOptions.cmake	(revision 337999)
+++ vendor/llvm/dist-release_70/cmake/modules/HandleLLVMOptions.cmake	(revision 338000)
@@ -1,886 +1,887 @@
 # This CMake module is responsible for interpreting the user defined LLVM_
 # options and executing the appropriate CMake commands to realize the users'
 # selections.
 
 # This is commonly needed so make sure it's defined before we include anything
 # else.
 string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
 
 include(CheckCompilerVersion)
 include(HandleLLVMStdlib)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 
 if(CMAKE_LINKER MATCHES "lld-link.exe" OR (WIN32 AND LLVM_USE_LINKER STREQUAL "lld"))
   set(LINKER_IS_LLD_LINK TRUE)
 else()
   set(LINKER_IS_LLD_LINK FALSE)
 endif()
 
 set(LLVM_ENABLE_LTO OFF CACHE STRING "Build LLVM with LTO. May be specified as Thin or Full to use a particular kind of LTO")
 string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 
 # Ninja Job Pool support
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
   "Define the maximum number of concurrent compilation jobs.")
 if(LLVM_PARALLEL_COMPILE_JOBS)
   if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
     message(WARNING "Job pooling is only available with Ninja generators.")
   else()
     set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${LLVM_PARALLEL_COMPILE_JOBS})
     set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
   endif()
 endif()
 
 set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
   "Define the maximum number of concurrent link jobs.")
 if(CMAKE_MAKE_PROGRAM MATCHES "ninja")
   if(NOT LLVM_PARALLEL_LINK_JOBS AND uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
     message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.")
     set(LLVM_PARALLEL_LINK_JOBS "2")
   endif()
   if(LLVM_PARALLEL_LINK_JOBS)
     set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${LLVM_PARALLEL_LINK_JOBS})
     set(CMAKE_JOB_POOL_LINK link_job_pool)
   endif()
 elseif(LLVM_PARALLEL_LINK_JOBS)
   message(WARNING "Job pooling is only available with Ninja generators.")
 endif()
 
 if( LLVM_ENABLE_ASSERTIONS )
   # MSVC doesn't like _DEBUG on release builds. See PR 4379.
   if( NOT MSVC )
     add_definitions( -D_DEBUG )
   endif()
   # On non-Debug builds cmake automatically defines NDEBUG, so we
   # explicitly undefine it:
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
     add_definitions( -UNDEBUG )
     # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
     foreach (flags_var_to_scrub
         CMAKE_CXX_FLAGS_RELEASE
         CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_CXX_FLAGS_MINSIZEREL
         CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS_MINSIZEREL)
       string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
         "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
     endforeach()
   endif()
 endif()
 
 if(LLVM_ENABLE_EXPENSIVE_CHECKS)
   add_definitions(-DEXPENSIVE_CHECKS)
   add_definitions(-D_GLIBCXX_DEBUG)
 endif()
 
 string(TOUPPER "${LLVM_ABI_BREAKING_CHECKS}" uppercase_LLVM_ABI_BREAKING_CHECKS)
 
 if( uppercase_LLVM_ABI_BREAKING_CHECKS STREQUAL "WITH_ASSERTS" )
   if( LLVM_ENABLE_ASSERTIONS )
     set( LLVM_ENABLE_ABI_BREAKING_CHECKS 1 )
   endif()
 elseif( uppercase_LLVM_ABI_BREAKING_CHECKS STREQUAL "FORCE_ON" )
   set( LLVM_ENABLE_ABI_BREAKING_CHECKS 1 )
 elseif( uppercase_LLVM_ABI_BREAKING_CHECKS STREQUAL "FORCE_OFF" )
   # We don't need to do anything special to turn off ABI breaking checks.
 elseif( NOT DEFINED LLVM_ABI_BREAKING_CHECKS )
   # Treat LLVM_ABI_BREAKING_CHECKS like "FORCE_OFF" when it has not been
   # defined.
 else()
   message(FATAL_ERROR "Unknown value for LLVM_ABI_BREAKING_CHECKS: \"${LLVM_ABI_BREAKING_CHECKS}\"!")
 endif()
 
 if( LLVM_REVERSE_ITERATION )
   set( LLVM_ENABLE_REVERSE_ITERATION 1 )
 endif()
 
 if(WIN32)
   set(LLVM_HAVE_LINK_VERSION_SCRIPT 0)
   if(CYGWIN)
     set(LLVM_ON_WIN32 0)
     set(LLVM_ON_UNIX 1)
   else(CYGWIN)
     set(LLVM_ON_WIN32 1)
     set(LLVM_ON_UNIX 0)
   endif(CYGWIN)
 else(WIN32)
   if(FUCHSIA OR UNIX)
     set(LLVM_ON_WIN32 0)
     set(LLVM_ON_UNIX 1)
     if(APPLE OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       set(LLVM_HAVE_LINK_VERSION_SCRIPT 0)
     else()
       set(LLVM_HAVE_LINK_VERSION_SCRIPT 1)
     endif()
   else(FUCHSIA OR UNIX)
     MESSAGE(SEND_ERROR "Unable to determine platform")
   endif(FUCHSIA OR UNIX)
 endif(WIN32)
 
 set(EXEEXT ${CMAKE_EXECUTABLE_SUFFIX})
 set(LTDL_SHLIB_EXT ${CMAKE_SHARED_LIBRARY_SUFFIX})
 
 # We use *.dylib rather than *.so on darwin.
 set(LLVM_PLUGIN_EXT ${CMAKE_SHARED_LIBRARY_SUFFIX})
 
 if(APPLE)
   if(LLVM_ENABLE_LLD AND LLVM_ENABLE_LTO)
     message(FATAL_ERROR "lld does not support LTO on Darwin")
   endif()
   # Darwin-specific linker flags for loadable modules.
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace -Wl,-undefined -Wl,suppress")
 endif()
 
 # Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO
 # build might work on ELF but fail on MachO/COFF.
 if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR CYGWIN OR
         ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR
         ${CMAKE_SYSTEM_NAME} MATCHES "OpenBSD") AND
    NOT LLVM_USE_SANITIZER)
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs")
 endif()
 
 # Pass -Wl,-z,nodelete. This makes sure our shared libraries are not unloaded
 # by dlclose(). We need that since the CLI API relies on cross-references
 # between global objects which became horribly broken when one of the libraries
 # is unloaded.
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,nodelete")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-z,nodelete")
 endif()
 
 
 function(append value)
   foreach(variable ${ARGN})
     set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
   endforeach(variable)
 endfunction()
 
 function(append_if condition value)
   if (${condition})
     foreach(variable ${ARGN})
       set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
     endforeach(variable)
   endif()
 endfunction()
 
 macro(add_flag_if_supported flag name)
   check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}")
   append_if("C_SUPPORTS_${name}" "${flag}" CMAKE_C_FLAGS)
   check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}")
   append_if("CXX_SUPPORTS_${name}" "${flag}" CMAKE_CXX_FLAGS)
 endmacro()
 
 function(add_flag_or_print_warning flag name)
   check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}")
   check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}")
   if (C_SUPPORTS_${name} AND CXX_SUPPORTS_${name})
     message(STATUS "Building with ${flag}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${flag}" PARENT_SCOPE)
   else()
     message(WARNING "${flag} is not supported.")
   endif()
 endfunction()
 
 if( LLVM_ENABLE_LLD )
 	if ( LLVM_USE_LINKER )
 		message(FATAL_ERROR "LLVM_ENABLE_LLD and LLVM_USE_LINKER can't be set at the same time")
 	endif()
 	set(LLVM_USE_LINKER "lld")
 endif()
 
 if( LLVM_USE_LINKER )
   set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -fuse-ld=${LLVM_USE_LINKER}")
   check_cxx_source_compiles("int main() { return 0; }" CXX_SUPPORTS_CUSTOM_LINKER)
   if ( NOT CXX_SUPPORTS_CUSTOM_LINKER )
 	  message(FATAL_ERROR "Host compiler does not support '-fuse-ld=${LLVM_USE_LINKER}'")
   endif()
   set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
   append("-fuse-ld=${LLVM_USE_LINKER}"
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
 endif()
 
 if( LLVM_ENABLE_PIC )
   if( XCODE )
     # Xcode has -mdynamic-no-pic on by default, which overrides -fPIC. I don't
     # know how to disable this, so just force ENABLE_PIC off for now.
     message(WARNING "-fPIC not supported with Xcode.")
   elseif( WIN32 OR CYGWIN)
     # On Windows all code is PIC. MinGW warns if -fPIC is used.
   else()
     add_flag_or_print_warning("-fPIC" FPIC)
   endif()
 endif()
 
 if(NOT WIN32 AND NOT CYGWIN)
   # MinGW warns if -fvisibility-inlines-hidden is used.
   check_cxx_compiler_flag("-fvisibility-inlines-hidden" SUPPORTS_FVISIBILITY_INLINES_HIDDEN_FLAG)
   append_if(SUPPORTS_FVISIBILITY_INLINES_HIDDEN_FLAG "-fvisibility-inlines-hidden" CMAKE_CXX_FLAGS)
 endif()
 
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
   # TODO: support other platforms and toolchains.
   if( LLVM_BUILD_32_BITS )
     message(STATUS "Building 32 bits executables and libraries.")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m32")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -m32")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -m32")
     set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -m32")
 
     # FIXME: CMAKE_SIZEOF_VOID_P is still 8
     add_definitions(-D_LARGEFILE_SOURCE)
     add_definitions(-D_FILE_OFFSET_BITS=64)
   endif( LLVM_BUILD_32_BITS )
 endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
 
 # If building on a GNU specific 32-bit system, make sure off_t is 64 bits
 # so that off_t can stored offset > 2GB.
 # Android until version N (API 24) doesn't support it.
 if (ANDROID AND (ANDROID_NATIVE_API_LEVEL LESS 24))
   set(LLVM_FORCE_SMALLFILE_FOR_ANDROID TRUE)
 endif()
 if( CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT LLVM_FORCE_SMALLFILE_FOR_ANDROID)
   # FIXME: It isn't handled in LLVM_BUILD_32_BITS.
   add_definitions( -D_LARGEFILE_SOURCE )
   add_definitions( -D_FILE_OFFSET_BITS=64 )
 endif()
 
 if( XCODE )
   # For Xcode enable several build settings that correspond to
   # many warnings that are on by default in Clang but are
   # not enabled for historical reasons.  For versions of Xcode
   # that do not support these options they will simply
   # be ignored.
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_ABOUT_RETURN_TYPE "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_ABOUT_MISSING_NEWLINE "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_UNUSED_VALUE "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_UNUSED_VARIABLE "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_SIGN_COMPARE "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_UNUSED_FUNCTION "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_INITIALIZER_NOT_FULLY_BRACKETED "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_UNINITIALIZED_AUTOS "YES")
   set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BOOL_CONVERSION "YES")
   set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_EMPTY_BODY "YES")
   set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_ENUM_CONVERSION "YES")
   set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_INT_CONVERSION "YES")
   set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_CONSTANT_CONVERSION "YES")
   set(CMAKE_XCODE_ATTRIBUTE_GCC_WARN_NON_VIRTUAL_DESTRUCTOR "YES")
 endif()
 
 # On Win32 using MS tools, provide an option to set the number of parallel jobs
 # to use.
 if( MSVC_IDE )
   set(LLVM_COMPILER_JOBS "0" CACHE STRING
     "Number of parallel compiler jobs. 0 means use all processors. Default is 0.")
   if( NOT LLVM_COMPILER_JOBS STREQUAL "1" )
     if( LLVM_COMPILER_JOBS STREQUAL "0" )
       add_definitions( /MP )
     else()
       message(STATUS "Number of parallel compiler jobs set to " ${LLVM_COMPILER_JOBS})
       add_definitions( /MP${LLVM_COMPILER_JOBS} )
     endif()
   else()
     message(STATUS "Parallel compilation disabled")
   endif()
 endif()
 
 # set stack reserved size to ~10MB
 if(MSVC)
   # CMake previously automatically set this value for MSVC builds, but the
   # behavior was changed in CMake 2.8.11 (Issue 12437) to use the MSVC default
   # value (1 MB) which is not enough for us in tasks such as parsing recursive
   # C++ templates in Clang.
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /STACK:10000000")
 elseif(MINGW) # FIXME: Also cygwin?
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--stack,16777216")
 
   # Pass -mbig-obj to mingw gas on Win64. COFF has a 2**16 section limit, and
   # on Win64, every COMDAT function creates at least 3 sections: .text, .pdata,
   # and .xdata.
   if (CMAKE_SIZEOF_VOID_P EQUAL 8)
     append("-Wa,-mbig-obj" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 endif()
 
 if( MSVC )
   if( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0 )
     # For MSVC 2013, disable iterator null pointer checking in debug mode,
     # especially so std::equal(nullptr, nullptr, nullptr) will not assert.
     add_definitions("-D_DEBUG_POINTER_IMPL=")
   endif()
 
   include(ChooseMSVCCRT)
 
   if( MSVC11 )
     add_definitions(-D_VARIADIC_MAX=10)
   endif()
 
   # Add definitions that make MSVC much less annoying.
   add_definitions(
     # For some reason MS wants to deprecate a bunch of standard functions...
     -D_CRT_SECURE_NO_DEPRECATE
     -D_CRT_SECURE_NO_WARNINGS
     -D_CRT_NONSTDC_NO_DEPRECATE
     -D_CRT_NONSTDC_NO_WARNINGS
     -D_SCL_SECURE_NO_DEPRECATE
     -D_SCL_SECURE_NO_WARNINGS
     )
 
   # Tell MSVC to use the Unicode version of the Win32 APIs instead of ANSI.
   add_definitions(
     -DUNICODE
     -D_UNICODE
   )
 
   if (LLVM_ENABLE_WERROR)
     append("/WX" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif (LLVM_ENABLE_WERROR)
 
   append("/Zc:inline" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
   # Allow users to request PDBs in release mode. CMake offeres the
   # RelWithDebInfo configuration, but it uses different optimization settings
   # (/Ob1 vs /Ob2 or -O2 vs -O3). LLVM provides this flag so that users can get
   # PDBs without changing codegen.
   option(LLVM_ENABLE_PDB OFF)
   if (LLVM_ENABLE_PDB AND uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE")
     append("/Zi" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     # /DEBUG disables linker GC and ICF, but we want those in Release mode.
     append("/DEBUG /OPT:REF /OPT:ICF"
           CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS
           CMAKE_SHARED_LINKER_FLAGS)
   endif()
 
   # /Zc:strictStrings is incompatible with VS12's (Visual Studio 2013's)
   # debug mode headers. Instead of only enabling them in VS2013's debug mode,
   # we'll just enable them for Visual Studio 2015 (VS 14, MSVC_VERSION 1900)
   # and up.
   if (NOT (MSVC_VERSION LESS 1900))
     # Disable string literal const->non-const type conversion.
     # "When specified, the compiler requires strict const-qualification
     # conformance for pointers initialized by using string literals."
     append("/Zc:strictStrings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif(NOT (MSVC_VERSION LESS 1900))
 
   # "Generate Intrinsic Functions".
   append("/Oi" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
   # "Enforce type conversion rules".
   append("/Zc:rvalueCast" CMAKE_CXX_FLAGS)
 
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT LLVM_ENABLE_LTO)
     # clang-cl and cl by default produce non-deterministic binaries because
     # link.exe /incremental requires a timestamp in the .obj file.  clang-cl
     # has the flag /Brepro to force deterministic binaries. We want to pass that
     # whenever you're building with clang unless you're passing /incremental
     # or using LTO (/Brepro with LTO would result in a warning about the flag
     # being unused, because we're not generating object files).
     # This checks CMAKE_CXX_COMPILER_ID in addition to check_cxx_compiler_flag()
     # because cl.exe does not emit an error on flags it doesn't understand,
     # letting check_cxx_compiler_flag() claim it understands all flags.
     check_cxx_compiler_flag("/Brepro" SUPPORTS_BREPRO)
     if (SUPPORTS_BREPRO)
       # Check if /INCREMENTAL is passed to the linker and complain that it
       # won't work with /Brepro.
       string(TOUPPER "${CMAKE_EXE_LINKER_FLAGS}" upper_exe_flags)
       string(TOUPPER "${CMAKE_MODULE_LINKER_FLAGS}" upper_module_flags)
       string(TOUPPER "${CMAKE_SHARED_LINKER_FLAGS}" upper_shared_flags)
 
       string(FIND "${upper_exe_flags} ${upper_module_flags} ${upper_shared_flags}"
         "/INCREMENTAL" linker_flag_idx)
 
       if (${linker_flag_idx} GREATER -1)
         message(WARNING "/Brepro not compatible with /INCREMENTAL linking - builds will be non-deterministic")
       else()
         append("/Brepro" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
       endif()
     endif()
   endif()
 
 elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   append_if(LLVM_ENABLE_WERROR "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   append_if(LLVM_ENABLE_WERROR "-Wno-error" CMAKE_REQUIRED_FLAGS)
   add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME)
   add_flag_if_supported("-Werror=unguarded-availability-new" WERROR_UNGUARDED_AVAILABILITY_NEW)
   if (LLVM_ENABLE_CXX1Y)
     check_cxx_compiler_flag("-std=c++1y" CXX_SUPPORTS_CXX1Y)
     append_if(CXX_SUPPORTS_CXX1Y "-std=c++1y" CMAKE_CXX_FLAGS)
   elseif(LLVM_ENABLE_CXX1Z)
     check_cxx_compiler_flag("-std=c++1z" CXX_SUPPORTS_CXX1Z)
     append_if(CXX_SUPPORTS_CXX1Z "-std=c++1z" CMAKE_CXX_FLAGS)
   else()
     check_cxx_compiler_flag("-std=c++11" CXX_SUPPORTS_CXX11)
     if (CXX_SUPPORTS_CXX11)
       if (CYGWIN OR MINGW)
         # MinGW and Cygwin are a bit stricter and lack things like
         # 'strdup', 'stricmp', etc in c++11 mode.
         append("-std=gnu++11" CMAKE_CXX_FLAGS)
       else()
         append("-std=c++11" CMAKE_CXX_FLAGS)
       endif()
     else()
       message(FATAL_ERROR "LLVM requires C++11 support but the '-std=c++11' flag isn't supported.")
     endif()
   endif()
   if (LLVM_ENABLE_MODULES)
     set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
     set(module_flags "-fmodules -fmodules-cache-path=${PROJECT_BINARY_DIR}/module.cache")
     if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
       # On Darwin -fmodules does not imply -fcxx-modules.
       set(module_flags "${module_flags} -fcxx-modules")
     endif()
     if (LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY)
       set(module_flags "${module_flags} -Xclang -fmodules-local-submodule-visibility")
     endif()
     if (LLVM_ENABLE_MODULE_DEBUGGING AND
         ((uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") OR
          (uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")))
       set(module_flags "${module_flags} -gmodules")
     endif()
     set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${module_flags}")
 
     # Check that we can build code with modules enabled, and that repeatedly
     # including <cassert> still manages to respect NDEBUG properly.
     CHECK_CXX_SOURCE_COMPILES("#undef NDEBUG
                                #include <cassert>
                                #define NDEBUG
                                #include <cassert>
                                int main() { assert(this code is not compiled); }"
                                CXX_SUPPORTS_MODULES)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
     if (CXX_SUPPORTS_MODULES)
       append("${module_flags}" CMAKE_CXX_FLAGS)
     else()
       message(FATAL_ERROR "LLVM_ENABLE_MODULES is not supported by this compiler")
     endif()
   endif(LLVM_ENABLE_MODULES)
 endif( MSVC )
 
 if (MSVC)
   if (NOT CLANG_CL)
     set(msvc_warning_flags
       # Disabled warnings.
       -wd4141 # Suppress ''modifier' : used more than once' (because of __forceinline combined with inline)
       -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
       -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
       -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
       -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
       -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
       -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
       -wd4345 # Suppress 'behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized'
       -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
       -wd4355 # Suppress ''this' : used in base member initializer list'
       -wd4456 # Suppress 'declaration of 'var' hides local variable'
       -wd4457 # Suppress 'declaration of 'var' hides function parameter'
       -wd4458 # Suppress 'declaration of 'var' hides class member'
       -wd4459 # Suppress 'declaration of 'var' hides global declaration'
       -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
       -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
       -wd4722 # Suppress 'function' : destructor never returns, potential memory leak
       -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
       -wd4100 # Suppress 'unreferenced formal parameter'
       -wd4127 # Suppress 'conditional expression is constant'
       -wd4512 # Suppress 'assignment operator could not be generated'
       -wd4505 # Suppress 'unreferenced local function has been removed'
       -wd4610 # Suppress '<class> can never be instantiated'
       -wd4510 # Suppress 'default constructor could not be generated'
       -wd4702 # Suppress 'unreachable code'
       -wd4245 # Suppress 'signed/unsigned mismatch'
       -wd4706 # Suppress 'assignment within conditional expression'
       -wd4310 # Suppress 'cast truncates constant value'
       -wd4701 # Suppress 'potentially uninitialized local variable'
       -wd4703 # Suppress 'potentially uninitialized local pointer variable'
       -wd4389 # Suppress 'signed/unsigned mismatch'
       -wd4611 # Suppress 'interaction between '_setjmp' and C++ object destruction is non-portable'
       -wd4805 # Suppress 'unsafe mix of type <type> and type <type> in operation'
       -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
       -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
       -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
           # C4592 is disabled because of false positives in Visual Studio 2015
           # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
       -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
       -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size'
 
       # Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
       # support the 'aligned' attribute in the way that clang sources requires (for
       # any code that uses the LLVM_ALIGNAS macro), so this is must be disabled to
       # avoid unwanted alignment warnings.
       # When we switch to requiring a version of MSVC that supports the 'alignas'
       # specifier (MSVC 2015?) this warning can be re-enabled.
       -wd4324 # Suppress 'structure was padded due to __declspec(align())'
 
       # Promoted warnings.
       -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
 
       # Promoted warnings to errors.
       -we4238 # Promote 'nonstandard extension used : class rvalue used as lvalue' to error.
       )
   endif(NOT CLANG_CL)
 
   # Enable warnings
   if (LLVM_ENABLE_WARNINGS)
     # Put /W4 in front of all the -we flags. cl.exe doesn't care, but for
     # clang-cl having /W4 after the -we flags will re-enable the warnings
     # disabled by -we.
     set(msvc_warning_flags "/W4 ${msvc_warning_flags}")
     # CMake appends /W3 by default, and having /W3 followed by /W4 will result in
     # cl : Command line warning D9025 : overriding '/W3' with '/W4'.  Since this is
     # a command line warning and not a compiler warning, it cannot be suppressed except
     # by fixing the command line.
     string(REGEX REPLACE " /W[0-4]" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
     string(REGEX REPLACE " /W[0-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
     if (LLVM_ENABLE_PEDANTIC)
       # No MSVC equivalent available
     endif (LLVM_ENABLE_PEDANTIC)
   endif (LLVM_ENABLE_WARNINGS)
 
   foreach(flag ${msvc_warning_flags})
     append("${flag}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endforeach(flag)
 endif (MSVC)
 
 if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
   # Don't add -Wall for clang-cl, because it maps -Wall to -Weverything for
   # MSVC compatibility.  /W4 is added above instead.
   if (NOT CLANG_CL)
     append("-Wall" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
   append("-Wextra -Wno-unused-parameter -Wwrite-strings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   append("-Wcast-qual" CMAKE_CXX_FLAGS)
 
   # Turn off missing field initializer warnings for gcc to avoid noise from
   # false positives with empty {}. Turn them on otherwise (they're off by
   # default for clang).
   check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
   if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
     if (CMAKE_COMPILER_IS_GNUCXX)
       append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
   endif()
 
   if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE)
     append("-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
   add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
   append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
   append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
 
   # Disable -Wclass-memaccess, a C++-only warning from GCC 8 that fires on
   # LLVM's ADT classes.
   check_cxx_compiler_flag("-Wclass-memaccess" CXX_SUPPORTS_CLASS_MEMACCESS_FLAG)
   append_if(CXX_SUPPORTS_CLASS_MEMACCESS_FLAG "-Wno-class-memaccess" CMAKE_CXX_FLAGS)
 
   # Check if -Wnon-virtual-dtor warns even though the class is marked final.
   # If it does, don't add it. So it won't be added on clang 3.4 and older.
   # This also catches cases when -Wnon-virtual-dtor isn't supported by
   # the compiler at all.  This flag is not activated for gcc since it will
   # incorrectly identify a protected non-virtual base when there is a friend
   # declaration. Don't activate this in general on Windows as this warning has
   # too many false positives on COM-style classes, which are destroyed with
   # Release() (PR32286).
   if (NOT CMAKE_COMPILER_IS_GNUCXX AND NOT WIN32)
     set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
     set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++11 -Werror=non-virtual-dtor")
     CHECK_CXX_SOURCE_COMPILES("class base {public: virtual void anchor();protected: ~base();};
                                class derived final : public base { public: ~derived();};
                                int main() { return 0; }"
                               CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
     append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR
               "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
   endif()
 
   # Enable -Wdelete-non-virtual-dtor if available.
   add_flag_if_supported("-Wdelete-non-virtual-dtor" DELETE_NON_VIRTUAL_DTOR_FLAG)
 
   # Check if -Wcomment is OK with an // comment ending with '\' if the next
   # line is also a // comment.
   set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment")
   CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
                           C_WCOMMENT_ALLOWS_LINE_WRAP)
   set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
   if (NOT C_WCOMMENT_ALLOWS_LINE_WRAP)
     append("-Wno-comment" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
   # Enable -Wstring-conversion to catch misuse of string literals.
   add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
 endif (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE AND NOT LLVM_ENABLE_WARNINGS)
   append("-w" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 endif()
 
 macro(append_common_sanitizer_flags)
   if (NOT MSVC)
     # Append -fno-omit-frame-pointer and turn on debug info to get better
     # stack traces.
     add_flag_if_supported("-fno-omit-frame-pointer" FNO_OMIT_FRAME_POINTER)
     if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND
         NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
       add_flag_if_supported("-gline-tables-only" GLINE_TABLES_ONLY)
     endif()
     # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.
     if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND LLVM_OPTIMIZE_SANITIZED_BUILDS)
       add_flag_if_supported("-O1" O1)
     endif()
   elseif (CLANG_CL)
     # Keep frame pointers around.
     append("/Oy-" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     if (LINKER_IS_LLD_LINK)
       # Use DWARF debug info with LLD.
       append("-gdwarf" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       # Enable codeview otherwise.
       append("/Z7" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
     # Always ask the linker to produce symbols with asan.
     append("-debug" CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
 endmacro()
 
 # Turn on sanitizers if necessary.
 if(LLVM_USE_SANITIZER)
   if (LLVM_ON_UNIX)
     if (LLVM_USE_SANITIZER STREQUAL "Address")
       append_common_sanitizer_flags()
       append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     elseif (LLVM_USE_SANITIZER MATCHES "Memory(WithOrigins)?")
       append_common_sanitizer_flags()
       append("-fsanitize=memory" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
       if(LLVM_USE_SANITIZER STREQUAL "MemoryWithOrigins")
         append("-fsanitize-memory-track-origins" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
       endif()
     elseif (LLVM_USE_SANITIZER STREQUAL "Undefined")
       append_common_sanitizer_flags()
       append("-fsanitize=undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all"
               CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     elseif (LLVM_USE_SANITIZER STREQUAL "Thread")
       append_common_sanitizer_flags()
       append("-fsanitize=thread" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     elseif (LLVM_USE_SANITIZER STREQUAL "Address;Undefined" OR
             LLVM_USE_SANITIZER STREQUAL "Undefined;Address")
       append_common_sanitizer_flags()
       append("-fsanitize=address,undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all"
               CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     elseif (LLVM_USE_SANITIZER STREQUAL "Leaks")
       append_common_sanitizer_flags()
       append("-fsanitize=leak" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       message(FATAL_ERROR "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}")
     endif()
   elseif(MSVC)
     if (LLVM_USE_SANITIZER STREQUAL "Address")
       append_common_sanitizer_flags()
       append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       message(FATAL_ERROR "This sanitizer not yet supported in the MSVC environment: ${LLVM_USE_SANITIZER}")
     endif()
   else()
     message(FATAL_ERROR "LLVM_USE_SANITIZER is not supported on this platform.")
   endif()
   if (LLVM_USE_SANITIZER MATCHES "(Undefined;)?Address(;Undefined)?")
     add_flag_if_supported("-fsanitize-address-use-after-scope"
                           FSANITIZE_USE_AFTER_SCOPE_FLAG)
   endif()
   if (LLVM_USE_SANITIZE_COVERAGE)
     append("-fsanitize=fuzzer-no-link" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
   if (LLVM_USE_SANITIZER MATCHES ".*Undefined.*")
     set(BLACKLIST_FILE "${CMAKE_SOURCE_DIR}/utils/sanitizers/ubsan_blacklist.txt")
     if (EXISTS "${BLACKLIST_FILE}")
       append("-fsanitize-blacklist=${BLACKLIST_FILE}"
              CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
   endif()
 endif()
 
 # Turn on -gsplit-dwarf if requested
 if(LLVM_USE_SPLIT_DWARF)
   add_definitions("-gsplit-dwarf")
 endif()
 
 add_definitions( -D__STDC_CONSTANT_MACROS )
 add_definitions( -D__STDC_FORMAT_MACROS )
 add_definitions( -D__STDC_LIMIT_MACROS )
 
 # clang and gcc don't default-print colored diagnostics when invoked from Ninja.
 if (UNIX AND
     CMAKE_GENERATOR STREQUAL "Ninja" AND
     (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
      (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
       NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9))))
   append("-fdiagnostics-color" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 endif()
 
 # lld doesn't print colored diagnostics when invoked from Ninja
 if (UNIX AND CMAKE_GENERATOR STREQUAL "Ninja")
   include(CheckLinkerFlag)
   check_linker_flag("-Wl,--color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
   append_if(LINKER_SUPPORTS_COLOR_DIAGNOSTICS "-Wl,--color-diagnostics"
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
 endif()
 
 # Add flags for add_dead_strip().
 # FIXME: With MSVS, consider compiling with /Gy and linking with /OPT:REF?
 # But MinSizeRel seems to add that automatically, so maybe disable these
 # flags instead if LLVM_NO_DEAD_STRIP is set.
 if(NOT CYGWIN AND NOT WIN32)
   if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND
      NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
     check_c_compiler_flag("-Werror -fno-function-sections" C_SUPPORTS_FNO_FUNCTION_SECTIONS)
     if (C_SUPPORTS_FNO_FUNCTION_SECTIONS)
       # Don't add -ffunction-section if it can be disabled with -fno-function-sections.
       # Doing so will break sanitizers.
       add_flag_if_supported("-ffunction-sections" FFUNCTION_SECTIONS)
     endif()
     add_flag_if_supported("-fdata-sections" FDATA_SECTIONS)
   endif()
 endif()
 
 if(MSVC)
   # Remove flags here, for exceptions and RTTI.
   # Each target property or source property should be responsible to control
   # them.
   # CL.EXE complains to override flags like "/GR /GR-".
   string(REGEX REPLACE "(^| ) */EH[-cs]+ *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REGEX REPLACE "(^| ) */GR-? *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
 # Provide public options to globally control RTTI and EH
 option(LLVM_ENABLE_EH "Enable Exception handling" OFF)
 option(LLVM_ENABLE_RTTI "Enable run time type information" OFF)
 if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
   message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
 endif()
 
 option(LLVM_ENABLE_IR_PGO "Build LLVM and tools with IR PGO instrumentation (deprecated)" Off)
 mark_as_advanced(LLVM_ENABLE_IR_PGO)
 
 set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR or Frontend")
 mark_as_advanced(LLVM_BUILD_INSTRUMENTED)
 string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" uppercase_LLVM_BUILD_INSTRUMENTED)
 
 if (LLVM_BUILD_INSTRUMENTED)
   if (LLVM_ENABLE_IR_PGO OR uppercase_LLVM_BUILD_INSTRUMENTED STREQUAL "IR")
     append("-fprofile-generate='${LLVM_PROFILE_DATA_DIR}'"
       CMAKE_CXX_FLAGS
       CMAKE_C_FLAGS
       CMAKE_EXE_LINKER_FLAGS
       CMAKE_SHARED_LINKER_FLAGS)
   else()
     append("-fprofile-instr-generate='${LLVM_PROFILE_FILE_PATTERN}'"
       CMAKE_CXX_FLAGS
       CMAKE_C_FLAGS
       CMAKE_EXE_LINKER_FLAGS
       CMAKE_SHARED_LINKER_FLAGS)
   endif()
 endif()
 
 option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation" Off)
 mark_as_advanced(LLVM_BUILD_INSTRUMENTED_COVERAGE)
 append_if(LLVM_BUILD_INSTRUMENTED_COVERAGE "-fprofile-instr-generate='${LLVM_PROFILE_FILE_PATTERN}' -fcoverage-mapping"
   CMAKE_CXX_FLAGS
   CMAKE_C_FLAGS
   CMAKE_EXE_LINKER_FLAGS
   CMAKE_SHARED_LINKER_FLAGS)
 
 if (LLVM_BUILD_INSTRUMENTED AND LLVM_BUILD_INSTRUMENTED_COVERAGE)
   message(FATAL_ERROR "LLVM_BUILD_INSTRUMENTED and LLVM_BUILD_INSTRUMENTED_COVERAGE cannot both be specified")
 endif()
 
 if(LLVM_ENABLE_LTO AND LLVM_ON_WIN32 AND NOT LINKER_IS_LLD_LINK)
   message(FATAL_ERROR "When compiling for Windows, LLVM_ENABLE_LTO requires using lld as the linker (point CMAKE_LINKER at lld-link.exe)")
 endif()
 if(uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
   append("-flto=thin" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
   if(NOT LINKER_IS_LLD_LINK)
     append("-flto=thin" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
   # If the linker supports it, enable the lto cache. This improves initial build
   # time a little since we re-link a lot of the same objects, and significantly
   # improves incremental build time.
   # FIXME: We should move all this logic into the clang driver.
   if(APPLE)
     append("-Wl,-cache_path_lto,${PROJECT_BINARY_DIR}/lto.cache"
            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   elseif(UNIX AND LLVM_USE_LINKER STREQUAL "lld")
     append("-Wl,--thinlto-cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   elseif(LLVM_USE_LINKER STREQUAL "gold")
     append("-Wl,--plugin-opt,cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
 elseif(uppercase_LLVM_ENABLE_LTO STREQUAL "FULL")
   append("-flto=full" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
   if(NOT LINKER_IS_LLD_LINK)
     append("-flto=full" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
 elseif(LLVM_ENABLE_LTO)
   append("-flto" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
   if(NOT LINKER_IS_LLD_LINK)
     append("-flto" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
 endif()
 
 # This option makes utils/extract_symbols.py be used to determine the list of
 # symbols to export from LLVM tools. This is necessary when using MSVC if you
 # want to allow plugins, though note that the plugin has to explicitly link
 # against (exactly one) tool so we can't unilaterally turn on
 # LLVM_ENABLE_PLUGINS when it's enabled.
 option(LLVM_EXPORT_SYMBOLS_FOR_PLUGINS "Export symbols from LLVM tools so that plugins can import them" OFF)
 if(BUILD_SHARED_LIBS AND LLVM_EXPORT_SYMBOLS_FOR_PLUGINS)
   message(FATAL_ERROR "BUILD_SHARED_LIBS not compatible with LLVM_EXPORT_SYMBOLS_FOR_PLUGINS")
 endif()
 if(LLVM_LINK_LLVM_DYLIB AND LLVM_EXPORT_SYMBOLS_FOR_PLUGINS)
   message(FATAL_ERROR "LLVM_LINK_LLVM_DYLIB not compatible with LLVM_EXPORT_SYMBOLS_FOR_PLUGINS")
 endif()
 
 # Plugin support
 # FIXME: Make this configurable.
 if(WIN32 OR CYGWIN)
   if(BUILD_SHARED_LIBS OR LLVM_BUILD_LLVM_DYLIB)
     set(LLVM_ENABLE_PLUGINS ON)
   else()
     set(LLVM_ENABLE_PLUGINS OFF)
   endif()
 else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()
 
 set(LLVM_ENABLE_IDE_default OFF)
 if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR)
   set(LLVM_ENABLE_IDE_default ON)
 endif()
 option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE"
     ${LLVM_ENABLE_IDE_default})
 
 function(get_compile_definitions)
   get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
   foreach(definition ${top_dir_definitions})
     if(DEFINED result)
       string(APPEND result " -D${definition}")
     else()
       set(result "-D${definition}")
     endif()
   endforeach()
   set(LLVM_DEFINITIONS "${result}" PARENT_SCOPE)
 endfunction()
 get_compile_definitions()
 
 option(LLVM_FORCE_ENABLE_STATS "Enable statistics collection for builds that wouldn't normally enable it" OFF)
Index: vendor/llvm/dist-release_70/docs/ReleaseNotes.rst
===================================================================
--- vendor/llvm/dist-release_70/docs/ReleaseNotes.rst	(revision 337999)
+++ vendor/llvm/dist-release_70/docs/ReleaseNotes.rst	(revision 338000)
@@ -1,242 +1,282 @@
 ========================
 LLVM 7.0.0 Release Notes
 ========================
 
 .. contents::
     :local:
 
 .. warning::
    These are in-progress notes for the upcoming LLVM 7 release.
    Release notes for previous releases can be found on
    `the Download Page <http://releases.llvm.org/download.html>`_.
 
 
 Introduction
 ============
 
 This document contains the release notes for the LLVM Compiler Infrastructure,
 release 7.0.0.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
 
 For more information about LLVM, including information about the latest
 release, please check out the `main LLVM web site <http://llvm.org/>`_.  If you
 have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
 Note that if you are reading this file from a Subversion checkout or the main
 LLVM web page, this document applies to the *next* release, not the current
 one.  To see the release notes for a specific release, please see the `releases
 page <http://llvm.org/releases/>`_.
 
 Non-comprehensive list of changes in this release
 =================================================
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
    point (e.g. maybe you would like to give an example of the
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
 * The Windows installer no longer includes a Visual Studio integration.
   Instead, a new
   `LLVM Compiler Toolchain Visual Studio extension <https://marketplace.visualstudio.com/items?itemName=LLVMExtensions.llvm-toolchain>`
   is available on the Visual Studio Marketplace. The new integration includes
   support for Visual Studio 2017.
 
 * Libraries have been renamed from 7.0 to 7. This change also impacts
   downstream libraries like lldb.
 
 * The LoopInstSimplify pass (-loop-instsimplify) has been removed.
 
 * Symbols starting with ``?`` are no longer mangled by LLVM when using the
   Windows ``x`` or ``w`` IR mangling schemes.
 
 * A new tool named :doc:`llvm-exegesis <CommandGuide/llvm-exegesis>` has been
   added. :program:`llvm-exegesis` automatically measures instruction scheduling
   properties (latency/uops) and provides a principled way to edit scheduling
   models.
 
 * A new tool named :doc:`llvm-mca <CommandGuide/llvm-mca>` has been added.
   :program:`llvm-mca` is a  static performance analysis tool that uses
   information available in LLVM to statically predict the performance of
   machine code for a specific CPU.
 
 * The optimization flag to merge constants (-fmerge-all-constants) is no longer
   applied by default.
 
 * Optimization of floating-point casts is improved. This may cause surprising
   results for code that is relying on the undefined behavior of overflowing 
   casts. The optimization can be disabled by specifying a function attribute:
   "strict-float-cast-overflow"="false". This attribute may be created by the
   clang option ``-fno-strict-float-cast-overflow``.
   Code sanitizers can be used to detect affected patterns. The option for
   detecting this problem alone is "-fsanitize=float-cast-overflow":
 
 .. code-block:: c
 
     int main() {
       float x = 4294967296.0f;
       x = (float)((int)x);
       printf("junk in the ftrunc: %f\n", x);
       return 0;
     }
 
 .. code-block:: bash
 
     clang -O1 ftrunc.c -fsanitize=float-cast-overflow ; ./a.out 
     ftrunc.c:5:15: runtime error: 4.29497e+09 is outside the range of representable values of type 'int'
     junk in the ftrunc: 0.000000
 
 * ``LLVM_ON_WIN32`` is no longer set by ``llvm/Config/config.h`` and
   ``llvm/Config/llvm-config.h``.  If you used this macro, use the compiler-set
   ``_WIN32`` instead which is set exactly when ``LLVM_ON_WIN32`` used to be set.
 
 * The ``DEBUG`` macro has been renamed to ``LLVM_DEBUG``, the interface remains
   the same.  If you used this macro you need to migrate to the new one.
   You should also clang-format your code to make it easier to integrate future
   changes locally.  This can be done with the following bash commands:
 
 .. code-block:: bash
 
     git grep -l 'DEBUG' | xargs perl -pi -e 's/\bDEBUG\s?\(/LLVM_DEBUG(/g'
     git diff -U0 master | ../clang/tools/clang-format/clang-format-diff.py -i -p1 -style LLVM
 
 * Early support for UBsan, X-Ray instrumentation and libFuzzer (x86 and x86_64) for OpenBSD. Support for MSan
   (x86_64), X-Ray instrumentation and libFuzzer (x86 and x86_64) for FreeBSD.
 
 * ``SmallVector<T, 0>`` shrank from ``sizeof(void*) * 4 + sizeof(T)`` to
   ``sizeof(void*) + sizeof(unsigned) * 2``, smaller than ``std::vector<T>`` on
   64-bit platforms.  The maximum capacity is now restricted to ``UINT32_MAX``.
   Since SmallVector doesn't have the exception-safety pessimizations some
   implementations saddle std::vector with and is better at using ``realloc``,
   it's now a better choice even on the heap (although when TinyPtrVector works,
   it's even smaller).
 
 * Preliminary/experimental support for DWARF v5 debugging information,
   including the new .debug_names accelerator table. DWARF emitted at ``-O0``
   should be fully DWARF v5 compliant. Type units and split DWARF are known
   not to be compliant, and higher optimization levels will still emit some
   information in v4 format.
 
+* Added support for the ``.rva`` assembler directive for COFF targets.
+
+* The :program:`llvm-rc` tool (Windows Resource Compiler) has been improved
+  a bit. There are still known missing features, but it is generally usable
+  in many cases. (The tool still doesn't preprocess input files automatically,
+  but it can now handle leftover C declarations in preprocessor output, if
+  given output from a preprocessor run externally.)
+
+* CodeView debug info can now be emitted MinGW configurations, if requested.
+
 * Note..
 
 .. NOTE
    If you would like to document a larger change, then you can add a
    subsection about it right here. You can copy the following boilerplate
    and un-indent it (the indentation causes it to be inside this comment).
 
    Special New Feature
    -------------------
 
    Makes programs 10x faster by doing Special New Thing.
 
 Changes to the LLVM IR
 ----------------------
 
 * The signatures for the builtins @llvm.memcpy, @llvm.memmove, and @llvm.memset
   have changed. Alignment is no longer an argument, and are instead conveyed as
   parameter attributes.
 
 * invariant.group.barrier has been renamed to launder.invariant.group.
 
 * invariant.group metadata can now refer only empty metadata nodes.
 
-Changes to the ARM Backend
---------------------------
+Changes to the AArch64 Target
+-----------------------------
 
- During this release ...
+* The ``.inst`` assembler directive is now usable on both COFF and Mach-O
+  targets, in addition to ELF.
 
+* Support for most remaining COFF relocations have been added.
 
+* Support for TLS on Windows has been added.
+
+Changes to the ARM Target
+-------------------------
+
+* The ``.inst`` assembler directive is now usable on both COFF and Mach-O
+  targets, in addition to ELF. For Thumb, it can now also automatically
+  deduce the instruction size, without having to specify it with
+  e.g. ``.inst.w`` as before.
+
+Changes to the Hexagon Target
+-----------------------------
+
+* Hexagon now supports auto-vectorization for HVX. It is disabled by default
+  and can be turned on with ``-fvectorize``. For auto-vectorization to take
+  effect, code generation for HVX needs to be enabled with ``-mhvx``.
+  The complete set of options should include ``-fvectorize``, ``-mhvx``,
+  and ``-mhvx-length={64b|128b}``.
+
+* The support for Hexagon ISA V4 is deprecated and will be removed in the
+  next release.
+
 Changes to the MIPS Target
 --------------------------
 
  During this release ...
 
 
 Changes to the PowerPC Target
 -----------------------------
 
  During this release ...
 
 Changes to the SystemZ Target
 -----------------------------
 
 During this release the SystemZ target has:
 
 * Added support for vector registers in inline asm statements.
 
 * Added support for stackmaps, patchpoints, and the anyregcc
   calling convention.
 
 * Changed the default function alignment to 16 bytes.
 
 * Improved codegen for condition code handling.
 
 * Improved instruction scheduling and microarchitecture tuning for z13/z14.
 
 * Fixed support for generating GCOV coverage data.
 
 * Fixed some codegen bugs.
 
 Changes to the X86 Target
 -------------------------
 
- During this release ...
+* The calling convention for the ``f80`` data type on MinGW targets has been
+  fixed. Normally, the calling convention for this type is handled within clang,
+  but if an intrinsic is used, which LLVM expands into a libcall, the
+  proper calling convention needs to be supported in LLVM as well. (Note,
+  on Windows, this data type is only used for long doubles in MinGW
+  environments - in MSVC environments, long doubles are the same size as
+  normal doubles.)
 
 Changes to the AMDGPU Target
 -----------------------------
 
  During this release ...
 
 Changes to the AVR Target
 -----------------------------
 
  During this release ...
 
 Changes to the OCaml bindings
 -----------------------------
 
 * Remove ``add_bb_vectorize``.
 
 
 Changes to the C API
 --------------------
 
 * Remove ``LLVMAddBBVectorizePass``. The implementation was removed and the C
   interface was made a deprecated no-op in LLVM 5. Use
   ``LLVMAddSLPVectorizePass`` instead to get the supported SLP vectorizer.
 
 Changes to the DAG infrastructure
 ---------------------------------
 * ADDC/ADDE/SUBC/SUBE are now deprecated and will default to expand. Backends
   that wish to continue to use these opcodes should explicitely request so
   using ``setOperationAction`` in their ``TargetLowering``. New backends
   should use UADDO/ADDCARRY/USUBO/SUBCARRY instead of the deprecated opcodes.
 
 * The SETCCE opcode has now been removed in favor of SETCCCARRY.
 
 * TableGen now supports multi-alternative pattern fragments via the PatFrags
   class.  PatFrag is now derived from PatFrags, which may require minor
   changes to backends that directly access PatFrag members.
 
 External Open Source Projects Using LLVM 7
 ==========================================
 
 * A project...
 
 
 Additional Information
 ======================
 
 A wide variety of additional information is available on the `LLVM web page
 <http://llvm.org/>`_, in particular in the `documentation
 <http://llvm.org/docs/>`_ section.  The web page also contains versions of the
 API documentation which is up-to-date with the Subversion version of the source
 code.  You can access versions of these documents specific to this release by
 going into the ``llvm/docs/`` directory in the LLVM tree.
 
 If you have any questions or comments about LLVM, please feel free to contact
 us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
Index: vendor/llvm/dist-release_70/lib/Analysis/BasicAliasAnalysis.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Analysis/BasicAliasAnalysis.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/Analysis/BasicAliasAnalysis.cpp	(revision 338000)
@@ -1,1973 +1,1974 @@
 //===- BasicAliasAnalysis.cpp - Stateless Alias Analysis Impl -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the primary stateless implementation of the
 // Alias Analysis interface that implements identities (two different
 // globals cannot alias, etc), but does no stateful analysis.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/PhiValues.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/KnownBits.h"
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
 #include <utility>
 
 #define DEBUG_TYPE "basicaa"
 
 using namespace llvm;
 
 /// Enable analysis of recursive PHI nodes.
 static cl::opt<bool> EnableRecPhiAnalysis("basicaa-recphi", cl::Hidden,
                                           cl::init(false));
 /// SearchLimitReached / SearchTimes shows how often the limit of
 /// to decompose GEPs is reached. It will affect the precision
 /// of basic alias analysis.
 STATISTIC(SearchLimitReached, "Number of times the limit to "
                               "decompose GEPs is reached");
 STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
 
 /// Cutoff after which to stop analysing a set of phi nodes potentially involved
 /// in a cycle. Because we are analysing 'through' phi nodes, we need to be
 /// careful with value equivalence. We use reachability to make sure a value
 /// cannot be involved in a cycle.
 const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;
 
 // The max limit of the search depth in DecomposeGEPExpression() and
 // GetUnderlyingObject(), both functions need to use the same search
 // depth otherwise the algorithm in aliasGEP will assert.
 static const unsigned MaxLookupSearchDepth = 6;
 
 bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &Inv) {
   // We don't care if this analysis itself is preserved, it has no state. But
   // we need to check that the analyses it depends on have been. Note that we
   // may be created without handles to some analyses and in that case don't
   // depend on them.
   if (Inv.invalidate<AssumptionAnalysis>(Fn, PA) ||
       (DT && Inv.invalidate<DominatorTreeAnalysis>(Fn, PA)) ||
       (LI && Inv.invalidate<LoopAnalysis>(Fn, PA)) ||
       (PV && Inv.invalidate<PhiValuesAnalysis>(Fn, PA)))
     return true;
 
   // Otherwise this analysis result remains valid.
   return false;
 }
 
 //===----------------------------------------------------------------------===//
 // Useful predicates
 //===----------------------------------------------------------------------===//
 
 /// Returns true if the pointer is to a function-local object that never
 /// escapes from the function.
 static bool isNonEscapingLocalObject(const Value *V) {
   // If this is a local allocation, check to see if it escapes.
   if (isa<AllocaInst>(V) || isNoAliasCall(V))
     // Set StoreCaptures to True so that we can assume in our callers that the
     // pointer is not the result of a load instruction. Currently
     // PointerMayBeCaptured doesn't have any special analysis for the
     // StoreCaptures=false case; if it did, our callers could be refined to be
     // more precise.
     return !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
 
   // If this is an argument that corresponds to a byval or noalias argument,
   // then it has not escaped before entering the function.  Check if it escapes
   // inside the function.
   if (const Argument *A = dyn_cast<Argument>(V))
     if (A->hasByValAttr() || A->hasNoAliasAttr())
       // Note even if the argument is marked nocapture, we still need to check
       // for copies made inside the function. The nocapture attribute only
       // specifies that there are no copies made that outlive the function.
       return !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
 
   return false;
 }
 
 /// Returns true if the pointer is one which would have been considered an
 /// escape by isNonEscapingLocalObject.
 static bool isEscapeSource(const Value *V) {
   if (ImmutableCallSite(V))
     return true;
 
   if (isa<Argument>(V))
     return true;
 
   // The load case works because isNonEscapingLocalObject considers all
   // stores to be escapes (it passes true for the StoreCaptures argument
   // to PointerMayBeCaptured).
   if (isa<LoadInst>(V))
     return true;
 
   return false;
 }
 
 /// Returns the size of the object specified by V or UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
                               bool NullIsValidLoc,
                               bool RoundToAlign = false) {
   uint64_t Size;
   ObjectSizeOpts Opts;
   Opts.RoundToAlign = RoundToAlign;
   Opts.NullIsUnknownSize = NullIsValidLoc;
   if (getObjectSize(V, Size, DL, &TLI, Opts))
     return Size;
   return MemoryLocation::UnknownSize;
 }
 
 /// Returns true if we can prove that the object specified by V is smaller than
 /// Size.
 static bool isObjectSmallerThan(const Value *V, uint64_t Size,
                                 const DataLayout &DL,
                                 const TargetLibraryInfo &TLI,
                                 bool NullIsValidLoc) {
   // Note that the meanings of the "object" are slightly different in the
   // following contexts:
   //    c1: llvm::getObjectSize()
   //    c2: llvm.objectsize() intrinsic
   //    c3: isObjectSmallerThan()
   // c1 and c2 share the same meaning; however, the meaning of "object" in c3
   // refers to the "entire object".
   //
   //  Consider this example:
   //     char *p = (char*)malloc(100)
   //     char *q = p+80;
   //
   //  In the context of c1 and c2, the "object" pointed by q refers to the
   // stretch of memory of q[0:19]. So, getObjectSize(q) should return 20.
   //
   //  However, in the context of c3, the "object" refers to the chunk of memory
   // being allocated. So, the "object" has 100 bytes, and q points to the middle
   // the "object". In case q is passed to isObjectSmallerThan() as the 1st
   // parameter, before the llvm::getObjectSize() is called to get the size of
   // entire object, we should:
   //    - either rewind the pointer q to the base-address of the object in
   //      question (in this case rewind to p), or
   //    - just give up. It is up to caller to make sure the pointer is pointing
   //      to the base address the object.
   //
   // We go for 2nd option for simplicity.
   if (!isIdentifiedObject(V))
     return false;
 
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
   uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc,
                                       /*RoundToAlign*/ true);
 
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size;
 }
 
 /// Returns true if we can prove that the object specified by V has size Size.
 static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
                          const TargetLibraryInfo &TLI, bool NullIsValidLoc) {
   uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc);
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize == Size;
 }
 
 //===----------------------------------------------------------------------===//
 // GetElementPtr Instruction Decomposition and Analysis
 //===----------------------------------------------------------------------===//
 
 /// Analyzes the specified value as a linear expression: "A*V + B", where A and
 /// B are constant integers.
 ///
 /// Returns the scale and offset values as APInts and return V as a Value*, and
 /// return whether we looked through any sign or zero extends.  The incoming
 /// Value is known to have IntegerType, and it may already be sign or zero
 /// extended.
 ///
 /// Note that this looks through extends, so the high bits may not be
 /// represented in the result.
 /*static*/ const Value *BasicAAResult::GetLinearExpression(
     const Value *V, APInt &Scale, APInt &Offset, unsigned &ZExtBits,
     unsigned &SExtBits, const DataLayout &DL, unsigned Depth,
     AssumptionCache *AC, DominatorTree *DT, bool &NSW, bool &NUW) {
   assert(V->getType()->isIntegerTy() && "Not an integer value");
 
   // Limit our recursion depth.
   if (Depth == 6) {
     Scale = 1;
     Offset = 0;
     return V;
   }
 
   if (const ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
     // If it's a constant, just convert it to an offset and remove the variable.
     // If we've been called recursively, the Offset bit width will be greater
     // than the constant's (the Offset's always as wide as the outermost call),
     // so we'll zext here and process any extension in the isa<SExtInst> &
     // isa<ZExtInst> cases below.
     Offset += Const->getValue().zextOrSelf(Offset.getBitWidth());
     assert(Scale == 0 && "Constant values don't have a scale");
     return V;
   }
 
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
       // If we've been called recursively, then Offset and Scale will be wider
       // than the BOp operands. We'll always zext it here as we'll process sign
       // extensions below (see the isa<SExtInst> / isa<ZExtInst> cases).
       APInt RHS = RHSC->getValue().zextOrSelf(Offset.getBitWidth());
 
       switch (BOp->getOpcode()) {
       default:
         // We don't understand this instruction, so we can't decompose it any
         // further.
         Scale = 1;
         Offset = 0;
         return V;
       case Instruction::Or:
         // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
         // analyze it.
         if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), DL, 0, AC,
                                BOp, DT)) {
           Scale = 1;
           Offset = 0;
           return V;
         }
         LLVM_FALLTHROUGH;
       case Instruction::Add:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
                                 SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
         Offset += RHS;
         break;
       case Instruction::Sub:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
                                 SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
         Offset -= RHS;
         break;
       case Instruction::Mul:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
                                 SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
         Offset *= RHS;
         Scale *= RHS;
         break;
       case Instruction::Shl:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
                                 SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
 
         // We're trying to linearize an expression of the kind:
         //   shl i8 -128, 36
         // where the shift count exceeds the bitwidth of the type.
         // We can't decompose this further (the expression would return
         // a poison value).
         if (Offset.getBitWidth() < RHS.getLimitedValue() ||
             Scale.getBitWidth() < RHS.getLimitedValue()) {
           Scale = 1;
           Offset = 0;
           return V;
         }
 
         Offset <<= RHS.getLimitedValue();
         Scale <<= RHS.getLimitedValue();
         // the semantics of nsw and nuw for left shifts don't match those of
         // multiplications, so we won't propagate them.
         NSW = NUW = false;
         return V;
       }
 
       if (isa<OverflowingBinaryOperator>(BOp)) {
         NUW &= BOp->hasNoUnsignedWrap();
         NSW &= BOp->hasNoSignedWrap();
       }
       return V;
     }
   }
 
   // Since GEP indices are sign extended anyway, we don't care about the high
   // bits of a sign or zero extended value - just scales and offsets.  The
   // extensions have to be consistent though.
   if (isa<SExtInst>(V) || isa<ZExtInst>(V)) {
     Value *CastOp = cast<CastInst>(V)->getOperand(0);
     unsigned NewWidth = V->getType()->getPrimitiveSizeInBits();
     unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
     unsigned OldZExtBits = ZExtBits, OldSExtBits = SExtBits;
     const Value *Result =
         GetLinearExpression(CastOp, Scale, Offset, ZExtBits, SExtBits, DL,
                             Depth + 1, AC, DT, NSW, NUW);
 
     // zext(zext(%x)) == zext(%x), and similarly for sext; we'll handle this
     // by just incrementing the number of bits we've extended by.
     unsigned ExtendedBy = NewWidth - SmallWidth;
 
     if (isa<SExtInst>(V) && ZExtBits == 0) {
       // sext(sext(%x, a), b) == sext(%x, a + b)
 
       if (NSW) {
         // We haven't sign-wrapped, so it's valid to decompose sext(%x + c)
         // into sext(%x) + sext(c). We'll sext the Offset ourselves:
         unsigned OldWidth = Offset.getBitWidth();
         Offset = Offset.trunc(SmallWidth).sext(NewWidth).zextOrSelf(OldWidth);
       } else {
         // We may have signed-wrapped, so don't decompose sext(%x + c) into
         // sext(%x) + sext(c)
         Scale = 1;
         Offset = 0;
         Result = CastOp;
         ZExtBits = OldZExtBits;
         SExtBits = OldSExtBits;
       }
       SExtBits += ExtendedBy;
     } else {
       // sext(zext(%x, a), b) = zext(zext(%x, a), b) = zext(%x, a + b)
 
       if (!NUW) {
         // We may have unsigned-wrapped, so don't decompose zext(%x + c) into
         // zext(%x) + zext(c)
         Scale = 1;
         Offset = 0;
         Result = CastOp;
         ZExtBits = OldZExtBits;
         SExtBits = OldSExtBits;
       }
       ZExtBits += ExtendedBy;
     }
 
     return Result;
   }
 
   Scale = 1;
   Offset = 0;
   return V;
 }
 
 /// To ensure a pointer offset fits in an integer of size PointerSize
 /// (in bits) when that size is smaller than 64. This is an issue in
 /// particular for 32b programs with negative indices that rely on two's
 /// complement wrap-arounds for precise alias information.
 static int64_t adjustToPointerSize(int64_t Offset, unsigned PointerSize) {
   assert(PointerSize <= 64 && "Invalid PointerSize!");
   unsigned ShiftBits = 64 - PointerSize;
   return (int64_t)((uint64_t)Offset << ShiftBits) >> ShiftBits;
 }
 
 /// If V is a symbolic pointer expression, decompose it into a base pointer
 /// with a constant offset and a number of scaled symbolic offsets.
 ///
 /// The scaled symbolic offsets (represented by pairs of a Value* and a scale
 /// in the VarIndices vector) are Value*'s that are known to be scaled by the
 /// specified amount, but which may have other unrepresented high bits. As
 /// such, the gep cannot necessarily be reconstructed from its decomposed form.
 ///
 /// When DataLayout is around, this function is capable of analyzing everything
 /// that GetUnderlyingObject can look through. To be able to do that
 /// GetUnderlyingObject and DecomposeGEPExpression must use the same search
 /// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks
 /// through pointer casts.
 bool BasicAAResult::DecomposeGEPExpression(const Value *V,
        DecomposedGEP &Decomposed, const DataLayout &DL, AssumptionCache *AC,
        DominatorTree *DT) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = MaxLookupSearchDepth;
   SearchTimes++;
 
   Decomposed.StructOffset = 0;
   Decomposed.OtherOffset = 0;
   Decomposed.VarIndices.clear();
   do {
     // See if this is a bitcast or GEP.
     const Operator *Op = dyn_cast<Operator>(V);
     if (!Op) {
       // The only non-operator case we can handle are GlobalAliases.
       if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
         if (!GA->isInterposable()) {
           V = GA->getAliasee();
           continue;
         }
       }
       Decomposed.Base = V;
       return false;
     }
 
     if (Op->getOpcode() == Instruction::BitCast ||
         Op->getOpcode() == Instruction::AddrSpaceCast) {
       V = Op->getOperand(0);
       continue;
     }
 
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
     if (!GEPOp) {
       if (auto CS = ImmutableCallSite(V)) {
         // CaptureTracking can know about special capturing properties of some
         // intrinsics like launder.invariant.group, that can't be expressed with
         // the attributes, but have properties like returning aliasing pointer.
         // Because some analysis may assume that nocaptured pointer is not
         // returned from some special intrinsic (because function would have to
         // be marked with returns attribute), it is crucial to use this function
         // because it should be in sync with CaptureTracking. Not using it may
         // cause weird miscompilations where 2 aliasing pointers are assumed to
         // noalias.
         if (auto *RP = getArgumentAliasingToReturnedPointer(CS)) {
           V = RP;
           continue;
         }
       }
 
       // If it's not a GEP, hand it off to SimplifyInstruction to see if it
       // can come up with something. This matches what GetUnderlyingObject does.
       if (const Instruction *I = dyn_cast<Instruction>(V))
         // TODO: Get a DominatorTree and AssumptionCache and use them here
         // (these are both now available in this function, but this should be
         // updated when GetUnderlyingObject is updated). TLI should be
         // provided also.
         if (const Value *Simplified =
                 SimplifyInstruction(const_cast<Instruction *>(I), DL)) {
           V = Simplified;
           continue;
         }
 
       Decomposed.Base = V;
       return false;
     }
 
     // Don't attempt to analyze GEPs over unsized objects.
     if (!GEPOp->getSourceElementType()->isSized()) {
       Decomposed.Base = V;
       return false;
     }
 
     unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
     unsigned PointerSize = DL.getPointerSizeInBits(AS);
     // Assume all GEP operands are constants until proven otherwise.
     bool GepHasConstantOffset = true;
     for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
          I != E; ++I, ++GTI) {
       const Value *Index = *I;
       // Compute the (potentially symbolic) offset in bytes for this index.
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         // For a struct, add the member offset.
         unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
         if (FieldNo == 0)
           continue;
 
         Decomposed.StructOffset +=
           DL.getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
 
       // For an array/pointer, add the element offset, explicitly scaled.
       if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero())
           continue;
         Decomposed.OtherOffset +=
           DL.getTypeAllocSize(GTI.getIndexedType()) * CIdx->getSExtValue();
         continue;
       }
 
       GepHasConstantOffset = false;
 
       uint64_t Scale = DL.getTypeAllocSize(GTI.getIndexedType());
       unsigned ZExtBits = 0, SExtBits = 0;
 
       // If the integer type is smaller than the pointer size, it is implicitly
       // sign extended to pointer size.
       unsigned Width = Index->getType()->getIntegerBitWidth();
       if (PointerSize > Width)
         SExtBits += PointerSize - Width;
 
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
       bool NSW = true, NUW = true;
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, ZExtBits,
                                   SExtBits, DL, 0, AC, DT, NSW, NUW);
 
       // All GEP math happens in the width of the pointer type,
       // so we can truncate the value to 64-bits as we don't handle
       // currently pointers larger than 64 bits and we would crash
       // later. TODO: Make `Scale` an APInt to avoid this problem.
       if (IndexScale.getBitWidth() > 64)
         IndexScale = IndexScale.sextOrTrunc(64);
 
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
       Decomposed.OtherOffset += IndexOffset.getSExtValue() * Scale;
       Scale *= IndexScale.getSExtValue();
 
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
       for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
         if (Decomposed.VarIndices[i].V == Index &&
             Decomposed.VarIndices[i].ZExtBits == ZExtBits &&
             Decomposed.VarIndices[i].SExtBits == SExtBits) {
           Scale += Decomposed.VarIndices[i].Scale;
           Decomposed.VarIndices.erase(Decomposed.VarIndices.begin() + i);
           break;
         }
       }
 
       // Make sure that we have a scale that makes sense for this target's
       // pointer size.
       Scale = adjustToPointerSize(Scale, PointerSize);
 
       if (Scale) {
         VariableGEPIndex Entry = {Index, ZExtBits, SExtBits,
                                   static_cast<int64_t>(Scale)};
         Decomposed.VarIndices.push_back(Entry);
       }
     }
 
     // Take care of wrap-arounds
     if (GepHasConstantOffset) {
       Decomposed.StructOffset =
           adjustToPointerSize(Decomposed.StructOffset, PointerSize);
       Decomposed.OtherOffset =
           adjustToPointerSize(Decomposed.OtherOffset, PointerSize);
     }
 
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
   } while (--MaxLookup);
 
   // If the chain of expressions is too deep, just return early.
   Decomposed.Base = V;
   SearchLimitReached++;
   return true;
 }
 
 /// Returns whether the given pointer value points to memory that is local to
 /// the function, with global constants being considered local to all
 /// functions.
 bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
                                            bool OrLocal) {
   assert(Visited.empty() && "Visited must be cleared after use!");
 
   unsigned MaxLookup = 8;
   SmallVector<const Value *, 16> Worklist;
   Worklist.push_back(Loc.Ptr);
   do {
     const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL);
     if (!Visited.insert(V).second) {
       Visited.clear();
       return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
     }
 
     // An alloca instruction defines local memory.
     if (OrLocal && isa<AllocaInst>(V))
       continue;
 
     // A global constant counts as local memory for our purposes.
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
       // Note: this doesn't require GV to be "ODR" because it isn't legal for a
       // global to be marked constant in some modules and non-constant in
       // others.  GV may even be a declaration, not a definition.
       if (!GV->isConstant()) {
         Visited.clear();
         return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
       }
       continue;
     }
 
     // If both select values point to local memory, then so does the select.
     if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
       Worklist.push_back(SI->getTrueValue());
       Worklist.push_back(SI->getFalseValue());
       continue;
     }
 
     // If all values incoming to a phi node point to local memory, then so does
     // the phi.
     if (const PHINode *PN = dyn_cast<PHINode>(V)) {
       // Don't bother inspecting phi nodes with many operands.
       if (PN->getNumIncomingValues() > MaxLookup) {
         Visited.clear();
         return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
       }
       for (Value *IncValue : PN->incoming_values())
         Worklist.push_back(IncValue);
       continue;
     }
 
     // Otherwise be conservative.
     Visited.clear();
     return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
   } while (!Worklist.empty() && --MaxLookup);
 
   Visited.clear();
   return Worklist.empty();
 }
 
 /// Returns the behavior when calling the given call site.
 FunctionModRefBehavior BasicAAResult::getModRefBehavior(ImmutableCallSite CS) {
   if (CS.doesNotAccessMemory())
     // Can't do better than this.
     return FMRB_DoesNotAccessMemory;
 
   FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
 
   // If the callsite knows it only reads memory, don't return worse
   // than that.
   if (CS.onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
   else if (CS.doesNotReadMemory())
     Min = FMRB_DoesNotReadMemory;
 
   if (CS.onlyAccessesArgMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
   else if (CS.onlyAccessesInaccessibleMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleMem);
   else if (CS.onlyAccessesInaccessibleMemOrArgMem())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleOrArgMem);
 
   // If CS has operand bundles then aliasing attributes from the function it
   // calls do not directly apply to the CallSite.  This can be made more
   // precise in the future.
   if (!CS.hasOperandBundles())
     if (const Function *F = CS.getCalledFunction())
       Min =
           FunctionModRefBehavior(Min & getBestAAResults().getModRefBehavior(F));
 
   return Min;
 }
 
 /// Returns the behavior when calling the given function. For use when the call
 /// site is not known.
 FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
   // If the function declares it doesn't access memory, we can't do better.
   if (F->doesNotAccessMemory())
     return FMRB_DoesNotAccessMemory;
 
   FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
 
   // If the function declares it only reads memory, go with that.
   if (F->onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
   else if (F->doesNotReadMemory())
     Min = FMRB_DoesNotReadMemory;
 
   if (F->onlyAccessesArgMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
   else if (F->onlyAccessesInaccessibleMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleMem);
   else if (F->onlyAccessesInaccessibleMemOrArgMem())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleOrArgMem);
 
   return Min;
 }
 
 /// Returns true if this is a writeonly (i.e Mod only) parameter.
 static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
                              const TargetLibraryInfo &TLI) {
   if (CS.paramHasAttr(ArgIdx, Attribute::WriteOnly))
     return true;
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
   // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
   // FIXME Consider handling this in InferFunctionAttr.cpp together with other
   // attributes.
   LibFunc F;
   if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
       F == LibFunc_memset_pattern16 && TLI.has(F))
     if (ArgIdx == 0)
       return true;
 
   // TODO: memset_pattern4, memset_pattern8
   // TODO: _chk variants
   // TODO: strcmp, strcpy
 
   return false;
 }
 
 ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
                                            unsigned ArgIdx) {
   // Checking for known builtin intrinsics and target library functions.
   if (isWriteOnlyParam(CS, ArgIdx, TLI))
     return ModRefInfo::Mod;
 
   if (CS.paramHasAttr(ArgIdx, Attribute::ReadOnly))
     return ModRefInfo::Ref;
 
   if (CS.paramHasAttr(ArgIdx, Attribute::ReadNone))
     return ModRefInfo::NoModRef;
 
   return AAResultBase::getArgModRefInfo(CS, ArgIdx);
 }
 
 static bool isIntrinsicCall(ImmutableCallSite CS, Intrinsic::ID IID) {
   const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
   return II && II->getIntrinsicID() == IID;
 }
 
 #ifndef NDEBUG
 static const Function *getParent(const Value *V) {
   if (const Instruction *inst = dyn_cast<Instruction>(V)) {
     if (!inst->getParent())
       return nullptr;
     return inst->getParent()->getParent();
   }
 
   if (const Argument *arg = dyn_cast<Argument>(V))
     return arg->getParent();
 
   return nullptr;
 }
 
 static bool notDifferentParent(const Value *O1, const Value *O2) {
 
   const Function *F1 = getParent(O1);
   const Function *F2 = getParent(O2);
 
   return !F1 || !F2 || F1 == F2;
 }
 #endif
 
 AliasResult BasicAAResult::alias(const MemoryLocation &LocA,
                                  const MemoryLocation &LocB) {
   assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
          "BasicAliasAnalysis doesn't support interprocedural queries.");
 
   // If we have a directly cached entry for these locations, we have recursed
   // through this once, so just return the cached results. Notably, when this
   // happens, we don't clear the cache.
   auto CacheIt = AliasCache.find(LocPair(LocA, LocB));
   if (CacheIt != AliasCache.end())
     return CacheIt->second;
 
   AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr,
                                  LocB.Size, LocB.AATags);
   // AliasCache rarely has more than 1 or 2 elements, always use
   // shrink_and_clear so it quickly returns to the inline capacity of the
   // SmallDenseMap if it ever grows larger.
   // FIXME: This should really be shrink_to_inline_capacity_and_clear().
   AliasCache.shrink_and_clear();
   VisitedPhiBBs.clear();
   return Alias;
 }
 
 /// Checks to see if the specified callsite can clobber the specified memory
 /// object.
 ///
 /// Since we only look at local properties of this function, we really can't
 /// say much about this query.  We do, however, use simple "address taken"
 /// analysis on local objects.
 ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
                                         const MemoryLocation &Loc) {
   assert(notDifferentParent(CS.getInstruction(), Loc.Ptr) &&
          "AliasAnalysis query involving multiple functions!");
 
   const Value *Object = GetUnderlyingObject(Loc.Ptr, DL);
 
-  // If this is a tail call and Loc.Ptr points to a stack location, we know that
-  // the tail call cannot access or modify the local stack.
-  // We cannot exclude byval arguments here; these belong to the caller of
-  // the current function not to the current function, and a tail callee
-  // may reference them.
+  // Calls marked 'tail' cannot read or write allocas from the current frame
+  // because the current frame might be destroyed by the time they run. However,
+  // a tail call may use an alloca with byval. Calling with byval copies the
+  // contents of the alloca into argument registers or stack slots, so there is
+  // no lifetime issue.
   if (isa<AllocaInst>(Object))
     if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
-      if (CI->isTailCall())
+      if (CI->isTailCall() &&
+          !CI->getAttributes().hasAttrSomewhere(Attribute::ByVal))
         return ModRefInfo::NoModRef;
 
   // If the pointer is to a locally allocated object that does not escape,
   // then the call can not mod/ref the pointer unless the call takes the pointer
   // as an argument, and itself doesn't capture it.
   if (!isa<Constant>(Object) && CS.getInstruction() != Object &&
       isNonEscapingLocalObject(Object)) {
 
     // Optimistically assume that call doesn't touch Object and check this
     // assumption in the following loop.
     ModRefInfo Result = ModRefInfo::NoModRef;
     bool IsMustAlias = true;
 
     unsigned OperandNo = 0;
     for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
          CI != CE; ++CI, ++OperandNo) {
       // Only look at the no-capture or byval pointer arguments.  If this
       // pointer were passed to arguments that were neither of these, then it
       // couldn't be no-capture.
       if (!(*CI)->getType()->isPointerTy() ||
           (!CS.doesNotCapture(OperandNo) &&
            OperandNo < CS.getNumArgOperands() && !CS.isByValArgument(OperandNo)))
         continue;
 
       // Call doesn't access memory through this operand, so we don't care
       // if it aliases with Object.
       if (CS.doesNotAccessMemory(OperandNo))
         continue;
 
       // If this is a no-capture pointer argument, see if we can tell that it
       // is impossible to alias the pointer we're checking.
       AliasResult AR =
           getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object));
       if (AR != MustAlias)
         IsMustAlias = false;
       // Operand doesnt alias 'Object', continue looking for other aliases
       if (AR == NoAlias)
         continue;
       // Operand aliases 'Object', but call doesn't modify it. Strengthen
       // initial assumption and keep looking in case if there are more aliases.
       if (CS.onlyReadsMemory(OperandNo)) {
         Result = setRef(Result);
         continue;
       }
       // Operand aliases 'Object' but call only writes into it.
       if (CS.doesNotReadMemory(OperandNo)) {
         Result = setMod(Result);
         continue;
       }
       // This operand aliases 'Object' and call reads and writes into it.
       // Setting ModRef will not yield an early return below, MustAlias is not
       // used further.
       Result = ModRefInfo::ModRef;
       break;
     }
 
     // No operand aliases, reset Must bit. Add below if at least one aliases
     // and all aliases found are MustAlias.
     if (isNoModRef(Result))
       IsMustAlias = false;
 
     // Early return if we improved mod ref information
     if (!isModAndRefSet(Result)) {
       if (isNoModRef(Result))
         return ModRefInfo::NoModRef;
       return IsMustAlias ? setMust(Result) : clearMust(Result);
     }
   }
 
   // If the CallSite is to malloc or calloc, we can assume that it doesn't
   // modify any IR visible value.  This is only valid because we assume these
   // routines do not read values visible in the IR.  TODO: Consider special
   // casing realloc and strdup routines which access only their arguments as
   // well.  Or alternatively, replace all of this with inaccessiblememonly once
   // that's implemented fully.
   auto *Inst = CS.getInstruction();
   if (isMallocOrCallocLikeFn(Inst, &TLI)) {
     // Be conservative if the accessed pointer may alias the allocation -
     // fallback to the generic handling below.
     if (getBestAAResults().alias(MemoryLocation(Inst), Loc) == NoAlias)
       return ModRefInfo::NoModRef;
   }
 
   // The semantics of memcpy intrinsics forbid overlap between their respective
   // operands, i.e., source and destination of any given memcpy must no-alias.
   // If Loc must-aliases either one of these two locations, then it necessarily
   // no-aliases the other.
   if (auto *Inst = dyn_cast<AnyMemCpyInst>(CS.getInstruction())) {
     AliasResult SrcAA, DestAA;
 
     if ((SrcAA = getBestAAResults().alias(MemoryLocation::getForSource(Inst),
                                           Loc)) == MustAlias)
       // Loc is exactly the memcpy source thus disjoint from memcpy dest.
       return ModRefInfo::Ref;
     if ((DestAA = getBestAAResults().alias(MemoryLocation::getForDest(Inst),
                                            Loc)) == MustAlias)
       // The converse case.
       return ModRefInfo::Mod;
 
     // It's also possible for Loc to alias both src and dest, or neither.
     ModRefInfo rv = ModRefInfo::NoModRef;
     if (SrcAA != NoAlias)
       rv = setRef(rv);
     if (DestAA != NoAlias)
       rv = setMod(rv);
     return rv;
   }
 
   // While the assume intrinsic is marked as arbitrarily writing so that
   // proper control dependencies will be maintained, it never aliases any
   // particular memory location.
   if (isIntrinsicCall(CS, Intrinsic::assume))
     return ModRefInfo::NoModRef;
 
   // Like assumes, guard intrinsics are also marked as arbitrarily writing so
   // that proper control dependencies are maintained but they never mods any
   // particular memory location.
   //
   // *Unlike* assumes, guard intrinsics are modeled as reading memory since the
   // heap state at the point the guard is issued needs to be consistent in case
   // the guard invokes the "deopt" continuation.
   if (isIntrinsicCall(CS, Intrinsic::experimental_guard))
     return ModRefInfo::Ref;
 
   // Like assumes, invariant.start intrinsics were also marked as arbitrarily
   // writing so that proper control dependencies are maintained but they never
   // mod any particular memory location visible to the IR.
   // *Unlike* assumes (which are now modeled as NoModRef), invariant.start
   // intrinsic is now modeled as reading memory. This prevents hoisting the
   // invariant.start intrinsic over stores. Consider:
   // *ptr = 40;
   // *ptr = 50;
   // invariant_start(ptr)
   // int val = *ptr;
   // print(val);
   //
   // This cannot be transformed to:
   //
   // *ptr = 40;
   // invariant_start(ptr)
   // *ptr = 50;
   // int val = *ptr;
   // print(val);
   //
   // The transformation will cause the second store to be ignored (based on
   // rules of invariant.start)  and print 40, while the first program always
   // prints 50.
   if (isIntrinsicCall(CS, Intrinsic::invariant_start))
     return ModRefInfo::Ref;
 
   // The AAResultBase base class has some smarts, lets use them.
   return AAResultBase::getModRefInfo(CS, Loc);
 }
 
 ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS1,
                                         ImmutableCallSite CS2) {
   // While the assume intrinsic is marked as arbitrarily writing so that
   // proper control dependencies will be maintained, it never aliases any
   // particular memory location.
   if (isIntrinsicCall(CS1, Intrinsic::assume) ||
       isIntrinsicCall(CS2, Intrinsic::assume))
     return ModRefInfo::NoModRef;
 
   // Like assumes, guard intrinsics are also marked as arbitrarily writing so
   // that proper control dependencies are maintained but they never mod any
   // particular memory location.
   //
   // *Unlike* assumes, guard intrinsics are modeled as reading memory since the
   // heap state at the point the guard is issued needs to be consistent in case
   // the guard invokes the "deopt" continuation.
 
   // NB! This function is *not* commutative, so we specical case two
   // possibilities for guard intrinsics.
 
   if (isIntrinsicCall(CS1, Intrinsic::experimental_guard))
     return isModSet(createModRefInfo(getModRefBehavior(CS2)))
                ? ModRefInfo::Ref
                : ModRefInfo::NoModRef;
 
   if (isIntrinsicCall(CS2, Intrinsic::experimental_guard))
     return isModSet(createModRefInfo(getModRefBehavior(CS1)))
                ? ModRefInfo::Mod
                : ModRefInfo::NoModRef;
 
   // The AAResultBase base class has some smarts, lets use them.
   return AAResultBase::getModRefInfo(CS1, CS2);
 }
 
 /// Provide ad-hoc rules to disambiguate accesses through two GEP operators,
 /// both having the exact same pointer operand.
 static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
                                             LocationSize V1Size,
                                             const GEPOperator *GEP2,
                                             LocationSize V2Size,
                                             const DataLayout &DL) {
   assert(GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
              GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
          GEP1->getPointerOperandType() == GEP2->getPointerOperandType() &&
          "Expected GEPs with the same pointer operand");
 
   // Try to determine whether GEP1 and GEP2 index through arrays, into structs,
   // such that the struct field accesses provably cannot alias.
   // We also need at least two indices (the pointer, and the struct field).
   if (GEP1->getNumIndices() != GEP2->getNumIndices() ||
       GEP1->getNumIndices() < 2)
     return MayAlias;
 
   // If we don't know the size of the accesses through both GEPs, we can't
   // determine whether the struct fields accessed can't alias.
   if (V1Size == MemoryLocation::UnknownSize ||
       V2Size == MemoryLocation::UnknownSize)
     return MayAlias;
 
   ConstantInt *C1 =
       dyn_cast<ConstantInt>(GEP1->getOperand(GEP1->getNumOperands() - 1));
   ConstantInt *C2 =
       dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1));
 
   // If the last (struct) indices are constants and are equal, the other indices
   // might be also be dynamically equal, so the GEPs can alias.
   if (C1 && C2 && C1->getSExtValue() == C2->getSExtValue())
     return MayAlias;
 
   // Find the last-indexed type of the GEP, i.e., the type you'd get if
   // you stripped the last index.
   // On the way, look at each indexed type.  If there's something other
   // than an array, different indices can lead to different final types.
   SmallVector<Value *, 8> IntermediateIndices;
 
   // Insert the first index; we don't need to check the type indexed
   // through it as it only drops the pointer indirection.
   assert(GEP1->getNumIndices() > 1 && "Not enough GEP indices to examine");
   IntermediateIndices.push_back(GEP1->getOperand(1));
 
   // Insert all the remaining indices but the last one.
   // Also, check that they all index through arrays.
   for (unsigned i = 1, e = GEP1->getNumIndices() - 1; i != e; ++i) {
     if (!isa<ArrayType>(GetElementPtrInst::getIndexedType(
             GEP1->getSourceElementType(), IntermediateIndices)))
       return MayAlias;
     IntermediateIndices.push_back(GEP1->getOperand(i + 1));
   }
 
   auto *Ty = GetElementPtrInst::getIndexedType(
     GEP1->getSourceElementType(), IntermediateIndices);
   StructType *LastIndexedStruct = dyn_cast<StructType>(Ty);
 
   if (isa<SequentialType>(Ty)) {
     // We know that:
     // - both GEPs begin indexing from the exact same pointer;
     // - the last indices in both GEPs are constants, indexing into a sequential
     //   type (array or pointer);
     // - both GEPs only index through arrays prior to that.
     //
     // Because array indices greater than the number of elements are valid in
     // GEPs, unless we know the intermediate indices are identical between
     // GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't
     // partially overlap. We also need to check that the loaded size matches
     // the element size, otherwise we could still have overlap.
     const uint64_t ElementSize =
         DL.getTypeStoreSize(cast<SequentialType>(Ty)->getElementType());
     if (V1Size != ElementSize || V2Size != ElementSize)
       return MayAlias;
 
     for (unsigned i = 0, e = GEP1->getNumIndices() - 1; i != e; ++i)
       if (GEP1->getOperand(i + 1) != GEP2->getOperand(i + 1))
         return MayAlias;
 
     // Now we know that the array/pointer that GEP1 indexes into and that
     // that GEP2 indexes into must either precisely overlap or be disjoint.
     // Because they cannot partially overlap and because fields in an array
     // cannot overlap, if we can prove the final indices are different between
     // GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias.
 
     // If the last indices are constants, we've already checked they don't
     // equal each other so we can exit early.
     if (C1 && C2)
       return NoAlias;
     {
       Value *GEP1LastIdx = GEP1->getOperand(GEP1->getNumOperands() - 1);
       Value *GEP2LastIdx = GEP2->getOperand(GEP2->getNumOperands() - 1);
       if (isa<PHINode>(GEP1LastIdx) || isa<PHINode>(GEP2LastIdx)) {
         // If one of the indices is a PHI node, be safe and only use
         // computeKnownBits so we don't make any assumptions about the
         // relationships between the two indices. This is important if we're
         // asking about values from different loop iterations. See PR32314.
         // TODO: We may be able to change the check so we only do this when
         // we definitely looked through a PHINode.
         if (GEP1LastIdx != GEP2LastIdx &&
             GEP1LastIdx->getType() == GEP2LastIdx->getType()) {
           KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
           KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
           if (Known1.Zero.intersects(Known2.One) ||
               Known1.One.intersects(Known2.Zero))
             return NoAlias;
         }
       } else if (isKnownNonEqual(GEP1LastIdx, GEP2LastIdx, DL))
         return NoAlias;
     }
     return MayAlias;
   } else if (!LastIndexedStruct || !C1 || !C2) {
     return MayAlias;
   }
 
   // We know that:
   // - both GEPs begin indexing from the exact same pointer;
   // - the last indices in both GEPs are constants, indexing into a struct;
   // - said indices are different, hence, the pointed-to fields are different;
   // - both GEPs only index through arrays prior to that.
   //
   // This lets us determine that the struct that GEP1 indexes into and the
   // struct that GEP2 indexes into must either precisely overlap or be
   // completely disjoint.  Because they cannot partially overlap, indexing into
   // different non-overlapping fields of the struct will never alias.
 
   // Therefore, the only remaining thing needed to show that both GEPs can't
   // alias is that the fields are not overlapping.
   const StructLayout *SL = DL.getStructLayout(LastIndexedStruct);
   const uint64_t StructSize = SL->getSizeInBytes();
   const uint64_t V1Off = SL->getElementOffset(C1->getZExtValue());
   const uint64_t V2Off = SL->getElementOffset(C2->getZExtValue());
 
   auto EltsDontOverlap = [StructSize](uint64_t V1Off, uint64_t V1Size,
                                       uint64_t V2Off, uint64_t V2Size) {
     return V1Off < V2Off && V1Off + V1Size <= V2Off &&
            ((V2Off + V2Size <= StructSize) ||
             (V2Off + V2Size - StructSize <= V1Off));
   };
 
   if (EltsDontOverlap(V1Off, V1Size, V2Off, V2Size) ||
       EltsDontOverlap(V2Off, V2Size, V1Off, V1Size))
     return NoAlias;
 
   return MayAlias;
 }
 
 // If a we have (a) a GEP and (b) a pointer based on an alloca, and the
 // beginning of the object the GEP points would have a negative offset with
 // repsect to the alloca, that means the GEP can not alias pointer (b).
 // Note that the pointer based on the alloca may not be a GEP. For
 // example, it may be the alloca itself.
 // The same applies if (b) is based on a GlobalVariable. Note that just being
 // based on isIdentifiedObject() is not enough - we need an identified object
 // that does not permit access to negative offsets. For example, a negative
 // offset from a noalias argument or call can be inbounds w.r.t the actual
 // underlying object.
 //
 // For example, consider:
 //
 //   struct { int f0, int f1, ...} foo;
 //   foo alloca;
 //   foo* random = bar(alloca);
 //   int *f0 = &alloca.f0
 //   int *f1 = &random->f1;
 //
 // Which is lowered, approximately, to:
 //
 //  %alloca = alloca %struct.foo
 //  %random = call %struct.foo* @random(%struct.foo* %alloca)
 //  %f0 = getelementptr inbounds %struct, %struct.foo* %alloca, i32 0, i32 0
 //  %f1 = getelementptr inbounds %struct, %struct.foo* %random, i32 0, i32 1
 //
 // Assume %f1 and %f0 alias. Then %f1 would point into the object allocated
 // by %alloca. Since the %f1 GEP is inbounds, that means %random must also
 // point into the same object. But since %f0 points to the beginning of %alloca,
 // the highest %f1 can be is (%alloca + 3). This means %random can not be higher
 // than (%alloca - 1), and so is not inbounds, a contradiction.
 bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
       LocationSize ObjectAccessSize) {
   // If the object access size is unknown, or the GEP isn't inbounds, bail.
   if (ObjectAccessSize == MemoryLocation::UnknownSize || !GEPOp->isInBounds())
     return false;
 
   // We need the object to be an alloca or a globalvariable, and want to know
   // the offset of the pointer from the object precisely, so no variable
   // indices are allowed.
   if (!(isa<AllocaInst>(DecompObject.Base) ||
         isa<GlobalVariable>(DecompObject.Base)) ||
       !DecompObject.VarIndices.empty())
     return false;
 
   int64_t ObjectBaseOffset = DecompObject.StructOffset +
                              DecompObject.OtherOffset;
 
   // If the GEP has no variable indices, we know the precise offset
   // from the base, then use it. If the GEP has variable indices,
   // we can't get exact GEP offset to identify pointer alias. So return
   // false in that case.
   if (!DecompGEP.VarIndices.empty())
     return false;
   int64_t GEPBaseOffset = DecompGEP.StructOffset;
   GEPBaseOffset += DecompGEP.OtherOffset;
 
   return (GEPBaseOffset >= ObjectBaseOffset + (int64_t)ObjectAccessSize);
 }
 
 /// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against
 /// another pointer.
 ///
 /// We know that V1 is a GEP, but we don't know anything about V2.
 /// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for
 /// V2.
 AliasResult
 BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
                         const AAMDNodes &V1AAInfo, const Value *V2,
                         LocationSize V2Size, const AAMDNodes &V2AAInfo,
                         const Value *UnderlyingV1, const Value *UnderlyingV2) {
   DecomposedGEP DecompGEP1, DecompGEP2;
   bool GEP1MaxLookupReached =
     DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
   bool GEP2MaxLookupReached =
     DecomposeGEPExpression(V2, DecompGEP2, DL, &AC, DT);
 
   int64_t GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
   int64_t GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;
 
   assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
          "DecomposeGEPExpression returned a result different from "
          "GetUnderlyingObject");
 
   // If the GEP's offset relative to its base is such that the base would
   // fall below the start of the object underlying V2, then the GEP and V2
   // cannot alias.
   if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
       isGEPBaseAtNegativeOffset(GEP1, DecompGEP1, DecompGEP2, V2Size))
     return NoAlias;
   // If we have two gep instructions with must-alias or not-alias'ing base
   // pointers, figure out if the indexes to the GEP tell us anything about the
   // derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
     // Check for the GEP base being at a negative offset, this time in the other
     // direction.
     if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
         isGEPBaseAtNegativeOffset(GEP2, DecompGEP2, DecompGEP1, V1Size))
       return NoAlias;
     // Do the base pointers alias?
     AliasResult BaseAlias =
         aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize, AAMDNodes(),
                    UnderlyingV2, MemoryLocation::UnknownSize, AAMDNodes());
 
     // Check for geps of non-aliasing underlying pointers where the offsets are
     // identical.
     if ((BaseAlias == MayAlias) && V1Size == V2Size) {
       // Do the base pointers alias assuming type and size.
       AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size, V1AAInfo,
                                                 UnderlyingV2, V2Size, V2AAInfo);
       if (PreciseBaseAlias == NoAlias) {
         // See if the computed offset from the common pointer tells us about the
         // relation of the resulting pointer.
         // If the max search depth is reached the result is undefined
         if (GEP2MaxLookupReached || GEP1MaxLookupReached)
           return MayAlias;
 
         // Same offsets.
         if (GEP1BaseOffset == GEP2BaseOffset &&
             DecompGEP1.VarIndices == DecompGEP2.VarIndices)
           return NoAlias;
       }
     }
 
     // If we get a No or May, then return it immediately, no amount of analysis
     // will improve this situation.
     if (BaseAlias != MustAlias) {
       assert(BaseAlias == NoAlias || BaseAlias == MayAlias);
       return BaseAlias;
     }
 
     // Otherwise, we have a MustAlias.  Since the base pointers alias each other
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
     // If we know the two GEPs are based off of the exact same pointer (and not
     // just the same underlying object), see if that tells us anything about
     // the resulting pointers.
     if (GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
             GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
         GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) {
       AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
       // If we couldn't find anything interesting, don't abandon just yet.
       if (R != MayAlias)
         return R;
     }
 
     // If the max search depth is reached, the result is undefined
     if (GEP2MaxLookupReached || GEP1MaxLookupReached)
       return MayAlias;
 
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
     GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);
 
   } else {
     // Check to see if these two pointers are related by the getelementptr
     // instruction.  If one pointer is a GEP with a non-zero index of the other
     // pointer, we know they cannot alias.
 
     // If both accesses are unknown size, we can't do anything useful here.
     if (V1Size == MemoryLocation::UnknownSize &&
         V2Size == MemoryLocation::UnknownSize)
       return MayAlias;
 
     AliasResult R = aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize,
                                AAMDNodes(), V2, MemoryLocation::UnknownSize,
                                V2AAInfo, nullptr, UnderlyingV2);
     if (R != MustAlias) {
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
       // cannot alias per GEP semantics: "Any memory access must be done through
       // a pointer value associated with an address range of the memory access,
       // otherwise the behavior is undefined.".
       assert(R == NoAlias || R == MayAlias);
       return R;
     }
 
     // If the max search depth is reached the result is undefined
     if (GEP1MaxLookupReached)
       return MayAlias;
   }
 
   // In the two GEP Case, if there is no difference in the offsets of the
   // computed pointers, the resultant pointers are a must alias.  This
   // happens when we have two lexically identical GEP's (for example).
   //
   // In the other case, if we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2
   // must aliases the GEP, the end result is a must alias also.
   if (GEP1BaseOffset == 0 && DecompGEP1.VarIndices.empty())
     return MustAlias;
 
   // If there is a constant difference between the pointers, but the difference
   // is less than the size of the associated memory object, then we know
   // that the objects are partially overlapping.  If the difference is
   // greater, we know they do not overlap.
   if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
     if (GEP1BaseOffset >= 0) {
       if (V2Size != MemoryLocation::UnknownSize) {
         if ((uint64_t)GEP1BaseOffset < V2Size)
           return PartialAlias;
         return NoAlias;
       }
     } else {
       // We have the situation where:
       // +                +
       // | BaseOffset     |
       // ---------------->|
       // |-->V1Size       |-------> V2Size
       // GEP1             V2
       // We need to know that V2Size is not unknown, otherwise we might have
       // stripped a gep with negative index ('gep <ptr>, -1, ...).
       if (V1Size != MemoryLocation::UnknownSize &&
           V2Size != MemoryLocation::UnknownSize) {
         if (-(uint64_t)GEP1BaseOffset < V1Size)
           return PartialAlias;
         return NoAlias;
       }
     }
   }
 
   if (!DecompGEP1.VarIndices.empty()) {
     uint64_t Modulo = 0;
     bool AllPositive = true;
     for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {
 
       // Try to distinguish something like &A[i][1] against &A[42][0].
       // Grab the least significant bit set in any of the scales. We
       // don't need std::abs here (even if the scale's negative) as we'll
       // be ^'ing Modulo with itself later.
       Modulo |= (uint64_t)DecompGEP1.VarIndices[i].Scale;
 
       if (AllPositive) {
         // If the Value could change between cycles, then any reasoning about
         // the Value this cycle may not hold in the next cycle. We'll just
         // give up if we can't determine conditions that hold for every cycle:
         const Value *V = DecompGEP1.VarIndices[i].V;
 
         KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT);
         bool SignKnownZero = Known.isNonNegative();
         bool SignKnownOne = Known.isNegative();
 
         // Zero-extension widens the variable, and so forces the sign
         // bit to zero.
         bool IsZExt = DecompGEP1.VarIndices[i].ZExtBits > 0 || isa<ZExtInst>(V);
         SignKnownZero |= IsZExt;
         SignKnownOne &= !IsZExt;
 
         // If the variable begins with a zero then we know it's
         // positive, regardless of whether the value is signed or
         // unsigned.
         int64_t Scale = DecompGEP1.VarIndices[i].Scale;
         AllPositive =
             (SignKnownZero && Scale >= 0) || (SignKnownOne && Scale < 0);
       }
     }
 
     Modulo = Modulo ^ (Modulo & (Modulo - 1));
 
     // We can compute the difference between the two addresses
     // mod Modulo. Check whether that difference guarantees that the
     // two locations do not alias.
     uint64_t ModOffset = (uint64_t)GEP1BaseOffset & (Modulo - 1);
     if (V1Size != MemoryLocation::UnknownSize &&
         V2Size != MemoryLocation::UnknownSize && ModOffset >= V2Size &&
         V1Size <= Modulo - ModOffset)
       return NoAlias;
 
     // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
     // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
     // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
     if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t)GEP1BaseOffset)
       return NoAlias;
 
     if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
                                 GEP1BaseOffset, &AC, DT))
       return NoAlias;
   }
 
   // Statically, we can see that the base objects are the same, but the
   // pointers have dynamic offsets which we can't resolve. And none of our
   // little tricks above worked.
   return MayAlias;
 }
 
 static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
   // If the results agree, take it.
   if (A == B)
     return A;
   // A mix of PartialAlias and MustAlias is PartialAlias.
   if ((A == PartialAlias && B == MustAlias) ||
       (B == PartialAlias && A == MustAlias))
     return PartialAlias;
   // Otherwise, we don't know anything.
   return MayAlias;
 }
 
 /// Provides a bunch of ad-hoc rules to disambiguate a Select instruction
 /// against another.
 AliasResult BasicAAResult::aliasSelect(const SelectInst *SI,
                                        LocationSize SISize,
                                        const AAMDNodes &SIAAInfo,
                                        const Value *V2, LocationSize V2Size,
                                        const AAMDNodes &V2AAInfo,
                                        const Value *UnderV2) {
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
     if (SI->getCondition() == SI2->getCondition()) {
       AliasResult Alias = aliasCheck(SI->getTrueValue(), SISize, SIAAInfo,
                                      SI2->getTrueValue(), V2Size, V2AAInfo);
       if (Alias == MayAlias)
         return MayAlias;
       AliasResult ThisAlias =
           aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
                      SI2->getFalseValue(), V2Size, V2AAInfo);
       return MergeAliasResults(ThisAlias, Alias);
     }
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
   AliasResult Alias =
       aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(),
                  SISize, SIAAInfo, UnderV2);
   if (Alias == MayAlias)
     return MayAlias;
 
   AliasResult ThisAlias =
       aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(), SISize, SIAAInfo,
                  UnderV2);
   return MergeAliasResults(ThisAlias, Alias);
 }
 
 /// Provide a bunch of ad-hoc rules to disambiguate a PHI instruction against
 /// another.
 AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
                                     const AAMDNodes &PNAAInfo, const Value *V2,
                                     LocationSize V2Size,
                                     const AAMDNodes &V2AAInfo,
                                     const Value *UnderV2) {
   // Track phi nodes we have visited. We use this information when we determine
   // value equivalence.
   VisitedPhiBBs.insert(PN->getParent());
 
   // If the values are PHIs in the same block, we can do a more precise
   // as well as efficient check: just check for aliases between the values
   // on corresponding edges.
   if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
     if (PN2->getParent() == PN->getParent()) {
       LocPair Locs(MemoryLocation(PN, PNSize, PNAAInfo),
                    MemoryLocation(V2, V2Size, V2AAInfo));
       if (PN > V2)
         std::swap(Locs.first, Locs.second);
       // Analyse the PHIs' inputs under the assumption that the PHIs are
       // NoAlias.
       // If the PHIs are May/MustAlias there must be (recursively) an input
       // operand from outside the PHIs' cycle that is MayAlias/MustAlias or
       // there must be an operation on the PHIs within the PHIs' value cycle
       // that causes a MayAlias.
       // Pretend the phis do not alias.
       AliasResult Alias = NoAlias;
       assert(AliasCache.count(Locs) &&
              "There must exist an entry for the phi node");
       AliasResult OrigAliasResult = AliasCache[Locs];
       AliasCache[Locs] = NoAlias;
 
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         AliasResult ThisAlias =
             aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
                        PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
                        V2Size, V2AAInfo);
         Alias = MergeAliasResults(ThisAlias, Alias);
         if (Alias == MayAlias)
           break;
       }
 
       // Reset if speculation failed.
       if (Alias != NoAlias)
         AliasCache[Locs] = OrigAliasResult;
 
       return Alias;
     }
 
   SmallVector<Value *, 4> V1Srcs;
   bool isRecursive = false;
   if (PV)  {
     // If we have PhiValues then use it to get the underlying phi values.
     const PhiValues::ValueSet &PhiValueSet = PV->getValuesForPhi(PN);
     // If we have more phi values than the search depth then return MayAlias
     // conservatively to avoid compile time explosion. The worst possible case
     // is if both sides are PHI nodes. In which case, this is O(m x n) time
     // where 'm' and 'n' are the number of PHI sources.
     if (PhiValueSet.size() > MaxLookupSearchDepth)
       return MayAlias;
     // Add the values to V1Srcs
     for (Value *PV1 : PhiValueSet) {
       if (EnableRecPhiAnalysis) {
         if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
           // Check whether the incoming value is a GEP that advances the pointer
           // result of this PHI node (e.g. in a loop). If this is the case, we
           // would recurse and always get a MayAlias. Handle this case specially
           // below.
           if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
               isa<ConstantInt>(PV1GEP->idx_begin())) {
             isRecursive = true;
             continue;
           }
         }
       }
       V1Srcs.push_back(PV1);
     }
   } else {
     // If we don't have PhiInfo then just look at the operands of the phi itself
     // FIXME: Remove this once we can guarantee that we have PhiInfo always
     SmallPtrSet<Value *, 4> UniqueSrc;
     for (Value *PV1 : PN->incoming_values()) {
       if (isa<PHINode>(PV1))
         // If any of the source itself is a PHI, return MayAlias conservatively
         // to avoid compile time explosion. The worst possible case is if both
         // sides are PHI nodes. In which case, this is O(m x n) time where 'm'
         // and 'n' are the number of PHI sources.
         return MayAlias;
 
       if (EnableRecPhiAnalysis)
         if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
           // Check whether the incoming value is a GEP that advances the pointer
           // result of this PHI node (e.g. in a loop). If this is the case, we
           // would recurse and always get a MayAlias. Handle this case specially
           // below.
           if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
               isa<ConstantInt>(PV1GEP->idx_begin())) {
             isRecursive = true;
             continue;
           }
         }
 
       if (UniqueSrc.insert(PV1).second)
         V1Srcs.push_back(PV1);
     }
   }
 
   // If V1Srcs is empty then that means that the phi has no underlying non-phi
   // value. This should only be possible in blocks unreachable from the entry
   // block, but return MayAlias just in case.
   if (V1Srcs.empty())
     return MayAlias;
 
   // If this PHI node is recursive, set the size of the accessed memory to
   // unknown to represent all the possible values the GEP could advance the
   // pointer to.
   if (isRecursive)
     PNSize = MemoryLocation::UnknownSize;
 
   AliasResult Alias =
       aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0],
                  PNSize, PNAAInfo, UnderV2);
 
   // Early exit if the check of the first PHI source against V2 is MayAlias.
   // Other results are not possible.
   if (Alias == MayAlias)
     return MayAlias;
 
   // If all sources of the PHI node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
     AliasResult ThisAlias =
         aliasCheck(V2, V2Size, V2AAInfo, V, PNSize, PNAAInfo, UnderV2);
     Alias = MergeAliasResults(ThisAlias, Alias);
     if (Alias == MayAlias)
       break;
   }
 
   return Alias;
 }
 
 /// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as
 /// array references.
 AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
                                       AAMDNodes V1AAInfo, const Value *V2,
                                       LocationSize V2Size, AAMDNodes V2AAInfo,
                                       const Value *O1, const Value *O2) {
   // If either of the memory references is empty, it doesn't matter what the
   // pointer values are.
   if (V1Size == 0 || V2Size == 0)
     return NoAlias;
 
   // Strip off any casts if they exist.
   V1 = V1->stripPointerCastsAndInvariantGroups();
   V2 = V2->stripPointerCastsAndInvariantGroups();
 
   // If V1 or V2 is undef, the result is NoAlias because we can always pick a
   // value for undef that aliases nothing in the program.
   if (isa<UndefValue>(V1) || isa<UndefValue>(V2))
     return NoAlias;
 
   // Are we checking for alias of the same value?
   // Because we look 'through' phi nodes, we could look at "Value" pointers from
   // different iterations. We must therefore make sure that this is not the
   // case. The function isValueEqualInPotentialCycles ensures that this cannot
   // happen by looking at the visited phi nodes and making sure they cannot
   // reach the value.
   if (isValueEqualInPotentialCycles(V1, V2))
     return MustAlias;
 
   if (!V1->getType()->isPointerTy() || !V2->getType()->isPointerTy())
     return NoAlias; // Scalars cannot alias each other
 
   // Figure out what objects these things are pointing to if we can.
   if (O1 == nullptr)
     O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth);
 
   if (O2 == nullptr)
     O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth);
 
   // Null values in the default address space don't point to any object, so they
   // don't alias any other pointer.
   if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O1))
     if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
       return NoAlias;
   if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O2))
     if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
       return NoAlias;
 
   if (O1 != O2) {
     // If V1/V2 point to two different objects, we know that we have no alias.
     if (isIdentifiedObject(O1) && isIdentifiedObject(O2))
       return NoAlias;
 
     // Constant pointers can't alias with non-const isIdentifiedObject objects.
     if ((isa<Constant>(O1) && isIdentifiedObject(O2) && !isa<Constant>(O2)) ||
         (isa<Constant>(O2) && isIdentifiedObject(O1) && !isa<Constant>(O1)))
       return NoAlias;
 
     // Function arguments can't alias with things that are known to be
     // unambigously identified at the function level.
     if ((isa<Argument>(O1) && isIdentifiedFunctionLocal(O2)) ||
         (isa<Argument>(O2) && isIdentifiedFunctionLocal(O1)))
       return NoAlias;
 
     // If one pointer is the result of a call/invoke or load and the other is a
     // non-escaping local object within the same function, then we know the
     // object couldn't escape to a point where the call could return it.
     //
     // Note that if the pointers are in different functions, there are a
     // variety of complications. A call with a nocapture argument may still
     // temporary store the nocapture argument's value in a temporary memory
     // location if that memory location doesn't escape. Or it may pass a
     // nocapture value to other functions as long as they don't capture it.
     if (isEscapeSource(O1) && isNonEscapingLocalObject(O2))
       return NoAlias;
     if (isEscapeSource(O2) && isNonEscapingLocalObject(O1))
       return NoAlias;
   }
 
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
   bool NullIsValidLocation = NullPointerIsDefined(&F);
   if ((V1Size != MemoryLocation::UnknownSize &&
        isObjectSmallerThan(O2, V1Size, DL, TLI, NullIsValidLocation)) ||
       (V2Size != MemoryLocation::UnknownSize &&
        isObjectSmallerThan(O1, V2Size, DL, TLI, NullIsValidLocation)))
     return NoAlias;
 
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
   LocPair Locs(MemoryLocation(V1, V1Size, V1AAInfo),
                MemoryLocation(V2, V2Size, V2AAInfo));
   if (V1 > V2)
     std::swap(Locs.first, Locs.second);
   std::pair<AliasCacheTy::iterator, bool> Pair =
       AliasCache.insert(std::make_pair(Locs, MayAlias));
   if (!Pair.second)
     return Pair.first->second;
 
   // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
   // GEP can't simplify, we don't even look at the PHI cases.
   if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
     std::swap(O1, O2);
     std::swap(V1AAInfo, V2AAInfo);
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
     AliasResult Result =
         aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2);
     if (Result != MayAlias)
       return AliasCache[Locs] = Result;
   }
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
     std::swap(V1, V2);
     std::swap(O1, O2);
     std::swap(V1Size, V2Size);
     std::swap(V1AAInfo, V2AAInfo);
   }
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
     AliasResult Result = aliasPHI(PN, V1Size, V1AAInfo,
                                   V2, V2Size, V2AAInfo, O2);
     if (Result != MayAlias)
       return AliasCache[Locs] = Result;
   }
 
   if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
     std::swap(V1, V2);
     std::swap(O1, O2);
     std::swap(V1Size, V2Size);
     std::swap(V1AAInfo, V2AAInfo);
   }
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
     AliasResult Result =
         aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2);
     if (Result != MayAlias)
       return AliasCache[Locs] = Result;
   }
 
   // If both pointers are pointing into the same object and one of them
   // accesses the entire object, then the accesses must overlap in some way.
   if (O1 == O2)
     if (V1Size != MemoryLocation::UnknownSize &&
         V2Size != MemoryLocation::UnknownSize &&
         (isObjectSize(O1, V1Size, DL, TLI, NullIsValidLocation) ||
          isObjectSize(O2, V2Size, DL, TLI, NullIsValidLocation)))
       return AliasCache[Locs] = PartialAlias;
 
   // Recurse back into the best AA results we have, potentially with refined
   // memory locations. We have already ensured that BasicAA has a MayAlias
   // cache result for these, so any recursion back into BasicAA won't loop.
   AliasResult Result = getBestAAResults().alias(Locs.first, Locs.second);
   return AliasCache[Locs] = Result;
 }
 
 /// Check whether two Values can be considered equivalent.
 ///
 /// In addition to pointer equivalence of \p V1 and \p V2 this checks whether
 /// they can not be part of a cycle in the value graph by looking at all
 /// visited phi nodes an making sure that the phis cannot reach the value. We
 /// have to do this because we are looking through phi nodes (That is we say
 /// noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB).
 bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
                                                   const Value *V2) {
   if (V != V2)
     return false;
 
   const Instruction *Inst = dyn_cast<Instruction>(V);
   if (!Inst)
     return true;
 
   if (VisitedPhiBBs.empty())
     return true;
 
   if (VisitedPhiBBs.size() > MaxNumPhiBBsValueReachabilityCheck)
     return false;
 
   // Make sure that the visited phis cannot reach the Value. This ensures that
   // the Values cannot come from different iterations of a potential cycle the
   // phi nodes could be involved in.
   for (auto *P : VisitedPhiBBs)
     if (isPotentiallyReachable(&P->front(), Inst, DT, LI))
       return false;
 
   return true;
 }
 
 /// Computes the symbolic difference between two de-composed GEPs.
 ///
 /// Dest and Src are the variable indices from two decomposed GetElementPtr
 /// instructions GEP1 and GEP2 which have common base pointers.
 void BasicAAResult::GetIndexDifference(
     SmallVectorImpl<VariableGEPIndex> &Dest,
     const SmallVectorImpl<VariableGEPIndex> &Src) {
   if (Src.empty())
     return;
 
   for (unsigned i = 0, e = Src.size(); i != e; ++i) {
     const Value *V = Src[i].V;
     unsigned ZExtBits = Src[i].ZExtBits, SExtBits = Src[i].SExtBits;
     int64_t Scale = Src[i].Scale;
 
     // Find V in Dest.  This is N^2, but pointer indices almost never have more
     // than a few variable indexes.
     for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
       if (!isValueEqualInPotentialCycles(Dest[j].V, V) ||
           Dest[j].ZExtBits != ZExtBits || Dest[j].SExtBits != SExtBits)
         continue;
 
       // If we found it, subtract off Scale V's from the entry in Dest.  If it
       // goes to zero, remove the entry.
       if (Dest[j].Scale != Scale)
         Dest[j].Scale -= Scale;
       else
         Dest.erase(Dest.begin() + j);
       Scale = 0;
       break;
     }
 
     // If we didn't consume this entry, add it to the end of the Dest list.
     if (Scale) {
       VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale};
       Dest.push_back(Entry);
     }
   }
 }
 
 bool BasicAAResult::constantOffsetHeuristic(
     const SmallVectorImpl<VariableGEPIndex> &VarIndices, LocationSize V1Size,
     LocationSize V2Size, int64_t BaseOffset, AssumptionCache *AC,
     DominatorTree *DT) {
   if (VarIndices.size() != 2 || V1Size == MemoryLocation::UnknownSize ||
       V2Size == MemoryLocation::UnknownSize)
     return false;
 
   const VariableGEPIndex &Var0 = VarIndices[0], &Var1 = VarIndices[1];
 
   if (Var0.ZExtBits != Var1.ZExtBits || Var0.SExtBits != Var1.SExtBits ||
       Var0.Scale != -Var1.Scale)
     return false;
 
   unsigned Width = Var1.V->getType()->getIntegerBitWidth();
 
   // We'll strip off the Extensions of Var0 and Var1 and do another round
   // of GetLinearExpression decomposition. In the example above, if Var0
   // is zext(%x + 1) we should get V1 == %x and V1Offset == 1.
 
   APInt V0Scale(Width, 0), V0Offset(Width, 0), V1Scale(Width, 0),
       V1Offset(Width, 0);
   bool NSW = true, NUW = true;
   unsigned V0ZExtBits = 0, V0SExtBits = 0, V1ZExtBits = 0, V1SExtBits = 0;
   const Value *V0 = GetLinearExpression(Var0.V, V0Scale, V0Offset, V0ZExtBits,
                                         V0SExtBits, DL, 0, AC, DT, NSW, NUW);
   NSW = true;
   NUW = true;
   const Value *V1 = GetLinearExpression(Var1.V, V1Scale, V1Offset, V1ZExtBits,
                                         V1SExtBits, DL, 0, AC, DT, NSW, NUW);
 
   if (V0Scale != V1Scale || V0ZExtBits != V1ZExtBits ||
       V0SExtBits != V1SExtBits || !isValueEqualInPotentialCycles(V0, V1))
     return false;
 
   // We have a hit - Var0 and Var1 only differ by a constant offset!
 
   // If we've been sext'ed then zext'd the maximum difference between Var0 and
   // Var1 is possible to calculate, but we're just interested in the absolute
   // minimum difference between the two. The minimum distance may occur due to
   // wrapping; consider "add i3 %i, 5": if %i == 7 then 7 + 5 mod 8 == 4, and so
   // the minimum distance between %i and %i + 5 is 3.
   APInt MinDiff = V0Offset - V1Offset, Wrapped = -MinDiff;
   MinDiff = APIntOps::umin(MinDiff, Wrapped);
   uint64_t MinDiffBytes = MinDiff.getZExtValue() * std::abs(Var0.Scale);
 
   // We can't definitely say whether GEP1 is before or after V2 due to wrapping
   // arithmetic (i.e. for some values of GEP1 and V2 GEP1 < V2, and for other
   // values GEP1 > V2). We'll therefore only declare NoAlias if both V1Size and
   // V2Size can fit in the MinDiffBytes gap.
   return V1Size + std::abs(BaseOffset) <= MinDiffBytes &&
          V2Size + std::abs(BaseOffset) <= MinDiffBytes;
 }
 
 //===----------------------------------------------------------------------===//
 // BasicAliasAnalysis Pass
 //===----------------------------------------------------------------------===//
 
 AnalysisKey BasicAA::Key;
 
 BasicAAResult BasicAA::run(Function &F, FunctionAnalysisManager &AM) {
   return BasicAAResult(F.getParent()->getDataLayout(),
                        F,
                        AM.getResult<TargetLibraryAnalysis>(F),
                        AM.getResult<AssumptionAnalysis>(F),
                        &AM.getResult<DominatorTreeAnalysis>(F),
                        AM.getCachedResult<LoopAnalysis>(F),
                        AM.getCachedResult<PhiValuesAnalysis>(F));
 }
 
 BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
     initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
 char BasicAAWrapperPass::ID = 0;
 
 void BasicAAWrapperPass::anchor() {}
 
 INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basicaa",
                       "Basic Alias Analysis (stateless AA impl)", false, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(BasicAAWrapperPass, "basicaa",
                     "Basic Alias Analysis (stateless AA impl)", false, true)
 
 FunctionPass *llvm::createBasicAAWrapperPass() {
   return new BasicAAWrapperPass();
 }
 
 bool BasicAAWrapperPass::runOnFunction(Function &F) {
   auto &ACT = getAnalysis<AssumptionCacheTracker>();
   auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
   auto &DTWP = getAnalysis<DominatorTreeWrapperPass>();
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   auto *PVWP = getAnalysisIfAvailable<PhiValuesWrapperPass>();
 
   Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F, TLIWP.getTLI(),
                                  ACT.getAssumptionCache(F), &DTWP.getDomTree(),
                                  LIWP ? &LIWP->getLoopInfo() : nullptr,
                                  PVWP ? &PVWP->getResult() : nullptr));
 
   return false;
 }
 
 void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addUsedIfAvailable<PhiValuesWrapperPass>();
 }
 
 BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) {
   return BasicAAResult(
       F.getParent()->getDataLayout(),
       F,
       P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
       P.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
 }
Index: vendor/llvm/dist-release_70/lib/Analysis/InstructionSimplify.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Analysis/InstructionSimplify.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/Analysis/InstructionSimplify.cpp	(revision 338000)
@@ -1,5181 +1,5181 @@
 //===- InstructionSimplify.cpp - Fold instruction operands ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements routines for folding instructions into simpler forms
 // that do not require creating new instructions.  This does constant folding
 // ("add i32 1, 1" -> "2") but can also handle non-constant operands, either
 // returning a constant ("and i32 %x, 0" -> "0") or an already existing value
 // ("and i32 %x, %x" -> "%x").  All operands are assumed to have already been
 // simplified: This is usually true and assuming it simplifies the logic (if
 // they have not been simplified then results are correct but maybe suboptimal).
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/KnownBits.h"
 #include <algorithm>
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "instsimplify"
 
 enum { RecursionLimit = 3 };
 
 STATISTIC(NumExpand,  "Number of expansions");
 STATISTIC(NumReassoc, "Number of reassociations");
 
 static Value *SimplifyAndInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
                             unsigned);
 static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &,
                               const SimplifyQuery &, unsigned);
 static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
                               unsigned);
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse);
 static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyCastInst(unsigned, Value *, Type *,
                                const SimplifyQuery &, unsigned);
 static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, const SimplifyQuery &,
                               unsigned);
 
 static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
                                      Value *FalseVal) {
   BinaryOperator::BinaryOps BinOpCode;
   if (auto *BO = dyn_cast<BinaryOperator>(Cond))
     BinOpCode = BO->getOpcode();
   else
     return nullptr;
 
   CmpInst::Predicate ExpectedPred, Pred1, Pred2;
   if (BinOpCode == BinaryOperator::Or) {
     ExpectedPred = ICmpInst::ICMP_NE;
   } else if (BinOpCode == BinaryOperator::And) {
     ExpectedPred = ICmpInst::ICMP_EQ;
   } else
     return nullptr;
 
   // %A = icmp eq %TV, %FV
   // %B = icmp eq %X, %Y (and one of these is a select operand)
   // %C = and %A, %B
   // %D = select %C, %TV, %FV
   // -->
   // %FV
 
   // %A = icmp ne %TV, %FV
   // %B = icmp ne %X, %Y (and one of these is a select operand)
   // %C = or %A, %B
   // %D = select %C, %TV, %FV
   // -->
   // %TV
   Value *X, *Y;
   if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal),
                                       m_Specific(FalseVal)),
                              m_ICmp(Pred2, m_Value(X), m_Value(Y)))) ||
       Pred1 != Pred2 || Pred1 != ExpectedPred)
     return nullptr;
 
   if (X == TrueVal || X == FalseVal || Y == TrueVal || Y == FalseVal)
     return BinOpCode == BinaryOperator::Or ? TrueVal : FalseVal;
 
   return nullptr;
 }
 
 /// For a boolean type or a vector of boolean type, return false or a vector
 /// with every element false.
 static Constant *getFalse(Type *Ty) {
   return ConstantInt::getFalse(Ty);
 }
 
 /// For a boolean type or a vector of boolean type, return true or a vector
 /// with every element true.
 static Constant *getTrue(Type *Ty) {
   return ConstantInt::getTrue(Ty);
 }
 
 /// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
 static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
                           Value *RHS) {
   CmpInst *Cmp = dyn_cast<CmpInst>(V);
   if (!Cmp)
     return false;
   CmpInst::Predicate CPred = Cmp->getPredicate();
   Value *CLHS = Cmp->getOperand(0), *CRHS = Cmp->getOperand(1);
   if (CPred == Pred && CLHS == LHS && CRHS == RHS)
     return true;
   return CPred == CmpInst::getSwappedPredicate(Pred) && CLHS == RHS &&
     CRHS == LHS;
 }
 
 /// Does the given value dominate the specified phi node?
 static bool valueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     // Arguments and constants dominate all instructions.
     return true;
 
   // If we are processing instructions (and/or basic blocks) that have not been
   // fully added to a function, the parent nodes may still be null. Simply
   // return the conservative answer in these cases.
   if (!I->getParent() || !P->getParent() || !I->getFunction())
     return false;
 
   // If we have a DominatorTree then do a precise test.
   if (DT)
     return DT->dominates(I, P);
 
   // Otherwise, if the instruction is in the entry block and is not an invoke,
   // then it obviously dominates all phi nodes.
   if (I->getParent() == &I->getFunction()->getEntryBlock() &&
       !isa<InvokeInst>(I))
     return true;
 
   return false;
 }
 
 /// Simplify "A op (B op' C)" by distributing op over op', turning it into
 /// "(A op B) op' (A op C)".  Here "op" is given by Opcode and "op'" is
 /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
 static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
                           Instruction::BinaryOps OpcodeToExpand,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   // Check whether the expression has the form "(A op' B) op C".
   if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
     if (Op0->getOpcode() == OpcodeToExpand) {
       // It does!  Try turning it into "(A op C) op' (B op C)".
       Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
       // Do "A op C" and "B op C" both simplify?
       if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse))
         if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
           // They do! Return "L op' R" if it simplifies or is already available.
           // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
           if ((L == A && R == B) || (Instruction::isCommutative(OpcodeToExpand)
                                      && L == B && R == A)) {
             ++NumExpand;
             return LHS;
           }
           // Otherwise return "L op' R" if it simplifies.
           if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
             ++NumExpand;
             return V;
           }
         }
     }
 
   // Check whether the expression has the form "A op (B op' C)".
   if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
     if (Op1->getOpcode() == OpcodeToExpand) {
       // It does!  Try turning it into "(A op B) op' (A op C)".
       Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
       // Do "A op B" and "A op C" both simplify?
       if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse))
         if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) {
           // They do! Return "L op' R" if it simplifies or is already available.
           // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
           if ((L == B && R == C) || (Instruction::isCommutative(OpcodeToExpand)
                                      && L == C && R == B)) {
             ++NumExpand;
             return RHS;
           }
           // Otherwise return "L op' R" if it simplifies.
           if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
             ++NumExpand;
             return V;
           }
         }
     }
 
   return nullptr;
 }
 
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
 static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
                                        Value *LHS, Value *RHS,
                                        const SimplifyQuery &Q,
                                        unsigned MaxRecurse) {
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
 
   // Transform: "(A op B) op C" ==> "A op (B op C)" if it simplifies completely.
   if (Op0 && Op0->getOpcode() == Opcode) {
     Value *A = Op0->getOperand(0);
     Value *B = Op0->getOperand(1);
     Value *C = RHS;
 
     // Does "B op C" simplify?
     if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
       // It does!  Return "A op V" if it simplifies or is already available.
       // If V equals B then "A op V" is just the LHS.
       if (V == B) return LHS;
       // Otherwise return "A op V" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   // Transform: "A op (B op C)" ==> "(A op B) op C" if it simplifies completely.
   if (Op1 && Op1->getOpcode() == Opcode) {
     Value *A = LHS;
     Value *B = Op1->getOperand(0);
     Value *C = Op1->getOperand(1);
 
     // Does "A op B" simplify?
     if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
       // It does!  Return "V op C" if it simplifies or is already available.
       // If V equals B then "V op C" is just the RHS.
       if (V == B) return RHS;
       // Otherwise return "V op C" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   // The remaining transforms require commutativity as well as associativity.
   if (!Instruction::isCommutative(Opcode))
     return nullptr;
 
   // Transform: "(A op B) op C" ==> "(C op A) op B" if it simplifies completely.
   if (Op0 && Op0->getOpcode() == Opcode) {
     Value *A = Op0->getOperand(0);
     Value *B = Op0->getOperand(1);
     Value *C = RHS;
 
     // Does "C op A" simplify?
     if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "V op B" if it simplifies or is already available.
       // If V equals A then "V op B" is just the LHS.
       if (V == A) return LHS;
       // Otherwise return "V op B" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   // Transform: "A op (B op C)" ==> "B op (C op A)" if it simplifies completely.
   if (Op1 && Op1->getOpcode() == Opcode) {
     Value *A = LHS;
     Value *B = Op1->getOperand(0);
     Value *C = Op1->getOperand(1);
 
     // Does "C op A" simplify?
     if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "B op V" if it simplifies or is already available.
       // If V equals C then "B op V" is just the RHS.
       if (V == C) return RHS;
       // Otherwise return "B op V" if it simplifies.
       if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
     }
   }
 
   return nullptr;
 }
 
 /// In the case of a binary operation with a select instruction as an operand,
 /// try to simplify the binop by seeing whether evaluating it on both branches
 /// of the select results in the same value. Returns the common value if so,
 /// otherwise returns null.
 static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   SelectInst *SI;
   if (isa<SelectInst>(LHS)) {
     SI = cast<SelectInst>(LHS);
   } else {
     assert(isa<SelectInst>(RHS) && "No select instruction operand!");
     SI = cast<SelectInst>(RHS);
   }
 
   // Evaluate the BinOp on the true and false branches of the select.
   Value *TV;
   Value *FV;
   if (SI == LHS) {
     TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
     FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
   } else {
     TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
     FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
   }
 
   // If they simplified to the same value, then return the common value.
   // If they both failed to simplify then return null.
   if (TV == FV)
     return TV;
 
   // If one branch simplified to undef, return the other one.
   if (TV && isa<UndefValue>(TV))
     return FV;
   if (FV && isa<UndefValue>(FV))
     return TV;
 
   // If applying the operation did not change the true and false select values,
   // then the result of the binop is the select itself.
   if (TV == SI->getTrueValue() && FV == SI->getFalseValue())
     return SI;
 
   // If one branch simplified and the other did not, and the simplified
   // value is equal to the unsimplified one, return the simplified value.
   // For example, select (cond, X, X & Z) & Z -> X & Z.
   if ((FV && !TV) || (TV && !FV)) {
     // Check that the simplified value has the form "X op Y" where "op" is the
     // same as the original operation.
     Instruction *Simplified = dyn_cast<Instruction>(FV ? FV : TV);
     if (Simplified && Simplified->getOpcode() == unsigned(Opcode)) {
       // The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS".
       // We already know that "op" is the same as for the simplified value.  See
       // if the operands match too.  If so, return the simplified value.
       Value *UnsimplifiedBranch = FV ? SI->getTrueValue() : SI->getFalseValue();
       Value *UnsimplifiedLHS = SI == LHS ? UnsimplifiedBranch : LHS;
       Value *UnsimplifiedRHS = SI == LHS ? RHS : UnsimplifiedBranch;
       if (Simplified->getOperand(0) == UnsimplifiedLHS &&
           Simplified->getOperand(1) == UnsimplifiedRHS)
         return Simplified;
       if (Simplified->isCommutative() &&
           Simplified->getOperand(1) == UnsimplifiedLHS &&
           Simplified->getOperand(0) == UnsimplifiedRHS)
         return Simplified;
     }
   }
 
   return nullptr;
 }
 
 /// In the case of a comparison with a select instruction, try to simplify the
 /// comparison by seeing whether both branches of the select result in the same
 /// value. Returns the common value if so, otherwise returns null.
 static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const SimplifyQuery &Q,
                                   unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   // Make sure the select is on the LHS.
   if (!isa<SelectInst>(LHS)) {
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
   assert(isa<SelectInst>(LHS) && "Not comparing with a select instruction!");
   SelectInst *SI = cast<SelectInst>(LHS);
   Value *Cond = SI->getCondition();
   Value *TV = SI->getTrueValue();
   Value *FV = SI->getFalseValue();
 
   // Now that we have "cmp select(Cond, TV, FV), RHS", analyse it.
   // Does "cmp TV, RHS" simplify?
   Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, Q, MaxRecurse);
   if (TCmp == Cond) {
     // It not only simplified, it simplified to the select condition.  Replace
     // it with 'true'.
     TCmp = getTrue(Cond->getType());
   } else if (!TCmp) {
     // It didn't simplify.  However if "cmp TV, RHS" is equal to the select
     // condition then we can replace it with 'true'.  Otherwise give up.
     if (!isSameCompare(Cond, Pred, TV, RHS))
       return nullptr;
     TCmp = getTrue(Cond->getType());
   }
 
   // Does "cmp FV, RHS" simplify?
   Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, Q, MaxRecurse);
   if (FCmp == Cond) {
     // It not only simplified, it simplified to the select condition.  Replace
     // it with 'false'.
     FCmp = getFalse(Cond->getType());
   } else if (!FCmp) {
     // It didn't simplify.  However if "cmp FV, RHS" is equal to the select
     // condition then we can replace it with 'false'.  Otherwise give up.
     if (!isSameCompare(Cond, Pred, FV, RHS))
       return nullptr;
     FCmp = getFalse(Cond->getType());
   }
 
   // If both sides simplified to the same value, then use it as the result of
   // the original comparison.
   if (TCmp == FCmp)
     return TCmp;
 
   // The remaining cases only make sense if the select condition has the same
   // type as the result of the comparison, so bail out if this is not so.
   if (Cond->getType()->isVectorTy() != RHS->getType()->isVectorTy())
     return nullptr;
   // If the false value simplified to false, then the result of the compare
   // is equal to "Cond && TCmp".  This also catches the case when the false
   // value simplified to false and the true value to true, returning "Cond".
   if (match(FCmp, m_Zero()))
     if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse))
       return V;
   // If the true value simplified to true, then the result of the compare
   // is equal to "Cond || FCmp".
   if (match(TCmp, m_One()))
     if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse))
       return V;
   // Finally, if the false value simplified to true and the true value to
   // false, then the result of the compare is equal to "!Cond".
   if (match(FCmp, m_One()) && match(TCmp, m_Zero()))
     if (Value *V =
         SimplifyXorInst(Cond, Constant::getAllOnesValue(Cond->getType()),
                         Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 /// In the case of a binary operation with an operand that is a PHI instruction,
 /// try to simplify the binop by seeing whether evaluating it on the incoming
 /// phi values yields the same result for every value. If so returns the common
 /// value, otherwise returns null.
 static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
                                  Value *RHS, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   PHINode *PI;
   if (isa<PHINode>(LHS)) {
     PI = cast<PHINode>(LHS);
     // Bail out if RHS and the phi may be mutually interdependent due to a loop.
     if (!valueDominatesPHI(RHS, PI, Q.DT))
       return nullptr;
   } else {
     assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
     PI = cast<PHINode>(RHS);
     // Bail out if LHS and the phi may be mutually interdependent due to a loop.
     if (!valueDominatesPHI(LHS, PI, Q.DT))
       return nullptr;
   }
 
   // Evaluate the BinOp on the incoming phi values.
   Value *CommonValue = nullptr;
   for (Value *Incoming : PI->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PI) continue;
     Value *V = PI == LHS ?
       SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) :
       SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
       return nullptr;
     CommonValue = V;
   }
 
   return CommonValue;
 }
 
 /// In the case of a comparison with a PHI instruction, try to simplify the
 /// comparison by seeing whether comparing with all of the incoming phi values
 /// yields the same result every time. If so returns the common result,
 /// otherwise returns null.
 static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
 
   // Make sure the phi is on the LHS.
   if (!isa<PHINode>(LHS)) {
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
   assert(isa<PHINode>(LHS) && "Not comparing with a phi instruction!");
   PHINode *PI = cast<PHINode>(LHS);
 
   // Bail out if RHS and the phi may be mutually interdependent due to a loop.
   if (!valueDominatesPHI(RHS, PI, Q.DT))
     return nullptr;
 
   // Evaluate the BinOp on the incoming phi values.
   Value *CommonValue = nullptr;
   for (Value *Incoming : PI->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PI) continue;
     Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
       return nullptr;
     CommonValue = V;
   }
 
   return CommonValue;
 }
 
 static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
                                        Value *&Op0, Value *&Op1,
                                        const SimplifyQuery &Q) {
   if (auto *CLHS = dyn_cast<Constant>(Op0)) {
     if (auto *CRHS = dyn_cast<Constant>(Op1))
       return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
 
     // Canonicalize the constant to the RHS if this is a commutative operation.
     if (Instruction::isCommutative(Opcode))
       std::swap(Op0, Op1);
   }
   return nullptr;
 }
 
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
     return C;
 
   // X + undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
   // X + 0 -> X
   if (match(Op1, m_Zero()))
     return Op0;
 
   // If two operands are negative, return 0.
   if (isKnownNegation(Op0, Op1))
     return Constant::getNullValue(Op0->getType());
 
   // X + (Y - X) -> Y
   // (Y - X) + X -> Y
   // Eg: X + -X -> 0
   Value *Y = nullptr;
   if (match(Op1, m_Sub(m_Value(Y), m_Specific(Op0))) ||
       match(Op0, m_Sub(m_Value(Y), m_Specific(Op1))))
     return Y;
 
   // X + ~X -> -1   since   ~X = -X-1
   Type *Ty = Op0->getType();
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Ty);
 
   // add nsw/nuw (xor Y, signmask), signmask --> Y
   // The no-wrapping add guarantees that the top bit will be set by the add.
   // Therefore, the xor must be clearing the already set sign bit of Y.
   if ((IsNSW || IsNUW) && match(Op1, m_SignMask()) &&
       match(Op0, m_Xor(m_Value(Y), m_SignMask())))
     return Y;
 
   // add nuw %x, -1  ->  -1, because %x can only be 0.
   if (IsNUW && match(Op1, m_AllOnes()))
     return Op1; // Which is -1.
 
   /// i1 add -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Threading Add over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A + select(cond, B, C)" means evaluating
   // "A+B" and "A+C" and seeing if they are equal; but they are equal if and
   // only if B and C are equal.  If B and C are equal then (since we assume
   // that operands have already been simplified) "select(cond, B, C)" should
   // have been simplified to the common value of B and C already.  Analysing
   // "A+B" and "A+C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
   return nullptr;
 }
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
                              const SimplifyQuery &Query) {
   return ::SimplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit);
 }
 
 /// Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
 /// accumulates the total constant offset applied in the returned constant. It
 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
 /// no constant offsets applied.
 ///
 /// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
 /// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
 /// folding.
 static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
                                                 bool AllowNonInbounds = false) {
   assert(V->getType()->isPtrOrPtrVectorTy());
 
   Type *IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType();
   APInt Offset = APInt::getNullValue(IntPtrTy->getIntegerBitWidth());
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       if ((!AllowNonInbounds && !GEP->isInBounds()) ||
           !GEP->accumulateConstantOffset(DL, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->isInterposable())
         break;
       V = GA->getAliasee();
     } else {
       if (auto CS = CallSite(V))
         if (Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
       break;
     }
     assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!");
   } while (Visited.insert(V).second);
 
   Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
   if (V->getType()->isVectorTy())
     return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
                                     OffsetIntPtr);
   return OffsetIntPtr;
 }
 
 /// Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
 static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
                                           Value *RHS) {
   Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
   Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
 
   // If LHS and RHS are not related via constant offsets to the same base
   // value, there is nothing we can do here.
   if (LHS != RHS)
     return nullptr;
 
   // Otherwise, the difference of LHS - RHS can be computed as:
   //    LHS - RHS
   //  = (LHSOffset + Base) - (RHSOffset + Base)
   //  = LHSOffset - RHSOffset
   return ConstantExpr::getSub(LHSOffset, RHSOffset);
 }
 
 /// Given operands for a Sub, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
     return C;
 
   // X - undef -> undef
   // undef - X -> undef
   if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
     return UndefValue::get(Op0->getType());
 
   // X - 0 -> X
   if (match(Op1, m_Zero()))
     return Op0;
 
   // X - X -> 0
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   // Is this a negation?
   if (match(Op0, m_Zero())) {
     // 0 - X -> 0 if the sub is NUW.
     if (isNUW)
       return Constant::getNullValue(Op0->getType());
 
     KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (Known.Zero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
       if (isNSW)
         return Constant::getNullValue(Op0->getType());
 
       // 0 - X -> X if X is 0 or the minimum signed value.
       return Op1;
     }
   }
 
   // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
   // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
   Value *X = nullptr, *Y = nullptr, *Z = Op1;
   if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
     // See if "V === Y - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
       // It does!  Now see if "X + V" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
       // It does!  Now see if "Y + V" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
   }
 
   // X - (Y + Z) -> (X - Y) - Z or (X - Z) - Y if everything simplifies.
   // For example, X - (X + 1) -> -1
   X = Op0;
   if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z)
     // See if "V === X - Y" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
       // It does!  Now see if "V - Z" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
       // It does!  Now see if "V - Y" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
   }
 
   // Z - (X - Y) -> (Z - X) + Y if everything simplifies.
   // For example, X - (X - Y) -> Y.
   Z = Op0;
   if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y)
     // See if "V === Z - X" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1))
       // It does!  Now see if "V + Y" simplifies.
       if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
 
   // trunc(X) - trunc(Y) -> trunc(X - Y) if everything simplifies.
   if (MaxRecurse && match(Op0, m_Trunc(m_Value(X))) &&
       match(Op1, m_Trunc(m_Value(Y))))
     if (X->getType() == Y->getType())
       // See if "V === X - Y" simplifies.
       if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
         // It does!  Now see if "trunc V" simplifies.
         if (Value *W = SimplifyCastInst(Instruction::Trunc, V, Op0->getType(),
                                         Q, MaxRecurse - 1))
           // It does, return the simplified "trunc V".
           return W;
 
   // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
   if (match(Op0, m_PtrToInt(m_Value(X))) &&
       match(Op1, m_PtrToInt(m_Value(Y))))
     if (Constant *Result = computePointerDifference(Q.DL, X, Y))
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Threading Sub over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A - select(cond, B, C)" means evaluating
   // "A-B" and "A-C" and seeing if they are equal; but they are equal if and
   // only if B and C are equal.  If B and C are equal then (since we assume
   // that operands have already been simplified) "select(cond, B, C)" should
   // have been simplified to the common value of B and C already.  Analysing
   // "A-B" and "A-C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
   return nullptr;
 }
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const SimplifyQuery &Q) {
   return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for a Mul, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
     return C;
 
   // X * undef -> 0
   // X * 0 -> 0
   if (match(Op1, m_CombineOr(m_Undef(), m_Zero())))
     return Constant::getNullValue(Op0->getType());
 
   // X * 1 -> X
   if (match(Op1, m_One()))
     return Op0;
 
   // (X / Y) * Y -> X if the division is exact.
   Value *X = nullptr;
   if (match(Op0, m_Exact(m_IDiv(m_Value(X), m_Specific(Op1)))) || // (X / Y) * Y
       match(Op1, m_Exact(m_IDiv(m_Value(X), m_Specific(Op0)))))   // Y * (X / Y)
     return X;
 
   // i1 mul -> and.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Mul distributes over Add. Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add,
                              Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q,
                                       MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Check for common or similar folds of integer division or integer remainder.
 /// This applies to all 4 opcodes (sdiv/udiv/srem/urem).
 static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
   Type *Ty = Op0->getType();
 
   // X / undef -> undef
   // X % undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
   // X / 0 -> undef
   // X % 0 -> undef
   // We don't need to preserve faults!
   if (match(Op1, m_Zero()))
     return UndefValue::get(Ty);
 
   // If any element of a constant divisor vector is zero or undef, the whole op
   // is undef.
   auto *Op1C = dyn_cast<Constant>(Op1);
   if (Op1C && Ty->isVectorTy()) {
     unsigned NumElts = Ty->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = Op1C->getAggregateElement(i);
       if (Elt && (Elt->isNullValue() || isa<UndefValue>(Elt)))
         return UndefValue::get(Ty);
     }
   }
 
   // undef / X -> 0
   // undef % X -> 0
   if (match(Op0, m_Undef()))
     return Constant::getNullValue(Ty);
 
   // 0 / X -> 0
   // 0 % X -> 0
   if (match(Op0, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X / X -> 1
   // X % X -> 0
   if (Op0 == Op1)
     return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty);
 
   // X / 1 -> X
   // X % 1 -> 0
   // If this is a boolean op (single-bit element type), we can't have
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
   // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1.
   Value *X;
   if (match(Op1, m_One()) || Ty->isIntOrIntVectorTy(1) ||
       (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
     return IsDiv ? Op0 : Constant::getNullValue(Ty);
 
   return nullptr;
 }
 
 /// Given a predicate and two operands, return true if the comparison is true.
 /// This is a helper for div/rem simplification where we return some other value
 /// when we can prove a relationship between the operands.
 static bool isICmpTrue(ICmpInst::Predicate Pred, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q, unsigned MaxRecurse) {
   Value *V = SimplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse);
   Constant *C = dyn_cast_or_null<Constant>(V);
   return (C && C->isAllOnesValue());
 }
 
 /// Return true if we can simplify X / Y to 0. Remainder can adapt that answer
 /// to simplify X % Y to X.
 static bool isDivZero(Value *X, Value *Y, const SimplifyQuery &Q,
                       unsigned MaxRecurse, bool IsSigned) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return false;
 
   if (IsSigned) {
     // |X| / |Y| --> 0
     //
     // We require that 1 operand is a simple constant. That could be extended to
     // 2 variables if we computed the sign bit for each.
     //
     // Make sure that a constant is not the minimum signed value because taking
     // the abs() of that is undefined.
     Type *Ty = X->getType();
     const APInt *C;
     if (match(X, m_APInt(C)) && !C->isMinSignedValue()) {
       // Is the variable divisor magnitude always greater than the constant
       // dividend magnitude?
       // |Y| > |C| --> Y < -abs(C) or Y > abs(C)
       Constant *PosDividendC = ConstantInt::get(Ty, C->abs());
       Constant *NegDividendC = ConstantInt::get(Ty, -C->abs());
       if (isICmpTrue(CmpInst::ICMP_SLT, Y, NegDividendC, Q, MaxRecurse) ||
           isICmpTrue(CmpInst::ICMP_SGT, Y, PosDividendC, Q, MaxRecurse))
         return true;
     }
     if (match(Y, m_APInt(C))) {
       // Special-case: we can't take the abs() of a minimum signed value. If
       // that's the divisor, then all we have to do is prove that the dividend
       // is also not the minimum signed value.
       if (C->isMinSignedValue())
         return isICmpTrue(CmpInst::ICMP_NE, X, Y, Q, MaxRecurse);
 
       // Is the variable dividend magnitude always less than the constant
       // divisor magnitude?
       // |X| < |C| --> X > -abs(C) and X < abs(C)
       Constant *PosDivisorC = ConstantInt::get(Ty, C->abs());
       Constant *NegDivisorC = ConstantInt::get(Ty, -C->abs());
       if (isICmpTrue(CmpInst::ICMP_SGT, X, NegDivisorC, Q, MaxRecurse) &&
           isICmpTrue(CmpInst::ICMP_SLT, X, PosDivisorC, Q, MaxRecurse))
         return true;
     }
     return false;
   }
 
   // IsSigned == false.
   // Is the dividend unsigned less than the divisor?
   return isICmpTrue(ICmpInst::ICMP_ULT, X, Y, Q, MaxRecurse);
 }
 
 /// These are simplifications common to SDiv and UDiv.
 static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
   if (Value *V = simplifyDivRem(Op0, Op1, true))
     return V;
 
   bool IsSigned = Opcode == Instruction::SDiv;
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
   Value *X;
   if (match(Op0, m_c_Mul(m_Value(X), m_Specific(Op1)))) {
     auto *Mul = cast<OverflowingBinaryOperator>(Op0);
     // If the Mul does not overflow, then we are good to go.
     if ((IsSigned && Mul->hasNoSignedWrap()) ||
         (!IsSigned && Mul->hasNoUnsignedWrap()))
       return X;
     // If X has the form X = A / Y, then X * Y cannot overflow.
     if ((IsSigned && match(X, m_SDiv(m_Value(), m_Specific(Op1)))) ||
         (!IsSigned && match(X, m_UDiv(m_Value(), m_Specific(Op1)))))
       return X;
   }
 
   // (X rem Y) / Y -> 0
   if ((IsSigned && match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) ||
       (!IsSigned && match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
     return Constant::getNullValue(Op0->getType());
 
   // (X /u C1) /u C2 -> 0 if C1 * C2 overflow
   ConstantInt *C1, *C2;
   if (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_ConstantInt(C1))) &&
       match(Op1, m_ConstantInt(C2))) {
     bool Overflow;
     (void)C1->getValue().umul_ov(C2->getValue(), Overflow);
     if (Overflow)
       return Constant::getNullValue(Op0->getType());
   }
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   if (isDivZero(Op0, Op1, Q, MaxRecurse, IsSigned))
     return Constant::getNullValue(Op0->getType());
 
   return nullptr;
 }
 
 /// These are simplifications common to SRem and URem.
 static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
   if (Value *V = simplifyDivRem(Op0, Op1, false))
     return V;
 
   // (X % Y) % Y -> X % Y
   if ((Opcode == Instruction::SRem &&
        match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) ||
       (Opcode == Instruction::URem &&
        match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
     return Op0;
 
   // (X << Y) % X -> 0
   if ((Opcode == Instruction::SRem &&
        match(Op0, m_NSWShl(m_Specific(Op1), m_Value()))) ||
       (Opcode == Instruction::URem &&
        match(Op0, m_NUWShl(m_Specific(Op1), m_Value()))))
     return Constant::getNullValue(Op0->getType());
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If X / Y == 0, then X % Y == X.
   if (isDivZero(Op0, Op1, Q, MaxRecurse, Opcode == Instruction::SRem))
     return Op0;
 
   return nullptr;
 }
 
 /// Given operands for an SDiv, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   // If two operands are negated and no signed overflow, return -1.
   if (isKnownNegation(Op0, Op1, /*NeedNSW=*/true))
     return Constant::getAllOnesValue(Op0->getType());
 
   return simplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a UDiv, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   return simplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for an SRem, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   // If the divisor is 0, the result is undefined, so assume the divisor is -1.
   // srem Op0, (sext i1 X) --> srem Op0, -1 --> 0
   Value *X;
   if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
     return ConstantInt::getNullValue(Op0->getType());
 
   // If the two operands are negated, return 0.
   if (isKnownNegation(Op0, Op1))
     return ConstantInt::getNullValue(Op0->getType());
 
   return simplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a URem, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   return simplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Returns true if a shift by \c Amount always yields undef.
 static bool isUndefShift(Value *Amount) {
   Constant *C = dyn_cast<Constant>(Amount);
   if (!C)
     return false;
 
   // X shift by undef -> undef because it may shift by the bitwidth.
   if (isa<UndefValue>(C))
     return true;
 
   // Shifting by the bitwidth or more is undefined.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
     if (CI->getValue().getLimitedValue() >=
         CI->getType()->getScalarSizeInBits())
       return true;
 
   // If all lanes of a vector shift are undefined the whole shift is.
   if (isa<ConstantVector>(C) || isa<ConstantDataVector>(C)) {
     for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I)
       if (!isUndefShift(C->getAggregateElement(I)))
         return false;
     return true;
   }
 
   return false;
 }
 
 /// Given operands for an Shl, LShr or AShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
                             Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
     return C;
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X shift by 0 -> X
   // Shift-by-sign-extended bool must be shift-by-0 because shift-by-all-ones
   // would be poison.
   Value *X;
   if (match(Op1, m_Zero()) ||
       (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
     return Op0;
 
   // Fold undefined shifts.
   if (isUndefShift(Op1))
     return UndefValue::get(Op0->getType());
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If any bits in the shift amount make that value greater than or equal to
   // the number of bits in the type, the shift is undefined.
   KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
   if (Known.One.getLimitedValue() >= Known.getBitWidth())
     return UndefValue::get(Op0->getType());
 
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
   unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth());
   if (Known.countMinTrailingZeros() >= NumValidShiftBits)
     return Op0;
 
   return nullptr;
 }
 
 /// Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
                                  Value *Op1, bool isExact, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // X >> X -> 0
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   // undef >> X -> 0
   // undef >> X -> undef (if it's exact)
   if (match(Op0, m_Undef()))
     return isExact ? Op0 : Constant::getNullValue(Op0->getType());
 
   // The low bit cannot be shifted out of an exact shift if it is set.
   if (isExact) {
     KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
     if (Op0Known.One[0])
       return Op0;
   }
 
   return nullptr;
 }
 
 /// Given operands for an Shl, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // undef << X -> 0
   // undef << X -> undef if (if it's NSW/NUW)
   if (match(Op0, m_Undef()))
     return isNSW || isNUW ? Op0 : Constant::getNullValue(Op0->getType());
 
   // (X >> A) << A -> X
   Value *X;
   if (match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1)))))
     return X;
 
   // shl nuw i8 C, %x  ->  C  iff C has sign bit set.
   if (isNUW && match(Op0, m_Negative()))
     return Op0;
   // NOTE: could use computeKnownBits() / LazyValueInfo,
   // but the cost-benefit analysis suggests it isn't worth it.
 
   return nullptr;
 }
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const SimplifyQuery &Q) {
   return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for an LShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
       return V;
 
   // (X << A) >> A -> X
   Value *X;
   if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
   // ((X << A) | Y) >> A -> X  if effective width of Y is not larger than A.
   // We can return X as we do in the above case since OR alters no bits in X.
   // SimplifyDemandedBits in InstCombine can do more general optimization for
   // bit manipulation. This pattern aims to provide opportunities for other
   // optimizers by supporting a simple but common case in InstSimplify.
   Value *Y;
   const APInt *ShRAmt, *ShLAmt;
   if (match(Op1, m_APInt(ShRAmt)) &&
       match(Op0, m_c_Or(m_NUWShl(m_Value(X), m_APInt(ShLAmt)), m_Value(Y))) &&
       *ShRAmt == *ShLAmt) {
     const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     const unsigned Width = Op0->getType()->getScalarSizeInBits();
     const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
-    if (EffWidthY <= ShRAmt->getZExtValue())
+    if (ShRAmt->uge(EffWidthY))
       return X;
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                               const SimplifyQuery &Q) {
   return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Given operands for an AShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
     return V;
 
   // all ones >>a X -> -1
   // Do not return Op0 because it may contain undef elements if it's a vector.
   if (match(Op0, m_AllOnes()))
     return Constant::getAllOnesValue(Op0->getType());
 
   // (X << A) >> A -> X
   Value *X;
   if (match(Op0, m_NSWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
   // Arithmetic shifting an all-sign-bit value is a no-op.
   unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
   if (NumSignBits == Op0->getType()->getScalarSizeInBits())
     return Op0;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                               const SimplifyQuery &Q) {
   return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
                                          ICmpInst *UnsignedICmp, bool IsAnd) {
   Value *X, *Y;
 
   ICmpInst::Predicate EqPred;
   if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(Y), m_Zero())) ||
       !ICmpInst::isEquality(EqPred))
     return nullptr;
 
   ICmpInst::Predicate UnsignedPred;
   if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Value(X), m_Specific(Y))) &&
       ICmpInst::isUnsigned(UnsignedPred))
     ;
   else if (match(UnsignedICmp,
                  m_ICmp(UnsignedPred, m_Specific(Y), m_Value(X))) &&
            ICmpInst::isUnsigned(UnsignedPred))
     UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
   else
     return nullptr;
 
   // X < Y && Y != 0  -->  X < Y
   // X < Y || Y != 0  -->  Y != 0
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
     return IsAnd ? UnsignedICmp : ZeroICmp;
 
   // X >= Y || Y != 0  -->  true
   // X >= Y || Y == 0  -->  X >= Y
   if (UnsignedPred == ICmpInst::ICMP_UGE && !IsAnd) {
     if (EqPred == ICmpInst::ICMP_NE)
       return getTrue(UnsignedICmp->getType());
     return UnsignedICmp;
   }
 
   // X < Y && Y == 0  -->  false
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ &&
       IsAnd)
     return getFalse(UnsignedICmp->getType());
 
   return nullptr;
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   Value *A ,*B;
   if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
       !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
     return nullptr;
 
   // We have (icmp Pred0, A, B) & (icmp Pred1, A, B).
   // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
   // can eliminate Op1 from this 'and'.
   if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
     return Op0;
 
   // Check for any combination of predicates that are guaranteed to be disjoint.
   if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_EQ && ICmpInst::isFalseWhenEqual(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT) ||
       (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT))
     return getFalse(Op0->getType());
 
   return nullptr;
 }
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
   ICmpInst::Predicate Pred0, Pred1;
   Value *A ,*B;
   if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
       !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
     return nullptr;
 
   // We have (icmp Pred0, A, B) | (icmp Pred1, A, B).
   // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
   // can eliminate Op0 from this 'or'.
   if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
     return Op1;
 
   // Check for any combination of predicates that cover the entire range of
   // possibilities.
   if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) ||
       (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) ||
       (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE))
     return getTrue(Op0->getType());
 
   return nullptr;
 }
 
 /// Test if a pair of compares with a shared operand and 2 constants has an
 /// empty set intersection, full set union, or if one compare is a superset of
 /// the other.
 static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
                                                 bool IsAnd) {
   // Look for this pattern: {and/or} (icmp X, C0), (icmp X, C1)).
   if (Cmp0->getOperand(0) != Cmp1->getOperand(0))
     return nullptr;
 
   const APInt *C0, *C1;
   if (!match(Cmp0->getOperand(1), m_APInt(C0)) ||
       !match(Cmp1->getOperand(1), m_APInt(C1)))
     return nullptr;
 
   auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
   auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
 
   // For and-of-compares, check if the intersection is empty:
   // (icmp X, C0) && (icmp X, C1) --> empty set --> false
   if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
     return getFalse(Cmp0->getType());
 
   // For or-of-compares, check if the union is full:
   // (icmp X, C0) || (icmp X, C1) --> full set --> true
   if (!IsAnd && Range0.unionWith(Range1).isFullSet())
     return getTrue(Cmp0->getType());
 
   // Is one range a superset of the other?
   // If this is and-of-compares, take the smaller set:
   // (icmp sgt X, 4) && (icmp sgt X, 42) --> icmp sgt X, 42
   // If this is or-of-compares, take the larger set:
   // (icmp sgt X, 4) || (icmp sgt X, 42) --> icmp sgt X, 4
   if (Range0.contains(Range1))
     return IsAnd ? Cmp1 : Cmp0;
   if (Range1.contains(Range0))
     return IsAnd ? Cmp0 : Cmp1;
 
   return nullptr;
 }
 
 static Value *simplifyAndOrOfICmpsWithZero(ICmpInst *Cmp0, ICmpInst *Cmp1,
                                            bool IsAnd) {
   ICmpInst::Predicate P0 = Cmp0->getPredicate(), P1 = Cmp1->getPredicate();
   if (!match(Cmp0->getOperand(1), m_Zero()) ||
       !match(Cmp1->getOperand(1), m_Zero()) || P0 != P1)
     return nullptr;
 
   if ((IsAnd && P0 != ICmpInst::ICMP_NE) || (!IsAnd && P1 != ICmpInst::ICMP_EQ))
     return nullptr;
 
   // We have either "(X == 0 || Y == 0)" or "(X != 0 && Y != 0)".
   Value *X = Cmp0->getOperand(0);
   Value *Y = Cmp1->getOperand(0);
 
   // If one of the compares is a masked version of a (not) null check, then
   // that compare implies the other, so we eliminate the other. Optionally, look
   // through a pointer-to-int cast to match a null check of a pointer type.
 
   // (X == 0) || (([ptrtoint] X & ?) == 0) --> ([ptrtoint] X & ?) == 0
   // (X == 0) || ((? & [ptrtoint] X) == 0) --> (? & [ptrtoint] X) == 0
   // (X != 0) && (([ptrtoint] X & ?) != 0) --> ([ptrtoint] X & ?) != 0
   // (X != 0) && ((? & [ptrtoint] X) != 0) --> (? & [ptrtoint] X) != 0
   if (match(Y, m_c_And(m_Specific(X), m_Value())) ||
       match(Y, m_c_And(m_PtrToInt(m_Specific(X)), m_Value())))
     return Cmp1;
 
   // (([ptrtoint] Y & ?) == 0) || (Y == 0) --> ([ptrtoint] Y & ?) == 0
   // ((? & [ptrtoint] Y) == 0) || (Y == 0) --> (? & [ptrtoint] Y) == 0
   // (([ptrtoint] Y & ?) != 0) && (Y != 0) --> ([ptrtoint] Y & ?) != 0
   // ((? & [ptrtoint] Y) != 0) && (Y != 0) --> (? & [ptrtoint] Y) != 0
   if (match(X, m_c_And(m_Specific(Y), m_Value())) ||
       match(X, m_c_And(m_PtrToInt(m_Specific(Y)), m_Value())))
     return Cmp0;
 
   return nullptr;
 }
 
 static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) & (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
   Value *V;
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
     return nullptr;
 
   if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
     return nullptr;
 
   auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
   if (AddInst->getOperand(1) != Op1->getOperand(1))
     return nullptr;
 
   Type *ITy = Op0->getType();
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
 
   const APInt Delta = *C1 - *C0;
   if (C0->isStrictlyPositive()) {
     if (Delta == 2) {
       if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_SGT)
         return getFalse(ITy);
       if (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT && isNSW)
         return getFalse(ITy);
     }
     if (Delta == 1) {
       if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_SGT)
         return getFalse(ITy);
       if (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGT && isNSW)
         return getFalse(ITy);
     }
   }
   if (C0->getBoolValue() && isNUW) {
     if (Delta == 2)
       if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT)
         return getFalse(ITy);
     if (Delta == 1)
       if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGT)
         return getFalse(ITy);
   }
 
   return nullptr;
 }
 
 static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
     return X;
   if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true))
     return X;
 
   if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
     return X;
   if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
     return X;
 
   if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1))
     return X;
   if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0))
     return X;
 
   return nullptr;
 }
 
 static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) | (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
   Value *V;
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
     return nullptr;
 
   if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
     return nullptr;
 
   auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
   if (AddInst->getOperand(1) != Op1->getOperand(1))
     return nullptr;
 
   Type *ITy = Op0->getType();
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
 
   const APInt Delta = *C1 - *C0;
   if (C0->isStrictlyPositive()) {
     if (Delta == 2) {
       if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE)
         return getTrue(ITy);
       if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW)
         return getTrue(ITy);
     }
     if (Delta == 1) {
       if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE)
         return getTrue(ITy);
       if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW)
         return getTrue(ITy);
     }
   }
   if (C0->getBoolValue() && isNUW) {
     if (Delta == 2)
       if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE)
         return getTrue(ITy);
     if (Delta == 1)
       if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE)
         return getTrue(ITy);
   }
 
   return nullptr;
 }
 
 static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
     return X;
   if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false))
     return X;
 
   if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
     return X;
   if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
     return X;
 
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
     return X;
 
   if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1))
     return X;
   if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0))
     return X;
 
   return nullptr;
 }
 
 static Value *simplifyAndOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) {
   Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
   Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
   if (LHS0->getType() != RHS0->getType())
     return nullptr;
 
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
       (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
     // (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y
     // (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X
     // (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y
     // (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X
     // (fcmp uno NNAN, X) | (fcmp uno X, Y) --> fcmp uno X, Y
     // (fcmp uno NNAN, X) | (fcmp uno Y, X) --> fcmp uno Y, X
     // (fcmp uno X, NNAN) | (fcmp uno X, Y) --> fcmp uno X, Y
     // (fcmp uno X, NNAN) | (fcmp uno Y, X) --> fcmp uno Y, X
     if ((isKnownNeverNaN(LHS0) && (LHS1 == RHS0 || LHS1 == RHS1)) ||
         (isKnownNeverNaN(LHS1) && (LHS0 == RHS0 || LHS0 == RHS1)))
       return RHS;
 
     // (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y
     // (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X
     // (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y
     // (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X
     // (fcmp uno X, Y) | (fcmp uno NNAN, X) --> fcmp uno X, Y
     // (fcmp uno Y, X) | (fcmp uno NNAN, X) --> fcmp uno Y, X
     // (fcmp uno X, Y) | (fcmp uno X, NNAN) --> fcmp uno X, Y
     // (fcmp uno Y, X) | (fcmp uno X, NNAN) --> fcmp uno Y, X
     if ((isKnownNeverNaN(RHS0) && (RHS1 == LHS0 || RHS1 == LHS1)) ||
         (isKnownNeverNaN(RHS1) && (RHS0 == LHS0 || RHS0 == LHS1)))
       return LHS;
   }
 
   return nullptr;
 }
 
 static Value *simplifyAndOrOfCmps(Value *Op0, Value *Op1, bool IsAnd) {
   // Look through casts of the 'and' operands to find compares.
   auto *Cast0 = dyn_cast<CastInst>(Op0);
   auto *Cast1 = dyn_cast<CastInst>(Op1);
   if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() &&
       Cast0->getSrcTy() == Cast1->getSrcTy()) {
     Op0 = Cast0->getOperand(0);
     Op1 = Cast1->getOperand(0);
   }
 
   Value *V = nullptr;
   auto *ICmp0 = dyn_cast<ICmpInst>(Op0);
   auto *ICmp1 = dyn_cast<ICmpInst>(Op1);
   if (ICmp0 && ICmp1)
     V = IsAnd ? simplifyAndOfICmps(ICmp0, ICmp1) :
                 simplifyOrOfICmps(ICmp0, ICmp1);
 
   auto *FCmp0 = dyn_cast<FCmpInst>(Op0);
   auto *FCmp1 = dyn_cast<FCmpInst>(Op1);
   if (FCmp0 && FCmp1)
     V = simplifyAndOrOfFCmps(FCmp0, FCmp1, IsAnd);
 
   if (!V)
     return nullptr;
   if (!Cast0)
     return V;
 
   // If we looked through casts, we can only handle a constant simplification
   // because we are not allowed to create a cast instruction here.
   if (auto *C = dyn_cast<Constant>(V))
     return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType());
 
   return nullptr;
 }
 
 /// Given operands for an And, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
     return C;
 
   // X & undef -> 0
   if (match(Op1, m_Undef()))
     return Constant::getNullValue(Op0->getType());
 
   // X & X = X
   if (Op0 == Op1)
     return Op0;
 
   // X & 0 = 0
   if (match(Op1, m_Zero()))
     return Constant::getNullValue(Op0->getType());
 
   // X & -1 = X
   if (match(Op1, m_AllOnes()))
     return Op0;
 
   // A & ~A  =  ~A & A  =  0
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
   if (match(Op0, m_c_Or(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A & (A | ?) = A
   if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
   // A mask that only clears known zeros of a shifted value is a no-op.
   Value *X;
   const APInt *Mask;
   const APInt *ShAmt;
   if (match(Op1, m_APInt(Mask))) {
     // If all bits in the inverted and shifted mask are clear:
     // and (shl X, ShAmt), Mask --> shl X, ShAmt
     if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) &&
         (~(*Mask)).lshr(*ShAmt).isNullValue())
       return Op0;
 
     // If all bits in the inverted and shifted mask are clear:
     // and (lshr X, ShAmt), Mask --> lshr X, ShAmt
     if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
         (~(*Mask)).shl(*ShAmt).isNullValue())
       return Op0;
   }
 
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
     if (isKnownToBeAPowerOfTwo(Op0, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI,
                                Q.DT))
       return Op0;
     if (isKnownToBeAPowerOfTwo(Op1, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI,
                                Q.DT))
       return Op1;
   }
 
   if (Value *V = simplifyAndOrOfCmps(Op0, Op1, true))
     return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // And distributes over Or.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or,
                              Q, MaxRecurse))
     return V;
 
   // And distributes over Xor.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor,
                              Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q,
                                       MaxRecurse))
       return V;
 
   // Assuming the effective width of Y is not larger than A, i.e. all bits
   // from X and Y are disjoint in (X << A) | Y,
   // if the mask of this AND op covers all bits of X or Y, while it covers
   // no bits from the other, we can bypass this AND op. E.g.,
   // ((X << A) | Y) & Mask -> Y,
   //     if Mask = ((1 << effective_width_of(Y)) - 1)
   // ((X << A) | Y) & Mask -> X << A,
   //     if Mask = ((1 << effective_width_of(X)) - 1) << A
   // SimplifyDemandedBits in InstCombine can optimize the general case.
   // This pattern aims to help other passes for a common case.
   Value *Y, *XShifted;
   if (match(Op1, m_APInt(Mask)) &&
       match(Op0, m_c_Or(m_CombineAnd(m_NUWShl(m_Value(X), m_APInt(ShAmt)),
                                      m_Value(XShifted)),
                         m_Value(Y)))) {
-    const unsigned ShftCnt = ShAmt->getZExtValue();
-    const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     const unsigned Width = Op0->getType()->getScalarSizeInBits();
+    const unsigned ShftCnt = ShAmt->getLimitedValue(Width);
+    const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
     if (EffWidthY <= ShftCnt) {
       const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI,
                                                 Q.DT);
       const unsigned EffWidthX = Width - XKnown.countMinLeadingZeros();
       const APInt EffBitsY = APInt::getLowBitsSet(Width, EffWidthY);
       const APInt EffBitsX = APInt::getLowBitsSet(Width, EffWidthX) << ShftCnt;
       // If the mask is extracting all bits from X or Y as is, we can skip
       // this AND op.
       if (EffBitsY.isSubsetOf(*Mask) && !EffBitsX.intersects(*Mask))
         return Y;
       if (EffBitsX.isSubsetOf(*Mask) && !EffBitsY.intersects(*Mask))
         return XShifted;
     }
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for an Or, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                              unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
     return C;
 
   // X | undef -> -1
   // X | -1 = -1
   // Do not return Op1 because it may contain undef elements if it's a vector.
   if (match(Op1, m_Undef()) || match(Op1, m_AllOnes()))
     return Constant::getAllOnesValue(Op0->getType());
 
   // X | X = X
   // X | 0 = X
   if (Op0 == Op1 || match(Op1, m_Zero()))
     return Op0;
 
   // A | ~A  =  ~A | A  =  -1
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
   // (A & ?) | A = A
   if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A | (A & ?) = A
   if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
     return Op0;
 
   // ~(A & ?) | A = -1
   if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op1->getType());
 
   // A | ~(A & ?) = -1
   if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
   Value *A, *B;
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
   // (A & ~B) | (B ^ A) -> (B ^ A)
   // (~B & A) | (B ^ A) -> (B ^ A)
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
       (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op1;
 
   // Commute the 'or' operands.
   // (A ^ B) | (A & ~B) -> (A ^ B)
   // (A ^ B) | (~B & A) -> (A ^ B)
   // (B ^ A) | (A & ~B) -> (B ^ A)
   // (B ^ A) | (~B & A) -> (B ^ A)
   if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
       (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
   // (A & B) | (~A ^ B) -> (~A ^ B)
   // (B & A) | (~A ^ B) -> (~A ^ B)
   // (A & B) | (B ^ ~A) -> (B ^ ~A)
   // (B & A) | (B ^ ~A) -> (B ^ ~A)
   if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
       (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op1;
 
   // (~A ^ B) | (A & B) -> (~A ^ B)
   // (~A ^ B) | (B & A) -> (~A ^ B)
   // (B ^ ~A) | (A & B) -> (B ^ ~A)
   // (B ^ ~A) | (B & A) -> (B ^ ~A)
   if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
       (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
        match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
   if (Value *V = simplifyAndOrOfCmps(Op0, Op1, false))
     return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Or distributes over And.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q,
                              MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
     if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // (A & C1)|(B & C2)
   const APInt *C1, *C2;
   if (match(Op0, m_And(m_Value(A), m_APInt(C1))) &&
       match(Op1, m_And(m_Value(B), m_APInt(C2)))) {
     if (*C1 == ~*C2) {
       // (A & C1)|(B & C2)
       // If we have: ((V + N) & C1) | (V & C2)
       // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
       // replace with V+N.
       Value *N;
       if (C2->isMask() && // C2 == 0+1+
           match(A, m_c_Add(m_Specific(B), m_Value(N)))) {
         // Add commutes, try both ways.
         if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return A;
       }
       // Or commutes, try both ways.
       if (C1->isMask() &&
           match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
         // Add commutes, try both ways.
         if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
       }
     }
   }
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
     if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a Xor, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
     return C;
 
   // A ^ undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
   // A ^ 0 = A
   if (match(Op1, m_Zero()))
     return Op0;
 
   // A ^ A = 0
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   // A ^ ~A  =  ~A ^ A  =  -1
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
     return Constant::getAllOnesValue(Op0->getType());
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Threading Xor over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A ^ select(cond, B, C)" means evaluating
   // "A^B" and "A^C" and seeing if they are equal; but they are equal if and
   // only if B and C are equal.  If B and C are equal then (since we assume
   // that operands have already been simplified) "select(cond, B, C)" should
   // have been simplified to the common value of B and C already.  Analysing
   // "A^B" and "A^C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
   return nullptr;
 }
 
 Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
   return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit);
 }
 
 
 static Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
 /// Rummage around inside V looking for something equivalent to the comparison
 /// "LHS Pred RHS". Return such a value if found, otherwise return null.
 /// Helper function for analyzing max/min idioms.
 static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
                                          Value *LHS, Value *RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI)
     return nullptr;
   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
   if (!Cmp)
     return nullptr;
   Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1);
   if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
     return Cmp;
   if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
       LHS == CmpRHS && RHS == CmpLHS)
     return Cmp;
   return nullptr;
 }
 
 // A significant optimization not implemented here is assuming that alloca
 // addresses are not equal to incoming argument values. They don't *alias*,
 // as we say, but that doesn't mean they aren't equal, so we take a
 // conservative approach.
 //
 // This is inspired in part by C++11 5.10p1:
 //   "Two pointers of the same type compare equal if and only if they are both
 //    null, both point to the same function, or both represent the same
 //    address."
 //
 // This is pretty permissive.
 //
 // It's also partly due to C11 6.5.9p6:
 //   "Two pointers compare equal if and only if both are null pointers, both are
 //    pointers to the same object (including a pointer to an object and a
 //    subobject at its beginning) or function, both are pointers to one past the
 //    last element of the same array object, or one is a pointer to one past the
 //    end of one array object and the other is a pointer to the start of a
 //    different array object that happens to immediately follow the first array
 //    object in the address space.)
 //
 // C11's version is more restrictive, however there's no reason why an argument
 // couldn't be a one-past-the-end value for a stack object in the caller and be
 // equal to the beginning of a stack object in the callee.
 //
 // If the C and C++ standards are ever made sufficiently restrictive in this
 // area, it may be possible to update LLVM's semantics accordingly and reinstate
 // this optimization.
 static Constant *
 computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
                    const DominatorTree *DT, CmpInst::Predicate Pred,
                    AssumptionCache *AC, const Instruction *CxtI,
                    Value *LHS, Value *RHS) {
   // First, skip past any trivial no-ops.
   LHS = LHS->stripPointerCasts();
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
   if (llvm::isKnownNonZero(LHS, DL) && isa<ConstantPointerNull>(RHS) &&
       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
 
   // We can only fold certain predicates on pointer comparisons.
   switch (Pred) {
   default:
     return nullptr;
 
     // Equality comaprisons are easy to fold.
   case CmpInst::ICMP_EQ:
   case CmpInst::ICMP_NE:
     break;
 
     // We can only handle unsigned relational comparisons because 'inbounds' on
     // a GEP only protects against unsigned wrapping.
   case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE:
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
     // However, we have to switch them to their signed variants to handle
     // negative indices from the base pointer.
     Pred = ICmpInst::getSignedPredicate(Pred);
     break;
   }
 
   // Strip off any constant offsets so that we can reason about them.
   // It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets
   // here and compare base addresses like AliasAnalysis does, however there are
   // numerous hazards. AliasAnalysis and its utilities rely on special rules
   // governing loads and stores which don't apply to icmps. Also, AliasAnalysis
   // doesn't need to guarantee pointer inequality when it says NoAlias.
   Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
   Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
 
   // If LHS and RHS are related via constant offsets to the same base
   // value, we can replace it with an icmp which just compares the offsets.
   if (LHS == RHS)
     return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
 
   // Various optimizations for (in)equality comparisons.
   if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
     // Different non-empty allocations that exist at the same time have
     // different addresses (if the program can tell). Global variables always
     // exist, so they always exist during the lifetime of each other and all
     // allocas. Two different allocas usually have different addresses...
     //
     // However, if there's an @llvm.stackrestore dynamically in between two
     // allocas, they may have the same address. It's tempting to reduce the
     // scope of the problem by only looking at *static* allocas here. That would
     // cover the majority of allocas while significantly reducing the likelihood
     // of having an @llvm.stackrestore pop up in the middle. However, it's not
     // actually impossible for an @llvm.stackrestore to pop up in the middle of
     // an entry block. Also, if we have a block that's not attached to a
     // function, we can't tell if it's "static" under the current definition.
     // Theoretically, this problem could be fixed by creating a new kind of
     // instruction kind specifically for static allocas. Such a new instruction
     // could be required to be at the top of the entry block, thus preventing it
     // from being subject to a @llvm.stackrestore. Instcombine could even
     // convert regular allocas into these special allocas. It'd be nifty.
     // However, until then, this problem remains open.
     //
     // So, we'll assume that two non-empty allocas have different addresses
     // for now.
     //
     // With all that, if the offsets are within the bounds of their allocations
     // (and not one-past-the-end! so we can't use inbounds!), and their
     // allocations aren't the same, the pointers are not equal.
     //
     // Note that it's not necessary to check for LHS being a global variable
     // address, due to canonicalization and constant folding.
     if (isa<AllocaInst>(LHS) &&
         (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
       ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
       ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
       Opts.NullIsUnknownSize =
           NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
       if (LHSOffsetCI && RHSOffsetCI &&
           getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
           getObjectSize(RHS, RHSSize, DL, TLI, Opts)) {
         const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
         const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
         if (!LHSOffsetValue.isNegative() &&
             !RHSOffsetValue.isNegative() &&
             LHSOffsetValue.ult(LHSSize) &&
             RHSOffsetValue.ult(RHSSize)) {
           return ConstantInt::get(GetCompareTy(LHS),
                                   !CmpInst::isTrueWhenEqual(Pred));
         }
       }
 
       // Repeat the above check but this time without depending on DataLayout
       // or being able to compute a precise size.
       if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
           !cast<PointerType>(RHS->getType())->isEmptyTy() &&
           LHSOffset->isNullValue() &&
           RHSOffset->isNullValue())
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
     }
 
     // Even if an non-inbounds GEP occurs along the path we can still optimize
     // equality comparisons concerning the result. We avoid walking the whole
     // chain again by starting where the last calls to
     // stripAndComputeConstantOffsets left off and accumulate the offsets.
     Constant *LHSNoBound = stripAndComputeConstantOffsets(DL, LHS, true);
     Constant *RHSNoBound = stripAndComputeConstantOffsets(DL, RHS, true);
     if (LHS == RHS)
       return ConstantExpr::getICmp(Pred,
                                    ConstantExpr::getAdd(LHSOffset, LHSNoBound),
                                    ConstantExpr::getAdd(RHSOffset, RHSNoBound));
 
     // If one side of the equality comparison must come from a noalias call
     // (meaning a system memory allocation function), and the other side must
     // come from a pointer that cannot overlap with dynamically-allocated
     // memory within the lifetime of the current function (allocas, byval
     // arguments, globals), then determine the comparison result here.
     SmallVector<Value *, 8> LHSUObjs, RHSUObjs;
     GetUnderlyingObjects(LHS, LHSUObjs, DL);
     GetUnderlyingObjects(RHS, RHSUObjs, DL);
 
     // Is the set of underlying objects all noalias calls?
     auto IsNAC = [](ArrayRef<Value *> Objects) {
       return all_of(Objects, isNoAliasCall);
     };
 
     // Is the set of underlying objects all things which must be disjoint from
     // noalias calls. For allocas, we consider only static ones (dynamic
     // allocas might be transformed into calls to malloc not simultaneously
     // live with the compared-to allocation). For globals, we exclude symbols
     // that might be resolve lazily to symbols in another dynamically-loaded
     // library (and, thus, could be malloc'ed by the implementation).
     auto IsAllocDisjoint = [](ArrayRef<Value *> Objects) {
       return all_of(Objects, [](Value *V) {
         if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
           return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
         if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
           return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
                   GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
                  !GV->isThreadLocal();
         if (const Argument *A = dyn_cast<Argument>(V))
           return A->hasByValAttr();
         return false;
       });
     };
 
     if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) ||
         (IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs)))
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
 
     // Fold comparisons for non-escaping pointer even if the allocation call
     // cannot be elided. We cannot fold malloc comparison to null. Also, the
     // dynamic allocation call could be either of the operands.
     Value *MI = nullptr;
     if (isAllocLikeFn(LHS, TLI) &&
         llvm::isKnownNonZero(RHS, DL, 0, nullptr, CxtI, DT))
       MI = LHS;
     else if (isAllocLikeFn(RHS, TLI) &&
              llvm::isKnownNonZero(LHS, DL, 0, nullptr, CxtI, DT))
       MI = RHS;
     // FIXME: We should also fold the compare when the pointer escapes, but the
     // compare dominates the pointer escape
     if (MI && !PointerMayBeCaptured(MI, true, true))
       return ConstantInt::get(GetCompareTy(LHS),
                               CmpInst::isFalseWhenEqual(Pred));
   }
 
   // Otherwise, fail.
   return nullptr;
 }
 
 /// Fold an icmp when its operands have i1 scalar type.
 static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
                                   Value *RHS, const SimplifyQuery &Q) {
   Type *ITy = GetCompareTy(LHS); // The return type.
   Type *OpTy = LHS->getType();   // The operand type.
   if (!OpTy->isIntOrIntVectorTy(1))
     return nullptr;
 
   // A boolean compared to true/false can be simplified in 14 out of the 20
   // (10 predicates * 2 constants) possible combinations. Cases not handled here
   // require a 'not' of the LHS, so those must be transformed in InstCombine.
   if (match(RHS, m_Zero())) {
     switch (Pred) {
     case CmpInst::ICMP_NE:  // X !=  0 -> X
     case CmpInst::ICMP_UGT: // X >u  0 -> X
     case CmpInst::ICMP_SLT: // X <s  0 -> X
       return LHS;
 
     case CmpInst::ICMP_ULT: // X <u  0 -> false
     case CmpInst::ICMP_SGT: // X >s  0 -> false
       return getFalse(ITy);
 
     case CmpInst::ICMP_UGE: // X >=u 0 -> true
     case CmpInst::ICMP_SLE: // X <=s 0 -> true
       return getTrue(ITy);
 
     default: break;
     }
   } else if (match(RHS, m_One())) {
     switch (Pred) {
     case CmpInst::ICMP_EQ:  // X ==   1 -> X
     case CmpInst::ICMP_UGE: // X >=u  1 -> X
     case CmpInst::ICMP_SLE: // X <=s -1 -> X
       return LHS;
 
     case CmpInst::ICMP_UGT: // X >u   1 -> false
     case CmpInst::ICMP_SLT: // X <s  -1 -> false
       return getFalse(ITy);
 
     case CmpInst::ICMP_ULE: // X <=u  1 -> true
     case CmpInst::ICMP_SGE: // X >=s -1 -> true
       return getTrue(ITy);
 
     default: break;
     }
   }
 
   switch (Pred) {
   default:
     break;
   case ICmpInst::ICMP_UGE:
     if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_SGE:
     /// For signed comparison, the values for an i1 are 0 and -1
     /// respectively. This maps into a truth table of:
     /// LHS | RHS | LHS >=s RHS   | LHS implies RHS
     ///  0  |  0  |  1 (0 >= 0)   |  1
     ///  0  |  1  |  1 (0 >= -1)  |  1
     ///  1  |  0  |  0 (-1 >= 0)  |  0
     ///  1  |  1  |  1 (-1 >= -1) |  1
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_ULE:
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
   }
 
   return nullptr;
 }
 
 /// Try hard to fold icmp with zero RHS because this is a common case.
 static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
                                    Value *RHS, const SimplifyQuery &Q) {
   if (!match(RHS, m_Zero()))
     return nullptr;
 
   Type *ITy = GetCompareTy(LHS); // The return type.
   switch (Pred) {
   default:
     llvm_unreachable("Unknown ICmp predicate!");
   case ICmpInst::ICMP_ULT:
     return getFalse(ITy);
   case ICmpInst::ICMP_UGE:
     return getTrue(ITy);
   case ICmpInst::ICMP_EQ:
   case ICmpInst::ICMP_ULE:
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
   case ICmpInst::ICMP_NE:
   case ICmpInst::ICMP_UGT:
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_SLT: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getTrue(ITy);
     if (LHSKnown.isNonNegative())
       return getFalse(ITy);
     break;
   }
   case ICmpInst::ICMP_SLE: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getTrue(ITy);
     if (LHSKnown.isNonNegative() &&
         isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
   }
   case ICmpInst::ICMP_SGE: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getFalse(ITy);
     if (LHSKnown.isNonNegative())
       return getTrue(ITy);
     break;
   }
   case ICmpInst::ICMP_SGT: {
     KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (LHSKnown.isNegative())
       return getFalse(ITy);
     if (LHSKnown.isNonNegative() &&
         isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   }
   }
 
   return nullptr;
 }
 
 /// Many binary operators with a constant operand have an easy-to-compute
 /// range of outputs. This can be used to fold a comparison to always true or
 /// always false.
 static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper) {
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (BO.getOpcode()) {
   case Instruction::Add:
     if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
       // FIXME: If we have both nuw and nsw, we should reduce the range further.
       if (BO.hasNoUnsignedWrap()) {
         // 'add nuw x, C' produces [C, UINT_MAX].
         Lower = *C;
       } else if (BO.hasNoSignedWrap()) {
         if (C->isNegative()) {
           // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C].
           Lower = APInt::getSignedMinValue(Width);
           Upper = APInt::getSignedMaxValue(Width) + *C + 1;
         } else {
           // 'add nsw x, +C' produces [SINT_MIN + C, SINT_MAX].
           Lower = APInt::getSignedMinValue(Width) + *C;
           Upper = APInt::getSignedMaxValue(Width) + 1;
         }
       }
     }
     break;
 
   case Instruction::And:
     if (match(BO.getOperand(1), m_APInt(C)))
       // 'and x, C' produces [0, C].
       Upper = *C + 1;
     break;
 
   case Instruction::Or:
     if (match(BO.getOperand(1), m_APInt(C)))
       // 'or x, C' produces [C, UINT_MAX].
       Lower = *C;
     break;
 
   case Instruction::AShr:
     if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
       // 'ashr x, C' produces [INT_MIN >> C, INT_MAX >> C].
       Lower = APInt::getSignedMinValue(Width).ashr(*C);
       Upper = APInt::getSignedMaxValue(Width).ashr(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       unsigned ShiftAmount = Width - 1;
       if (!C->isNullValue() && BO.isExact())
         ShiftAmount = C->countTrailingZeros();
       if (C->isNegative()) {
         // 'ashr C, x' produces [C, C >> (Width-1)]
         Lower = *C;
         Upper = C->ashr(ShiftAmount) + 1;
       } else {
         // 'ashr C, x' produces [C >> (Width-1), C]
         Lower = C->ashr(ShiftAmount);
         Upper = *C + 1;
       }
     }
     break;
 
   case Instruction::LShr:
     if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
       // 'lshr x, C' produces [0, UINT_MAX >> C].
       Upper = APInt::getAllOnesValue(Width).lshr(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       // 'lshr C, x' produces [C >> (Width-1), C].
       unsigned ShiftAmount = Width - 1;
       if (!C->isNullValue() && BO.isExact())
         ShiftAmount = C->countTrailingZeros();
       Lower = C->lshr(ShiftAmount);
       Upper = *C + 1;
     }
     break;
 
   case Instruction::Shl:
     if (match(BO.getOperand(0), m_APInt(C))) {
       if (BO.hasNoUnsignedWrap()) {
         // 'shl nuw C, x' produces [C, C << CLZ(C)]
         Lower = *C;
         Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
       } else if (BO.hasNoSignedWrap()) { // TODO: What if both nuw+nsw?
         if (C->isNegative()) {
           // 'shl nsw C, x' produces [C << CLO(C)-1, C]
           unsigned ShiftAmount = C->countLeadingOnes() - 1;
           Lower = C->shl(ShiftAmount);
           Upper = *C + 1;
         } else {
           // 'shl nsw C, x' produces [C, C << CLZ(C)-1]
           unsigned ShiftAmount = C->countLeadingZeros() - 1;
           Lower = *C;
           Upper = C->shl(ShiftAmount) + 1;
         }
       }
     }
     break;
 
   case Instruction::SDiv:
     if (match(BO.getOperand(1), m_APInt(C))) {
       APInt IntMin = APInt::getSignedMinValue(Width);
       APInt IntMax = APInt::getSignedMaxValue(Width);
       if (C->isAllOnesValue()) {
         // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
         //    where C != -1 and C != 0 and C != 1
         Lower = IntMin + 1;
         Upper = IntMax + 1;
       } else if (C->countLeadingZeros() < Width - 1) {
         // 'sdiv x, C' produces [INT_MIN / C, INT_MAX / C]
         //    where C != -1 and C != 0 and C != 1
         Lower = IntMin.sdiv(*C);
         Upper = IntMax.sdiv(*C);
         if (Lower.sgt(Upper))
           std::swap(Lower, Upper);
         Upper = Upper + 1;
         assert(Upper != Lower && "Upper part of range has wrapped!");
       }
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       if (C->isMinSignedValue()) {
         // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
         Lower = *C;
         Upper = Lower.lshr(1) + 1;
       } else {
         // 'sdiv C, x' produces [-|C|, |C|].
         Upper = C->abs() + 1;
         Lower = (-Upper) + 1;
       }
     }
     break;
 
   case Instruction::UDiv:
     if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
       // 'udiv x, C' produces [0, UINT_MAX / C].
       Upper = APInt::getMaxValue(Width).udiv(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       // 'udiv C, x' produces [0, C].
       Upper = *C + 1;
     }
     break;
 
   case Instruction::SRem:
     if (match(BO.getOperand(1), m_APInt(C))) {
       // 'srem x, C' produces (-|C|, |C|).
       Upper = C->abs();
       Lower = (-Upper) + 1;
     }
     break;
 
   case Instruction::URem:
     if (match(BO.getOperand(1), m_APInt(C)))
       // 'urem x, C' produces [0, C).
       Upper = *C;
     break;
 
   default:
     break;
   }
 }
 
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS) {
   Type *ITy = GetCompareTy(RHS); // The return type.
 
   Value *X;
   // Sign-bit checks can be optimized to true/false after unsigned
   // floating-point casts:
   // icmp slt (bitcast (uitofp X)),  0 --> false
   // icmp sgt (bitcast (uitofp X)), -1 --> true
   if (match(LHS, m_BitCast(m_UIToFP(m_Value(X))))) {
     if (Pred == ICmpInst::ICMP_SLT && match(RHS, m_Zero()))
       return ConstantInt::getFalse(ITy);
     if (Pred == ICmpInst::ICMP_SGT && match(RHS, m_AllOnes()))
       return ConstantInt::getTrue(ITy);
   }
 
   const APInt *C;
   if (!match(RHS, m_APInt(C)))
     return nullptr;
 
   // Rule out tautological comparisons (eg., ult 0 or uge 0).
   ConstantRange RHS_CR = ConstantRange::makeExactICmpRegion(Pred, *C);
   if (RHS_CR.isEmptySet())
     return ConstantInt::getFalse(ITy);
   if (RHS_CR.isFullSet())
     return ConstantInt::getTrue(ITy);
 
   // Find the range of possible values for binary operators.
   unsigned Width = C->getBitWidth();
   APInt Lower = APInt(Width, 0);
   APInt Upper = APInt(Width, 0);
   if (auto *BO = dyn_cast<BinaryOperator>(LHS))
     setLimitsForBinOp(*BO, Lower, Upper);
 
   ConstantRange LHS_CR =
       Lower != Upper ? ConstantRange(Lower, Upper) : ConstantRange(Width, true);
 
   if (auto *I = dyn_cast<Instruction>(LHS))
     if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
       LHS_CR = LHS_CR.intersectWith(getConstantRangeFromMetadata(*Ranges));
 
   if (!LHS_CR.isFullSet()) {
     if (RHS_CR.contains(LHS_CR))
       return ConstantInt::getTrue(ITy);
     if (RHS_CR.inverse().contains(LHS_CR))
       return ConstantInt::getFalse(ITy);
   }
 
   return nullptr;
 }
 
 /// TODO: A large part of this logic is duplicated in InstCombine's
 /// foldICmpBinOp(). We should be able to share that and avoid the code
 /// duplication.
 static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   Type *ITy = GetCompareTy(LHS); // The return type.
 
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
   if (MaxRecurse && (LBO || RBO)) {
     // Analyze the case when either LHS or RHS is an add instruction.
     Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
     // LHS = A + B (or A and B are null); RHS = C + D (or C and D are null).
     bool NoLHSWrapProblem = false, NoRHSWrapProblem = false;
     if (LBO && LBO->getOpcode() == Instruction::Add) {
       A = LBO->getOperand(0);
       B = LBO->getOperand(1);
       NoLHSWrapProblem =
           ICmpInst::isEquality(Pred) ||
           (CmpInst::isUnsigned(Pred) && LBO->hasNoUnsignedWrap()) ||
           (CmpInst::isSigned(Pred) && LBO->hasNoSignedWrap());
     }
     if (RBO && RBO->getOpcode() == Instruction::Add) {
       C = RBO->getOperand(0);
       D = RBO->getOperand(1);
       NoRHSWrapProblem =
           ICmpInst::isEquality(Pred) ||
           (CmpInst::isUnsigned(Pred) && RBO->hasNoUnsignedWrap()) ||
           (CmpInst::isSigned(Pred) && RBO->hasNoSignedWrap());
     }
 
     // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
     if ((A == RHS || B == RHS) && NoLHSWrapProblem)
       if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A,
                                       Constant::getNullValue(RHS->getType()), Q,
                                       MaxRecurse - 1))
         return V;
 
     // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
     if ((C == LHS || D == LHS) && NoRHSWrapProblem)
       if (Value *V =
               SimplifyICmpInst(Pred, Constant::getNullValue(LHS->getType()),
                                C == LHS ? D : C, Q, MaxRecurse - 1))
         return V;
 
     // icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow.
     if (A && C && (A == C || A == D || B == C || B == D) && NoLHSWrapProblem &&
         NoRHSWrapProblem) {
       // Determine Y and Z in the form icmp (X+Y), (X+Z).
       Value *Y, *Z;
       if (A == C) {
         // C + B == C + D  ->  B == D
         Y = B;
         Z = D;
       } else if (A == D) {
         // D + B == C + D  ->  B == C
         Y = B;
         Z = C;
       } else if (B == C) {
         // A + C == C + D  ->  A == D
         Y = A;
         Z = D;
       } else {
         assert(B == D);
         // A + D == C + D  ->  A == C
         Y = A;
         Z = C;
       }
       if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse - 1))
         return V;
     }
   }
 
   {
     Value *Y = nullptr;
     // icmp pred (or X, Y), X
     if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) {
       if (Pred == ICmpInst::ICMP_ULT)
         return getFalse(ITy);
       if (Pred == ICmpInst::ICMP_UGE)
         return getTrue(ITy);
 
       if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
         KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         if (RHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
         if (RHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
       }
     }
     // icmp pred X, (or X, Y)
     if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) {
       if (Pred == ICmpInst::ICMP_ULE)
         return getTrue(ITy);
       if (Pred == ICmpInst::ICMP_UGT)
         return getFalse(ITy);
 
       if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
         KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
         if (LHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
         if (LHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
       }
     }
   }
 
   // icmp pred (and X, Y), X
   if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
       return getTrue(ITy);
   }
   // icmp pred X, (and X, Y)
   if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) {
     if (Pred == ICmpInst::ICMP_UGE)
       return getTrue(ITy);
     if (Pred == ICmpInst::ICMP_ULT)
       return getFalse(ITy);
   }
 
   // 0 - (zext X) pred C
   if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
       if (RHSC->getValue().isStrictlyPositive()) {
         if (Pred == ICmpInst::ICMP_SLT)
           return ConstantInt::getTrue(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_SGE)
           return ConstantInt::getFalse(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_EQ)
           return ConstantInt::getFalse(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_NE)
           return ConstantInt::getTrue(RHSC->getContext());
       }
       if (RHSC->getValue().isNonNegative()) {
         if (Pred == ICmpInst::ICMP_SLE)
           return ConstantInt::getTrue(RHSC->getContext());
         if (Pred == ICmpInst::ICMP_SGT)
           return ConstantInt::getFalse(RHSC->getContext());
       }
     }
   }
 
   // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: {
       KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE: {
       KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
       return getTrue(ITy);
     }
   }
 
   // icmp pred X, (urem Y, X)
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: {
       KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE: {
       KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
       if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
     }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
       return getFalse(ITy);
     }
   }
 
   // x >> y <=u x
   // x udiv y <=u x.
   if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) ||
               match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) {
     // icmp pred (X op Y), X
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
       return getTrue(ITy);
   }
 
   // x >=u x >> y
   // x >=u x udiv y.
   if (RBO && (match(RBO, m_LShr(m_Specific(LHS), m_Value())) ||
               match(RBO, m_UDiv(m_Specific(LHS), m_Value())))) {
     // icmp pred X, (X op Y)
     if (Pred == ICmpInst::ICMP_ULT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_UGE)
       return getTrue(ITy);
   }
 
   // handle:
   //   CI2 << X == CI
   //   CI2 << X != CI
   //
   //   where CI2 is a power of 2 and CI isn't
   if (auto *CI = dyn_cast<ConstantInt>(RHS)) {
     const APInt *CI2Val, *CIVal = &CI->getValue();
     if (LBO && match(LBO, m_Shl(m_APInt(CI2Val), m_Value())) &&
         CI2Val->isPowerOf2()) {
       if (!CIVal->isPowerOf2()) {
         // CI2 << X can equal zero in some circumstances,
         // this simplification is unsafe if CI is zero.
         //
         // We know it is safe if:
         // - The shift is nsw, we can't shift out the one bit.
         // - The shift is nuw, we can't shift out the one bit.
         // - CI2 is one
         // - CI isn't zero
         if (LBO->hasNoSignedWrap() || LBO->hasNoUnsignedWrap() ||
             CI2Val->isOneValue() || !CI->isZero()) {
           if (Pred == ICmpInst::ICMP_EQ)
             return ConstantInt::getFalse(RHS->getContext());
           if (Pred == ICmpInst::ICMP_NE)
             return ConstantInt::getTrue(RHS->getContext());
         }
       }
       if (CIVal->isSignMask() && CI2Val->isOneValue()) {
         if (Pred == ICmpInst::ICMP_UGT)
           return ConstantInt::getFalse(RHS->getContext());
         if (Pred == ICmpInst::ICMP_ULE)
           return ConstantInt::getTrue(RHS->getContext());
       }
     }
   }
 
   if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
       LBO->getOperand(1) == RBO->getOperand(1)) {
     switch (LBO->getOpcode()) {
     default:
       break;
     case Instruction::UDiv:
     case Instruction::LShr:
       if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
           return V;
       break;
     case Instruction::SDiv:
       if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     case Instruction::AShr:
       if (!LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     case Instruction::Shl: {
       bool NUW = LBO->hasNoUnsignedWrap() && RBO->hasNoUnsignedWrap();
       bool NSW = LBO->hasNoSignedWrap() && RBO->hasNoSignedWrap();
       if (!NUW && !NSW)
         break;
       if (!NSW && ICmpInst::isSigned(Pred))
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), Q, MaxRecurse - 1))
         return V;
       break;
     }
     }
   }
   return nullptr;
 }
 
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
                                      Value *RHS, const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
   Type *ITy = GetCompareTy(LHS); // The return type.
   Value *A, *B;
   CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
   CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".
 
   // Signed variants on "max(a,b)>=a -> true".
   if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // smax(A, B) pred A.
     EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
     // We analyze this as smax(A, B) pred A.
     P = Pred;
   } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred smax(A, B).
     EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
     // We analyze this as smax(A, B) swapped-pred A.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
              (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // smin(A, B) pred A.
     EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
     // We analyze this as smax(-A, -B) swapped-pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred smin(A, B).
     EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
     // We analyze this as smax(-A, -B) pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = Pred;
   }
   if (P != CmpInst::BAD_ICMP_PREDICATE) {
     // Cases correspond to "max(A, B) p A".
     switch (P) {
     default:
       break;
     case CmpInst::ICMP_EQ:
     case CmpInst::ICMP_SLE:
       // Equivalent to "A EqP B".  This may be the same as the condition tested
       // in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     case CmpInst::ICMP_NE:
     case CmpInst::ICMP_SGT: {
       CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
       // Equivalent to "A InvEqP B".  This may be the same as the condition
       // tested in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     }
     case CmpInst::ICMP_SGE:
       // Always true.
       return getTrue(ITy);
     case CmpInst::ICMP_SLT:
       // Always false.
       return getFalse(ITy);
     }
   }
 
   // Unsigned variants on "max(a,b)>=a -> true".
   P = CmpInst::BAD_ICMP_PREDICATE;
   if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // umax(A, B) pred A.
     EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
     // We analyze this as umax(A, B) pred A.
     P = Pred;
   } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred umax(A, B).
     EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
     // We analyze this as umax(A, B) swapped-pred A.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
              (A == RHS || B == RHS)) {
     if (A != RHS)
       std::swap(A, B);       // umin(A, B) pred A.
     EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
     // We analyze this as umax(-A, -B) swapped-pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = CmpInst::getSwappedPredicate(Pred);
   } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) &&
              (A == LHS || B == LHS)) {
     if (A != LHS)
       std::swap(A, B);       // A pred umin(A, B).
     EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
     // We analyze this as umax(-A, -B) pred -A.
     // Note that we do not need to actually form -A or -B thanks to EqP.
     P = Pred;
   }
   if (P != CmpInst::BAD_ICMP_PREDICATE) {
     // Cases correspond to "max(A, B) p A".
     switch (P) {
     default:
       break;
     case CmpInst::ICMP_EQ:
     case CmpInst::ICMP_ULE:
       // Equivalent to "A EqP B".  This may be the same as the condition tested
       // in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     case CmpInst::ICMP_NE:
     case CmpInst::ICMP_UGT: {
       CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
       // Equivalent to "A InvEqP B".  This may be the same as the condition
       // tested in the max/min; if so, we can just return that.
       if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
         return V;
       if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
         if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
           return V;
       break;
     }
     case CmpInst::ICMP_UGE:
       // Always true.
       return getTrue(ITy);
     case CmpInst::ICMP_ULT:
       // Always false.
       return getFalse(ITy);
     }
   }
 
   // Variants on "max(x,y) >= min(x,z)".
   Value *C, *D;
   if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
       match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
       (A == C || A == D || B == C || B == D)) {
     // max(x, ?) pred min(x, ?).
     if (Pred == CmpInst::ICMP_SGE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_SLT)
       // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
              match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // min(x, ?) pred max(x, ?).
     if (Pred == CmpInst::ICMP_SLE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_SGT)
       // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // max(x, ?) pred min(x, ?).
     if (Pred == CmpInst::ICMP_UGE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_ULT)
       // Always false.
       return getFalse(ITy);
   } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // min(x, ?) pred max(x, ?).
     if (Pred == CmpInst::ICMP_ULE)
       // Always true.
       return getTrue(ITy);
     if (Pred == CmpInst::ICMP_UGT)
       // Always false.
       return getFalse(ITy);
   }
 
   return nullptr;
 }
 
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
       return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
 
   Type *ITy = GetCompareTy(LHS); // The return type.
 
   // icmp X, X -> true/false
   // icmp X, undef -> true/false because undef could be X.
   if (LHS == RHS || isa<UndefValue>(RHS))
     return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));
 
   if (Value *V = simplifyICmpOfBools(Pred, LHS, RHS, Q))
     return V;
 
   if (Value *V = simplifyICmpWithZero(Pred, LHS, RHS, Q))
     return V;
 
   if (Value *V = simplifyICmpWithConstant(Pred, LHS, RHS))
     return V;
 
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
   if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
     auto RHS_Instr = cast<Instruction>(RHS);
     auto LHS_Instr = cast<Instruction>(LHS);
 
     if (RHS_Instr->getMetadata(LLVMContext::MD_range) &&
         LHS_Instr->getMetadata(LLVMContext::MD_range)) {
       auto RHS_CR = getConstantRangeFromMetadata(
           *RHS_Instr->getMetadata(LLVMContext::MD_range));
       auto LHS_CR = getConstantRangeFromMetadata(
           *LHS_Instr->getMetadata(LLVMContext::MD_range));
 
       auto Satisfied_CR = ConstantRange::makeSatisfyingICmpRegion(Pred, RHS_CR);
       if (Satisfied_CR.contains(LHS_CR))
         return ConstantInt::getTrue(RHS->getContext());
 
       auto InversedSatisfied_CR = ConstantRange::makeSatisfyingICmpRegion(
                 CmpInst::getInversePredicate(Pred), RHS_CR);
       if (InversedSatisfied_CR.contains(LHS_CR))
         return ConstantInt::getFalse(RHS->getContext());
     }
   }
 
   // Compare of cast, for example (zext X) != 0 -> X != 0
   if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
     Instruction *LI = cast<CastInst>(LHS);
     Value *SrcOp = LI->getOperand(0);
     Type *SrcTy = SrcOp->getType();
     Type *DstTy = LI->getType();
 
     // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
     // if the integer type is the same size as the pointer type.
     if (MaxRecurse && isa<PtrToIntInst>(LI) &&
         Q.DL.getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) {
       if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
         // Transfer the cast to the constant.
         if (Value *V = SimplifyICmpInst(Pred, SrcOp,
                                         ConstantExpr::getIntToPtr(RHSC, SrcTy),
                                         Q, MaxRecurse-1))
           return V;
       } else if (PtrToIntInst *RI = dyn_cast<PtrToIntInst>(RHS)) {
         if (RI->getOperand(0)->getType() == SrcTy)
           // Compare without the cast.
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
                                           Q, MaxRecurse-1))
             return V;
       }
     }
 
     if (isa<ZExtInst>(LHS)) {
       // Turn icmp (zext X), (zext Y) into a compare of X and Y if they have the
       // same type.
       if (ZExtInst *RI = dyn_cast<ZExtInst>(RHS)) {
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that signed predicates become unsigned.
           if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
                                           SrcOp, RI->getOperand(0), Q,
                                           MaxRecurse-1))
             return V;
       }
       // Turn icmp (zext X), Cst into a compare of X and Cst if Cst is extended
       // too.  If not, then try to deduce the result of the comparison.
       else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
         // Compute the constant that would happen if we truncated to SrcTy then
         // reextended to DstTy.
         Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
         Constant *RExt = ConstantExpr::getCast(CastInst::ZExt, Trunc, DstTy);
 
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two zero-extended values.
         if (RExt == CI && MaxRecurse)
           if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
                                         SrcOp, Trunc, Q, MaxRecurse-1))
             return V;
 
         // Otherwise the upper bits of LHS are zero while RHS has a non-zero bit
         // there.  Use this to work out the result of the comparison.
         if (RExt != CI) {
           switch (Pred) {
           default: llvm_unreachable("Unknown ICmp predicate!");
           // LHS <u RHS.
           case ICmpInst::ICMP_EQ:
           case ICmpInst::ICMP_UGT:
           case ICmpInst::ICMP_UGE:
             return ConstantInt::getFalse(CI->getContext());
 
           case ICmpInst::ICMP_NE:
           case ICmpInst::ICMP_ULT:
           case ICmpInst::ICMP_ULE:
             return ConstantInt::getTrue(CI->getContext());
 
           // LHS is non-negative.  If RHS is negative then LHS >s LHS.  If RHS
           // is non-negative then LHS <s RHS.
           case ICmpInst::ICMP_SGT:
           case ICmpInst::ICMP_SGE:
             return CI->getValue().isNegative() ?
               ConstantInt::getTrue(CI->getContext()) :
               ConstantInt::getFalse(CI->getContext());
 
           case ICmpInst::ICMP_SLT:
           case ICmpInst::ICMP_SLE:
             return CI->getValue().isNegative() ?
               ConstantInt::getFalse(CI->getContext()) :
               ConstantInt::getTrue(CI->getContext());
           }
         }
       }
     }
 
     if (isa<SExtInst>(LHS)) {
       // Turn icmp (sext X), (sext Y) into a compare of X and Y if they have the
       // same type.
       if (SExtInst *RI = dyn_cast<SExtInst>(RHS)) {
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that the predicate does not change.
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
                                           Q, MaxRecurse-1))
             return V;
       }
       // Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended
       // too.  If not, then try to deduce the result of the comparison.
       else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
         // Compute the constant that would happen if we truncated to SrcTy then
         // reextended to DstTy.
         Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
         Constant *RExt = ConstantExpr::getCast(CastInst::SExt, Trunc, DstTy);
 
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two sign-extended values.
         if (RExt == CI && MaxRecurse)
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1))
             return V;
 
         // Otherwise the upper bits of LHS are all equal, while RHS has varying
         // bits there.  Use this to work out the result of the comparison.
         if (RExt != CI) {
           switch (Pred) {
           default: llvm_unreachable("Unknown ICmp predicate!");
           case ICmpInst::ICMP_EQ:
             return ConstantInt::getFalse(CI->getContext());
           case ICmpInst::ICMP_NE:
             return ConstantInt::getTrue(CI->getContext());
 
           // If RHS is non-negative then LHS <s RHS.  If RHS is negative then
           // LHS >s RHS.
           case ICmpInst::ICMP_SGT:
           case ICmpInst::ICMP_SGE:
             return CI->getValue().isNegative() ?
               ConstantInt::getTrue(CI->getContext()) :
               ConstantInt::getFalse(CI->getContext());
           case ICmpInst::ICMP_SLT:
           case ICmpInst::ICMP_SLE:
             return CI->getValue().isNegative() ?
               ConstantInt::getFalse(CI->getContext()) :
               ConstantInt::getTrue(CI->getContext());
 
           // If LHS is non-negative then LHS <u RHS.  If LHS is negative then
           // LHS >u RHS.
           case ICmpInst::ICMP_UGT:
           case ICmpInst::ICMP_UGE:
             // Comparison is true iff the LHS <s 0.
             if (MaxRecurse)
               if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
                                               Constant::getNullValue(SrcTy),
                                               Q, MaxRecurse-1))
                 return V;
             break;
           case ICmpInst::ICMP_ULT:
           case ICmpInst::ICMP_ULE:
             // Comparison is true iff the LHS >=s 0.
             if (MaxRecurse)
               if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
                                               Constant::getNullValue(SrcTy),
                                               Q, MaxRecurse-1))
                 return V;
             break;
           }
         }
       }
     }
   }
 
   // icmp eq|ne X, Y -> false|true if X != Y
   if (ICmpInst::isEquality(Pred) &&
       isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT)) {
     return Pred == ICmpInst::ICMP_NE ? getTrue(ITy) : getFalse(ITy);
   }
 
   if (Value *V = simplifyICmpWithBinOp(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
     if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI, LHS,
                                      RHS))
       return C;
   if (auto *CLHS = dyn_cast<PtrToIntOperator>(LHS))
     if (auto *CRHS = dyn_cast<PtrToIntOperator>(RHS))
       if (Q.DL.getTypeSizeInBits(CLHS->getPointerOperandType()) ==
               Q.DL.getTypeSizeInBits(CLHS->getType()) &&
           Q.DL.getTypeSizeInBits(CRHS->getPointerOperandType()) ==
               Q.DL.getTypeSizeInBits(CRHS->getType()))
         if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI,
                                          CLHS->getPointerOperand(),
                                          CRHS->getPointerOperand()))
           return C;
 
   if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
     if (GEPOperator *GRHS = dyn_cast<GEPOperator>(RHS)) {
       if (GLHS->getPointerOperand() == GRHS->getPointerOperand() &&
           GLHS->hasAllConstantIndices() && GRHS->hasAllConstantIndices() &&
           (ICmpInst::isEquality(Pred) ||
            (GLHS->isInBounds() && GRHS->isInBounds() &&
             Pred == ICmpInst::getSignedPredicate(Pred)))) {
         // The bases are equal and the indices are constant.  Build a constant
         // expression GEP with the same indices and a null base pointer to see
         // what constant folding can make out of it.
         Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType());
         SmallVector<Value *, 4> IndicesLHS(GLHS->idx_begin(), GLHS->idx_end());
         Constant *NewLHS = ConstantExpr::getGetElementPtr(
             GLHS->getSourceElementType(), Null, IndicesLHS);
 
         SmallVector<Value *, 4> IndicesRHS(GRHS->idx_begin(), GRHS->idx_end());
         Constant *NewRHS = ConstantExpr::getGetElementPtr(
             GLHS->getSourceElementType(), Null, IndicesRHS);
         return ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
       }
     }
   }
 
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
     if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
     if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q) {
   return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 /// Given operands for an FCmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                FastMathFlags FMF, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
       return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
 
   // Fold trivial predicates.
   Type *RetTy = GetCompareTy(LHS);
   if (Pred == FCmpInst::FCMP_FALSE)
     return getFalse(RetTy);
   if (Pred == FCmpInst::FCMP_TRUE)
     return getTrue(RetTy);
 
   // UNO/ORD predicates can be trivially folded if NaNs are ignored.
   if (FMF.noNaNs()) {
     if (Pred == FCmpInst::FCMP_UNO)
       return getFalse(RetTy);
     if (Pred == FCmpInst::FCMP_ORD)
       return getTrue(RetTy);
   }
 
   // NaN is unordered; NaN is not ordered.
   assert((FCmpInst::isOrdered(Pred) || FCmpInst::isUnordered(Pred)) &&
          "Comparison must be either ordered or unordered");
   if (match(RHS, m_NaN()))
     return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
 
   // fcmp pred x, undef  and  fcmp pred undef, x
   // fold to true if unordered, false if ordered
   if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) {
     // Choosing NaN for the undef will always make unordered comparison succeed
     // and ordered comparison fail.
     return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
   }
 
   // fcmp x,x -> true/false.  Not all compares are foldable.
   if (LHS == RHS) {
     if (CmpInst::isTrueWhenEqual(Pred))
       return getTrue(RetTy);
     if (CmpInst::isFalseWhenEqual(Pred))
       return getFalse(RetTy);
   }
 
   // Handle fcmp with constant RHS.
   const APFloat *C;
   if (match(RHS, m_APFloat(C))) {
     // Check whether the constant is an infinity.
     if (C->isInfinity()) {
       if (C->isNegative()) {
         switch (Pred) {
         case FCmpInst::FCMP_OLT:
           // No value is ordered and less than negative infinity.
           return getFalse(RetTy);
         case FCmpInst::FCMP_UGE:
           // All values are unordered with or at least negative infinity.
           return getTrue(RetTy);
         default:
           break;
         }
       } else {
         switch (Pred) {
         case FCmpInst::FCMP_OGT:
           // No value is ordered and greater than infinity.
           return getFalse(RetTy);
         case FCmpInst::FCMP_ULE:
           // All values are unordered with and at most infinity.
           return getTrue(RetTy);
         default:
           break;
         }
       }
     }
     if (C->isZero()) {
       switch (Pred) {
       case FCmpInst::FCMP_UGE:
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
         break;
       case FCmpInst::FCMP_OLT:
         // X < 0
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getFalse(RetTy);
         break;
       default:
         break;
       }
     } else if (C->isNegative()) {
       assert(!C->isNaN() && "Unexpected NaN constant!");
       // TODO: We can catch more cases by using a range check rather than
       //       relying on CannotBeOrderedLessThanZero.
       switch (Pred) {
       case FCmpInst::FCMP_UGE:
       case FCmpInst::FCMP_UGT:
       case FCmpInst::FCMP_UNE:
         // (X >= 0) implies (X > C) when (C < 0)
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
         break;
       case FCmpInst::FCMP_OEQ:
       case FCmpInst::FCMP_OLE:
       case FCmpInst::FCMP_OLT:
         // (X >= 0) implies !(X < C) when (C < 0)
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getFalse(RetTy);
         break;
       default:
         break;
       }
     }
   }
 
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
     if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
     if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               FastMathFlags FMF, const SimplifyQuery &Q) {
   return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// See if V simplifies when its operand Op is replaced with RepOp.
 static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                            const SimplifyQuery &Q,
                                            unsigned MaxRecurse) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
 
   // We cannot replace a constant, and shouldn't even try.
   if (isa<Constant>(Op))
     return nullptr;
 
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return nullptr;
 
   // If this is a binary operator, try to simplify it with the replaced op.
   if (auto *B = dyn_cast<BinaryOperator>(I)) {
     // Consider:
     //   %cmp = icmp eq i32 %x, 2147483647
     //   %add = add nsw i32 %x, 1
     //   %sel = select i1 %cmp, i32 -2147483648, i32 %add
     //
     // We can't replace %sel with %add unless we strip away the flags.
     if (isa<OverflowingBinaryOperator>(B))
       if (B->hasNoSignedWrap() || B->hasNoUnsignedWrap())
         return nullptr;
     if (isa<PossiblyExactOperator>(B))
       if (B->isExact())
         return nullptr;
 
     if (MaxRecurse) {
       if (B->getOperand(0) == Op)
         return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q,
                              MaxRecurse - 1);
       if (B->getOperand(1) == Op)
         return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, Q,
                              MaxRecurse - 1);
     }
   }
 
   // Same for CmpInsts.
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (MaxRecurse) {
       if (C->getOperand(0) == Op)
         return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), Q,
                                MaxRecurse - 1);
       if (C->getOperand(1) == Op)
         return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, Q,
                                MaxRecurse - 1);
     }
   }
 
   // Same for GEPs.
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     if (MaxRecurse) {
       SmallVector<Value *, 8> NewOps(GEP->getNumOperands());
       transform(GEP->operands(), NewOps.begin(),
                 [&](Value *V) { return V == Op ? RepOp : V; });
       return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q,
                              MaxRecurse - 1);
     }
   }
 
   // TODO: We could hand off more cases to instsimplify here.
 
   // If all operands are constant after substituting Op for RepOp then we can
   // constant fold the instruction.
   if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) {
     // Build a list of all constant operands.
     SmallVector<Constant *, 8> ConstOps;
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
       if (I->getOperand(i) == Op)
         ConstOps.push_back(CRepOp);
       else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i)))
         ConstOps.push_back(COp);
       else
         break;
     }
 
     // All operands were constants, fold it.
     if (ConstOps.size() == I->getNumOperands()) {
       if (CmpInst *C = dyn_cast<CmpInst>(I))
         return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
                                                ConstOps[1], Q.DL, Q.TLI);
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         if (!LI->isVolatile())
           return ConstantFoldLoadFromConstPtr(ConstOps[0], LI->getType(), Q.DL);
 
       return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI);
     }
   }
 
   return nullptr;
 }
 
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison where one operand of the compare is a constant.
 static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X,
                                     const APInt *Y, bool TrueWhenUnset) {
   const APInt *C;
 
   // (X & Y) == 0 ? X & ~Y : X  --> X
   // (X & Y) != 0 ? X & ~Y : X  --> X & ~Y
   if (FalseVal == X && match(TrueVal, m_And(m_Specific(X), m_APInt(C))) &&
       *Y == ~*C)
     return TrueWhenUnset ? FalseVal : TrueVal;
 
   // (X & Y) == 0 ? X : X & ~Y  --> X & ~Y
   // (X & Y) != 0 ? X : X & ~Y  --> X
   if (TrueVal == X && match(FalseVal, m_And(m_Specific(X), m_APInt(C))) &&
       *Y == ~*C)
     return TrueWhenUnset ? FalseVal : TrueVal;
 
   if (Y->isPowerOf2()) {
     // (X & Y) == 0 ? X | Y : X  --> X | Y
     // (X & Y) != 0 ? X | Y : X  --> X
     if (FalseVal == X && match(TrueVal, m_Or(m_Specific(X), m_APInt(C))) &&
         *Y == *C)
       return TrueWhenUnset ? TrueVal : FalseVal;
 
     // (X & Y) == 0 ? X : X | Y  --> X
     // (X & Y) != 0 ? X : X | Y  --> X | Y
     if (TrueVal == X && match(FalseVal, m_Or(m_Specific(X), m_APInt(C))) &&
         *Y == *C)
       return TrueWhenUnset ? TrueVal : FalseVal;
   }
 
   return nullptr;
 }
 
 /// An alternative way to test if a bit is set or not uses sgt/slt instead of
 /// eq/ne.
 static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
                                            ICmpInst::Predicate Pred,
                                            Value *TrueVal, Value *FalseVal) {
   Value *X;
   APInt Mask;
   if (!decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask))
     return nullptr;
 
   return simplifySelectBitTest(TrueVal, FalseVal, X, &Mask,
                                Pred == ICmpInst::ICMP_EQ);
 }
 
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison.
 static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                          Value *FalseVal, const SimplifyQuery &Q,
                                          unsigned MaxRecurse) {
   ICmpInst::Predicate Pred;
   Value *CmpLHS, *CmpRHS;
   if (!match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS))))
     return nullptr;
 
   if (ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero())) {
     Value *X;
     const APInt *Y;
     if (match(CmpLHS, m_And(m_Value(X), m_APInt(Y))))
       if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y,
                                            Pred == ICmpInst::ICMP_EQ))
         return V;
   }
 
   // Check for other compares that behave like bit test.
   if (Value *V = simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred,
                                               TrueVal, FalseVal))
     return V;
 
   // If we have an equality comparison, then we know the value in one of the
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             TrueVal ||
         SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             TrueVal)
       return FalseVal;
     if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             FalseVal ||
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             FalseVal)
       return FalseVal;
   } else if (Pred == ICmpInst::ICMP_NE) {
     if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             FalseVal ||
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             FalseVal)
       return TrueVal;
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
             TrueVal ||
         SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             TrueVal)
       return TrueVal;
   }
 
   return nullptr;
 }
 
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                  const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *CondC = dyn_cast<Constant>(Cond)) {
     if (auto *TrueC = dyn_cast<Constant>(TrueVal))
       if (auto *FalseC = dyn_cast<Constant>(FalseVal))
         return ConstantFoldSelectInstruction(CondC, TrueC, FalseC);
 
     // select undef, X, Y -> X or Y
     if (isa<UndefValue>(CondC))
       return isa<Constant>(FalseVal) ? FalseVal : TrueVal;
 
     // TODO: Vector constants with undef elements don't simplify.
 
     // select true, X, Y  -> X
     if (CondC->isAllOnesValue())
       return TrueVal;
     // select false, X, Y -> Y
     if (CondC->isNullValue())
       return FalseVal;
   }
 
   // select ?, X, X -> X
   if (TrueVal == FalseVal)
     return TrueVal;
 
   if (isa<UndefValue>(TrueVal))   // select ?, undef, X -> X
     return FalseVal;
   if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
     return TrueVal;
 
   if (Value *V =
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
   if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal))
     return V;
 
   return nullptr;
 }
 
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                 const SimplifyQuery &Q) {
   return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit);
 }
 
 /// Given operands for an GetElementPtrInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                               const SimplifyQuery &Q, unsigned) {
   // The type of the GEP pointer operand.
   unsigned AS =
       cast<PointerType>(Ops[0]->getType()->getScalarType())->getAddressSpace();
 
   // getelementptr P -> P.
   if (Ops.size() == 1)
     return Ops[0];
 
   // Compute the (pointer) type returned by the GEP instruction.
   Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1));
   Type *GEPTy = PointerType::get(LastType, AS);
   if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
     GEPTy = VectorType::get(GEPTy, VT->getNumElements());
   else if (VectorType *VT = dyn_cast<VectorType>(Ops[1]->getType()))
     GEPTy = VectorType::get(GEPTy, VT->getNumElements());
 
   if (isa<UndefValue>(Ops[0]))
     return UndefValue::get(GEPTy);
 
   if (Ops.size() == 2) {
     // getelementptr P, 0 -> P.
     if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy)
       return Ops[0];
 
     Type *Ty = SrcTy;
     if (Ty->isSized()) {
       Value *P;
       uint64_t C;
       uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty);
       // getelementptr P, N -> P if P points to a type of zero size.
       if (TyAllocSize == 0 && Ops[0]->getType() == GEPTy)
         return Ops[0];
 
       // The following transforms are only safe if the ptrtoint cast
       // doesn't truncate the pointers.
       if (Ops[1]->getType()->getScalarSizeInBits() ==
           Q.DL.getIndexSizeInBits(AS)) {
         auto PtrToIntOrZero = [GEPTy](Value *P) -> Value * {
           if (match(P, m_Zero()))
             return Constant::getNullValue(GEPTy);
           Value *Temp;
           if (match(P, m_PtrToInt(m_Value(Temp))))
             if (Temp->getType() == GEPTy)
               return Temp;
           return nullptr;
         };
 
         // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
         if (TyAllocSize == 1 &&
             match(Ops[1], m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0])))))
           if (Value *R = PtrToIntOrZero(P))
             return R;
 
         // getelementptr V, (ashr (sub P, V), C) -> Q
         // if P points to a type of size 1 << C.
         if (match(Ops[1],
                   m_AShr(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
                          m_ConstantInt(C))) &&
             TyAllocSize == 1ULL << C)
           if (Value *R = PtrToIntOrZero(P))
             return R;
 
         // getelementptr V, (sdiv (sub P, V), C) -> Q
         // if P points to a type of size C.
         if (match(Ops[1],
                   m_SDiv(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
                          m_SpecificInt(TyAllocSize))))
           if (Value *R = PtrToIntOrZero(P))
             return R;
       }
     }
   }
 
   if (Q.DL.getTypeAllocSize(LastType) == 1 &&
       all_of(Ops.slice(1).drop_back(1),
              [](Value *Idx) { return match(Idx, m_Zero()); })) {
     unsigned IdxWidth =
         Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace());
     if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) {
       APInt BasePtrOffset(IdxWidth, 0);
       Value *StrippedBasePtr =
           Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL,
                                                             BasePtrOffset);
 
       // gep (gep V, C), (sub 0, V) -> C
       if (match(Ops.back(),
                 m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr))))) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
       // gep (gep V, C), (xor V, -1) -> C-1
       if (match(Ops.back(),
                 m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes()))) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
     }
   }
 
   // Check to see if this is constant foldable.
   if (!all_of(Ops, [](Value *V) { return isa<Constant>(V); }))
     return nullptr;
 
   auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
                                             Ops.slice(1));
   if (auto *CEFolded = ConstantFoldConstant(CE, Q.DL))
     return CEFolded;
   return CE;
 }
 
 Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
                              const SimplifyQuery &Q) {
   return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
                                       ArrayRef<unsigned> Idxs, const SimplifyQuery &Q,
                                       unsigned) {
   if (Constant *CAgg = dyn_cast<Constant>(Agg))
     if (Constant *CVal = dyn_cast<Constant>(Val))
       return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
 
   // insertvalue x, undef, n -> x
   if (match(Val, m_Undef()))
     return Agg;
 
   // insertvalue x, (extractvalue y, n), n
   if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val))
     if (EV->getAggregateOperand()->getType() == Agg->getType() &&
         EV->getIndices() == Idxs) {
       // insertvalue undef, (extractvalue y, n), n -> y
       if (match(Agg, m_Undef()))
         return EV->getAggregateOperand();
 
       // insertvalue y, (extractvalue y, n), n -> y
       if (Agg == EV->getAggregateOperand())
         return Agg;
     }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
                                      ArrayRef<unsigned> Idxs,
                                      const SimplifyQuery &Q) {
   return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
                                        const SimplifyQuery &Q) {
   // Try to constant fold.
   auto *VecC = dyn_cast<Constant>(Vec);
   auto *ValC = dyn_cast<Constant>(Val);
   auto *IdxC = dyn_cast<Constant>(Idx);
   if (VecC && ValC && IdxC)
     return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);
 
   // Fold into undef if index is out of bounds.
   if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
     uint64_t NumElements = cast<VectorType>(Vec->getType())->getNumElements();
     if (CI->uge(NumElements))
       return UndefValue::get(Vec->getType());
   }
 
   // If index is undef, it might be out of bounds (see above case)
   if (isa<UndefValue>(Idx))
     return UndefValue::get(Vec->getType());
 
   return nullptr;
 }
 
 /// Given operands for an ExtractValueInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                        const SimplifyQuery &, unsigned) {
   if (auto *CAgg = dyn_cast<Constant>(Agg))
     return ConstantFoldExtractValueInstruction(CAgg, Idxs);
 
   // extractvalue x, (insertvalue y, elt, n), n -> elt
   unsigned NumIdxs = Idxs.size();
   for (auto *IVI = dyn_cast<InsertValueInst>(Agg); IVI != nullptr;
        IVI = dyn_cast<InsertValueInst>(IVI->getAggregateOperand())) {
     ArrayRef<unsigned> InsertValueIdxs = IVI->getIndices();
     unsigned NumInsertValueIdxs = InsertValueIdxs.size();
     unsigned NumCommonIdxs = std::min(NumInsertValueIdxs, NumIdxs);
     if (InsertValueIdxs.slice(0, NumCommonIdxs) ==
         Idxs.slice(0, NumCommonIdxs)) {
       if (NumIdxs == NumInsertValueIdxs)
         return IVI->getInsertedValueOperand();
       break;
     }
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
                                       const SimplifyQuery &Q) {
   return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit);
 }
 
 /// Given operands for an ExtractElementInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &,
                                          unsigned) {
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
       return ConstantFoldExtractElementInstruction(CVec, CIdx);
 
     // The index is not relevant if our vector is a splat.
     if (auto *Splat = CVec->getSplatValue())
       return Splat;
 
     if (isa<UndefValue>(Vec))
       return UndefValue::get(Vec->getType()->getVectorElementType());
   }
 
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
   if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
     if (IdxC->getValue().uge(Vec->getType()->getVectorNumElements()))
       // definitely out of bounds, thus undefined result
       return UndefValue::get(Vec->getType()->getVectorElementType());
     if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
       return Elt;
   }
 
   // An undef extract index can be arbitrarily chosen to be an out-of-range
   // index value, which would result in the instruction being undef.
   if (isa<UndefValue>(Idx))
     return UndefValue::get(Vec->getType()->getVectorElementType());
 
   return nullptr;
 }
 
 Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx,
                                         const SimplifyQuery &Q) {
   return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit);
 }
 
 /// See if we can fold the given phi. If not, returns null.
 static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
   Value *CommonValue = nullptr;
   bool HasUndefInput = false;
   for (Value *Incoming : PN->incoming_values()) {
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PN) continue;
     if (isa<UndefValue>(Incoming)) {
       // Remember that we saw an undef value, but otherwise ignore them.
       HasUndefInput = true;
       continue;
     }
     if (CommonValue && Incoming != CommonValue)
       return nullptr;  // Not the same, bail out.
     CommonValue = Incoming;
   }
 
   // If CommonValue is null then all of the incoming values were either undef or
   // equal to the phi node itself.
   if (!CommonValue)
     return UndefValue::get(PN->getType());
 
   // If we have a PHI node like phi(X, undef, X), where X is defined by some
   // instruction, we cannot return X as the result of the PHI node unless it
   // dominates the PHI block.
   if (HasUndefInput)
     return valueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;
 
   return CommonValue;
 }
 
 static Value *SimplifyCastInst(unsigned CastOpc, Value *Op,
                                Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *C = dyn_cast<Constant>(Op))
     return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL);
 
   if (auto *CI = dyn_cast<CastInst>(Op)) {
     auto *Src = CI->getOperand(0);
     Type *SrcTy = Src->getType();
     Type *MidTy = CI->getType();
     Type *DstTy = Ty;
     if (Src->getType() == Ty) {
       auto FirstOp = static_cast<Instruction::CastOps>(CI->getOpcode());
       auto SecondOp = static_cast<Instruction::CastOps>(CastOpc);
       Type *SrcIntPtrTy =
           SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr;
       Type *MidIntPtrTy =
           MidTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(MidTy) : nullptr;
       Type *DstIntPtrTy =
           DstTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(DstTy) : nullptr;
       if (CastInst::isEliminableCastPair(FirstOp, SecondOp, SrcTy, MidTy, DstTy,
                                          SrcIntPtrTy, MidIntPtrTy,
                                          DstIntPtrTy) == Instruction::BitCast)
         return Src;
     }
   }
 
   // bitcast x -> x
   if (CastOpc == Instruction::BitCast)
     if (Op->getType() == Ty)
       return Op;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                               const SimplifyQuery &Q) {
   return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit);
 }
 
 /// For the given destination element of a shuffle, peek through shuffles to
 /// match a root vector source operand that contains that element in the same
 /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
 static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
                                    int MaskVal, Value *RootVec,
                                    unsigned MaxRecurse) {
   if (!MaxRecurse--)
     return nullptr;
 
   // Bail out if any mask value is undefined. That kind of shuffle may be
   // simplified further based on demanded bits or other folds.
   if (MaskVal == -1)
     return nullptr;
 
   // The mask value chooses which source operand we need to look at next.
   int InVecNumElts = Op0->getType()->getVectorNumElements();
   int RootElt = MaskVal;
   Value *SourceOp = Op0;
   if (MaskVal >= InVecNumElts) {
     RootElt = MaskVal - InVecNumElts;
     SourceOp = Op1;
   }
 
   // If the source operand is a shuffle itself, look through it to find the
   // matching root vector.
   if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
     return foldIdentityShuffles(
         DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
         SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
   }
 
   // TODO: Look through bitcasts? What if the bitcast changes the vector element
   // size?
 
   // The source operand is not a shuffle. Initialize the root vector value for
   // this shuffle if that has not been done yet.
   if (!RootVec)
     RootVec = SourceOp;
 
   // Give up as soon as a source operand does not match the existing root value.
   if (RootVec != SourceOp)
     return nullptr;
 
   // The element must be coming from the same lane in the source vector
   // (although it may have crossed lanes in intermediate shuffles).
   if (RootElt != DestElt)
     return nullptr;
 
   return RootVec;
 }
 
 static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
                                         Type *RetTy, const SimplifyQuery &Q,
                                         unsigned MaxRecurse) {
   if (isa<UndefValue>(Mask))
     return UndefValue::get(RetTy);
 
   Type *InVecTy = Op0->getType();
   unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
   unsigned InVecNumElts = InVecTy->getVectorNumElements();
 
   SmallVector<int, 32> Indices;
   ShuffleVectorInst::getShuffleMask(Mask, Indices);
   assert(MaskNumElts == Indices.size() &&
          "Size of Indices not same as number of mask elements?");
 
   // Canonicalization: If mask does not select elements from an input vector,
   // replace that input vector with undef.
   bool MaskSelects0 = false, MaskSelects1 = false;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     if (Indices[i] == -1)
       continue;
     if ((unsigned)Indices[i] < InVecNumElts)
       MaskSelects0 = true;
     else
       MaskSelects1 = true;
   }
   if (!MaskSelects0)
     Op0 = UndefValue::get(InVecTy);
   if (!MaskSelects1)
     Op1 = UndefValue::get(InVecTy);
 
   auto *Op0Const = dyn_cast<Constant>(Op0);
   auto *Op1Const = dyn_cast<Constant>(Op1);
 
   // If all operands are constant, constant fold the shuffle.
   if (Op0Const && Op1Const)
     return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
 
   // Canonicalization: if only one input vector is constant, it shall be the
   // second one.
   if (Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
     ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
   }
 
   // A shuffle of a splat is always the splat itself. Legal if the shuffle's
   // value type is same as the input vectors' type.
   if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
     if (isa<UndefValue>(Op1) && RetTy == InVecTy &&
         OpShuf->getMask()->getSplatValue())
       return Op0;
 
   // Don't fold a shuffle with undef mask elements. This may get folded in a
   // better way using demanded bits or other analysis.
   // TODO: Should we allow this?
   if (find(Indices, -1) != Indices.end())
     return nullptr;
 
   // Check if every element of this shuffle can be mapped back to the
   // corresponding element of a single root vector. If so, we don't need this
   // shuffle. This handles simple identity shuffles as well as chains of
   // shuffles that may widen/narrow and/or move elements across lanes and back.
   Value *RootVec = nullptr;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     // Note that recursion is limited for each vector element, so if any element
     // exceeds the limit, this will fail to simplify.
     RootVec =
         foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
 
     // We can't replace a widening/narrowing shuffle with one of its operands.
     if (!RootVec || RootVec->getType() != RetTy)
       return nullptr;
   }
   return RootVec;
 }
 
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
 Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
                                        Type *RetTy, const SimplifyQuery &Q) {
   return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
 }
 
 static Constant *propagateNaN(Constant *In) {
   // If the input is a vector with undef elements, just return a default NaN.
   if (!In->isNaN())
     return ConstantFP::getNaN(In->getType());
 
   // Propagate the existing NaN constant when possible.
   // TODO: Should we quiet a signaling NaN?
   return In;
 }
 
 static Constant *simplifyFPBinop(Value *Op0, Value *Op1) {
   if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
     return ConstantFP::getNaN(Op0->getType());
 
   if (match(Op0, m_NaN()))
     return propagateNaN(cast<Constant>(Op0));
   if (match(Op1, m_NaN()))
     return propagateNaN(cast<Constant>(Op1));
 
   return nullptr;
 }
 
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // fadd X, -0 ==> X
   if (match(Op1, m_NegZeroFP()))
     return Op0;
 
   // fadd X, 0 ==> X, when we know X is not -0
   if (match(Op1, m_PosZeroFP()) &&
       (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
   // With nnan: (+/-0.0 - X) + X --> 0.0 (and commuted variant)
   // We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN.
   // Negative zeros are allowed because we always end up with positive zero:
   // X = -0.0: (-0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
   // X = -0.0: ( 0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
   // X =  0.0: (-0.0 - ( 0.0)) + ( 0.0) == (-0.0) + ( 0.0) == 0.0
   // X =  0.0: ( 0.0 - ( 0.0)) + ( 0.0) == ( 0.0) + ( 0.0) == 0.0
   if (FMF.noNaNs() && (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) ||
                        match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0)))))
     return ConstantFP::getNullValue(Op0->getType());
 
   return nullptr;
 }
 
 /// Given operands for an FSub, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // fsub X, +0 ==> X
   if (match(Op1, m_PosZeroFP()))
     return Op0;
 
   // fsub X, -0 ==> X, when we know X is not -0
   if (match(Op1, m_NegZeroFP()) &&
       (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
     return Op0;
 
   // fsub -0.0, (fsub -0.0, X) ==> X
   Value *X;
   if (match(Op0, m_NegZeroFP()) &&
       match(Op1, m_FSub(m_NegZeroFP(), m_Value(X))))
     return X;
 
   // fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored.
   if (FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()) &&
       match(Op1, m_FSub(m_AnyZeroFP(), m_Value(X))))
     return X;
 
   // fsub nnan x, x ==> 0.0
   if (FMF.noNaNs() && Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
   return nullptr;
 }
 
 /// Given the operands for an FMul, see if we can fold the result
 static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // fmul X, 1.0 ==> X
   if (match(Op1, m_FPOne()))
     return Op0;
 
   // fmul nnan nsz X, 0 ==> 0
   if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZeroFP()))
     return ConstantFP::getNullValue(Op0->getType());
 
   // sqrt(X) * sqrt(X) --> X, if we can:
   // 1. Remove the intermediate rounding (reassociate).
   // 2. Ignore non-zero negative numbers because sqrt would produce NAN.
   // 3. Ignore -0.0 because sqrt(-0.0) == -0.0, but -0.0 * -0.0 == 0.0.
   Value *X;
   if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) &&
       FMF.allowReassoc() && FMF.noNaNs() && FMF.noSignedZeros())
     return X;
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 
 Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // X / 1.0 -> X
   if (match(Op1, m_FPOne()))
     return Op0;
 
   // 0 / X -> 0
   // Requires that NaNs are off (X could be zero) and signed zeroes are
   // ignored (X could be positive or negative, so the output sign is unknown).
   if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()))
     return ConstantFP::getNullValue(Op0->getType());
 
   if (FMF.noNaNs()) {
     // X / X -> 1.0 is legal when NaNs are ignored.
     // We can ignore infinities because INF/INF is NaN.
     if (Op0 == Op1)
       return ConstantFP::get(Op0->getType(), 1.0);
 
     // (X * Y) / Y --> X if we can reassociate to the above form.
     Value *X;
     if (FMF.allowReassoc() && match(Op0, m_c_FMul(m_Value(X), m_Specific(Op1))))
       return X;
 
     // -X /  X -> -1.0 and
     //  X / -X -> -1.0 are legal when NaNs are ignored.
     // We can ignore signed zeros because +-0.0/+-0.0 is NaN and ignored.
     if ((BinaryOperator::isFNeg(Op0, /*IgnoreZeroSign=*/true) &&
          BinaryOperator::getFNegArgument(Op0) == Op1) ||
         (BinaryOperator::isFNeg(Op1, /*IgnoreZeroSign=*/true) &&
          BinaryOperator::getFNegArgument(Op1) == Op0))
       return ConstantFP::get(Op0->getType(), -1.0);
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
     return C;
 
   if (Constant *C = simplifyFPBinop(Op0, Op1))
     return C;
 
   // Unlike fdiv, the result of frem always matches the sign of the dividend.
   // The constant match may include undef elements in a vector, so return a full
   // zero constant as the result.
   if (FMF.noNaNs()) {
     // +0 % X -> 0
     if (match(Op0, m_PosZeroFP()))
       return ConstantFP::getNullValue(Op0->getType());
     // -0 % X -> -0
     if (match(Op0, m_NegZeroFP()))
       return ConstantFP::getNegativeZero(Op0->getType());
   }
 
   return nullptr;
 }
 
 Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 //=== Helper functions for higher up the class hierarchy.
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const SimplifyQuery &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
     return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::Sub:
     return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::Mul:
     return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::SDiv:
     return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::UDiv:
     return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::SRem:
     return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::URem:
     return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Shl:
     return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::LShr:
     return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::AShr:
     return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::And:
     return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Or:
     return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Xor:
     return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FMul:
     return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FDiv:
     return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::FRem:
     return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   default:
     llvm_unreachable("Unexpected opcode");
   }
 }
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
 /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
 /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
 static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                               const FastMathFlags &FMF, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FMul:
     return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FDiv:
     return SimplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse);
   default:
     return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
   }
 }
 
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            const SimplifyQuery &Q) {
   return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                              FastMathFlags FMF, const SimplifyQuery &Q) {
   return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for a CmpInst, see if we can fold the result.
 static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
     return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
   return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                              const SimplifyQuery &Q) {
   return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 static bool IsIdempotent(Intrinsic::ID ID) {
   switch (ID) {
   default: return false;
 
   // Unary idempotent: f(f(x)) = f(x)
   case Intrinsic::fabs:
   case Intrinsic::floor:
   case Intrinsic::ceil:
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
   case Intrinsic::canonicalize:
     return true;
   }
 }
 
 static Value *SimplifyRelativeLoad(Constant *Ptr, Constant *Offset,
                                    const DataLayout &DL) {
   GlobalValue *PtrSym;
   APInt PtrOffset;
   if (!IsConstantOffsetFromGlobal(Ptr, PtrSym, PtrOffset, DL))
     return nullptr;
 
   Type *Int8PtrTy = Type::getInt8PtrTy(Ptr->getContext());
   Type *Int32Ty = Type::getInt32Ty(Ptr->getContext());
   Type *Int32PtrTy = Int32Ty->getPointerTo();
   Type *Int64Ty = Type::getInt64Ty(Ptr->getContext());
 
   auto *OffsetConstInt = dyn_cast<ConstantInt>(Offset);
   if (!OffsetConstInt || OffsetConstInt->getType()->getBitWidth() > 64)
     return nullptr;
 
   uint64_t OffsetInt = OffsetConstInt->getSExtValue();
   if (OffsetInt % 4 != 0)
     return nullptr;
 
   Constant *C = ConstantExpr::getGetElementPtr(
       Int32Ty, ConstantExpr::getBitCast(Ptr, Int32PtrTy),
       ConstantInt::get(Int64Ty, OffsetInt / 4));
   Constant *Loaded = ConstantFoldLoadFromConstPtr(C, Int32Ty, DL);
   if (!Loaded)
     return nullptr;
 
   auto *LoadedCE = dyn_cast<ConstantExpr>(Loaded);
   if (!LoadedCE)
     return nullptr;
 
   if (LoadedCE->getOpcode() == Instruction::Trunc) {
     LoadedCE = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
     if (!LoadedCE)
       return nullptr;
   }
 
   if (LoadedCE->getOpcode() != Instruction::Sub)
     return nullptr;
 
   auto *LoadedLHS = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
   if (!LoadedLHS || LoadedLHS->getOpcode() != Instruction::PtrToInt)
     return nullptr;
   auto *LoadedLHSPtr = LoadedLHS->getOperand(0);
 
   Constant *LoadedRHS = LoadedCE->getOperand(1);
   GlobalValue *LoadedRHSSym;
   APInt LoadedRHSOffset;
   if (!IsConstantOffsetFromGlobal(LoadedRHS, LoadedRHSSym, LoadedRHSOffset,
                                   DL) ||
       PtrSym != LoadedRHSSym || PtrOffset != LoadedRHSOffset)
     return nullptr;
 
   return ConstantExpr::getBitCast(LoadedLHSPtr, Int8PtrTy);
 }
 
 static bool maskIsAllZeroOrUndef(Value *Mask) {
   auto *ConstMask = dyn_cast<Constant>(Mask);
   if (!ConstMask)
     return false;
   if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
     return true;
   for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
        ++I) {
     if (auto *MaskElt = ConstMask->getAggregateElement(I))
       if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
         continue;
     return false;
   }
   return true;
 }
 
 static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
                                      const SimplifyQuery &Q) {
   // Idempotent functions return the same result when called repeatedly.
   Intrinsic::ID IID = F->getIntrinsicID();
   if (IsIdempotent(IID))
     if (auto *II = dyn_cast<IntrinsicInst>(Op0))
       if (II->getIntrinsicID() == IID)
         return II;
 
   Value *X;
   switch (IID) {
   case Intrinsic::fabs:
     if (SignBitMustBeZero(Op0, Q.TLI)) return Op0;
     break;
   case Intrinsic::bswap:
     // bswap(bswap(x)) -> x
     if (match(Op0, m_BSwap(m_Value(X)))) return X;
     break;
   case Intrinsic::bitreverse:
     // bitreverse(bitreverse(x)) -> x
     if (match(Op0, m_BitReverse(m_Value(X)))) return X;
     break;
   case Intrinsic::exp:
     // exp(log(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::log>(m_Value(X)))) return X;
     break;
   case Intrinsic::exp2:
     // exp2(log2(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::log2>(m_Value(X)))) return X;
     break;
   case Intrinsic::log:
     // log(exp(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X)))) return X;
     break;
   case Intrinsic::log2:
     // log2(exp2(x)) -> x
     if (Q.CxtI->hasAllowReassoc() &&
         match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X)))) return X;
     break;
   default:
     break;
   }
 
   return nullptr;
 }
 
 static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
                                       const SimplifyQuery &Q) {
   Intrinsic::ID IID = F->getIntrinsicID();
   Type *ReturnType = F->getReturnType();
   switch (IID) {
   case Intrinsic::usub_with_overflow:
   case Intrinsic::ssub_with_overflow:
     // X - X -> { 0, false }
     if (Op0 == Op1)
       return Constant::getNullValue(ReturnType);
     // X - undef -> undef
     // undef - X -> undef
     if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
       return UndefValue::get(ReturnType);
     break;
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
     // X + undef -> undef
     if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
       return UndefValue::get(ReturnType);
     break;
   case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
     // 0 * X -> { 0, false }
     // X * 0 -> { 0, false }
     if (match(Op0, m_Zero()) || match(Op1, m_Zero()))
       return Constant::getNullValue(ReturnType);
     // undef * X -> { 0, false }
     // X * undef -> { 0, false }
     if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
       return Constant::getNullValue(ReturnType);
     break;
   case Intrinsic::load_relative:
     if (auto *C0 = dyn_cast<Constant>(Op0))
       if (auto *C1 = dyn_cast<Constant>(Op1))
         return SimplifyRelativeLoad(C0, C1, Q.DL);
     break;
   case Intrinsic::powi:
     if (auto *Power = dyn_cast<ConstantInt>(Op1)) {
       // powi(x, 0) -> 1.0
       if (Power->isZero())
         return ConstantFP::get(Op0->getType(), 1.0);
       // powi(x, 1) -> x
       if (Power->isOne())
         return Op0;
     }
     break;
   case Intrinsic::maxnum:
   case Intrinsic::minnum:
     // If one argument is NaN, return the other argument.
     if (match(Op0, m_NaN())) return Op1;
     if (match(Op1, m_NaN())) return Op0;
     break;
   default:
     break;
   }
 
   return nullptr;
 }
 
 template <typename IterTy>
 static Value *simplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
                                 const SimplifyQuery &Q) {
   // Intrinsics with no operands have some kind of side effect. Don't simplify.
   unsigned NumOperands = std::distance(ArgBegin, ArgEnd);
   if (NumOperands == 0)
     return nullptr;
 
   Intrinsic::ID IID = F->getIntrinsicID();
   if (NumOperands == 1)
     return simplifyUnaryIntrinsic(F, ArgBegin[0], Q);
 
   if (NumOperands == 2)
     return simplifyBinaryIntrinsic(F, ArgBegin[0], ArgBegin[1], Q);
 
   // Handle intrinsics with 3 or more arguments.
   switch (IID) {
   case Intrinsic::masked_load: {
     Value *MaskArg = ArgBegin[2];
     Value *PassthruArg = ArgBegin[3];
     // If the mask is all zeros or undef, the "passthru" argument is the result.
     if (maskIsAllZeroOrUndef(MaskArg))
       return PassthruArg;
     return nullptr;
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
     Value *ShAmtArg = ArgBegin[2];
     const APInt *ShAmtC;
     if (match(ShAmtArg, m_APInt(ShAmtC))) {
       // If there's effectively no shift, return the 1st arg or 2nd arg.
       // TODO: For vectors, we could check each element of a non-splat constant.
       APInt BitWidth = APInt(ShAmtC->getBitWidth(), ShAmtC->getBitWidth());
       if (ShAmtC->urem(BitWidth).isNullValue())
         return ArgBegin[IID == Intrinsic::fshl ? 0 : 1];
     }
     return nullptr;
   }
   default:
     return nullptr;
   }
 }
 
 template <typename IterTy>
 static Value *SimplifyCall(ImmutableCallSite CS, Value *V, IterTy ArgBegin,
                            IterTy ArgEnd, const SimplifyQuery &Q,
                            unsigned MaxRecurse) {
   Type *Ty = V->getType();
   if (PointerType *PTy = dyn_cast<PointerType>(Ty))
     Ty = PTy->getElementType();
   FunctionType *FTy = cast<FunctionType>(Ty);
 
   // call undef -> undef
   // call null -> undef
   if (isa<UndefValue>(V) || isa<ConstantPointerNull>(V))
     return UndefValue::get(FTy->getReturnType());
 
   Function *F = dyn_cast<Function>(V);
   if (!F)
     return nullptr;
 
   if (F->isIntrinsic())
     if (Value *Ret = simplifyIntrinsic(F, ArgBegin, ArgEnd, Q))
       return Ret;
 
   if (!canConstantFoldCallTo(CS, F))
     return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
   ConstantArgs.reserve(ArgEnd - ArgBegin);
   for (IterTy I = ArgBegin, E = ArgEnd; I != E; ++I) {
     Constant *C = dyn_cast<Constant>(*I);
     if (!C)
       return nullptr;
     ConstantArgs.push_back(C);
   }
 
   return ConstantFoldCall(CS, F, ConstantArgs, Q.TLI);
 }
 
 Value *llvm::SimplifyCall(ImmutableCallSite CS, Value *V,
                           User::op_iterator ArgBegin, User::op_iterator ArgEnd,
                           const SimplifyQuery &Q) {
   return ::SimplifyCall(CS, V, ArgBegin, ArgEnd, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyCall(ImmutableCallSite CS, Value *V,
                           ArrayRef<Value *> Args, const SimplifyQuery &Q) {
   return ::SimplifyCall(CS, V, Args.begin(), Args.end(), Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyCall(ImmutableCallSite ICS, const SimplifyQuery &Q) {
   CallSite CS(const_cast<Instruction*>(ICS.getInstruction()));
   return ::SimplifyCall(CS, CS.getCalledValue(), CS.arg_begin(), CS.arg_end(),
                         Q, RecursionLimit);
 }
 
 /// See if we can compute a simplified version of this instruction.
 /// If not, this returns null.
 
 Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
                                  OptimizationRemarkEmitter *ORE) {
   const SimplifyQuery Q = SQ.CxtI ? SQ : SQ.getWithInstruction(I);
   Value *Result;
 
   switch (I->getOpcode()) {
   default:
     Result = ConstantFoldInstruction(I, Q.DL, Q.TLI);
     break;
   case Instruction::FAdd:
     Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Add:
     Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::FSub:
     Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Sub:
     Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::FMul:
     Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Mul:
     Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::SDiv:
     Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::UDiv:
     Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FDiv:
     Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::SRem:
     Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::URem:
     Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FRem:
     Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1),
                               I->getFastMathFlags(), Q);
     break;
   case Instruction::Shl:
     Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::LShr:
     Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
                               cast<BinaryOperator>(I)->isExact(), Q);
     break;
   case Instruction::AShr:
     Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
                               cast<BinaryOperator>(I)->isExact(), Q);
     break;
   case Instruction::And:
     Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::Or:
     Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::Xor:
     Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::ICmp:
     Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
                               I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FCmp:
     Result =
         SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), I->getOperand(0),
                          I->getOperand(1), I->getFastMathFlags(), Q);
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
                                 I->getOperand(2), Q);
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value *, 8> Ops(I->op_begin(), I->op_end());
     Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
                              Ops, Q);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
     Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
                                      IV->getInsertedValueOperand(),
                                      IV->getIndices(), Q);
     break;
   }
   case Instruction::InsertElement: {
     auto *IE = cast<InsertElementInst>(I);
     Result = SimplifyInsertElementInst(IE->getOperand(0), IE->getOperand(1),
                                        IE->getOperand(2), Q);
     break;
   }
   case Instruction::ExtractValue: {
     auto *EVI = cast<ExtractValueInst>(I);
     Result = SimplifyExtractValueInst(EVI->getAggregateOperand(),
                                       EVI->getIndices(), Q);
     break;
   }
   case Instruction::ExtractElement: {
     auto *EEI = cast<ExtractElementInst>(I);
     Result = SimplifyExtractElementInst(EEI->getVectorOperand(),
                                         EEI->getIndexOperand(), Q);
     break;
   }
   case Instruction::ShuffleVector: {
     auto *SVI = cast<ShuffleVectorInst>(I);
     Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
                                        SVI->getMask(), SVI->getType(), Q);
     break;
   }
   case Instruction::PHI:
     Result = SimplifyPHINode(cast<PHINode>(I), Q);
     break;
   case Instruction::Call: {
     CallSite CS(cast<CallInst>(I));
     Result = SimplifyCall(CS, Q);
     break;
   }
 #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc:
 #include "llvm/IR/Instruction.def"
 #undef HANDLE_CAST_INST
     Result =
         SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), Q);
     break;
   case Instruction::Alloca:
     // No simplifications for Alloca and it can't be constant folded.
     Result = nullptr;
     break;
   }
 
   // In general, it is possible for computeKnownBits to determine all bits in a
   // value even when the operands are not all constants.
   if (!Result && I->getType()->isIntOrIntVectorTy()) {
     KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
     if (Known.isConstant())
       Result = ConstantInt::get(I->getType(), Known.getConstant());
   }
 
   /// If called on unreachable code, the above logic may report that the
   /// instruction simplified to itself.  Make life easier for users by
   /// detecting that case here, returning a safe value instead.
   return Result == I ? UndefValue::get(I->getType()) : Result;
 }
 
 /// Implementation of recursive simplification through an instruction's
 /// uses.
 ///
 /// This is the common implementation of the recursive simplification routines.
 /// If we have a pre-simplified value in 'SimpleV', that is forcibly used to
 /// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of
 /// instructions to process and attempt to simplify it using
 /// InstructionSimplify.
 ///
 /// This routine returns 'true' only when *it* simplifies something. The passed
 /// in simplified value does not count toward this.
 static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
                                               const TargetLibraryInfo *TLI,
                                               const DominatorTree *DT,
                                               AssumptionCache *AC) {
   bool Simplified = false;
   SmallSetVector<Instruction *, 8> Worklist;
   const DataLayout &DL = I->getModule()->getDataLayout();
 
   // If we have an explicit value to collapse to, do that round of the
   // simplification loop by hand initially.
   if (SimpleV) {
     for (User *U : I->users())
       if (U != I)
         Worklist.insert(cast<Instruction>(U));
 
     // Replace the instruction with its simplified value.
     I->replaceAllUsesWith(SimpleV);
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
     if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
         !I->mayHaveSideEffects())
       I->eraseFromParent();
   } else {
     Worklist.insert(I);
   }
 
   // Note that we must test the size on each iteration, the worklist can grow.
   for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
     I = Worklist[Idx];
 
     // See if this instruction simplifies.
     SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC});
     if (!SimpleV)
       continue;
 
     Simplified = true;
 
     // Stash away all the uses of the old instruction so we can check them for
     // recursive simplifications after a RAUW. This is cheaper than checking all
     // uses of To on the recursive step in most cases.
     for (User *U : I->users())
       Worklist.insert(cast<Instruction>(U));
 
     // Replace the instruction with its simplified value.
     I->replaceAllUsesWith(SimpleV);
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
     if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
         !I->mayHaveSideEffects())
       I->eraseFromParent();
   }
   return Simplified;
 }
 
 bool llvm::recursivelySimplifyInstruction(Instruction *I,
                                           const TargetLibraryInfo *TLI,
                                           const DominatorTree *DT,
                                           AssumptionCache *AC) {
   return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC);
 }
 
 bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
                                          const TargetLibraryInfo *TLI,
                                          const DominatorTree *DT,
                                          AssumptionCache *AC) {
   assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
   assert(SimpleV && "Must provide a simplified value.");
   return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC);
 }
 
 namespace llvm {
 const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) {
   auto *DTWP = P.getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *TLIWP = P.getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr;
   auto *ACWP = P.getAnalysisIfAvailable<AssumptionCacheTracker>();
   auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr;
   return {F.getParent()->getDataLayout(), TLI, DT, AC};
 }
 
 const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &AR,
                                          const DataLayout &DL) {
   return {DL, &AR.TLI, &AR.DT, &AR.AC};
 }
 
 template <class T, class... TArgs>
 const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
                                          Function &F) {
   auto *DT = AM.template getCachedResult<DominatorTreeAnalysis>(F);
   auto *TLI = AM.template getCachedResult<TargetLibraryAnalysis>(F);
   auto *AC = AM.template getCachedResult<AssumptionAnalysis>(F);
   return {F.getParent()->getDataLayout(), TLI, DT, AC};
 }
 template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
                                                   Function &);
 }
Index: vendor/llvm/dist-release_70/lib/Analysis/MemorySSA.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Analysis/MemorySSA.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/Analysis/MemorySSA.cpp	(revision 338000)
@@ -1,2186 +1,2191 @@
 //===- MemorySSA.cpp - Memory SSA Builder ---------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the MemorySSA class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Use.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <memory>
 #include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "memoryssa"
 
 INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
                       true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
                     true)
 
 INITIALIZE_PASS_BEGIN(MemorySSAPrinterLegacyPass, "print-memoryssa",
                       "Memory SSA Printer", false, false)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(MemorySSAPrinterLegacyPass, "print-memoryssa",
                     "Memory SSA Printer", false, false)
 
 static cl::opt<unsigned> MaxCheckLimit(
     "memssa-check-limit", cl::Hidden, cl::init(100),
     cl::desc("The maximum number of stores/phis MemorySSA"
              "will consider trying to walk past (default = 100)"));
 
 static cl::opt<bool>
     VerifyMemorySSA("verify-memoryssa", cl::init(false), cl::Hidden,
                     cl::desc("Verify MemorySSA in legacy printer pass."));
 
 namespace llvm {
 
 /// An assembly annotator class to print Memory SSA information in
 /// comments.
 class MemorySSAAnnotatedWriter : public AssemblyAnnotationWriter {
   friend class MemorySSA;
 
   const MemorySSA *MSSA;
 
 public:
   MemorySSAAnnotatedWriter(const MemorySSA *M) : MSSA(M) {}
 
   void emitBasicBlockStartAnnot(const BasicBlock *BB,
                                 formatted_raw_ostream &OS) override {
     if (MemoryAccess *MA = MSSA->getMemoryAccess(BB))
       OS << "; " << *MA << "\n";
   }
 
   void emitInstructionAnnot(const Instruction *I,
                             formatted_raw_ostream &OS) override {
     if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
       OS << "; " << *MA << "\n";
   }
 };
 
 } // end namespace llvm
 
 namespace {
 
 /// Our current alias analysis API differentiates heavily between calls and
 /// non-calls, and functions called on one usually assert on the other.
 /// This class encapsulates the distinction to simplify other code that wants
 /// "Memory affecting instructions and related data" to use as a key.
 /// For example, this class is used as a densemap key in the use optimizer.
 class MemoryLocOrCall {
 public:
   bool IsCall = false;
 
   MemoryLocOrCall() = default;
   MemoryLocOrCall(MemoryUseOrDef *MUD)
       : MemoryLocOrCall(MUD->getMemoryInst()) {}
   MemoryLocOrCall(const MemoryUseOrDef *MUD)
       : MemoryLocOrCall(MUD->getMemoryInst()) {}
 
   MemoryLocOrCall(Instruction *Inst) {
     if (ImmutableCallSite(Inst)) {
       IsCall = true;
       CS = ImmutableCallSite(Inst);
     } else {
       IsCall = false;
       // There is no such thing as a memorylocation for a fence inst, and it is
       // unique in that regard.
       if (!isa<FenceInst>(Inst))
         Loc = MemoryLocation::get(Inst);
     }
   }
 
   explicit MemoryLocOrCall(const MemoryLocation &Loc) : Loc(Loc) {}
 
   ImmutableCallSite getCS() const {
     assert(IsCall);
     return CS;
   }
 
   MemoryLocation getLoc() const {
     assert(!IsCall);
     return Loc;
   }
 
   bool operator==(const MemoryLocOrCall &Other) const {
     if (IsCall != Other.IsCall)
       return false;
 
     if (!IsCall)
       return Loc == Other.Loc;
 
     if (CS.getCalledValue() != Other.CS.getCalledValue())
       return false;
 
     return CS.arg_size() == Other.CS.arg_size() &&
            std::equal(CS.arg_begin(), CS.arg_end(), Other.CS.arg_begin());
   }
 
 private:
   union {
     ImmutableCallSite CS;
     MemoryLocation Loc;
   };
 };
 
 } // end anonymous namespace
 
 namespace llvm {
 
 template <> struct DenseMapInfo<MemoryLocOrCall> {
   static inline MemoryLocOrCall getEmptyKey() {
     return MemoryLocOrCall(DenseMapInfo<MemoryLocation>::getEmptyKey());
   }
 
   static inline MemoryLocOrCall getTombstoneKey() {
     return MemoryLocOrCall(DenseMapInfo<MemoryLocation>::getTombstoneKey());
   }
 
   static unsigned getHashValue(const MemoryLocOrCall &MLOC) {
     if (!MLOC.IsCall)
       return hash_combine(
           MLOC.IsCall,
           DenseMapInfo<MemoryLocation>::getHashValue(MLOC.getLoc()));
 
     hash_code hash =
         hash_combine(MLOC.IsCall, DenseMapInfo<const Value *>::getHashValue(
                                       MLOC.getCS().getCalledValue()));
 
     for (const Value *Arg : MLOC.getCS().args())
       hash = hash_combine(hash, DenseMapInfo<const Value *>::getHashValue(Arg));
     return hash;
   }
 
   static bool isEqual(const MemoryLocOrCall &LHS, const MemoryLocOrCall &RHS) {
     return LHS == RHS;
   }
 };
 
 } // end namespace llvm
 
 /// This does one-way checks to see if Use could theoretically be hoisted above
 /// MayClobber. This will not check the other way around.
 ///
 /// This assumes that, for the purposes of MemorySSA, Use comes directly after
 /// MayClobber, with no potentially clobbering operations in between them.
 /// (Where potentially clobbering ops are memory barriers, aliased stores, etc.)
 static bool areLoadsReorderable(const LoadInst *Use,
                                 const LoadInst *MayClobber) {
   bool VolatileUse = Use->isVolatile();
   bool VolatileClobber = MayClobber->isVolatile();
   // Volatile operations may never be reordered with other volatile operations.
   if (VolatileUse && VolatileClobber)
     return false;
   // Otherwise, volatile doesn't matter here. From the language reference:
   // 'optimizers may change the order of volatile operations relative to
   // non-volatile operations.'"
 
   // If a load is seq_cst, it cannot be moved above other loads. If its ordering
   // is weaker, it can be moved above other loads. We just need to be sure that
   // MayClobber isn't an acquire load, because loads can't be moved above
   // acquire loads.
   //
   // Note that this explicitly *does* allow the free reordering of monotonic (or
   // weaker) loads of the same address.
   bool SeqCstUse = Use->getOrdering() == AtomicOrdering::SequentiallyConsistent;
   bool MayClobberIsAcquire = isAtLeastOrStrongerThan(MayClobber->getOrdering(),
                                                      AtomicOrdering::Acquire);
   return !(SeqCstUse || MayClobberIsAcquire);
 }
 
 namespace {
 
 struct ClobberAlias {
   bool IsClobber;
   Optional<AliasResult> AR;
 };
 
 } // end anonymous namespace
 
 // Return a pair of {IsClobber (bool), AR (AliasResult)}. It relies on AR being
 // ignored if IsClobber = false.
 static ClobberAlias instructionClobbersQuery(MemoryDef *MD,
                                              const MemoryLocation &UseLoc,
                                              const Instruction *UseInst,
                                              AliasAnalysis &AA) {
   Instruction *DefInst = MD->getMemoryInst();
   assert(DefInst && "Defining instruction not actually an instruction");
   ImmutableCallSite UseCS(UseInst);
   Optional<AliasResult> AR;
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
     // These intrinsics will show up as affecting memory, but they are just
-    // markers.
+    // markers, mostly.
+    //
+    // FIXME: We probably don't actually want MemorySSA to model these at all
+    // (including creating MemoryAccesses for them): we just end up inventing
+    // clobbers where they don't really exist at all. Please see D43269 for
+    // context.
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
       if (UseCS)
         return {false, NoAlias};
       AR = AA.alias(MemoryLocation(II->getArgOperand(1)), UseLoc);
-      return {AR == MustAlias, AR};
+      return {AR != NoAlias, AR};
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
     case Intrinsic::assume:
       return {false, NoAlias};
     default:
       break;
     }
   }
 
   if (UseCS) {
     ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
     AR = isMustSet(I) ? MustAlias : MayAlias;
     return {isModOrRefSet(I), AR};
   }
 
   if (auto *DefLoad = dyn_cast<LoadInst>(DefInst))
     if (auto *UseLoad = dyn_cast<LoadInst>(UseInst))
       return {!areLoadsReorderable(UseLoad, DefLoad), MayAlias};
 
   ModRefInfo I = AA.getModRefInfo(DefInst, UseLoc);
   AR = isMustSet(I) ? MustAlias : MayAlias;
   return {isModSet(I), AR};
 }
 
 static ClobberAlias instructionClobbersQuery(MemoryDef *MD,
                                              const MemoryUseOrDef *MU,
                                              const MemoryLocOrCall &UseMLOC,
                                              AliasAnalysis &AA) {
   // FIXME: This is a temporary hack to allow a single instructionClobbersQuery
   // to exist while MemoryLocOrCall is pushed through places.
   if (UseMLOC.IsCall)
     return instructionClobbersQuery(MD, MemoryLocation(), MU->getMemoryInst(),
                                     AA);
   return instructionClobbersQuery(MD, UseMLOC.getLoc(), MU->getMemoryInst(),
                                   AA);
 }
 
 // Return true when MD may alias MU, return false otherwise.
 bool MemorySSAUtil::defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
                                         AliasAnalysis &AA) {
   return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA).IsClobber;
 }
 
 namespace {
 
 struct UpwardsMemoryQuery {
   // True if our original query started off as a call
   bool IsCall = false;
   // The pointer location we started the query with. This will be empty if
   // IsCall is true.
   MemoryLocation StartingLoc;
   // This is the instruction we were querying about.
   const Instruction *Inst = nullptr;
   // The MemoryAccess we actually got called with, used to test local domination
   const MemoryAccess *OriginalAccess = nullptr;
   Optional<AliasResult> AR = MayAlias;
 
   UpwardsMemoryQuery() = default;
 
   UpwardsMemoryQuery(const Instruction *Inst, const MemoryAccess *Access)
       : IsCall(ImmutableCallSite(Inst)), Inst(Inst), OriginalAccess(Access) {
     if (!IsCall)
       StartingLoc = MemoryLocation::get(Inst);
   }
 };
 
 } // end anonymous namespace
 
 static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
                            AliasAnalysis &AA) {
   Instruction *Inst = MD->getMemoryInst();
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_end:
       return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), Loc);
     default:
       return false;
     }
   }
   return false;
 }
 
 static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysis &AA,
                                                    const Instruction *I) {
   // If the memory can't be changed, then loads of the memory can't be
   // clobbered.
   return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
                               AA.pointsToConstantMemory(cast<LoadInst>(I)->
                                                           getPointerOperand()));
 }
 
 /// Verifies that `Start` is clobbered by `ClobberAt`, and that nothing
 /// inbetween `Start` and `ClobberAt` can clobbers `Start`.
 ///
 /// This is meant to be as simple and self-contained as possible. Because it
 /// uses no cache, etc., it can be relatively expensive.
 ///
 /// \param Start     The MemoryAccess that we want to walk from.
 /// \param ClobberAt A clobber for Start.
 /// \param StartLoc  The MemoryLocation for Start.
 /// \param MSSA      The MemorySSA isntance that Start and ClobberAt belong to.
 /// \param Query     The UpwardsMemoryQuery we used for our search.
 /// \param AA        The AliasAnalysis we used for our search.
 static void LLVM_ATTRIBUTE_UNUSED
 checkClobberSanity(MemoryAccess *Start, MemoryAccess *ClobberAt,
                    const MemoryLocation &StartLoc, const MemorySSA &MSSA,
                    const UpwardsMemoryQuery &Query, AliasAnalysis &AA) {
   assert(MSSA.dominates(ClobberAt, Start) && "Clobber doesn't dominate start?");
 
   if (MSSA.isLiveOnEntryDef(Start)) {
     assert(MSSA.isLiveOnEntryDef(ClobberAt) &&
            "liveOnEntry must clobber itself");
     return;
   }
 
   bool FoundClobber = false;
   DenseSet<MemoryAccessPair> VisitedPhis;
   SmallVector<MemoryAccessPair, 8> Worklist;
   Worklist.emplace_back(Start, StartLoc);
   // Walk all paths from Start to ClobberAt, while looking for clobbers. If one
   // is found, complain.
   while (!Worklist.empty()) {
     MemoryAccessPair MAP = Worklist.pop_back_val();
     // All we care about is that nothing from Start to ClobberAt clobbers Start.
     // We learn nothing from revisiting nodes.
     if (!VisitedPhis.insert(MAP).second)
       continue;
 
     for (MemoryAccess *MA : def_chain(MAP.first)) {
       if (MA == ClobberAt) {
         if (auto *MD = dyn_cast<MemoryDef>(MA)) {
           // instructionClobbersQuery isn't essentially free, so don't use `|=`,
           // since it won't let us short-circuit.
           //
           // Also, note that this can't be hoisted out of the `Worklist` loop,
           // since MD may only act as a clobber for 1 of N MemoryLocations.
           FoundClobber = FoundClobber || MSSA.isLiveOnEntryDef(MD);
           if (!FoundClobber) {
             ClobberAlias CA =
                 instructionClobbersQuery(MD, MAP.second, Query.Inst, AA);
             if (CA.IsClobber) {
               FoundClobber = true;
               // Not used: CA.AR;
             }
           }
         }
         break;
       }
 
       // We should never hit liveOnEntry, unless it's the clobber.
       assert(!MSSA.isLiveOnEntryDef(MA) && "Hit liveOnEntry before clobber?");
 
       if (auto *MD = dyn_cast<MemoryDef>(MA)) {
         (void)MD;
         assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA)
                     .IsClobber &&
                "Found clobber before reaching ClobberAt!");
         continue;
       }
 
       assert(isa<MemoryPhi>(MA));
       Worklist.append(upward_defs_begin({MA, MAP.second}), upward_defs_end());
     }
   }
 
   // If ClobberAt is a MemoryPhi, we can assume something above it acted as a
   // clobber. Otherwise, `ClobberAt` should've acted as a clobber at some point.
   assert((isa<MemoryPhi>(ClobberAt) || FoundClobber) &&
          "ClobberAt never acted as a clobber");
 }
 
 namespace {
 
 /// Our algorithm for walking (and trying to optimize) clobbers, all wrapped up
 /// in one class.
 class ClobberWalker {
   /// Save a few bytes by using unsigned instead of size_t.
   using ListIndex = unsigned;
 
   /// Represents a span of contiguous MemoryDefs, potentially ending in a
   /// MemoryPhi.
   struct DefPath {
     MemoryLocation Loc;
     // Note that, because we always walk in reverse, Last will always dominate
     // First. Also note that First and Last are inclusive.
     MemoryAccess *First;
     MemoryAccess *Last;
     Optional<ListIndex> Previous;
 
     DefPath(const MemoryLocation &Loc, MemoryAccess *First, MemoryAccess *Last,
             Optional<ListIndex> Previous)
         : Loc(Loc), First(First), Last(Last), Previous(Previous) {}
 
     DefPath(const MemoryLocation &Loc, MemoryAccess *Init,
             Optional<ListIndex> Previous)
         : DefPath(Loc, Init, Init, Previous) {}
   };
 
   const MemorySSA &MSSA;
   AliasAnalysis &AA;
   DominatorTree &DT;
   UpwardsMemoryQuery *Query;
 
   // Phi optimization bookkeeping
   SmallVector<DefPath, 32> Paths;
   DenseSet<ConstMemoryAccessPair> VisitedPhis;
 
   /// Find the nearest def or phi that `From` can legally be optimized to.
   const MemoryAccess *getWalkTarget(const MemoryPhi *From) const {
     assert(From->getNumOperands() && "Phi with no operands?");
 
     BasicBlock *BB = From->getBlock();
     MemoryAccess *Result = MSSA.getLiveOnEntryDef();
     DomTreeNode *Node = DT.getNode(BB);
     while ((Node = Node->getIDom())) {
       auto *Defs = MSSA.getBlockDefs(Node->getBlock());
       if (Defs)
         return &*Defs->rbegin();
     }
     return Result;
   }
 
   /// Result of calling walkToPhiOrClobber.
   struct UpwardsWalkResult {
     /// The "Result" of the walk. Either a clobber, the last thing we walked, or
     /// both. Include alias info when clobber found.
     MemoryAccess *Result;
     bool IsKnownClobber;
     Optional<AliasResult> AR;
   };
 
   /// Walk to the next Phi or Clobber in the def chain starting at Desc.Last.
   /// This will update Desc.Last as it walks. It will (optionally) also stop at
   /// StopAt.
   ///
   /// This does not test for whether StopAt is a clobber
   UpwardsWalkResult
   walkToPhiOrClobber(DefPath &Desc,
                      const MemoryAccess *StopAt = nullptr) const {
     assert(!isa<MemoryUse>(Desc.Last) && "Uses don't exist in my world");
 
     for (MemoryAccess *Current : def_chain(Desc.Last)) {
       Desc.Last = Current;
       if (Current == StopAt)
         return {Current, false, MayAlias};
 
       if (auto *MD = dyn_cast<MemoryDef>(Current)) {
         if (MSSA.isLiveOnEntryDef(MD))
           return {MD, true, MustAlias};
         ClobberAlias CA =
             instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA);
         if (CA.IsClobber)
           return {MD, true, CA.AR};
       }
     }
 
     assert(isa<MemoryPhi>(Desc.Last) &&
            "Ended at a non-clobber that's not a phi?");
     return {Desc.Last, false, MayAlias};
   }
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
                    ListIndex PriorNode) {
     auto UpwardDefs = make_range(upward_defs_begin({Phi, Paths[PriorNode].Loc}),
                                  upward_defs_end());
     for (const MemoryAccessPair &P : UpwardDefs) {
       PausedSearches.push_back(Paths.size());
       Paths.emplace_back(P.second, P.first, PriorNode);
     }
   }
 
   /// Represents a search that terminated after finding a clobber. This clobber
   /// may or may not be present in the path of defs from LastNode..SearchStart,
   /// since it may have been retrieved from cache.
   struct TerminatedPath {
     MemoryAccess *Clobber;
     ListIndex LastNode;
   };
 
   /// Get an access that keeps us from optimizing to the given phi.
   ///
   /// PausedSearches is an array of indices into the Paths array. Its incoming
   /// value is the indices of searches that stopped at the last phi optimization
   /// target. It's left in an unspecified state.
   ///
   /// If this returns None, NewPaused is a vector of searches that terminated
   /// at StopWhere. Otherwise, NewPaused is left in an unspecified state.
   Optional<TerminatedPath>
   getBlockingAccess(const MemoryAccess *StopWhere,
                     SmallVectorImpl<ListIndex> &PausedSearches,
                     SmallVectorImpl<ListIndex> &NewPaused,
                     SmallVectorImpl<TerminatedPath> &Terminated) {
     assert(!PausedSearches.empty() && "No searches to continue?");
 
     // BFS vs DFS really doesn't make a difference here, so just do a DFS with
     // PausedSearches as our stack.
     while (!PausedSearches.empty()) {
       ListIndex PathIndex = PausedSearches.pop_back_val();
       DefPath &Node = Paths[PathIndex];
 
       // If we've already visited this path with this MemoryLocation, we don't
       // need to do so again.
       //
       // NOTE: That we just drop these paths on the ground makes caching
       // behavior sporadic. e.g. given a diamond:
       //  A
       // B C
       //  D
       //
       // ...If we walk D, B, A, C, we'll only cache the result of phi
       // optimization for A, B, and D; C will be skipped because it dies here.
       // This arguably isn't the worst thing ever, since:
       //   - We generally query things in a top-down order, so if we got below D
       //     without needing cache entries for {C, MemLoc}, then chances are
       //     that those cache entries would end up ultimately unused.
       //   - We still cache things for A, so C only needs to walk up a bit.
       // If this behavior becomes problematic, we can fix without a ton of extra
       // work.
       if (!VisitedPhis.insert({Node.Last, Node.Loc}).second)
         continue;
 
       UpwardsWalkResult Res = walkToPhiOrClobber(Node, /*StopAt=*/StopWhere);
       if (Res.IsKnownClobber) {
         assert(Res.Result != StopWhere);
         // If this wasn't a cache hit, we hit a clobber when walking. That's a
         // failure.
         TerminatedPath Term{Res.Result, PathIndex};
         if (!MSSA.dominates(Res.Result, StopWhere))
           return Term;
 
         // Otherwise, it's a valid thing to potentially optimize to.
         Terminated.push_back(Term);
         continue;
       }
 
       if (Res.Result == StopWhere) {
         // We've hit our target. Save this path off for if we want to continue
         // walking.
         NewPaused.push_back(PathIndex);
         continue;
       }
 
       assert(!MSSA.isLiveOnEntryDef(Res.Result) && "liveOnEntry is a clobber");
       addSearches(cast<MemoryPhi>(Res.Result), PausedSearches, PathIndex);
     }
 
     return None;
   }
 
   template <typename T, typename Walker>
   struct generic_def_path_iterator
       : public iterator_facade_base<generic_def_path_iterator<T, Walker>,
                                     std::forward_iterator_tag, T *> {
     generic_def_path_iterator() = default;
     generic_def_path_iterator(Walker *W, ListIndex N) : W(W), N(N) {}
 
     T &operator*() const { return curNode(); }
 
     generic_def_path_iterator &operator++() {
       N = curNode().Previous;
       return *this;
     }
 
     bool operator==(const generic_def_path_iterator &O) const {
       if (N.hasValue() != O.N.hasValue())
         return false;
       return !N.hasValue() || *N == *O.N;
     }
 
   private:
     T &curNode() const { return W->Paths[*N]; }
 
     Walker *W = nullptr;
     Optional<ListIndex> N = None;
   };
 
   using def_path_iterator = generic_def_path_iterator<DefPath, ClobberWalker>;
   using const_def_path_iterator =
       generic_def_path_iterator<const DefPath, const ClobberWalker>;
 
   iterator_range<def_path_iterator> def_path(ListIndex From) {
     return make_range(def_path_iterator(this, From), def_path_iterator());
   }
 
   iterator_range<const_def_path_iterator> const_def_path(ListIndex From) const {
     return make_range(const_def_path_iterator(this, From),
                       const_def_path_iterator());
   }
 
   struct OptznResult {
     /// The path that contains our result.
     TerminatedPath PrimaryClobber;
     /// The paths that we can legally cache back from, but that aren't
     /// necessarily the result of the Phi optimization.
     SmallVector<TerminatedPath, 4> OtherClobbers;
   };
 
   ListIndex defPathIndex(const DefPath &N) const {
     // The assert looks nicer if we don't need to do &N
     const DefPath *NP = &N;
     assert(!Paths.empty() && NP >= &Paths.front() && NP <= &Paths.back() &&
            "Out of bounds DefPath!");
     return NP - &Paths.front();
   }
 
   /// Try to optimize a phi as best as we can. Returns a SmallVector of Paths
   /// that act as legal clobbers. Note that this won't return *all* clobbers.
   ///
   /// Phi optimization algorithm tl;dr:
   ///   - Find the earliest def/phi, A, we can optimize to
   ///   - Find if all paths from the starting memory access ultimately reach A
   ///     - If not, optimization isn't possible.
   ///     - Otherwise, walk from A to another clobber or phi, A'.
   ///       - If A' is a def, we're done.
   ///       - If A' is a phi, try to optimize it.
   ///
   /// A path is a series of {MemoryAccess, MemoryLocation} pairs. A path
   /// terminates when a MemoryAccess that clobbers said MemoryLocation is found.
   OptznResult tryOptimizePhi(MemoryPhi *Phi, MemoryAccess *Start,
                              const MemoryLocation &Loc) {
     assert(Paths.empty() && VisitedPhis.empty() &&
            "Reset the optimization state.");
 
     Paths.emplace_back(Loc, Start, Phi, None);
     // Stores how many "valid" optimization nodes we had prior to calling
     // addSearches/getBlockingAccess. Necessary for caching if we had a blocker.
     auto PriorPathsSize = Paths.size();
 
     SmallVector<ListIndex, 16> PausedSearches;
     SmallVector<ListIndex, 8> NewPaused;
     SmallVector<TerminatedPath, 4> TerminatedPaths;
 
     addSearches(Phi, PausedSearches, 0);
 
     // Moves the TerminatedPath with the "most dominated" Clobber to the end of
     // Paths.
     auto MoveDominatedPathToEnd = [&](SmallVectorImpl<TerminatedPath> &Paths) {
       assert(!Paths.empty() && "Need a path to move");
       auto Dom = Paths.begin();
       for (auto I = std::next(Dom), E = Paths.end(); I != E; ++I)
         if (!MSSA.dominates(I->Clobber, Dom->Clobber))
           Dom = I;
       auto Last = Paths.end() - 1;
       if (Last != Dom)
         std::iter_swap(Last, Dom);
     };
 
     MemoryPhi *Current = Phi;
     while (true) {
       assert(!MSSA.isLiveOnEntryDef(Current) &&
              "liveOnEntry wasn't treated as a clobber?");
 
       const auto *Target = getWalkTarget(Current);
       // If a TerminatedPath doesn't dominate Target, then it wasn't a legal
       // optimization for the prior phi.
       assert(all_of(TerminatedPaths, [&](const TerminatedPath &P) {
         return MSSA.dominates(P.Clobber, Target);
       }));
 
       // FIXME: This is broken, because the Blocker may be reported to be
       // liveOnEntry, and we'll happily wait for that to disappear (read: never)
       // For the moment, this is fine, since we do nothing with blocker info.
       if (Optional<TerminatedPath> Blocker = getBlockingAccess(
               Target, PausedSearches, NewPaused, TerminatedPaths)) {
 
         // Find the node we started at. We can't search based on N->Last, since
         // we may have gone around a loop with a different MemoryLocation.
         auto Iter = find_if(def_path(Blocker->LastNode), [&](const DefPath &N) {
           return defPathIndex(N) < PriorPathsSize;
         });
         assert(Iter != def_path_iterator());
 
         DefPath &CurNode = *Iter;
         assert(CurNode.Last == Current);
 
         // Two things:
         // A. We can't reliably cache all of NewPaused back. Consider a case
         //    where we have two paths in NewPaused; one of which can't optimize
         //    above this phi, whereas the other can. If we cache the second path
         //    back, we'll end up with suboptimal cache entries. We can handle
         //    cases like this a bit better when we either try to find all
         //    clobbers that block phi optimization, or when our cache starts
         //    supporting unfinished searches.
         // B. We can't reliably cache TerminatedPaths back here without doing
         //    extra checks; consider a case like:
         //       T
         //      / \
         //     D   C
         //      \ /
         //       S
         //    Where T is our target, C is a node with a clobber on it, D is a
         //    diamond (with a clobber *only* on the left or right node, N), and
         //    S is our start. Say we walk to D, through the node opposite N
         //    (read: ignoring the clobber), and see a cache entry in the top
         //    node of D. That cache entry gets put into TerminatedPaths. We then
         //    walk up to C (N is later in our worklist), find the clobber, and
         //    quit. If we append TerminatedPaths to OtherClobbers, we'll cache
         //    the bottom part of D to the cached clobber, ignoring the clobber
         //    in N. Again, this problem goes away if we start tracking all
         //    blockers for a given phi optimization.
         TerminatedPath Result{CurNode.Last, defPathIndex(CurNode)};
         return {Result, {}};
       }
 
       // If there's nothing left to search, then all paths led to valid clobbers
       // that we got from our cache; pick the nearest to the start, and allow
       // the rest to be cached back.
       if (NewPaused.empty()) {
         MoveDominatedPathToEnd(TerminatedPaths);
         TerminatedPath Result = TerminatedPaths.pop_back_val();
         return {Result, std::move(TerminatedPaths)};
       }
 
       MemoryAccess *DefChainEnd = nullptr;
       SmallVector<TerminatedPath, 4> Clobbers;
       for (ListIndex Paused : NewPaused) {
         UpwardsWalkResult WR = walkToPhiOrClobber(Paths[Paused]);
         if (WR.IsKnownClobber)
           Clobbers.push_back({WR.Result, Paused});
         else
           // Micro-opt: If we hit the end of the chain, save it.
           DefChainEnd = WR.Result;
       }
 
       if (!TerminatedPaths.empty()) {
         // If we couldn't find the dominating phi/liveOnEntry in the above loop,
         // do it now.
         if (!DefChainEnd)
           for (auto *MA : def_chain(const_cast<MemoryAccess *>(Target)))
             DefChainEnd = MA;
 
         // If any of the terminated paths don't dominate the phi we'll try to
         // optimize, we need to figure out what they are and quit.
         const BasicBlock *ChainBB = DefChainEnd->getBlock();
         for (const TerminatedPath &TP : TerminatedPaths) {
           // Because we know that DefChainEnd is as "high" as we can go, we
           // don't need local dominance checks; BB dominance is sufficient.
           if (DT.dominates(ChainBB, TP.Clobber->getBlock()))
             Clobbers.push_back(TP);
         }
       }
 
       // If we have clobbers in the def chain, find the one closest to Current
       // and quit.
       if (!Clobbers.empty()) {
         MoveDominatedPathToEnd(Clobbers);
         TerminatedPath Result = Clobbers.pop_back_val();
         return {Result, std::move(Clobbers)};
       }
 
       assert(all_of(NewPaused,
                     [&](ListIndex I) { return Paths[I].Last == DefChainEnd; }));
 
       // Because liveOnEntry is a clobber, this must be a phi.
       auto *DefChainPhi = cast<MemoryPhi>(DefChainEnd);
 
       PriorPathsSize = Paths.size();
       PausedSearches.clear();
       for (ListIndex I : NewPaused)
         addSearches(DefChainPhi, PausedSearches, I);
       NewPaused.clear();
 
       Current = DefChainPhi;
     }
   }
 
   void verifyOptResult(const OptznResult &R) const {
     assert(all_of(R.OtherClobbers, [&](const TerminatedPath &P) {
       return MSSA.dominates(P.Clobber, R.PrimaryClobber.Clobber);
     }));
   }
 
   void resetPhiOptznState() {
     Paths.clear();
     VisitedPhis.clear();
   }
 
 public:
   ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT)
       : MSSA(MSSA), AA(AA), DT(DT) {}
 
   /// Finds the nearest clobber for the given query, optimizing phis if
   /// possible.
   MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q) {
     Query = &Q;
 
     MemoryAccess *Current = Start;
     // This walker pretends uses don't exist. If we're handed one, silently grab
     // its def. (This has the nice side-effect of ensuring we never cache uses)
     if (auto *MU = dyn_cast<MemoryUse>(Start))
       Current = MU->getDefiningAccess();
 
     DefPath FirstDesc(Q.StartingLoc, Current, Current, None);
     // Fast path for the overly-common case (no crazy phi optimization
     // necessary)
     UpwardsWalkResult WalkResult = walkToPhiOrClobber(FirstDesc);
     MemoryAccess *Result;
     if (WalkResult.IsKnownClobber) {
       Result = WalkResult.Result;
       Q.AR = WalkResult.AR;
     } else {
       OptznResult OptRes = tryOptimizePhi(cast<MemoryPhi>(FirstDesc.Last),
                                           Current, Q.StartingLoc);
       verifyOptResult(OptRes);
       resetPhiOptznState();
       Result = OptRes.PrimaryClobber.Clobber;
     }
 
 #ifdef EXPENSIVE_CHECKS
     checkClobberSanity(Current, Result, Q.StartingLoc, MSSA, Q, AA);
 #endif
     return Result;
   }
 
   void verify(const MemorySSA *MSSA) { assert(MSSA == &this->MSSA); }
 };
 
 struct RenamePassData {
   DomTreeNode *DTN;
   DomTreeNode::const_iterator ChildIt;
   MemoryAccess *IncomingVal;
 
   RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
                  MemoryAccess *M)
       : DTN(D), ChildIt(It), IncomingVal(M) {}
 
   void swap(RenamePassData &RHS) {
     std::swap(DTN, RHS.DTN);
     std::swap(ChildIt, RHS.ChildIt);
     std::swap(IncomingVal, RHS.IncomingVal);
   }
 };
 
 } // end anonymous namespace
 
 namespace llvm {
 
 /// A MemorySSAWalker that does AA walks to disambiguate accesses. It no
 /// longer does caching on its own, but the name has been retained for the
 /// moment.
 class MemorySSA::CachingWalker final : public MemorySSAWalker {
   ClobberWalker Walker;
 
   MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
 
 public:
   CachingWalker(MemorySSA *, AliasAnalysis *, DominatorTree *);
   ~CachingWalker() override = default;
 
   using MemorySSAWalker::getClobberingMemoryAccess;
 
   MemoryAccess *getClobberingMemoryAccess(MemoryAccess *) override;
   MemoryAccess *getClobberingMemoryAccess(MemoryAccess *,
                                           const MemoryLocation &) override;
   void invalidateInfo(MemoryAccess *) override;
 
   void verify(const MemorySSA *MSSA) override {
     MemorySSAWalker::verify(MSSA);
     Walker.verify(MSSA);
   }
 };
 
 } // end namespace llvm
 
 void MemorySSA::renameSuccessorPhis(BasicBlock *BB, MemoryAccess *IncomingVal,
                                     bool RenameAllUses) {
   // Pass through values to our successors
   for (const BasicBlock *S : successors(BB)) {
     auto It = PerBlockAccesses.find(S);
     // Rename the phi nodes in our successor block
     if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
       continue;
     AccessList *Accesses = It->second.get();
     auto *Phi = cast<MemoryPhi>(&Accesses->front());
     if (RenameAllUses) {
       int PhiIndex = Phi->getBasicBlockIndex(BB);
       assert(PhiIndex != -1 && "Incomplete phi during partial rename");
       Phi->setIncomingValue(PhiIndex, IncomingVal);
     } else
       Phi->addIncoming(IncomingVal, BB);
   }
 }
 
 /// Rename a single basic block into MemorySSA form.
 /// Uses the standard SSA renaming algorithm.
 /// \returns The new incoming value.
 MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB, MemoryAccess *IncomingVal,
                                      bool RenameAllUses) {
   auto It = PerBlockAccesses.find(BB);
   // Skip most processing if the list is empty.
   if (It != PerBlockAccesses.end()) {
     AccessList *Accesses = It->second.get();
     for (MemoryAccess &L : *Accesses) {
       if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(&L)) {
         if (MUD->getDefiningAccess() == nullptr || RenameAllUses)
           MUD->setDefiningAccess(IncomingVal);
         if (isa<MemoryDef>(&L))
           IncomingVal = &L;
       } else {
         IncomingVal = &L;
       }
     }
   }
   return IncomingVal;
 }
 
 /// This is the standard SSA renaming algorithm.
 ///
 /// We walk the dominator tree in preorder, renaming accesses, and then filling
 /// in phi nodes in our successors.
 void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
                            SmallPtrSetImpl<BasicBlock *> &Visited,
                            bool SkipVisited, bool RenameAllUses) {
   SmallVector<RenamePassData, 32> WorkStack;
   // Skip everything if we already renamed this block and we are skipping.
   // Note: You can't sink this into the if, because we need it to occur
   // regardless of whether we skip blocks or not.
   bool AlreadyVisited = !Visited.insert(Root->getBlock()).second;
   if (SkipVisited && AlreadyVisited)
     return;
 
   IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses);
   renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses);
   WorkStack.push_back({Root, Root->begin(), IncomingVal});
 
   while (!WorkStack.empty()) {
     DomTreeNode *Node = WorkStack.back().DTN;
     DomTreeNode::const_iterator ChildIt = WorkStack.back().ChildIt;
     IncomingVal = WorkStack.back().IncomingVal;
 
     if (ChildIt == Node->end()) {
       WorkStack.pop_back();
     } else {
       DomTreeNode *Child = *ChildIt;
       ++WorkStack.back().ChildIt;
       BasicBlock *BB = Child->getBlock();
       // Note: You can't sink this into the if, because we need it to occur
       // regardless of whether we skip blocks or not.
       AlreadyVisited = !Visited.insert(BB).second;
       if (SkipVisited && AlreadyVisited) {
         // We already visited this during our renaming, which can happen when
         // being asked to rename multiple blocks. Figure out the incoming val,
         // which is the last def.
         // Incoming value can only change if there is a block def, and in that
         // case, it's the last block def in the list.
         if (auto *BlockDefs = getWritableBlockDefs(BB))
           IncomingVal = &*BlockDefs->rbegin();
       } else
         IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses);
       renameSuccessorPhis(BB, IncomingVal, RenameAllUses);
       WorkStack.push_back({Child, Child->begin(), IncomingVal});
     }
   }
 }
 
 /// This handles unreachable block accesses by deleting phi nodes in
 /// unreachable blocks, and marking all other unreachable MemoryAccess's as
 /// being uses of the live on entry definition.
 void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
   assert(!DT->isReachableFromEntry(BB) &&
          "Reachable block found while handling unreachable blocks");
 
   // Make sure phi nodes in our reachable successors end up with a
   // LiveOnEntryDef for our incoming edge, even though our block is forward
   // unreachable.  We could just disconnect these blocks from the CFG fully,
   // but we do not right now.
   for (const BasicBlock *S : successors(BB)) {
     if (!DT->isReachableFromEntry(S))
       continue;
     auto It = PerBlockAccesses.find(S);
     // Rename the phi nodes in our successor block
     if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
       continue;
     AccessList *Accesses = It->second.get();
     auto *Phi = cast<MemoryPhi>(&Accesses->front());
     Phi->addIncoming(LiveOnEntryDef.get(), BB);
   }
 
   auto It = PerBlockAccesses.find(BB);
   if (It == PerBlockAccesses.end())
     return;
 
   auto &Accesses = It->second;
   for (auto AI = Accesses->begin(), AE = Accesses->end(); AI != AE;) {
     auto Next = std::next(AI);
     // If we have a phi, just remove it. We are going to replace all
     // users with live on entry.
     if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(AI))
       UseOrDef->setDefiningAccess(LiveOnEntryDef.get());
     else
       Accesses->erase(AI);
     AI = Next;
   }
 }
 
 MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
     : AA(AA), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
       NextID(0) {
   buildMemorySSA();
 }
 
 MemorySSA::~MemorySSA() {
   // Drop all our references
   for (const auto &Pair : PerBlockAccesses)
     for (MemoryAccess &MA : *Pair.second)
       MA.dropAllReferences();
 }
 
 MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
   auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
 
   if (Res.second)
     Res.first->second = llvm::make_unique<AccessList>();
   return Res.first->second.get();
 }
 
 MemorySSA::DefsList *MemorySSA::getOrCreateDefsList(const BasicBlock *BB) {
   auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr));
 
   if (Res.second)
     Res.first->second = llvm::make_unique<DefsList>();
   return Res.first->second.get();
 }
 
 namespace llvm {
 
 /// This class is a batch walker of all MemoryUse's in the program, and points
 /// their defining access at the thing that actually clobbers them.  Because it
 /// is a batch walker that touches everything, it does not operate like the
 /// other walkers.  This walker is basically performing a top-down SSA renaming
 /// pass, where the version stack is used as the cache.  This enables it to be
 /// significantly more time and memory efficient than using the regular walker,
 /// which is walking bottom-up.
 class MemorySSA::OptimizeUses {
 public:
   OptimizeUses(MemorySSA *MSSA, MemorySSAWalker *Walker, AliasAnalysis *AA,
                DominatorTree *DT)
       : MSSA(MSSA), Walker(Walker), AA(AA), DT(DT) {
     Walker = MSSA->getWalker();
   }
 
   void optimizeUses();
 
 private:
   /// This represents where a given memorylocation is in the stack.
   struct MemlocStackInfo {
     // This essentially is keeping track of versions of the stack. Whenever
     // the stack changes due to pushes or pops, these versions increase.
     unsigned long StackEpoch;
     unsigned long PopEpoch;
     // This is the lower bound of places on the stack to check. It is equal to
     // the place the last stack walk ended.
     // Note: Correctness depends on this being initialized to 0, which densemap
     // does
     unsigned long LowerBound;
     const BasicBlock *LowerBoundBlock;
     // This is where the last walk for this memory location ended.
     unsigned long LastKill;
     bool LastKillValid;
     Optional<AliasResult> AR;
   };
 
   void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &,
                            SmallVectorImpl<MemoryAccess *> &,
                            DenseMap<MemoryLocOrCall, MemlocStackInfo> &);
 
   MemorySSA *MSSA;
   MemorySSAWalker *Walker;
   AliasAnalysis *AA;
   DominatorTree *DT;
 };
 
 } // end namespace llvm
 
 /// Optimize the uses in a given block This is basically the SSA renaming
 /// algorithm, with one caveat: We are able to use a single stack for all
 /// MemoryUses.  This is because the set of *possible* reaching MemoryDefs is
 /// the same for every MemoryUse.  The *actual* clobbering MemoryDef is just
 /// going to be some position in that stack of possible ones.
 ///
 /// We track the stack positions that each MemoryLocation needs
 /// to check, and last ended at.  This is because we only want to check the
 /// things that changed since last time.  The same MemoryLocation should
 /// get clobbered by the same store (getModRefInfo does not use invariantness or
 /// things like this, and if they start, we can modify MemoryLocOrCall to
 /// include relevant data)
 void MemorySSA::OptimizeUses::optimizeUsesInBlock(
     const BasicBlock *BB, unsigned long &StackEpoch, unsigned long &PopEpoch,
     SmallVectorImpl<MemoryAccess *> &VersionStack,
     DenseMap<MemoryLocOrCall, MemlocStackInfo> &LocStackInfo) {
 
   /// If no accesses, nothing to do.
   MemorySSA::AccessList *Accesses = MSSA->getWritableBlockAccesses(BB);
   if (Accesses == nullptr)
     return;
 
   // Pop everything that doesn't dominate the current block off the stack,
   // increment the PopEpoch to account for this.
   while (true) {
     assert(
         !VersionStack.empty() &&
         "Version stack should have liveOnEntry sentinel dominating everything");
     BasicBlock *BackBlock = VersionStack.back()->getBlock();
     if (DT->dominates(BackBlock, BB))
       break;
     while (VersionStack.back()->getBlock() == BackBlock)
       VersionStack.pop_back();
     ++PopEpoch;
   }
 
   for (MemoryAccess &MA : *Accesses) {
     auto *MU = dyn_cast<MemoryUse>(&MA);
     if (!MU) {
       VersionStack.push_back(&MA);
       ++StackEpoch;
       continue;
     }
 
     if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getMemoryInst())) {
       MU->setDefiningAccess(MSSA->getLiveOnEntryDef(), true, None);
       continue;
     }
 
     MemoryLocOrCall UseMLOC(MU);
     auto &LocInfo = LocStackInfo[UseMLOC];
     // If the pop epoch changed, it means we've removed stuff from top of
     // stack due to changing blocks. We may have to reset the lower bound or
     // last kill info.
     if (LocInfo.PopEpoch != PopEpoch) {
       LocInfo.PopEpoch = PopEpoch;
       LocInfo.StackEpoch = StackEpoch;
       // If the lower bound was in something that no longer dominates us, we
       // have to reset it.
       // We can't simply track stack size, because the stack may have had
       // pushes/pops in the meantime.
       // XXX: This is non-optimal, but only is slower cases with heavily
       // branching dominator trees.  To get the optimal number of queries would
       // be to make lowerbound and lastkill a per-loc stack, and pop it until
       // the top of that stack dominates us.  This does not seem worth it ATM.
       // A much cheaper optimization would be to always explore the deepest
       // branch of the dominator tree first. This will guarantee this resets on
       // the smallest set of blocks.
       if (LocInfo.LowerBoundBlock && LocInfo.LowerBoundBlock != BB &&
           !DT->dominates(LocInfo.LowerBoundBlock, BB)) {
         // Reset the lower bound of things to check.
         // TODO: Some day we should be able to reset to last kill, rather than
         // 0.
         LocInfo.LowerBound = 0;
         LocInfo.LowerBoundBlock = VersionStack[0]->getBlock();
         LocInfo.LastKillValid = false;
       }
     } else if (LocInfo.StackEpoch != StackEpoch) {
       // If all that has changed is the StackEpoch, we only have to check the
       // new things on the stack, because we've checked everything before.  In
       // this case, the lower bound of things to check remains the same.
       LocInfo.PopEpoch = PopEpoch;
       LocInfo.StackEpoch = StackEpoch;
     }
     if (!LocInfo.LastKillValid) {
       LocInfo.LastKill = VersionStack.size() - 1;
       LocInfo.LastKillValid = true;
       LocInfo.AR = MayAlias;
     }
 
     // At this point, we should have corrected last kill and LowerBound to be
     // in bounds.
     assert(LocInfo.LowerBound < VersionStack.size() &&
            "Lower bound out of range");
     assert(LocInfo.LastKill < VersionStack.size() &&
            "Last kill info out of range");
     // In any case, the new upper bound is the top of the stack.
     unsigned long UpperBound = VersionStack.size() - 1;
 
     if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) {
       LLVM_DEBUG(dbgs() << "MemorySSA skipping optimization of " << *MU << " ("
                         << *(MU->getMemoryInst()) << ")"
                         << " because there are "
                         << UpperBound - LocInfo.LowerBound
                         << " stores to disambiguate\n");
       // Because we did not walk, LastKill is no longer valid, as this may
       // have been a kill.
       LocInfo.LastKillValid = false;
       continue;
     }
     bool FoundClobberResult = false;
     while (UpperBound > LocInfo.LowerBound) {
       if (isa<MemoryPhi>(VersionStack[UpperBound])) {
         // For phis, use the walker, see where we ended up, go there
         Instruction *UseInst = MU->getMemoryInst();
         MemoryAccess *Result = Walker->getClobberingMemoryAccess(UseInst);
         // We are guaranteed to find it or something is wrong
         while (VersionStack[UpperBound] != Result) {
           assert(UpperBound != 0);
           --UpperBound;
         }
         FoundClobberResult = true;
         break;
       }
 
       MemoryDef *MD = cast<MemoryDef>(VersionStack[UpperBound]);
       // If the lifetime of the pointer ends at this instruction, it's live on
       // entry.
       if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) {
         // Reset UpperBound to liveOnEntryDef's place in the stack
         UpperBound = 0;
         FoundClobberResult = true;
         LocInfo.AR = MustAlias;
         break;
       }
       ClobberAlias CA = instructionClobbersQuery(MD, MU, UseMLOC, *AA);
       if (CA.IsClobber) {
         FoundClobberResult = true;
         LocInfo.AR = CA.AR;
         break;
       }
       --UpperBound;
     }
 
     // Note: Phis always have AliasResult AR set to MayAlias ATM.
 
     // At the end of this loop, UpperBound is either a clobber, or lower bound
     // PHI walking may cause it to be < LowerBound, and in fact, < LastKill.
     if (FoundClobberResult || UpperBound < LocInfo.LastKill) {
       // We were last killed now by where we got to
       if (MSSA->isLiveOnEntryDef(VersionStack[UpperBound]))
         LocInfo.AR = None;
       MU->setDefiningAccess(VersionStack[UpperBound], true, LocInfo.AR);
       LocInfo.LastKill = UpperBound;
     } else {
       // Otherwise, we checked all the new ones, and now we know we can get to
       // LastKill.
       MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true, LocInfo.AR);
     }
     LocInfo.LowerBound = VersionStack.size() - 1;
     LocInfo.LowerBoundBlock = BB;
   }
 }
 
 /// Optimize uses to point to their actual clobbering definitions.
 void MemorySSA::OptimizeUses::optimizeUses() {
   SmallVector<MemoryAccess *, 16> VersionStack;
   DenseMap<MemoryLocOrCall, MemlocStackInfo> LocStackInfo;
   VersionStack.push_back(MSSA->getLiveOnEntryDef());
 
   unsigned long StackEpoch = 1;
   unsigned long PopEpoch = 1;
   // We perform a non-recursive top-down dominator tree walk.
   for (const auto *DomNode : depth_first(DT->getRootNode()))
     optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
                         LocStackInfo);
 }
 
 void MemorySSA::placePHINodes(
     const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks) {
   // Determine where our MemoryPhi's should go
   ForwardIDFCalculator IDFs(*DT);
   IDFs.setDefiningBlocks(DefiningBlocks);
   SmallVector<BasicBlock *, 32> IDFBlocks;
   IDFs.calculate(IDFBlocks);
 
   // Now place MemoryPhi nodes.
   for (auto &BB : IDFBlocks)
     createMemoryPhi(BB);
 }
 
 void MemorySSA::buildMemorySSA() {
   // We create an access to represent "live on entry", for things like
   // arguments or users of globals, where the memory they use is defined before
   // the beginning of the function. We do not actually insert it into the IR.
   // We do not define a live on exit for the immediate uses, and thus our
   // semantics do *not* imply that something with no immediate uses can simply
   // be removed.
   BasicBlock &StartingPoint = F.getEntryBlock();
   LiveOnEntryDef.reset(new MemoryDef(F.getContext(), nullptr, nullptr,
                                      &StartingPoint, NextID++));
 
   // We maintain lists of memory accesses per-block, trading memory for time. We
   // could just look up the memory access for every possible instruction in the
   // stream.
   SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
   // Go through each block, figure out where defs occur, and chain together all
   // the accesses.
   for (BasicBlock &B : F) {
     bool InsertIntoDef = false;
     AccessList *Accesses = nullptr;
     DefsList *Defs = nullptr;
     for (Instruction &I : B) {
       MemoryUseOrDef *MUD = createNewAccess(&I);
       if (!MUD)
         continue;
 
       if (!Accesses)
         Accesses = getOrCreateAccessList(&B);
       Accesses->push_back(MUD);
       if (isa<MemoryDef>(MUD)) {
         InsertIntoDef = true;
         if (!Defs)
           Defs = getOrCreateDefsList(&B);
         Defs->push_back(*MUD);
       }
     }
     if (InsertIntoDef)
       DefiningBlocks.insert(&B);
   }
   placePHINodes(DefiningBlocks);
 
   // Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
   // filled in with all blocks.
   SmallPtrSet<BasicBlock *, 16> Visited;
   renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
 
   CachingWalker *Walker = getWalkerImpl();
 
   OptimizeUses(this, Walker, AA, DT).optimizeUses();
 
   // Mark the uses in unreachable blocks as live on entry, so that they go
   // somewhere.
   for (auto &BB : F)
     if (!Visited.count(&BB))
       markUnreachableAsLiveOnEntry(&BB);
 }
 
 MemorySSAWalker *MemorySSA::getWalker() { return getWalkerImpl(); }
 
 MemorySSA::CachingWalker *MemorySSA::getWalkerImpl() {
   if (Walker)
     return Walker.get();
 
   Walker = llvm::make_unique<CachingWalker>(this, AA, DT);
   return Walker.get();
 }
 
 // This is a helper function used by the creation routines. It places NewAccess
 // into the access and defs lists for a given basic block, at the given
 // insertion point.
 void MemorySSA::insertIntoListsForBlock(MemoryAccess *NewAccess,
                                         const BasicBlock *BB,
                                         InsertionPlace Point) {
   auto *Accesses = getOrCreateAccessList(BB);
   if (Point == Beginning) {
     // If it's a phi node, it goes first, otherwise, it goes after any phi
     // nodes.
     if (isa<MemoryPhi>(NewAccess)) {
       Accesses->push_front(NewAccess);
       auto *Defs = getOrCreateDefsList(BB);
       Defs->push_front(*NewAccess);
     } else {
       auto AI = find_if_not(
           *Accesses, [](const MemoryAccess &MA) { return isa<MemoryPhi>(MA); });
       Accesses->insert(AI, NewAccess);
       if (!isa<MemoryUse>(NewAccess)) {
         auto *Defs = getOrCreateDefsList(BB);
         auto DI = find_if_not(
             *Defs, [](const MemoryAccess &MA) { return isa<MemoryPhi>(MA); });
         Defs->insert(DI, *NewAccess);
       }
     }
   } else {
     Accesses->push_back(NewAccess);
     if (!isa<MemoryUse>(NewAccess)) {
       auto *Defs = getOrCreateDefsList(BB);
       Defs->push_back(*NewAccess);
     }
   }
   BlockNumberingValid.erase(BB);
 }
 
 void MemorySSA::insertIntoListsBefore(MemoryAccess *What, const BasicBlock *BB,
                                       AccessList::iterator InsertPt) {
   auto *Accesses = getWritableBlockAccesses(BB);
   bool WasEnd = InsertPt == Accesses->end();
   Accesses->insert(AccessList::iterator(InsertPt), What);
   if (!isa<MemoryUse>(What)) {
     auto *Defs = getOrCreateDefsList(BB);
     // If we got asked to insert at the end, we have an easy job, just shove it
     // at the end. If we got asked to insert before an existing def, we also get
     // an iterator. If we got asked to insert before a use, we have to hunt for
     // the next def.
     if (WasEnd) {
       Defs->push_back(*What);
     } else if (isa<MemoryDef>(InsertPt)) {
       Defs->insert(InsertPt->getDefsIterator(), *What);
     } else {
       while (InsertPt != Accesses->end() && !isa<MemoryDef>(InsertPt))
         ++InsertPt;
       // Either we found a def, or we are inserting at the end
       if (InsertPt == Accesses->end())
         Defs->push_back(*What);
       else
         Defs->insert(InsertPt->getDefsIterator(), *What);
     }
   }
   BlockNumberingValid.erase(BB);
 }
 
 // Move What before Where in the IR.  The end result is that What will belong to
 // the right lists and have the right Block set, but will not otherwise be
 // correct. It will not have the right defining access, and if it is a def,
 // things below it will not properly be updated.
 void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
                        AccessList::iterator Where) {
   // Keep it in the lookup tables, remove from the lists
   removeFromLists(What, false);
   What->setBlock(BB);
   insertIntoListsBefore(What, BB, Where);
 }
 
 void MemorySSA::moveTo(MemoryAccess *What, BasicBlock *BB,
                        InsertionPlace Point) {
   if (isa<MemoryPhi>(What)) {
     assert(Point == Beginning &&
            "Can only move a Phi at the beginning of the block");
     // Update lookup table entry
     ValueToMemoryAccess.erase(What->getBlock());
     bool Inserted = ValueToMemoryAccess.insert({BB, What}).second;
     (void)Inserted;
     assert(Inserted && "Cannot move a Phi to a block that already has one");
   }
 
   removeFromLists(What, false);
   What->setBlock(BB);
   insertIntoListsForBlock(What, BB, Point);
 }
 
 MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
   assert(!getMemoryAccess(BB) && "MemoryPhi already exists for this BB");
   MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
   // Phi's always are placed at the front of the block.
   insertIntoListsForBlock(Phi, BB, Beginning);
   ValueToMemoryAccess[BB] = Phi;
   return Phi;
 }
 
 MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
                                                MemoryAccess *Definition) {
   assert(!isa<PHINode>(I) && "Cannot create a defined access for a PHI");
   MemoryUseOrDef *NewAccess = createNewAccess(I);
   assert(
       NewAccess != nullptr &&
       "Tried to create a memory access for a non-memory touching instruction");
   NewAccess->setDefiningAccess(Definition);
   return NewAccess;
 }
 
 // Return true if the instruction has ordering constraints.
 // Note specifically that this only considers stores and loads
 // because others are still considered ModRef by getModRefInfo.
 static inline bool isOrdered(const Instruction *I) {
   if (auto *SI = dyn_cast<StoreInst>(I)) {
     if (!SI->isUnordered())
       return true;
   } else if (auto *LI = dyn_cast<LoadInst>(I)) {
     if (!LI->isUnordered())
       return true;
   }
   return false;
 }
 
 /// Helper function to create new memory accesses
 MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   // The assume intrinsic has a control dependency which we model by claiming
   // that it writes arbitrarily. Ignore that fake memory dependency here.
   // FIXME: Replace this special casing with a more accurate modelling of
   // assume's control dependency.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
     if (II->getIntrinsicID() == Intrinsic::assume)
       return nullptr;
 
   // Find out what affect this instruction has on memory.
   ModRefInfo ModRef = AA->getModRefInfo(I, None);
   // The isOrdered check is used to ensure that volatiles end up as defs
   // (atomics end up as ModRef right now anyway).  Until we separate the
   // ordering chain from the memory chain, this enables people to see at least
   // some relative ordering to volatiles.  Note that getClobberingMemoryAccess
   // will still give an answer that bypasses other volatile loads.  TODO:
   // Separate memory aliasing and ordering into two different chains so that we
   // can precisely represent both "what memory will this read/write/is clobbered
   // by" and "what instructions can I move this past".
   bool Def = isModSet(ModRef) || isOrdered(I);
   bool Use = isRefSet(ModRef);
 
   // It's possible for an instruction to not modify memory at all. During
   // construction, we ignore them.
   if (!Def && !Use)
     return nullptr;
 
   MemoryUseOrDef *MUD;
   if (Def)
     MUD = new MemoryDef(I->getContext(), nullptr, I, I->getParent(), NextID++);
   else
     MUD = new MemoryUse(I->getContext(), nullptr, I, I->getParent());
   ValueToMemoryAccess[I] = MUD;
   return MUD;
 }
 
 /// Returns true if \p Replacer dominates \p Replacee .
 bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
                              const MemoryAccess *Replacee) const {
   if (isa<MemoryUseOrDef>(Replacee))
     return DT->dominates(Replacer->getBlock(), Replacee->getBlock());
   const auto *MP = cast<MemoryPhi>(Replacee);
   // For a phi node, the use occurs in the predecessor block of the phi node.
   // Since we may occur multiple times in the phi node, we have to check each
   // operand to ensure Replacer dominates each operand where Replacee occurs.
   for (const Use &Arg : MP->operands()) {
     if (Arg.get() != Replacee &&
         !DT->dominates(Replacer->getBlock(), MP->getIncomingBlock(Arg)))
       return false;
   }
   return true;
 }
 
 /// Properly remove \p MA from all of MemorySSA's lookup tables.
 void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   assert(MA->use_empty() &&
          "Trying to remove memory access that still has uses");
   BlockNumbering.erase(MA);
   if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
     MUD->setDefiningAccess(nullptr);
   // Invalidate our walker's cache if necessary
   if (!isa<MemoryUse>(MA))
     Walker->invalidateInfo(MA);
 
   Value *MemoryInst;
   if (const auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
     MemoryInst = MUD->getMemoryInst();
   else
     MemoryInst = MA->getBlock();
 
   auto VMA = ValueToMemoryAccess.find(MemoryInst);
   if (VMA->second == MA)
     ValueToMemoryAccess.erase(VMA);
 }
 
 /// Properly remove \p MA from all of MemorySSA's lists.
 ///
 /// Because of the way the intrusive list and use lists work, it is important to
 /// do removal in the right order.
 /// ShouldDelete defaults to true, and will cause the memory access to also be
 /// deleted, not just removed.
 void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
   BasicBlock *BB = MA->getBlock();
   // The access list owns the reference, so we erase it from the non-owning list
   // first.
   if (!isa<MemoryUse>(MA)) {
     auto DefsIt = PerBlockDefs.find(BB);
     std::unique_ptr<DefsList> &Defs = DefsIt->second;
     Defs->remove(*MA);
     if (Defs->empty())
       PerBlockDefs.erase(DefsIt);
   }
 
   // The erase call here will delete it. If we don't want it deleted, we call
   // remove instead.
   auto AccessIt = PerBlockAccesses.find(BB);
   std::unique_ptr<AccessList> &Accesses = AccessIt->second;
   if (ShouldDelete)
     Accesses->erase(MA);
   else
     Accesses->remove(MA);
 
   if (Accesses->empty()) {
     PerBlockAccesses.erase(AccessIt);
     BlockNumberingValid.erase(BB);
   }
 }
 
 void MemorySSA::print(raw_ostream &OS) const {
   MemorySSAAnnotatedWriter Writer(this);
   F.print(OS, &Writer);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MemorySSA::dump() const { print(dbgs()); }
 #endif
 
 void MemorySSA::verifyMemorySSA() const {
   verifyDefUses(F);
   verifyDomination(F);
   verifyOrdering(F);
   verifyDominationNumbers(F);
   Walker->verify(this);
 }
 
 /// Verify that all of the blocks we believe to have valid domination numbers
 /// actually have valid domination numbers.
 void MemorySSA::verifyDominationNumbers(const Function &F) const {
 #ifndef NDEBUG
   if (BlockNumberingValid.empty())
     return;
 
   SmallPtrSet<const BasicBlock *, 16> ValidBlocks = BlockNumberingValid;
   for (const BasicBlock &BB : F) {
     if (!ValidBlocks.count(&BB))
       continue;
 
     ValidBlocks.erase(&BB);
 
     const AccessList *Accesses = getBlockAccesses(&BB);
     // It's correct to say an empty block has valid numbering.
     if (!Accesses)
       continue;
 
     // Block numbering starts at 1.
     unsigned long LastNumber = 0;
     for (const MemoryAccess &MA : *Accesses) {
       auto ThisNumberIter = BlockNumbering.find(&MA);
       assert(ThisNumberIter != BlockNumbering.end() &&
              "MemoryAccess has no domination number in a valid block!");
 
       unsigned long ThisNumber = ThisNumberIter->second;
       assert(ThisNumber > LastNumber &&
              "Domination numbers should be strictly increasing!");
       LastNumber = ThisNumber;
     }
   }
 
   assert(ValidBlocks.empty() &&
          "All valid BasicBlocks should exist in F -- dangling pointers?");
 #endif
 }
 
 /// Verify that the order and existence of MemoryAccesses matches the
 /// order and existence of memory affecting instructions.
 void MemorySSA::verifyOrdering(Function &F) const {
   // Walk all the blocks, comparing what the lookups think and what the access
   // lists think, as well as the order in the blocks vs the order in the access
   // lists.
   SmallVector<MemoryAccess *, 32> ActualAccesses;
   SmallVector<MemoryAccess *, 32> ActualDefs;
   for (BasicBlock &B : F) {
     const AccessList *AL = getBlockAccesses(&B);
     const auto *DL = getBlockDefs(&B);
     MemoryAccess *Phi = getMemoryAccess(&B);
     if (Phi) {
       ActualAccesses.push_back(Phi);
       ActualDefs.push_back(Phi);
     }
 
     for (Instruction &I : B) {
       MemoryAccess *MA = getMemoryAccess(&I);
       assert((!MA || (AL && (isa<MemoryUse>(MA) || DL))) &&
              "We have memory affecting instructions "
              "in this block but they are not in the "
              "access list or defs list");
       if (MA) {
         ActualAccesses.push_back(MA);
         if (isa<MemoryDef>(MA))
           ActualDefs.push_back(MA);
       }
     }
     // Either we hit the assert, really have no accesses, or we have both
     // accesses and an access list.
     // Same with defs.
     if (!AL && !DL)
       continue;
     assert(AL->size() == ActualAccesses.size() &&
            "We don't have the same number of accesses in the block as on the "
            "access list");
     assert((DL || ActualDefs.size() == 0) &&
            "Either we should have a defs list, or we should have no defs");
     assert((!DL || DL->size() == ActualDefs.size()) &&
            "We don't have the same number of defs in the block as on the "
            "def list");
     auto ALI = AL->begin();
     auto AAI = ActualAccesses.begin();
     while (ALI != AL->end() && AAI != ActualAccesses.end()) {
       assert(&*ALI == *AAI && "Not the same accesses in the same order");
       ++ALI;
       ++AAI;
     }
     ActualAccesses.clear();
     if (DL) {
       auto DLI = DL->begin();
       auto ADI = ActualDefs.begin();
       while (DLI != DL->end() && ADI != ActualDefs.end()) {
         assert(&*DLI == *ADI && "Not the same defs in the same order");
         ++DLI;
         ++ADI;
       }
     }
     ActualDefs.clear();
   }
 }
 
 /// Verify the domination properties of MemorySSA by checking that each
 /// definition dominates all of its uses.
 void MemorySSA::verifyDomination(Function &F) const {
 #ifndef NDEBUG
   for (BasicBlock &B : F) {
     // Phi nodes are attached to basic blocks
     if (MemoryPhi *MP = getMemoryAccess(&B))
       for (const Use &U : MP->uses())
         assert(dominates(MP, U) && "Memory PHI does not dominate it's uses");
 
     for (Instruction &I : B) {
       MemoryAccess *MD = dyn_cast_or_null<MemoryDef>(getMemoryAccess(&I));
       if (!MD)
         continue;
 
       for (const Use &U : MD->uses())
         assert(dominates(MD, U) && "Memory Def does not dominate it's uses");
     }
   }
 #endif
 }
 
 /// Verify the def-use lists in MemorySSA, by verifying that \p Use
 /// appears in the use list of \p Def.
 void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
 #ifndef NDEBUG
   // The live on entry use may cause us to get a NULL def here
   if (!Def)
     assert(isLiveOnEntryDef(Use) &&
            "Null def but use not point to live on entry def");
   else
     assert(is_contained(Def->users(), Use) &&
            "Did not find use in def's use list");
 #endif
 }
 
 /// Verify the immediate use information, by walking all the memory
 /// accesses and verifying that, for each use, it appears in the
 /// appropriate def's use list
 void MemorySSA::verifyDefUses(Function &F) const {
   for (BasicBlock &B : F) {
     // Phi nodes are attached to basic blocks
     if (MemoryPhi *Phi = getMemoryAccess(&B)) {
       assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
                                           pred_begin(&B), pred_end(&B))) &&
              "Incomplete MemoryPhi Node");
       for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
         verifyUseInDefs(Phi->getIncomingValue(I), Phi);
         assert(find(predecessors(&B), Phi->getIncomingBlock(I)) !=
                    pred_end(&B) &&
                "Incoming phi block not a block predecessor");
       }
     }
 
     for (Instruction &I : B) {
       if (MemoryUseOrDef *MA = getMemoryAccess(&I)) {
         verifyUseInDefs(MA->getDefiningAccess(), MA);
       }
     }
   }
 }
 
 MemoryUseOrDef *MemorySSA::getMemoryAccess(const Instruction *I) const {
   return cast_or_null<MemoryUseOrDef>(ValueToMemoryAccess.lookup(I));
 }
 
 MemoryPhi *MemorySSA::getMemoryAccess(const BasicBlock *BB) const {
   return cast_or_null<MemoryPhi>(ValueToMemoryAccess.lookup(cast<Value>(BB)));
 }
 
 /// Perform a local numbering on blocks so that instruction ordering can be
 /// determined in constant time.
 /// TODO: We currently just number in order.  If we numbered by N, we could
 /// allow at least N-1 sequences of insertBefore or insertAfter (and at least
 /// log2(N) sequences of mixed before and after) without needing to invalidate
 /// the numbering.
 void MemorySSA::renumberBlock(const BasicBlock *B) const {
   // The pre-increment ensures the numbers really start at 1.
   unsigned long CurrentNumber = 0;
   const AccessList *AL = getBlockAccesses(B);
   assert(AL != nullptr && "Asking to renumber an empty block");
   for (const auto &I : *AL)
     BlockNumbering[&I] = ++CurrentNumber;
   BlockNumberingValid.insert(B);
 }
 
 /// Determine, for two memory accesses in the same block,
 /// whether \p Dominator dominates \p Dominatee.
 /// \returns True if \p Dominator dominates \p Dominatee.
 bool MemorySSA::locallyDominates(const MemoryAccess *Dominator,
                                  const MemoryAccess *Dominatee) const {
   const BasicBlock *DominatorBlock = Dominator->getBlock();
 
   assert((DominatorBlock == Dominatee->getBlock()) &&
          "Asking for local domination when accesses are in different blocks!");
   // A node dominates itself.
   if (Dominatee == Dominator)
     return true;
 
   // When Dominatee is defined on function entry, it is not dominated by another
   // memory access.
   if (isLiveOnEntryDef(Dominatee))
     return false;
 
   // When Dominator is defined on function entry, it dominates the other memory
   // access.
   if (isLiveOnEntryDef(Dominator))
     return true;
 
   if (!BlockNumberingValid.count(DominatorBlock))
     renumberBlock(DominatorBlock);
 
   unsigned long DominatorNum = BlockNumbering.lookup(Dominator);
   // All numbers start with 1
   assert(DominatorNum != 0 && "Block was not numbered properly");
   unsigned long DominateeNum = BlockNumbering.lookup(Dominatee);
   assert(DominateeNum != 0 && "Block was not numbered properly");
   return DominatorNum < DominateeNum;
 }
 
 bool MemorySSA::dominates(const MemoryAccess *Dominator,
                           const MemoryAccess *Dominatee) const {
   if (Dominator == Dominatee)
     return true;
 
   if (isLiveOnEntryDef(Dominatee))
     return false;
 
   if (Dominator->getBlock() != Dominatee->getBlock())
     return DT->dominates(Dominator->getBlock(), Dominatee->getBlock());
   return locallyDominates(Dominator, Dominatee);
 }
 
 bool MemorySSA::dominates(const MemoryAccess *Dominator,
                           const Use &Dominatee) const {
   if (MemoryPhi *MP = dyn_cast<MemoryPhi>(Dominatee.getUser())) {
     BasicBlock *UseBB = MP->getIncomingBlock(Dominatee);
     // The def must dominate the incoming block of the phi.
     if (UseBB != Dominator->getBlock())
       return DT->dominates(Dominator->getBlock(), UseBB);
     // If the UseBB and the DefBB are the same, compare locally.
     return locallyDominates(Dominator, cast<MemoryAccess>(Dominatee));
   }
   // If it's not a PHI node use, the normal dominates can already handle it.
   return dominates(Dominator, cast<MemoryAccess>(Dominatee.getUser()));
 }
 
 const static char LiveOnEntryStr[] = "liveOnEntry";
 
 void MemoryAccess::print(raw_ostream &OS) const {
   switch (getValueID()) {
   case MemoryPhiVal: return static_cast<const MemoryPhi *>(this)->print(OS);
   case MemoryDefVal: return static_cast<const MemoryDef *>(this)->print(OS);
   case MemoryUseVal: return static_cast<const MemoryUse *>(this)->print(OS);
   }
   llvm_unreachable("invalid value id");
 }
 
 void MemoryDef::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
 
   auto printID = [&OS](MemoryAccess *A) {
     if (A && A->getID())
       OS << A->getID();
     else
       OS << LiveOnEntryStr;
   };
 
   OS << getID() << " = MemoryDef(";
   printID(UO);
   OS << ")";
 
   if (isOptimized()) {
     OS << "->";
     printID(getOptimized());
 
     if (Optional<AliasResult> AR = getOptimizedAccessType())
       OS << " " << *AR;
   }
 }
 
 void MemoryPhi::print(raw_ostream &OS) const {
   bool First = true;
   OS << getID() << " = MemoryPhi(";
   for (const auto &Op : operands()) {
     BasicBlock *BB = getIncomingBlock(Op);
     MemoryAccess *MA = cast<MemoryAccess>(Op);
     if (!First)
       OS << ',';
     else
       First = false;
 
     OS << '{';
     if (BB->hasName())
       OS << BB->getName();
     else
       BB->printAsOperand(OS, false);
     OS << ',';
     if (unsigned ID = MA->getID())
       OS << ID;
     else
       OS << LiveOnEntryStr;
     OS << '}';
   }
   OS << ')';
 }
 
 void MemoryUse::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
   OS << "MemoryUse(";
   if (UO && UO->getID())
     OS << UO->getID();
   else
     OS << LiveOnEntryStr;
   OS << ')';
 
   if (Optional<AliasResult> AR = getOptimizedAccessType())
     OS << " " << *AR;
 }
 
 void MemoryAccess::dump() const {
 // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   print(dbgs());
   dbgs() << "\n";
 #endif
 }
 
 char MemorySSAPrinterLegacyPass::ID = 0;
 
 MemorySSAPrinterLegacyPass::MemorySSAPrinterLegacyPass() : FunctionPass(ID) {
   initializeMemorySSAPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
 }
 
 void MemorySSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MemorySSAWrapperPass>();
 }
 
 bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
   auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
   MSSA.print(dbgs());
   if (VerifyMemorySSA)
     MSSA.verifyMemorySSA();
   return false;
 }
 
 AnalysisKey MemorySSAAnalysis::Key;
 
 MemorySSAAnalysis::Result MemorySSAAnalysis::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
   return MemorySSAAnalysis::Result(llvm::make_unique<MemorySSA>(F, &AA, &DT));
 }
 
 PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   OS << "MemorySSA for function: " << F.getName() << "\n";
   AM.getResult<MemorySSAAnalysis>(F).getMSSA().print(OS);
 
   return PreservedAnalyses::all();
 }
 
 PreservedAnalyses MemorySSAVerifierPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
   AM.getResult<MemorySSAAnalysis>(F).getMSSA().verifyMemorySSA();
 
   return PreservedAnalyses::all();
 }
 
 char MemorySSAWrapperPass::ID = 0;
 
 MemorySSAWrapperPass::MemorySSAWrapperPass() : FunctionPass(ID) {
   initializeMemorySSAWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
 void MemorySSAWrapperPass::releaseMemory() { MSSA.reset(); }
 
 void MemorySSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequiredTransitive<DominatorTreeWrapperPass>();
   AU.addRequiredTransitive<AAResultsWrapperPass>();
 }
 
 bool MemorySSAWrapperPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
   MSSA.reset(new MemorySSA(F, &AA, &DT));
   return false;
 }
 
 void MemorySSAWrapperPass::verifyAnalysis() const { MSSA->verifyMemorySSA(); }
 
 void MemorySSAWrapperPass::print(raw_ostream &OS, const Module *M) const {
   MSSA->print(OS);
 }
 
 MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
 
 MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
                                         DominatorTree *D)
     : MemorySSAWalker(M), Walker(*M, *A, *D) {}
 
 void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
   if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
     MUD->resetOptimized();
 }
 
 /// Walk the use-def chains starting at \p MA and find
 /// the MemoryAccess that actually clobbers Loc.
 ///
 /// \returns our clobbering memory access
 MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
   return Walker.findClobber(StartingAccess, Q);
 }
 
 MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, const MemoryLocation &Loc) {
   if (isa<MemoryPhi>(StartingAccess))
     return StartingAccess;
 
   auto *StartingUseOrDef = cast<MemoryUseOrDef>(StartingAccess);
   if (MSSA->isLiveOnEntryDef(StartingUseOrDef))
     return StartingUseOrDef;
 
   Instruction *I = StartingUseOrDef->getMemoryInst();
 
   // Conservatively, fences are always clobbers, so don't perform the walk if we
   // hit a fence.
   if (!ImmutableCallSite(I) && I->isFenceLike())
     return StartingUseOrDef;
 
   UpwardsMemoryQuery Q;
   Q.OriginalAccess = StartingUseOrDef;
   Q.StartingLoc = Loc;
   Q.Inst = I;
   Q.IsCall = false;
 
   // Unlike the other function, do not walk to the def of a def, because we are
   // handed something we already believe is the clobbering access.
   MemoryAccess *DefiningAccess = isa<MemoryUse>(StartingUseOrDef)
                                      ? StartingUseOrDef->getDefiningAccess()
                                      : StartingUseOrDef;
 
   MemoryAccess *Clobber = getClobberingMemoryAccess(DefiningAccess, Q);
   LLVM_DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
   LLVM_DEBUG(dbgs() << *StartingUseOrDef << "\n");
   LLVM_DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
   LLVM_DEBUG(dbgs() << *Clobber << "\n");
   return Clobber;
 }
 
 MemoryAccess *
 MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   auto *StartingAccess = dyn_cast<MemoryUseOrDef>(MA);
   // If this is a MemoryPhi, we can't do anything.
   if (!StartingAccess)
     return MA;
 
   // If this is an already optimized use or def, return the optimized result.
   // Note: Currently, we store the optimized def result in a separate field,
   // since we can't use the defining access.
   if (StartingAccess->isOptimized())
     return StartingAccess->getOptimized();
 
   const Instruction *I = StartingAccess->getMemoryInst();
   UpwardsMemoryQuery Q(I, StartingAccess);
   // We can't sanely do anything with a fence, since they conservatively clobber
   // all memory, and have no locations to get pointers from to try to
   // disambiguate.
   if (!Q.IsCall && I->isFenceLike())
     return StartingAccess;
 
   if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
     MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
     StartingAccess->setOptimized(LiveOnEntry);
     StartingAccess->setOptimizedAccessType(None);
     return LiveOnEntry;
   }
 
   // Start with the thing we already think clobbers this location
   MemoryAccess *DefiningAccess = StartingAccess->getDefiningAccess();
 
   // At this point, DefiningAccess may be the live on entry def.
   // If it is, we will not get a better result.
   if (MSSA->isLiveOnEntryDef(DefiningAccess)) {
     StartingAccess->setOptimized(DefiningAccess);
     StartingAccess->setOptimizedAccessType(None);
     return DefiningAccess;
   }
 
   MemoryAccess *Result = getClobberingMemoryAccess(DefiningAccess, Q);
   LLVM_DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
   LLVM_DEBUG(dbgs() << *DefiningAccess << "\n");
   LLVM_DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
   LLVM_DEBUG(dbgs() << *Result << "\n");
 
   StartingAccess->setOptimized(Result);
   if (MSSA->isLiveOnEntryDef(Result))
     StartingAccess->setOptimizedAccessType(None);
   else if (Q.AR == MustAlias)
     StartingAccess->setOptimizedAccessType(MustAlias);
 
   return Result;
 }
 
 MemoryAccess *
 DoNothingMemorySSAWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
     return Use->getDefiningAccess();
   return MA;
 }
 
 MemoryAccess *DoNothingMemorySSAWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, const MemoryLocation &) {
   if (auto *Use = dyn_cast<MemoryUseOrDef>(StartingAccess))
     return Use->getDefiningAccess();
   return StartingAccess;
 }
 
 void MemoryPhi::deleteMe(DerivedUser *Self) {
   delete static_cast<MemoryPhi *>(Self);
 }
 
 void MemoryDef::deleteMe(DerivedUser *Self) {
   delete static_cast<MemoryDef *>(Self);
 }
 
 void MemoryUse::deleteMe(DerivedUser *Self) {
   delete static_cast<MemoryUse *>(Self);
 }
Index: vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp	(revision 338000)
@@ -1,2158 +1,2163 @@
 //===-------- LegalizeFloatTypes.cpp - Legalization of float types --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements float type expansion and softening for LegalizeTypes.
 // Softening is the act of turning a computation in an illegal floating point
 // type into a computation in an integer type of the same size; also known as
 // "soft float".  For example, turning f32 arithmetic into operations using i32.
 // The resulting integer value is the same as what you would get by performing
 // the floating point operation and bitcasting the result to the integer type.
 // Expansion is the act of changing a computation in an illegal type to be a
 // computation in two identical registers of a smaller type.  For example,
 // implementing ppcf128 arithmetic in two f64 registers.
 //
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "legalize-types"
 
 /// GetFPLibCall - Return the right libcall for the given floating point type.
 static RTLIB::Libcall GetFPLibCall(EVT VT,
                                    RTLIB::Libcall Call_F32,
                                    RTLIB::Libcall Call_F64,
                                    RTLIB::Libcall Call_F80,
                                    RTLIB::Libcall Call_F128,
                                    RTLIB::Libcall Call_PPCF128) {
   return
     VT == MVT::f32 ? Call_F32 :
     VT == MVT::f64 ? Call_F64 :
     VT == MVT::f80 ? Call_F80 :
     VT == MVT::f128 ? Call_F128 :
     VT == MVT::ppcf128 ? Call_PPCF128 :
     RTLIB::UNKNOWN_LIBCALL;
 }
 
 //===----------------------------------------------------------------------===//
 //  Convert Float Results to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "SoftenFloatResult #" << ResNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
     case ISD::Register:
     case ISD::CopyFromReg:
     case ISD::CopyToReg:
       assert(isLegalInHWReg(N->getValueType(ResNo)) &&
              "Unsupported SoftenFloatRes opcode!");
       // Only when isLegalInHWReg, we can skip check of the operands.
       R = SDValue(N, ResNo);
       break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
     case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
     case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break;
     case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
     case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
     case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
     case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
     case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
     case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
     case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
     case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
     case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
     case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
     case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
     case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
     case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
     case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
     case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
   }
 
   if (R.getNode() && R.getNode() != N) {
     SetSoftenedFloat(SDValue(N, ResNo), R);
     // Return true only if the node is changed, assuming that the operands
     // are also converted when necessary.
     return true;
   }
 
   // Otherwise, return false to tell caller to scan operands.
   return false;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   return BitConvertToInteger(N->getOperand(0));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
                                                       unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
   return BitConvertToInteger(Op);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
   // Convert the inputs to integers, and build a new pair out of them.
   return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N),
                      TLI.getTypeToTransformTo(*DAG.getContext(),
                                               N->getValueType(0)),
                      BitConvertToInteger(N->getOperand(0)),
                      BitConvertToInteger(N->getOperand(1)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
   // When LegalInHWReg, we can load better from the constant pool.
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
   // In ppcf128, the high 64 bits are always first in memory regardless
   // of Endianness. LLVM's APFloat representation is not Endian sensitive,
   // and so always converts into a 128-bit APInt in a non-Endian-sensitive
   // way. However, APInt's are serialized in an Endian-sensitive fashion,
   // so on big-Endian targets, the two doubles are output in the wrong
   // order. Fix this by manually flipping the order of the high 64 bits
   // and the low 64 bits here.
   if (DAG.getDataLayout().isBigEndian() &&
       CN->getValueType(0).getSimpleVT() == llvm::MVT::ppcf128) {
     uint64_t words[2] = { CN->getValueAPF().bitcastToAPInt().getRawData()[1],
                           CN->getValueAPF().bitcastToAPInt().getRawData()[0] };
     APInt Val(128, words);
     return DAG.getConstant(Val, SDLoc(CN),
                            TLI.getTypeToTransformTo(*DAG.getContext(),
                                                     CN->getValueType(0)));
   } else {
     return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
                            TLI.getTypeToTransformTo(*DAG.getContext(),
                                                     CN->getValueType(0)));
   }
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) {
   // When LegalInHWReg, keep the extracted value in register.
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      NewOp.getValueType().getVectorElementType(),
                      NewOp, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
   // When LegalInHWReg, FABS can be implemented as native bitwise operations.
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
   // Mask = ~(1 << (Size-1))
   APInt API = APInt::getAllOnesValue(Size);
   API.clearBit(Size - 1);
   SDValue Mask = DAG.getConstant(API, SDLoc(N), NVT);
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return DAG.getNode(ISD::AND, SDLoc(N), NVT, Op, Mask);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMIN_F32,
                                            RTLIB::FMIN_F64,
                                            RTLIB::FMIN_F80,
                                            RTLIB::FMIN_F128,
                                            RTLIB::FMIN_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMAX_F32,
                                            RTLIB::FMAX_F64,
                                            RTLIB::FMAX_F80,
                                            RTLIB::FMAX_F128,
                                            RTLIB::FMAX_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::ADD_F32,
                                            RTLIB::ADD_F64,
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::CEIL_F32,
                                            RTLIB::CEIL_F64,
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
   // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
 
   EVT LVT = LHS.getValueType();
   EVT RVT = RHS.getValueType();
 
   unsigned LSize = LVT.getSizeInBits();
   unsigned RSize = RVT.getSizeInBits();
 
   // First get the sign bit of second operand.
   SDValue SignBit = DAG.getNode(
       ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
       DAG.getConstant(RSize - 1, dl,
                       TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
   SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
 
   // Shift right or sign-extend it if the two operands have different types.
   int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
   if (SizeDiff > 0) {
     SignBit =
         DAG.getNode(ISD::SRL, dl, RVT, SignBit,
                     DAG.getConstant(SizeDiff, dl,
                                     TLI.getShiftAmountTy(SignBit.getValueType(),
                                                          DAG.getDataLayout())));
     SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
   } else if (SizeDiff < 0) {
     SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
     SignBit =
         DAG.getNode(ISD::SHL, dl, LVT, SignBit,
                     DAG.getConstant(-SizeDiff, dl,
                                     TLI.getShiftAmountTy(SignBit.getValueType(),
                                                          DAG.getDataLayout())));
   }
 
   // Clear the sign bit of the first operand.
   SDValue Mask = DAG.getNode(
       ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
       DAG.getConstant(LSize - 1, dl,
                       TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
   Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
   LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
 
   // Or the value with the sign bit.
   return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::COS_F32,
                                            RTLIB::COS_F64,
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::DIV_F32,
                                            RTLIB::DIV_F64,
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::EXP_F32,
                                            RTLIB::EXP_F64,
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::EXP2_F32,
                                            RTLIB::EXP2_F64,
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FLOOR_F32,
                                            RTLIB::FLOOR_F64,
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG_F32,
                                            RTLIB::LOG_F64,
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG2_F32,
                                            RTLIB::LOG2_F64,
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG10_F32,
                                            RTLIB::LOG10_F64,
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)),
                      GetSoftenedFloat(N->getOperand(2)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMA_F32,
                                            RTLIB::FMA_F64,
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::MUL_F32,
                                            RTLIB::MUL_F64,
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::NEARBYINT_F32,
                                            RTLIB::NEARBYINT_F64,
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
   // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
   SDValue Ops[2] = { DAG.getConstantFP(-0.0, dl, N->getValueType(0)),
                      GetSoftenedFloat(N->getOperand(0)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SUB_F32,
                                            RTLIB::SUB_F64,
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
                          NVT, Ops, false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
 
   // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's
   // entirely possible for both f16 and f32 to be legal, so use the fully
   // hard-float FP_EXTEND rather than FP16_TO_FP.
   if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
     Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
     if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
       AddToWorklist(Op.getNode());
   }
 
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
     Op = GetPromotedFloat(Op);
     // If the promotion did the FP_EXTEND to the destination type for us,
     // there's nothing left to do here.
     if (Op.getValueType() == N->getValueType(0)) {
       return BitConvertToInteger(Op);
     }
   }
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
 // nodes?
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
   SDValue Op = N->getOperand(0);
   SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
                                   false, SDLoc(N)).first;
   if (N->getValueType(0) == MVT::f32)
     return Res32;
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, Res32, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
   if (N->getValueType(0) == MVT::f16) {
     // Semi-soften first, to FP_TO_FP16, so that targets which support f16 as a
     // storage-only type get a chance to select things.
     return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, Op);
   }
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
   return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::POW_F32,
                                            RTLIB::POW_F64,
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
   assert(N->getOperand(1).getValueType() == MVT::i32 &&
          "Unsupported power type!");
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::POWI_F32,
                                            RTLIB::POWI_F64,
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::REM_F32,
                                            RTLIB::REM_F64,
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::RINT_F32,
                                            RTLIB::RINT_F64,
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::ROUND_F32,
                                            RTLIB::ROUND_F64,
                                            RTLIB::ROUND_F80,
                                            RTLIB::ROUND_F128,
                                            RTLIB::ROUND_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SIN_F32,
                                            RTLIB::SIN_F64,
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SQRT_F32,
                                            RTLIB::SQRT_F64,
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SUB_F32,
                                            RTLIB::SUB_F64,
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
                          NVT, Ops, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   if (N->getValueType(0) == MVT::f16)
     return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0));
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::TRUNC_F32,
                                            RTLIB::TRUNC_F64,
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
                          NVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
   bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDLoc dl(N);
 
   auto MMOFlags =
       L->getMemOperand()->getFlags() &
       ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
   SDValue NewL;
   if (L->getExtensionType() == ISD::NON_EXTLOAD) {
     NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), NVT, dl,
                        L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getPointerInfo(), NVT, L->getAlignment(), MMOFlags,
                        L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     if (N != NewL.getValue(1).getNode())
       ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
     return NewL;
   }
 
   // Do a non-extending load followed by FP_EXTEND.
   NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD, L->getMemoryVT(),
                      dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                      L->getPointerInfo(), L->getMemoryVT(), L->getAlignment(),
                      MMOFlags, L->getAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
   auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
   if (LegalInHWReg)
     return ExtendNode;
   return BitConvertToInteger(ExtendNode);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
   if (isLegalInHWReg(N->getValueType(ResNo)))
     return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
                      LHS.getValueType(), N->getOperand(0),
                      N->getOperand(1), LHS, RHS, N->getOperand(4));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_UNDEF(SDNode *N) {
   return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
                                                N->getValueType(0)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
   SDValue Chain = N->getOperand(0); // Get the chain.
   SDValue Ptr = N->getOperand(1); // Get the pointer.
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDLoc dl(N);
 
   SDValue NewVAARG;
   NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
                           N->getConstantOperandVal(3));
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   if (N != NewVAARG.getValue(1).getNode())
     ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
   return NewVAARG;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
   EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
   EVT NVT = EVT();
   SDLoc dl(N);
 
   // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
   // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
   // match.  Look for an appropriate libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
        t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; ++t) {
     NVT = (MVT::SimpleValueType)t;
     // The source needs to big enough to hold the operand.
     if (NVT.bitsGE(SVT))
       LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT):RTLIB::getUINTTOFP (NVT, RVT);
   }
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
   // Sign/zero extend the argument if the libcall takes a larger type.
   SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
                          Op, Signed, dl).first;
 }
 
 
 //===----------------------------------------------------------------------===//
 //  Convert Float Operand to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
 
   switch (N->getOpcode()) {
   default:
     if (CanSkipSoftenFloatOperand(N, OpNo))
       return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to soften this operator's operand!");
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
   case ISD::CopyToReg:   Res = SoftenFloatOp_COPY_TO_REG(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
   case ISD::FABS:        Res = SoftenFloatOp_FABS(N); break;
   case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
   case ISD::FNEG:        Res = SoftenFloatOp_FNEG(N); break;
   case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
   case ISD::SELECT:      Res = SoftenFloatOp_SELECT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
   case ISD::STORE:
     Res = SoftenFloatOp_STORE(N, OpNo);
     // Do not try to analyze or soften this node again if the value is
     // or can be held in a register. In that case, Res.getNode() should
     // be equal to N.
     if (Res.getNode() == N &&
         isLegalInHWReg(N->getOperand(OpNo).getValueType()))
       return false;
     // Otherwise, we need to reanalyze and lower the new Res nodes.
     break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this to re-analyze.
   if (Res.getNode() == N)
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
   if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
     return false;
 
   // When the operand type can be kept in registers there is nothing to do for
   // the following opcodes.
   switch (N->getOperand(OpNo).getOpcode()) {
     case ISD::BITCAST:
     case ISD::ConstantFP:
     case ISD::CopyFromReg:
     case ISD::CopyToReg:
     case ISD::FABS:
     case ISD::FCOPYSIGN:
     case ISD::FNEG:
     case ISD::Register:
     case ISD::SELECT:
     case ISD::SELECT_CC:
       return true;
   }
 
   switch (N->getOpcode()) {
     case ISD::ConstantFP:  // Leaf node.
     case ISD::CopyFromReg: // Operand is a register that we know to be left
                            // unchanged by SoftenFloatResult().
     case ISD::Register:    // Leaf node.
       return true;
   }
   return false;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_COPY_TO_REG(SDNode *N) {
   SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
   SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
 
   if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
     return SDValue();
 
   if (N->getNumOperands() == 3)
     return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2), 0);
 
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2,
                                         N->getOperand(3)),
                  0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
   // If we get here, the result must be legal but the source illegal.
   EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
 
   if (SVT == MVT::f16)
     return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), RVT, Op);
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
 
   return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
 }
 
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   // We actually deal with the partially-softened FP_TO_FP16 node too, which
   // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
   assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16);
 
   EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
   EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT;
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
 
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
   TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
 
   // Update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
                                 N->getOperand(4)),
                  0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FABS(SDNode *N) {
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
 
   if (Op == N->getOperand(0))
     return SDValue();
 
   return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
   SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
   SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
 
   if (Op0 == N->getOperand(0) && Op1 == N->getOperand(1))
     return SDValue();
 
   return SDValue(DAG.UpdateNodeOperands(N, Op0, Op1), 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FNEG(SDNode *N) {
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
 
   if (Op == N->getOperand(0))
     return SDValue();
 
   return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
   EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
   EVT NVT = EVT();
   SDLoc dl(N);
 
   // If the result is not legal, eg: fp -> i1, then it needs to be promoted to
   // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly
   // match, eg. we don't have fp -> i8 conversions.
   // Look for an appropriate libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
        IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
        ++IntVT) {
     NVT = (MVT::SimpleValueType)IntVT;
     // The type needs to big enough to hold the result.
     if (NVT.bitsGE(RVT))
       LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT):RTLIB::getFPTOUINT(SVT, NVT);
   }
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, false, dl).first;
 
   // Truncate the result if the libcall returns a larger type.
   return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT(SDNode *N) {
   SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
   SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
 
   if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
     return SDValue();
 
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2),
                  0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
 
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
   TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
 
   // Update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
                                 DAG.getCondCode(CCCode)),
                  0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
 
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
   TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, use it.
   if (!NewRHS.getNode()) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
     return NewLHS;
   }
 
   // Otherwise, update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 DAG.getCondCode(CCCode)),
                  0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   assert(OpNo == 1 && "Can only soften the stored value!");
   StoreSDNode *ST = cast<StoreSDNode>(N);
   SDValue Val = ST->getValue();
   SDLoc dl(N);
 
   if (ST->isTruncatingStore())
     // Do an FP_ROUND followed by a non-truncating store.
     Val = BitConvertToInteger(DAG.getNode(ISD::FP_ROUND, dl, ST->getMemoryVT(),
                                           Val, DAG.getIntPtrConstant(0, dl)));
   else
     Val = GetSoftenedFloat(Val);
 
   return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
                       ST->getMemOperand());
 }
 
 
 //===----------------------------------------------------------------------===//
 //  Float Result Expansion
 //===----------------------------------------------------------------------===//
 
 /// ExpandFloatResult - This method is called when the specified result of the
 /// specified node is found to need expansion.  At this point, the node may also
 /// have invalid operands or may have other results that need promotion, we just
 /// know that (at least) one result needs expansion.
 void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Lo, Hi;
   Lo = Hi = SDValue();
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "ExpandFloatResult #" << ResNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to expand the result of this operator!");
 
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
   case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
   case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
   case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
   case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
   case ISD::FMAXNUM:    ExpandFloatRes_FMAXNUM(N, Lo, Hi); break;
   case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
   case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
   case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
   case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
   case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
   case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
   case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
   case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
   case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
   case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
   case ISD::FMA:        ExpandFloatRes_FMA(N, Lo, Hi); break;
   case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
   case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
   case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
   case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
   case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
   case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
   case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
   case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
   case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
   case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
   case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
   case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
   if (Lo.getNode())
     SetExpandedFloat(SDValue(N, ResNo), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
                                                  SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   assert(NVT.getSizeInBits() == 64 &&
          "Do not know how to expand this float constant!");
   APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
   SDLoc dl(N);
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                  APInt(64, C.getRawData()[1])),
                          dl, NVT);
   Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                  APInt(64, C.getRawData()[0])),
                          dl, NVT);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   assert(N->getValueType(0) == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
   SDLoc dl(N);
   SDValue Tmp;
   GetExpandedFloat(N->getOperand(0), Lo, Tmp);
   Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
   // Lo = Hi==fabs(Hi) ? Lo : -Lo;
   Lo = DAG.getSelectCC(dl, Tmp, Hi, Lo,
                    DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
                    ISD::SETEQ);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
                                               SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::FMIN_F32, RTLIB::FMIN_F64,
                                          RTLIB::FMIN_F80, RTLIB::FMIN_F128,
                                          RTLIB::FMIN_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo,
                                               SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::FMAX_F32, RTLIB::FMAX_F64,
                                          RTLIB::FMAX_F80, RTLIB::FMAX_F128,
                                          RTLIB::FMAX_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::ADD_F32, RTLIB::ADD_F64,
                                          RTLIB::ADD_F80, RTLIB::ADD_F128,
                                          RTLIB::ADD_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::CEIL_F32, RTLIB::CEIL_F64,
                                          RTLIB::CEIL_F80, RTLIB::CEIL_F128,
                                          RTLIB::CEIL_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCOPYSIGN(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::COPYSIGN_F32,
                                          RTLIB::COPYSIGN_F64,
                                          RTLIB::COPYSIGN_F80,
                                          RTLIB::COPYSIGN_F128,
                                          RTLIB::COPYSIGN_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::COS_F32, RTLIB::COS_F64,
                                          RTLIB::COS_F80, RTLIB::COS_F128,
                                          RTLIB::COS_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::DIV_F32,
                                                    RTLIB::DIV_F64,
                                                    RTLIB::DIV_F80,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
                                  N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::EXP_F32, RTLIB::EXP_F64,
                                          RTLIB::EXP_F80, RTLIB::EXP_F128,
                                          RTLIB::EXP_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::EXP2_F32, RTLIB::EXP2_F64,
                                          RTLIB::EXP2_F80, RTLIB::EXP2_F128,
                                          RTLIB::EXP2_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
                                          RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
                                          RTLIB::FLOOR_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::LOG_F32, RTLIB::LOG_F64,
                                          RTLIB::LOG_F80, RTLIB::LOG_F128,
                                          RTLIB::LOG_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::LOG2_F32, RTLIB::LOG2_F64,
                                          RTLIB::LOG2_F80, RTLIB::LOG2_F128,
                                          RTLIB::LOG2_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::LOG10_F32, RTLIB::LOG10_F64,
                                          RTLIB::LOG10_F80, RTLIB::LOG10_F128,
                                          RTLIB::LOG10_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::FMA_F32,
                                                    RTLIB::FMA_F64,
                                                    RTLIB::FMA_F80,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
                                  N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::MUL_F32,
                                                    RTLIB::MUL_F64,
                                                    RTLIB::MUL_F80,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
                                  N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
                                                  SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::NEARBYINT_F32,
                                          RTLIB::NEARBYINT_F64,
                                          RTLIB::NEARBYINT_F80,
                                          RTLIB::NEARBYINT_F128,
                                          RTLIB::NEARBYINT_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDLoc dl(N);
   GetExpandedFloat(N->getOperand(0), Lo, Hi);
   Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
   Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   Hi = DAG.getNode(ISD::FP_EXTEND, dl, NVT, N->getOperand(0));
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                  APInt(NVT.getSizeInBits(), 0)), dl, NVT);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::POW_F32, RTLIB::POW_F64,
                                          RTLIB::POW_F80, RTLIB::POW_F128,
                                          RTLIB::POW_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::POWI_F32, RTLIB::POWI_F64,
                                          RTLIB::POWI_F80, RTLIB::POWI_F128,
                                          RTLIB::POWI_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FREM(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::REM_F32, RTLIB::REM_F64,
                                          RTLIB::REM_F80, RTLIB::REM_F128,
                                          RTLIB::REM_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::RINT_F32, RTLIB::RINT_F64,
                                          RTLIB::RINT_F80, RTLIB::RINT_F128,
                                          RTLIB::RINT_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::ROUND_F32,
                                          RTLIB::ROUND_F64,
                                          RTLIB::ROUND_F80,
                                          RTLIB::ROUND_F128,
                                          RTLIB::ROUND_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::SIN_F32, RTLIB::SIN_F64,
                                          RTLIB::SIN_F80, RTLIB::SIN_F128,
                                          RTLIB::SIN_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::SQRT_F32, RTLIB::SQRT_F64,
                                          RTLIB::SQRT_F80, RTLIB::SQRT_F128,
                                          RTLIB::SQRT_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::SUB_F32,
                                                    RTLIB::SUB_F64,
                                                    RTLIB::SUB_F80,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
                                  N->getValueType(0), Ops, false,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
                                          RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
                                          RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
                                          RTLIB::TRUNC_PPCF128),
                             N, false);
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   if (ISD::isNormalLoad(N)) {
     ExpandRes_NormalLoad(N, Lo, Hi);
     return;
   }
 
   assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
   LoadSDNode *LD = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
   SDLoc dl(N);
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
 
   Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
                       LD->getMemoryVT(), LD->getMemOperand());
 
   // Remember the chain.
   Chain = Hi.getValue(1);
 
   // The low part is zero.
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                  APInt(NVT.getSizeInBits(), 0)), dl, NVT);
 
   // Modified the chain - switch anything that used the old chain to use the
   // new one.
   ReplaceValueWith(SDValue(LD, 1), Chain);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
                                                  SDValue &Hi) {
   assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Src = N->getOperand(0);
   EVT SrcVT = Src.getValueType();
   bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
   SDLoc dl(N);
 
   // First do an SINT_TO_FP, whether the original was signed or unsigned.
   // When promoting partial word types to i32 we must honor the signedness,
   // though.
   if (SrcVT.bitsLE(MVT::i32)) {
     // The integer can be represented exactly in an f64.
     Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                       MVT::i32, Src);
     Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                    APInt(NVT.getSizeInBits(), 0)), dl, NVT);
     Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
   } else {
     RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
     if (SrcVT.bitsLE(MVT::i64)) {
       Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                         MVT::i64, Src);
       LC = RTLIB::SINTTOFP_I64_PPCF128;
     } else if (SrcVT.bitsLE(MVT::i128)) {
       Src = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i128, Src);
       LC = RTLIB::SINTTOFP_I128_PPCF128;
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
     Hi = TLI.makeLibCall(DAG, LC, VT, Src, true, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
   if (isSigned)
     return;
 
   // Unsigned - fix up the SINT_TO_FP value just calculated.
   Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
   SrcVT = Src.getValueType();
 
   // x>=0 ? (ppcf128)(iN)x : (ppcf128)(iN)x + 2^N; N=32,64,128.
   static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
   static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
   static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
   ArrayRef<uint64_t> Parts;
 
   switch (SrcVT.getSimpleVT().SimpleTy) {
   default:
     llvm_unreachable("Unsupported UINT_TO_FP!");
   case MVT::i32:
     Parts = TwoE32;
     break;
   case MVT::i64:
     Parts = TwoE64;
     break;
   case MVT::i128:
     Parts = TwoE128;
     break;
   }
 
   // TODO: Are there fast-math-flags to propagate to this FADD?
   Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
                    DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble(),
                                              APInt(128, Parts)),
                                      dl, MVT::ppcf128));
   Lo = DAG.getSelectCC(dl, Src, DAG.getConstant(0, dl, SrcVT),
                        Lo, Hi, ISD::SETLT);
   GetPairElements(Lo, Lo, Hi);
 }
 
 
 //===----------------------------------------------------------------------===//
 //  Float Operand Expansion
 //===----------------------------------------------------------------------===//
 
 /// ExpandFloatOperand - This method is called when the specified operand of the
 /// specified node is found to need expansion.  At this point, all of the result
 /// types of the node are known to be legal, but other operands of the node may
 /// need promotion or expansion as well as the specified one.
 bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
     return false;
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to expand this operator's operand!");
 
   case ISD::BITCAST:         Res = ExpandOp_BITCAST(N); break;
   case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
   case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
 
   case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
   case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
   case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
   case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
   case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
   case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
                                                   OpNo); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this.
   if (Res.getNode() == N)
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 /// FloatExpandSetCCOperands - Expand the operands of a comparison.  This code
 /// is shared among BR_CC, SELECT_CC, and SETCC handlers.
 void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
                                                 SDValue &NewRHS,
                                                 ISD::CondCode &CCCode,
                                                 const SDLoc &dl) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedFloat(NewLHS, LHSLo, LHSHi);
   GetExpandedFloat(NewRHS, RHSLo, RHSHi);
 
   assert(NewLHS.getValueType() == MVT::ppcf128 && "Unsupported setcc type!");
 
   // FIXME:  This generated code sucks.  We want to generate
   //         FCMPU crN, hi1, hi2
   //         BNE crN, L:
   //         FCMPU crN, lo1, lo2
   // The following can be improved, but not that much.
   SDValue Tmp1, Tmp2, Tmp3;
   Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                       LHSHi, RHSHi, ISD::SETOEQ);
   Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()),
                       LHSLo, RHSLo, CCCode);
   Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
   Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                       LHSHi, RHSHi, ISD::SETUNE);
   Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                       LHSHi, RHSHi, CCCode);
   Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
   NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
   NewRHS = SDValue();   // LHS is the result, not a compare.
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
   FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
 
   // Update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
                                 N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
   assert(N->getOperand(1).getValueType() == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
   SDValue Lo, Hi;
   GetExpandedFloat(N->getOperand(1), Lo, Hi);
   // The ppcf128 value is providing only the sign; take it from the
   // higher-order double (which must have the larger magnitude).
   return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N),
                      N->getValueType(0), N->getOperand(0), Hi);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
   assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
   SDValue Lo, Hi;
   GetExpandedFloat(N->getOperand(0), Lo, Hi);
   // Round it the rest of the way (e.g. to f32) if needed.
   return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
                      N->getValueType(0), Hi, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
                          false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
   FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
 
   // Update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
                                 DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
   FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, use it.
   if (!NewRHS.getNode()) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
     return NewLHS;
   }
 
   // Otherwise, update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
   if (ISD::isNormalStore(N))
     return ExpandOp_NormalStore(N, OpNo);
 
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   assert(OpNo == 1 && "Can only expand the stored value so far");
   StoreSDNode *ST = cast<StoreSDNode>(N);
 
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
                                      ST->getValue().getValueType());
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
   (void)NVT;
 
   SDValue Lo, Hi;
   GetExpandedOp(ST->getValue(), Lo, Hi);
 
   return DAG.getTruncStore(Chain, SDLoc(N), Hi, Ptr,
                            ST->getMemoryVT(), ST->getMemOperand());
 }
 
 //===----------------------------------------------------------------------===//
 //  Float Operand Promotion
 //===----------------------------------------------------------------------===//
 //
 
 static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
   if (OpVT == MVT::f16) {
       return ISD::FP16_TO_FP;
   } else if (RetVT == MVT::f16) {
       return ISD::FP_TO_FP16;
   }
 
   report_fatal_error("Attempt at an invalid promotion-related conversion");
 }
 
 bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
   SDValue R = SDValue();
 
   // Nodes that use a promotion-requiring floating point operand, but doesn't
   // produce a promotion-requiring floating point result, need to be legalized
   // to use the promoted float operand.  Nodes that produce at least one
   // promotion-requiring floating point result have their operands legalized as
   // a part of PromoteFloatResult.
   switch (N->getOpcode()) {
     default:
       llvm_unreachable("Do not know how to promote this operator's operand!");
 
     case ISD::BITCAST:    R = PromoteFloatOp_BITCAST(N, OpNo); break;
     case ISD::FCOPYSIGN:  R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT: R = PromoteFloatOp_FP_TO_XINT(N, OpNo); break;
     case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
     case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
     case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
     case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
   }
 
   if (R.getNode())
     ReplaceValueWith(SDValue(N, 0), R);
   return false;
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo) {
   SDValue Op = N->getOperand(0);
   EVT OpVT = Op->getValueType(0);
 
-  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
-  assert (IVT == N->getValueType(0) && "Bitcast to type of different size");
-
   SDValue Promoted = GetPromotedFloat(N->getOperand(0));
   EVT PromotedVT = Promoted->getValueType(0);
 
   // Convert the promoted float value to the desired IVT.
-  return DAG.getNode(GetPromotionOpcode(PromotedVT, OpVT), SDLoc(N), IVT,
-                     Promoted);
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+  SDValue Convert = DAG.getNode(GetPromotionOpcode(PromotedVT, OpVT), SDLoc(N),
+                                IVT, Promoted);
+  // The final result type might not be an scalar so we need a bitcast. The
+  // bitcast will be further legalized if needed.
+  return DAG.getBitcast(N->getValueType(0), Convert);
 }
 
 // Promote Operand 1 of FCOPYSIGN.  Operand 0 ought to be handled by
 // PromoteFloatRes_FCOPYSIGN.
 SDValue DAGTypeLegalizer::PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo) {
   assert (OpNo == 1 && "Only Operand 1 must need promotion here");
   SDValue Op1 = GetPromotedFloat(N->getOperand(1));
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
                      N->getOperand(0), Op1);
 }
 
 // Convert the promoted float value to the desired integer type
 SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo) {
   SDValue Op = GetPromotedFloat(N->getOperand(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
   SDValue Op = GetPromotedFloat(N->getOperand(0));
   EVT VT = N->getValueType(0);
 
   // Desired VT is same as promoted type.  Use promoted float directly.
   if (VT == Op->getValueType(0))
     return Op;
 
   // Else, extend the promoted float value to the desired VT.
   return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
 }
 
 // Promote the float operands used for comparison.  The true- and false-
 // operands have the same type as the result and are promoted, if needed, by
 // PromoteFloatRes_SELECT_CC
 SDValue DAGTypeLegalizer::PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo) {
   SDValue LHS = GetPromotedFloat(N->getOperand(0));
   SDValue RHS = GetPromotedFloat(N->getOperand(1));
 
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
                      LHS, RHS, N->getOperand(2), N->getOperand(3),
                      N->getOperand(4));
 }
 
 // Construct a SETCC that compares the promoted values and sets the conditional
 // code.
 SDValue DAGTypeLegalizer::PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op0 = GetPromotedFloat(N->getOperand(0));
   SDValue Op1 = GetPromotedFloat(N->getOperand(1));
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
 
   return DAG.getSetCC(SDLoc(N), NVT, Op0, Op1, CCCode);
 
 }
 
 // Lower the promoted Float down to the integer value of same size and construct
 // a STORE of the integer value.
 SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) {
   StoreSDNode *ST = cast<StoreSDNode>(N);
   SDValue Val = ST->getValue();
   SDLoc DL(N);
 
   SDValue Promoted = GetPromotedFloat(Val);
   EVT VT = ST->getOperand(1).getValueType();
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
 
   SDValue NewVal;
   NewVal = DAG.getNode(GetPromotionOpcode(Promoted.getValueType(), VT), DL,
                        IVT, Promoted);
 
   return DAG.getStore(ST->getChain(), DL, NewVal, ST->getBasePtr(),
                       ST->getMemOperand());
 }
 
 //===----------------------------------------------------------------------===//
 //  Float Result Promotion
 //===----------------------------------------------------------------------===//
 
 void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
     // These opcodes cannot appear if promotion of FP16 is done in the backend
     // instead of Clang
     case ISD::FP16_TO_FP:
     case ISD::FP_TO_FP16:
     default:
       llvm_unreachable("Do not know how to promote this operator's result!");
 
     case ISD::BITCAST:    R = PromoteFloatRes_BITCAST(N); break;
     case ISD::ConstantFP: R = PromoteFloatRes_ConstantFP(N); break;
     case ISD::EXTRACT_VECTOR_ELT:
                           R = PromoteFloatRes_EXTRACT_VECTOR_ELT(N); break;
     case ISD::FCOPYSIGN:  R = PromoteFloatRes_FCOPYSIGN(N); break;
 
     // Unary FP Operations
     case ISD::FABS:
     case ISD::FCEIL:
     case ISD::FCOS:
     case ISD::FEXP:
     case ISD::FEXP2:
     case ISD::FFLOOR:
     case ISD::FLOG:
     case ISD::FLOG2:
     case ISD::FLOG10:
     case ISD::FNEARBYINT:
     case ISD::FNEG:
     case ISD::FRINT:
     case ISD::FROUND:
     case ISD::FSIN:
     case ISD::FSQRT:
     case ISD::FTRUNC:
     case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break;
 
     // Binary FP Operations
     case ISD::FADD:
     case ISD::FDIV:
     case ISD::FMAXNAN:
     case ISD::FMINNAN:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
     case ISD::FMUL:
     case ISD::FPOW:
     case ISD::FREM:
     case ISD::FSUB:       R = PromoteFloatRes_BinOp(N); break;
 
     case ISD::FMA:        // FMA is same as FMAD
     case ISD::FMAD:       R = PromoteFloatRes_FMAD(N); break;
 
     case ISD::FPOWI:      R = PromoteFloatRes_FPOWI(N); break;
 
     case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
     case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
 
   }
 
   if (R.getNode())
     SetPromotedFloat(SDValue(N, ResNo), R);
 }
 
 // Bitcast from i16 to f16:  convert the i16 to a f32 value instead.
 // At this point, it is not possible to determine if the bitcast value is
 // eventually stored to memory or promoted to f32 or promoted to a floating
 // point at a higher precision.  Some of these cases are handled by FP_EXTEND,
 // STORE promotion handlers.
 SDValue DAGTypeLegalizer::PromoteFloatRes_BITCAST(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT,
-                     N->getOperand(0));
+  // Input type isn't guaranteed to be a scalar int so bitcast if not. The
+  // bitcast will be legalized further if necessary.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(),
+                              N->getOperand(0).getValueType().getSizeInBits());
+  SDValue Cast = DAG.getBitcast(IVT, N->getOperand(0));
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, Cast);
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatRes_ConstantFP(SDNode *N) {
   ConstantFPSDNode *CFPNode = cast<ConstantFPSDNode>(N);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // Get the (bit-cast) APInt of the APFloat and build an integer constant
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
   SDValue C = DAG.getConstant(CFPNode->getValueAPF().bitcastToAPInt(), DL,
                               IVT);
 
   // Convert the Constant to the desired FP type
   // FIXME We might be able to do the conversion during compilation and get rid
   // of it from the object code
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, C);
 }
 
 // If the Index operand is a constant, try to redirect the extract operation to
 // the correct legalized vector.  If not, bit-convert the input vector to
 // equivalent integer vector.  Extract the element as an (bit-cast) integer
 // value and convert it to the promoted type.
 SDValue DAGTypeLegalizer::PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDLoc DL(N);
 
   // If the index is constant, try to extract the value from the legalized
   // vector type.
   if (isa<ConstantSDNode>(N->getOperand(1))) {
     SDValue Vec = N->getOperand(0);
     SDValue Idx = N->getOperand(1);
     EVT VecVT = Vec->getValueType(0);
     EVT EltVT = VecVT.getVectorElementType();
 
     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
     switch (getTypeAction(VecVT)) {
     default: break;
     case TargetLowering::TypeScalarizeVector: {
       SDValue Res = GetScalarizedVector(N->getOperand(0));
       ReplaceValueWith(SDValue(N, 0), Res);
       return SDValue();
     }
     case TargetLowering::TypeWidenVector: {
       Vec = GetWidenedVector(Vec);
       SDValue Res = DAG.getNode(N->getOpcode(), DL, EltVT, Vec, Idx);
       ReplaceValueWith(SDValue(N, 0), Res);
       return SDValue();
     }
     case TargetLowering::TypeSplitVector: {
       SDValue Lo, Hi;
       GetSplitVector(Vec, Lo, Hi);
 
       uint64_t LoElts = Lo.getValueType().getVectorNumElements();
       SDValue Res;
       if (IdxVal < LoElts)
         Res = DAG.getNode(N->getOpcode(), DL, EltVT, Lo, Idx);
       else
         Res = DAG.getNode(N->getOpcode(), DL, EltVT, Hi,
                           DAG.getConstant(IdxVal - LoElts, DL,
                                           Idx.getValueType()));
       ReplaceValueWith(SDValue(N, 0), Res);
       return SDValue();
     }
 
     }
   }
 
   // Bit-convert the input vector to the equivalent integer vector
   SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
   EVT IVT = NewOp.getValueType().getVectorElementType();
 
   // Extract the element as an (bit-cast) integer value
   SDValue NewVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IVT,
                                NewOp, N->getOperand(1));
 
   // Convert the element to the desired FP type
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, NewVal);
 }
 
 // FCOPYSIGN(X, Y) returns the value of X with the sign of Y.  If the result
 // needs promotion, so does the argument X.  Note that Y, if needed, will be
 // handled during operand promotion.
 SDValue DAGTypeLegalizer::PromoteFloatRes_FCOPYSIGN(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op0 = GetPromotedFloat(N->getOperand(0));
 
   SDValue Op1 = N->getOperand(1);
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
 }
 
 // Unary operation where the result and the operand have PromoteFloat type
 // action.  Construct a new SDNode with the promoted float value of the old
 // operand.
 SDValue DAGTypeLegalizer::PromoteFloatRes_UnaryOp(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op = GetPromotedFloat(N->getOperand(0));
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op);
 }
 
 // Binary operations where the result and both operands have PromoteFloat type
 // action.  Construct a new SDNode with the promoted float values of the old
 // operands.
 SDValue DAGTypeLegalizer::PromoteFloatRes_BinOp(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op0 = GetPromotedFloat(N->getOperand(0));
   SDValue Op1 = GetPromotedFloat(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op0 = GetPromotedFloat(N->getOperand(0));
   SDValue Op1 = GetPromotedFloat(N->getOperand(1));
   SDValue Op2 = GetPromotedFloat(N->getOperand(2));
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, Op2);
 }
 
 // Promote the Float (first) operand and retain the Integer (second) operand
 SDValue DAGTypeLegalizer::PromoteFloatRes_FPOWI(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op0 = GetPromotedFloat(N->getOperand(0));
   SDValue Op1 = N->getOperand(1);
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
 }
 
 // Explicit operation to reduce precision.  Reduce the value to half precision
 // and promote it back to the legal type.
 SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
   SDLoc DL(N);
 
   SDValue Op = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT OpVT = Op->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
 
   // Round promoted float to desired precision
   SDValue Round = DAG.getNode(GetPromotionOpcode(OpVT, VT), DL, IVT, Op);
   // Promote it back to the legal output type
   return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
 
   // Load the value as an integer value with the same number of bits.
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
   SDValue newL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), IVT,
                              SDLoc(N), L->getChain(), L->getBasePtr(),
                              L->getOffset(), L->getPointerInfo(), IVT,
                              L->getAlignment(),
                              L->getMemOperand()->getFlags(),
                              L->getAAInfo());
   // Legalize the chain result by replacing uses of the old value chain with the
   // new one
   ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
 
   // Convert the integer value to the desired FP type
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, newL);
 }
 
 // Construct a new SELECT node with the promoted true- and false- values.
 SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
   SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
   SDValue FalseVal = GetPromotedFloat(N->getOperand(2));
 
   return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0),
                      N->getOperand(0), TrueVal, FalseVal);
 }
 
 // Construct a new SELECT_CC node with the promoted true- and false- values.
 // The operands used for comparison are promoted by PromoteFloatOp_SELECT_CC.
 SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) {
   SDValue TrueVal = GetPromotedFloat(N->getOperand(2));
   SDValue FalseVal = GetPromotedFloat(N->getOperand(3));
 
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), N->getOperand(1), TrueVal, FalseVal,
                      N->getOperand(4));
 }
 
 // Construct a SDNode that transforms the SINT or UINT operand to the promoted
 // float type.
 SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue NV = DAG.getNode(N->getOpcode(), DL, NVT, N->getOperand(0));
   // Round the value to the desired precision (that of the source type).
   return DAG.getNode(
       ISD::FP_EXTEND, DL, NVT,
       DAG.getNode(ISD::FP_ROUND, DL, VT, NV, DAG.getIntPtrConstant(0, DL)));
 }
 
 SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
   return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
                                                N->getValueType(0)));
 }
 
Index: vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp	(revision 338000)
@@ -1,3598 +1,3598 @@
 //===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements integer type expansion and promotion for LegalizeTypes.
 // Promotion is the act of changing a computation in an illegal type into a
 // computation in a larger type.  For example, implementing i8 arithmetic in an
 // i32 register (often needed on powerpc).
 // Expansion is the act of changing a computation in an illegal type into a
 // computation in two identical registers of a smaller type.  For example,
 // implementing i64 arithmetic in two i32 registers (often needed on 32-bit
 // targets).
 //
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "legalize-types"
 
 //===----------------------------------------------------------------------===//
 //  Integer Result Promotion
 //===----------------------------------------------------------------------===//
 
 /// PromoteIntegerResult - This method is called when a result of a node is
 /// found to be in need of promotion to a larger type.  At this point, the node
 /// may also have invalid operands or may have other results that need
 /// expansion, we just know that (at least) one result needs promotion.
 void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
     LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
     return;
   }
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "PromoteIntegerResult #" << ResNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to promote this operator!");
   case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break;
   case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
   case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
   case ISD::BITCAST:     Res = PromoteIntRes_BITCAST(N); break;
   case ISD::BITREVERSE:  Res = PromoteIntRes_BITREVERSE(N); break;
   case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
   case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
   case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
   case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
   case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
   case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
   case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
   case ISD::SMIN:
   case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
   case ISD::UMIN:
   case ISD::UMAX:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
 
   case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
   case ISD::SIGN_EXTEND_INREG:
                          Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SRA:         Res = PromoteIntRes_SRA(N); break;
   case ISD::SRL:         Res = PromoteIntRes_SRL(N); break;
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
 
   case ISD::EXTRACT_SUBVECTOR:
                          Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::VECTOR_SHUFFLE:
                          Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
   case ISD::INSERT_VECTOR_ELT:
                          Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
   case ISD::BUILD_VECTOR:
                          Res = PromoteIntRes_BUILD_VECTOR(N); break;
   case ISD::SCALAR_TO_VECTOR:
                          Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:
                          Res = PromoteIntRes_CONCAT_VECTORS(N); break;
 
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
                          Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break;
 
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
 
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
 
   case ISD::FP_TO_FP16:  Res = PromoteIntRes_FP_TO_FP16(N); break;
 
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
 
   case ISD::SDIV:
   case ISD::SREM:        Res = PromoteIntRes_SExtIntBinOp(N); break;
 
   case ISD::UDIV:
   case ISD::UREM:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
 
   case ISD::SADDO:
   case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
   case ISD::UADDO:
   case ISD::USUBO:       Res = PromoteIntRes_UADDSUBO(N, ResNo); break;
   case ISD::SMULO:
   case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
 
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
 
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
 
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_SWAP:
     Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
 
   case ISD::ATOMIC_CMP_SWAP:
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     Res = PromoteIntRes_AtomicCmpSwap(cast<AtomicSDNode>(N), ResNo);
     break;
   }
 
   // If the result is null then the sub-method took care of registering it.
   if (Res.getNode())
     SetPromotedInteger(SDValue(N, ResNo), Res);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
                                                      unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
   return GetPromotedInteger(Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
   // Sign-extend the new bits, and continue the assertion.
   SDValue Op = SExtPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::AssertSext, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
   // Zero the new bits, and continue the assertion.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::AssertZext, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
   EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(), ResVT,
                               N->getChain(), N->getBasePtr(),
                               N->getMemOperand());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(),
                               N->getChain(), N->getBasePtr(),
                               Op2, N->getMemOperand());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
                                                       unsigned ResNo) {
   if (ResNo == 1) {
     assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
     EVT SVT = getSetCCResultType(N->getOperand(2).getValueType());
     EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
 
     // Only use the result of getSetCCResultType if it is legal,
     // otherwise just use the promoted result type (NVT).
     if (!TLI.isTypeLegal(SVT))
       SVT = NVT;
 
     SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other);
     SDValue Res = DAG.getAtomicCmpSwap(
         ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs,
         N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3),
         N->getMemOperand());
     ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
     ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
     return Res.getValue(1);
   }
 
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
   SDVTList VTs =
       DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
   SDValue Res = DAG.getAtomicCmpSwap(
       N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(),
       N->getBasePtr(), Op2, Op3, N->getMemOperand());
   // Update the use to N with the newly created Res.
   for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i)
     ReplaceValueWith(SDValue(N, i), Res.getValue(i));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
   EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   SDLoc dl(N);
 
   switch (getTypeAction(InVT)) {
   case TargetLowering::TypeLegal:
     break;
   case TargetLowering::TypePromoteInteger:
     if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector())
       // The input promotes to the same size.  Convert the promoted value.
       return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp));
     break;
   case TargetLowering::TypeSoftenFloat:
     // Promote the integer operand by hand.
     return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
   case TargetLowering::TypePromoteFloat: {
     // Convert the promoted float by hand.
-    SDValue PromotedOp = GetPromotedFloat(InOp);
-    return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp);
+    if (!NOutVT.isVector())
+      return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, GetPromotedFloat(InOp));
     break;
   }
   case TargetLowering::TypeExpandInteger:
   case TargetLowering::TypeExpandFloat:
     break;
   case TargetLowering::TypeScalarizeVector:
     // Convert the element to an integer and promote it by hand.
     if (!NOutVT.isVector())
       return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
                          BitConvertToInteger(GetScalarizedVector(InOp)));
     break;
   case TargetLowering::TypeSplitVector: {
     // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
     // pieces of the input into integers and reassemble in the final type.
     SDValue Lo, Hi;
     GetSplitVector(N->getOperand(0), Lo, Hi);
     Lo = BitConvertToInteger(Lo);
     Hi = BitConvertToInteger(Hi);
 
     if (DAG.getDataLayout().isBigEndian())
       std::swap(Lo, Hi);
 
     InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
                        EVT::getIntegerVT(*DAG.getContext(),
                                          NOutVT.getSizeInBits()),
                        JoinIntegers(Lo, Hi));
     return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
   }
   case TargetLowering::TypeWidenVector:
     // The input is widened to the same size. Convert to the widened value.
     // Make sure that the outgoing value is not a vector, because this would
     // make us bitcast between two vectors which are legalized in different ways.
     if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector())
       return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp));
   }
 
   return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
                      CreateStackStoreLoad(InOp, OutVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
   return DAG.getNode(
       ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
       DAG.getConstant(DiffBits, dl,
                       TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
   return DAG.getNode(
       ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
       DAG.getConstant(DiffBits, dl,
                       TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
   // The pair element type may be legal, or may not promote to the same type as
   // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
   return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N),
                      TLI.getTypeToTransformTo(*DAG.getContext(),
                      N->getValueType(0)), JoinIntegers(N->getOperand(0),
                      N->getOperand(1)));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   EVT VT = N->getValueType(0);
   // FIXME there is no actual debug info here
   SDLoc dl(N);
   // Zero extend things like i1, sign extend everything else.  It shouldn't
   // matter in theory which one we pick, but this tends to give better code?
   unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   SDValue Result = DAG.getNode(Opc, dl,
                                TLI.getTypeToTransformTo(*DAG.getContext(), VT),
                                SDValue(N, 0));
   assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
   return Result;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
   SDLoc dl(N);
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
   Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
   // Subtract off the extra leading bits in the bigger type.
   return DAG.getNode(
       ISD::SUB, dl, NVT, Op,
       DAG.getConstant(NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl,
                       NVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
   SDLoc dl(N);
   if (N->getOpcode() == ISD::CTTZ) {
     // The count is the same in the promoted type except if the original
     // value was zero.  This can be handled by setting the bit just off
     // the top of the original type.
     auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(),
                                       OVT.getScalarSizeInBits());
     Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT));
   }
   return DAG.getNode(N->getOpcode(), dl, NVT, Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDLoc dl(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0),
                      N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned NewOpc = N->getOpcode();
   SDLoc dl(N);
 
   // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
   // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
   // and SINT conversions are Custom, there is no way to tell which is
   // preferable. We choose SINT because that's the right thing on PPC.)
   if (N->getOpcode() == ISD::FP_TO_UINT &&
       !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
       TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
     NewOpc = ISD::FP_TO_SINT;
 
   SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
 
   // Assert that the converted value fits in the original type.  If it doesn't
   // (eg: because the value being converted is too big), then the result of the
   // original operation was undefined anyway, so the assert is still correct.
   //
   // NOTE: fp-to-uint to fp-to-sint promotion guarantees zero extend. For example:
   //   before legalization: fp-to-uint16, 65534. -> 0xfffe
   //   after legalization: fp-to-sint32, 65534. -> 0x0000fffe
   return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
                      ISD::AssertZext : ISD::AssertSext, dl, NVT, Res,
                      DAG.getValueType(N->getValueType(0).getScalarType()));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
   if (getTypeAction(N->getOperand(0).getValueType())
       == TargetLowering::TypePromoteInteger) {
     SDValue Res = GetPromotedInteger(N->getOperand(0));
     assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
 
     // If the result and operand types are the same after promotion, simplify
     // to an in-register extension.
     if (NVT == Res.getValueType()) {
       // The high bits are not guaranteed to be anything.  Insert an extend.
       if (N->getOpcode() == ISD::SIGN_EXTEND)
         return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
                            DAG.getValueType(N->getOperand(0).getValueType()));
       if (N->getOpcode() == ISD::ZERO_EXTEND)
         return DAG.getZeroExtendInReg(Res, dl,
                       N->getOperand(0).getValueType().getScalarType());
       assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
       return Res;
     }
   }
 
   // Otherwise, just extend the original operand all the way to the larger type.
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   ISD::LoadExtType ExtType =
     ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
   SDLoc dl(N);
   SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
                                N->getMemoryVT(), N->getMemOperand());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
 
   SDLoc dl(N);
   SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
                                   N->getMask(), ExtSrc0, N->getMemoryVT(),
                                   N->getMemOperand(), ISD::SEXTLOAD);
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue ExtSrc0 = GetPromotedInteger(N->getValue());
   assert(NVT == ExtSrc0.getValueType() &&
       "Gather result type and the passThru agrument type should be the same");
 
   SDLoc dl(N);
   SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(),
                    N->getIndex(), N->getScale() };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
                                     N->getMemOperand());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return Res;
 }
 
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Simply change the return type of the boolean result.
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
   EVT ValueVTs[] = { N->getValueType(0), NVT };
   SDValue Ops[3] = { N->getOperand(0), N->getOperand(1) };
   unsigned NumOps = N->getNumOperands();
   assert(NumOps <= 3 && "Too many operands");
   if (NumOps == 3)
     Ops[2] = N->getOperand(2);
 
   SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
                             DAG.getVTList(ValueVTs), makeArrayRef(Ops, NumOps));
 
   // Modified the sum result - switch anything that used the old sum to use
   // the new one.
   ReplaceValueWith(SDValue(N, 0), Res);
 
   return SDValue(Res.getNode(), 1);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
 
   // The operation overflowed iff the result in the larger type is not the
   // sign extension of its truncation to the original type.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
   EVT OVT = N->getOperand(0).getValueType();
   EVT NVT = LHS.getValueType();
   SDLoc dl(N);
 
   // Do the arithmetic in the larger type.
   unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
   SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
 
   // Calculate the overflow flag: sign extend the arithmetic result from
   // the original type.
   SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
                             DAG.getValueType(OVT));
   // Overflowed if and only if this is not equal to Res.
   Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
 
   // Use the calculated overflow everywhere.
   ReplaceValueWith(SDValue(N, 1), Ofl);
 
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
   SDValue Mask = N->getOperand(0);
 
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
   return DAG.getNode(ISD::VSELECT, SDLoc(N),
                      LHS.getValueType(), Mask, LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(2));
   SDValue RHS = GetPromotedInteger(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
                      LHS.getValueType(), N->getOperand(0),
                      N->getOperand(1), LHS, RHS, N->getOperand(4));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   EVT InVT = N->getOperand(0).getValueType();
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
 
   EVT SVT = getSetCCResultType(InVT);
 
   // If we got back a type that needs to be promoted, this likely means the
   // the input type also needs to be promoted. So get the promoted type for
   // the input and try the query again.
   if (getTypeAction(SVT) == TargetLowering::TypePromoteInteger) {
     if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) {
       InVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
       SVT = getSetCCResultType(InVT);
     } else {
       // Input type isn't promoted, just use the default promoted type.
       SVT = NVT;
     }
   }
 
   SDLoc dl(N);
   assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() &&
          "Vector compare must return a vector result!");
 
   // Get the SETCC result using the canonical SETCC type.
   SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0),
                               N->getOperand(1), N->getOperand(2));
 
   // Convert to the expected type.
   return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
   return DAG.getNode(ISD::SHL, SDLoc(N), LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
   // The input may have strange things in the top bits of the registers, but
   // these operations don't care.  They may have weird bits going out, but
   // that too is okay if they are integer operations.
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = GetPromotedInteger(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
   // Sign extend the input.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
   // Zero extend the input.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   // The input value must be properly sign extended.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
   return DAG.getNode(ISD::SRA, SDLoc(N), LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
   // The input value must be properly zero extended.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
   return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Res;
   SDValue InOp = N->getOperand(0);
   SDLoc dl(N);
 
   switch (getTypeAction(InOp.getValueType())) {
   default: llvm_unreachable("Unknown type action!");
   case TargetLowering::TypeLegal:
   case TargetLowering::TypeExpandInteger:
     Res = InOp;
     break;
   case TargetLowering::TypePromoteInteger:
     Res = GetPromotedInteger(InOp);
     break;
   case TargetLowering::TypeSplitVector: {
     EVT InVT = InOp.getValueType();
     assert(InVT.isVector() && "Cannot split scalar types");
     unsigned NumElts = InVT.getVectorNumElements();
     assert(NumElts == NVT.getVectorNumElements() &&
            "Dst and Src must have the same number of elements");
     assert(isPowerOf2_32(NumElts) &&
            "Promoted vector type must be a power of two");
 
     SDValue EOp1, EOp2;
     GetSplitVector(InOp, EOp1, EOp2);
 
     EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
                                    NumElts/2);
     EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
     EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
   }
   case TargetLowering::TypeWidenVector: {
     SDValue WideInOp = GetWidenedVector(InOp);
 
     // Truncate widened InOp.
     unsigned NumElem = WideInOp.getValueType().getVectorNumElements();
     EVT TruncVT = EVT::getVectorVT(*DAG.getContext(),
                                    N->getValueType(0).getScalarType(), NumElem);
     SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp);
 
     // Zero extend so that the elements are of same type as those of NVT
     EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(),
                                  NumElem);
     SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc);
 
     // Extract the low NVT subvector.
     MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
     SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx);
   }
   }
 
   // Truncate to NVT instead of VT
   return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
 
   // The operation overflowed iff the result in the larger type is not the
   // zero extension of its truncation to the original type.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
   EVT OVT = N->getOperand(0).getValueType();
   EVT NVT = LHS.getValueType();
   SDLoc dl(N);
 
   // Do the arithmetic in the larger type.
   unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
   SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
 
   // Calculate the overflow flag: zero extend the arithmetic result from
   // the original type.
   SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
   // Overflowed if and only if this is not equal to Res.
   Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
 
   // Use the calculated overflow everywhere.
   ReplaceValueWith(SDValue(N, 1), Ofl);
 
   return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
 
   // We need to sign-extend the operands so the carry value computed by the
   // wide operation will be equivalent to the carry value computed by the
   // narrow operation.
   // An ADDCARRY can generate carry only if any of the operands has its
   // most significant bit set. Sign extension propagates the most significant
   // bit into the higher bits which means the extra bit that the narrow
   // addition would need (i.e. the carry) will be propagated through the higher
   // bits of the wide addition.
   // A SUBCARRY can generate borrow only if LHS < RHS and this property will be
   // preserved by sign extension.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
 
   EVT ValueVTs[] = {LHS.getValueType(), N->getValueType(1)};
 
   // Do the arithmetic in the wide type.
   SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), DAG.getVTList(ValueVTs),
                             LHS, RHS, N->getOperand(2));
 
   // Update the users of the original carry/borrow value.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
 
   return SDValue(Res.getNode(), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   // Promote the overflow bit trivially.
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
 
   SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
   SDLoc DL(N);
   EVT SmallVT = LHS.getValueType();
 
   // To determine if the result overflowed in a larger type, we extend the
   // input to the larger type, do the multiply (checking if it overflows),
   // then also check the high bits of the result to see if overflow happened
   // there.
   if (N->getOpcode() == ISD::SMULO) {
     LHS = SExtPromotedInteger(LHS);
     RHS = SExtPromotedInteger(RHS);
   } else {
     LHS = ZExtPromotedInteger(LHS);
     RHS = ZExtPromotedInteger(RHS);
   }
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1));
   SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS);
 
   // Overflow occurred if it occurred in the larger type, or if the high part
   // of the result does not zero/sign-extend the low part.  Check this second
   // possibility first.
   SDValue Overflow;
   if (N->getOpcode() == ISD::UMULO) {
     // Unsigned overflow occurred if the high part is non-zero.
     SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
                              DAG.getIntPtrConstant(SmallVT.getSizeInBits(),
                                                    DL));
     Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
                             DAG.getConstant(0, DL, Hi.getValueType()),
                             ISD::SETNE);
   } else {
     // Signed overflow occurred if the high part does not sign extend the low.
     SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(),
                                Mul, DAG.getValueType(SmallVT));
     Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE);
   }
 
   // The only other way for overflow to occur is if the multiplication in the
   // larger type itself overflowed.
   Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow,
                          SDValue(Mul.getNode(), 1));
 
   // Use the calculated overflow everywhere.
   ReplaceValueWith(SDValue(N, 1), Overflow);
   return Mul;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
   return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
                                                N->getValueType(0)));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
   SDValue Chain = N->getOperand(0); // Get the chain.
   SDValue Ptr = N->getOperand(1); // Get the pointer.
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT);
   unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT);
   // The argument is passed as NumRegs registers of type RegVT.
 
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned i = 0; i < NumRegs; ++i) {
     Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
                             N->getConstantOperandVal(3));
     Chain = Parts[i].getValue(1);
   }
 
   // Handle endianness of the load.
   if (DAG.getDataLayout().isBigEndian())
     std::reverse(Parts.begin(), Parts.end());
 
   // Assemble the parts in the promoted type.
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]);
   for (unsigned i = 1; i < NumRegs; ++i) {
     SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
     // Shift it to the right position and "or" it in.
     Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
                        DAG.getConstant(i * RegVT.getSizeInBits(), dl,
                                        TLI.getPointerTy(DAG.getDataLayout())));
     Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
   }
 
   // Modified the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Chain);
 
   return Res;
 }
 
 //===----------------------------------------------------------------------===//
 //  Integer Operand Promotion
 //===----------------------------------------------------------------------===//
 
 /// PromoteIntegerOperand - This method is called when the specified operand of
 /// the specified node is found to need promotion.  At this point, all of the
 /// result types of the node are known to be legal, but other operands of the
 /// node may need promotion or expansion as well as the specified one.
 bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
 
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
     LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
     return false;
   }
 
   switch (N->getOpcode()) {
     default:
   #ifndef NDEBUG
     dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
   #endif
     llvm_unreachable("Do not know how to promote this operator's operand!");
 
   case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
   case ISD::ATOMIC_STORE:
     Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
     break;
   case ISD::BITCAST:      Res = PromoteIntOp_BITCAST(N); break;
   case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
   case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
   case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
   case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::INSERT_VECTOR_ELT:
                           Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
   case ISD::SCALAR_TO_VECTOR:
                           Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
   case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
   case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
   case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
                                                    OpNo); break;
   case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
                                                     OpNo); break;
   case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
                                                     OpNo); break;
   case ISD::MGATHER:  Res = PromoteIntOp_MGATHER(cast<MaskedGatherSDNode>(N),
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::FP16_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
   case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntOp_EXTRACT_SUBVECTOR(N); break;
 
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:
   case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
 
   case ISD::ADDCARRY:
   case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this.
   if (Res.getNode() == N)
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 /// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
 /// shared among BR_CC, SELECT_CC, and SETCC handlers.
 void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
                                             ISD::CondCode CCCode) {
   // We have to insert explicit sign or zero extends.  Note that we could
   // insert sign extends for ALL conditions, but zero extend is cheaper on
   // many machines (an AND instead of two shifts), so prefer it.
   switch (CCCode) {
   default: llvm_unreachable("Unknown integer comparison!");
   case ISD::SETEQ:
   case ISD::SETNE: {
     SDValue OpL = GetPromotedInteger(NewLHS);
     SDValue OpR = GetPromotedInteger(NewRHS);
 
     // We would prefer to promote the comparison operand with sign extension.
     // If the width of OpL/OpR excluding the duplicated sign bits is no greater
     // than the width of NewLHS/NewRH, we can avoid inserting real truncate
     // instruction, which is redudant eventually.
     unsigned OpLEffectiveBits =
         OpL.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(OpL) + 1;
     unsigned OpREffectiveBits =
         OpR.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(OpR) + 1;
     if (OpLEffectiveBits <= NewLHS.getScalarValueSizeInBits() &&
         OpREffectiveBits <= NewRHS.getScalarValueSizeInBits()) {
       NewLHS = OpL;
       NewRHS = OpR;
     } else {
       NewLHS = ZExtPromotedInteger(NewLHS);
       NewRHS = ZExtPromotedInteger(NewRHS);
     }
     break;
   }
   case ISD::SETUGE:
   case ISD::SETUGT:
   case ISD::SETULE:
   case ISD::SETULT:
     // ALL of these operations will work if we either sign or zero extend
     // the operands (including the unsigned comparisons!).  Zero extend is
     // usually a simpler/cheaper operation, so prefer it.
     NewLHS = ZExtPromotedInteger(NewLHS);
     NewRHS = ZExtPromotedInteger(NewRHS);
     break;
   case ISD::SETGE:
   case ISD::SETGT:
   case ISD::SETLT:
   case ISD::SETLE:
     NewLHS = SExtPromotedInteger(NewLHS);
     NewRHS = SExtPromotedInteger(NewRHS);
     break;
   }
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
                        N->getChain(), N->getBasePtr(), Op2, N->getMemOperand());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
   // This should only occur in unusual situations like bitcasting to an
   // x86_fp80, so just turn it into a store+load
   return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
   assert(OpNo == 2 && "Don't know how to promote this operand!");
 
   SDValue LHS = N->getOperand(2);
   SDValue RHS = N->getOperand(3);
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(1))->get());
 
   // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
   // legal types.
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 N->getOperand(1), LHS, RHS, N->getOperand(4)),
                  0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
   assert(OpNo == 1 && "only know how to promote condition");
 
   // Promote all the way up to the canonical SetCC type.
   SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other);
 
   // The chain (Op#0) and basic block destination (Op#2) are always legal types.
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
                                         N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
   // Since the result type is legal, the operands must promote to it.
   EVT OVT = N->getOperand(0).getValueType();
   SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
   SDValue Hi = GetPromotedInteger(N->getOperand(1));
   assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
   SDLoc dl(N);
 
   Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
                    DAG.getConstant(OVT.getSizeInBits(), dl,
                                    TLI.getPointerTy(DAG.getDataLayout())));
   return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   // The vector type is legal but the element type is not.  This implies
   // that the vector is a power-of-two in length and that the element
   // type does not have a strange size (eg: it is not i1).
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
   assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
          "Legal vector of one illegal element?");
 
   // Promote the inserted value.  The type does not need to match the
   // vector element type.  Check that any extra bits introduced will be
   // truncated away.
   assert(N->getOperand(0).getValueSizeInBits() >=
          N->getValueType(0).getScalarSizeInBits() &&
          "Type of inserted value narrower than vector element type!");
 
   SmallVector<SDValue, 16> NewOps;
   for (unsigned i = 0; i < NumElts; ++i)
     NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
 
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
                                                          unsigned OpNo) {
   if (OpNo == 1) {
     // Promote the inserted value.  This is valid because the type does not
     // have to match the vector element type.
 
     // Check that any extra bits introduced will be truncated away.
     assert(N->getOperand(1).getValueSizeInBits() >=
            N->getValueType(0).getScalarSizeInBits() &&
            "Type of inserted value narrower than vector element type!");
     return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                   GetPromotedInteger(N->getOperand(1)),
                                   N->getOperand(2)),
                    0);
   }
 
   assert(OpNo == 2 && "Different operand and result vector types?");
 
   // Promote the index.
   SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N),
                                    TLI.getVectorIdxTy(DAG.getDataLayout()));
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 N->getOperand(1), Idx), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
   // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
   // the operand in place.
   return SDValue(DAG.UpdateNodeOperands(N,
                                 GetPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Only know how to promote the condition!");
   SDValue Cond = N->getOperand(0);
   EVT OpTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::VSELECT)
     if (SDValue Res = WidenVSELECTAndMask(N))
       return Res;
 
   // Promote all the way up to the canonical SetCC type.
   EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
   Cond = PromoteTargetBoolean(Cond, OpVT);
 
   return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1),
                                         N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Don't know how to promote this operand!");
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
 
   // The CC (#4) and the possible return values (#2 and #3) have legal types.
   return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
                                 N->getOperand(3), N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Don't know how to promote this operand!");
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
 
   // The CC (#2) is always legal.
   return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 ZExtPromotedInteger(N->getOperand(1))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   SDLoc dl(N);
   Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
   return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
                      Op, DAG.getValueType(N->getOperand(0).getValueType()));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
   return SDValue(DAG.UpdateNodeOperands(N,
                                 SExtPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
   SDLoc dl(N);
 
   SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
 
   // Truncate the value and store the result.
   return DAG.getTruncStore(Ch, dl, Val, Ptr,
                            N->getMemoryVT(), N->getMemOperand());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
                                               unsigned OpNo) {
 
   SDValue DataOp = N->getValue();
   EVT DataVT = DataOp.getValueType();
   SDValue Mask = N->getMask();
   SDLoc dl(N);
 
   bool TruncateStore = false;
   if (OpNo == 2) {
     // Mask comes before the data operand. If the data operand is legal, we just
     // promote the mask.
     // When the data operand has illegal type, we should legalize the data
     // operand first. The mask will be promoted/splitted/widened according to
     // the data operand type.
     if (TLI.isTypeLegal(DataVT)) {
       Mask = PromoteTargetBoolean(Mask, DataVT);
       // Update in place.
       SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
       NewOps[2] = Mask;
       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
     }
 
     if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger)
       return PromoteIntOp_MSTORE(N, 3);
     if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector)
       return WidenVecOp_MSTORE(N, 3);
     assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector);
     return SplitVecOp_MSTORE(N, 3);
   } else { // Data operand
     assert(OpNo == 3 && "Unexpected operand for promotion");
     DataOp = GetPromotedInteger(DataOp);
     TruncateStore = true;
   }
 
   return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
                             N->getMemoryVT(), N->getMemOperand(),
                             TruncateStore, N->isCompressingStore());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
                                              unsigned OpNo) {
   assert(OpNo == 2 && "Only know how to promote the mask!");
   EVT DataVT = N->getValueType(0);
   SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
   NewOps[OpNo] = Mask;
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
                                                unsigned OpNo) {
 
   SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   if (OpNo == 2) {
     // The Mask
     EVT DataVT = N->getValueType(0);
     NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   } else if (OpNo == 4) {
     // Need to sign extend the index since the bits will likely be used.
     NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
 
   SDValue Res = SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
   // updated in place.
   if (Res.getNode() == N)
     return Res;
 
   ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   return SDValue();
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                                                 unsigned OpNo) {
   SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   if (OpNo == 2) {
     // The Mask
     EVT DataVT = N->getValue().getValueType();
     NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   } else if (OpNo == 4) {
     // Need to sign extend the index since the bits will likely be used.
     NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
   return SDValue(DAG.UpdateNodeOperands(N,
                                 ZExtPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
   SDLoc dl(N);
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
   return DAG.getZeroExtendInReg(Op, dl,
                                 N->getOperand(0).getValueType().getScalarType());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) {
   assert(OpNo == 2 && "Don't know how to promote this operand!");
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDValue Carry = N->getOperand(2);
   SDLoc DL(N);
 
   auto VT = getSetCCResultType(LHS.getValueType());
   TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(VT);
   switch (BoolType) {
   case TargetLoweringBase::UndefinedBooleanContent:
     Carry = DAG.getAnyExtOrTrunc(Carry, DL, VT);
     break;
   case TargetLoweringBase::ZeroOrOneBooleanContent:
     Carry = DAG.getZExtOrTrunc(Carry, DL, VT);
     break;
   case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
     Carry = DAG.getSExtOrTrunc(Carry, DL, VT);
     break;
   }
 
   return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0);
 }
 
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
 
 /// ExpandIntegerResult - This method is called when the specified result of the
 /// specified node is found to need expansion.  At this point, the node may also
 /// have invalid operands or may have other results that need promotion, we just
 /// know that (at least) one result needs expansion.
 void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Lo, Hi;
   Lo = Hi = SDValue();
 
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;
 
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
     dbgs() << "ExpandIntegerResult #" << ResNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to expand the result of this operator!");
 
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
 
   case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
   case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
   case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
   case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
 
   case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
   case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
   case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
   case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
   case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
   case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
   case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
   case ISD::FLT_ROUNDS_: ExpandIntRes_FLT_ROUNDS(N, Lo, Hi); break;
   case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
   case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
   case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
   case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
   case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break;
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
   case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
   case ISD::TRUNCATE:    ExpandIntRes_TRUNCATE(N, Lo, Hi); break;
   case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
   case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
   case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
   case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break;
 
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_CMP_SWAP: {
     std::pair<SDValue, SDValue> Tmp = ExpandAtomic(N);
     SplitInteger(Tmp.first, Lo, Hi);
     ReplaceValueWith(SDValue(N, 1), Tmp.second);
     break;
   }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     AtomicSDNode *AN = cast<AtomicSDNode>(N);
     SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other);
     SDValue Tmp = DAG.getAtomicCmpSwap(
         ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs,
         N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3),
         AN->getMemOperand());
 
     // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine
     // success simply by comparing the loaded value against the ingoing
     // comparison.
     SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp,
                                    N->getOperand(2), ISD::SETEQ);
 
     SplitInteger(Tmp, Lo, Hi);
     ReplaceValueWith(SDValue(N, 1), Success);
     ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1));
     break;
   }
 
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
 
   case ISD::UMAX:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::SMIN: ExpandIntRes_MINMAX(N, Lo, Hi); break;
 
   case ISD::ADD:
   case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
 
   case ISD::ADDC:
   case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break;
 
   case ISD::ADDE:
   case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
 
   case ISD::ADDCARRY:
   case ISD::SUBCARRY: ExpandIntRes_ADDSUBCARRY(N, Lo, Hi); break;
 
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
 
   case ISD::SADDO:
   case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
   case ISD::UADDO:
   case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
   case ISD::UMULO:
   case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
   if (Lo.getNode())
     SetExpandedInteger(SDValue(N, ResNo), Lo, Hi);
 }
 
 /// Lower an atomic node to the appropriate builtin call.
 std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
   unsigned Opc = Node->getOpcode();
   MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
   RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
 
   return ExpandChainLibCall(LC, Node, false);
 }
 
 /// N is a shift by a value that needs to be expanded,
 /// and the shift amount is a constant 'Amt'.  Expand the operation.
 void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
                                              SDValue &Lo, SDValue &Hi) {
   SDLoc DL(N);
   // Expand the incoming operand to be shifted, so that we have its parts
   SDValue InL, InH;
   GetExpandedInteger(N->getOperand(0), InL, InH);
 
   // Though Amt shouldn't usually be 0, it's possible. E.g. when legalization
   // splitted a vector shift, like this: <op1, op2> SHL <0, 2>.
   if (!Amt) {
     Lo = InL;
     Hi = InH;
     return;
   }
 
   EVT NVT = InL.getValueType();
   unsigned VTBits = N->getValueType(0).getSizeInBits();
   unsigned NVTBits = NVT.getSizeInBits();
   EVT ShTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::SHL) {
     if (Amt.ugt(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getConstant(0, DL, NVT);
       Hi = DAG.getNode(ISD::SHL, DL,
                        NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy));
     } else if (Amt == NVTBits) {
       Lo = DAG.getConstant(0, DL, NVT);
       Hi = InL;
     } else {
       Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy));
       Hi = DAG.getNode(ISD::OR, DL, NVT,
                        DAG.getNode(ISD::SHL, DL, NVT, InH,
                                    DAG.getConstant(Amt, DL, ShTy)),
                        DAG.getNode(ISD::SRL, DL, NVT, InL,
                                    DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
     }
     return;
   }
 
   if (N->getOpcode() == ISD::SRL) {
     if (Amt.ugt(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getNode(ISD::SRL, DL,
                        NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy));
       Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt == NVTBits) {
       Lo = InH;
       Hi = DAG.getConstant(0, DL, NVT);
     } else {
       Lo = DAG.getNode(ISD::OR, DL, NVT,
                        DAG.getNode(ISD::SRL, DL, NVT, InL,
                                    DAG.getConstant(Amt, DL, ShTy)),
                        DAG.getNode(ISD::SHL, DL, NVT, InH,
                                    DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
       Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
     }
     return;
   }
 
   assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
   if (Amt.ugt(VTBits)) {
     Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
                           DAG.getConstant(NVTBits - 1, DL, ShTy));
   } else if (Amt.ugt(NVTBits)) {
     Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
                      DAG.getConstant(Amt - NVTBits, DL, ShTy));
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
                      DAG.getConstant(NVTBits - 1, DL, ShTy));
   } else if (Amt == NVTBits) {
     Lo = InH;
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
                      DAG.getConstant(NVTBits - 1, DL, ShTy));
   } else {
     Lo = DAG.getNode(ISD::OR, DL, NVT,
                      DAG.getNode(ISD::SRL, DL, NVT, InL,
                                  DAG.getConstant(Amt, DL, ShTy)),
                      DAG.getNode(ISD::SHL, DL, NVT, InH,
                                  DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
   }
 }
 
 /// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify
 /// this shift based on knowledge of the high bit of the shift amount.  If we
 /// can tell this, we know that it is >= 32 or < 32, without knowing the actual
 /// shift amount.
 bool DAGTypeLegalizer::
 ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Amt = N->getOperand(1);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT ShTy = Amt.getValueType();
   unsigned ShBits = ShTy.getScalarSizeInBits();
   unsigned NVTBits = NVT.getScalarSizeInBits();
   assert(isPowerOf2_32(NVTBits) &&
          "Expanded integer type size not a power of two!");
   SDLoc dl(N);
 
   APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
   KnownBits Known;
   DAG.computeKnownBits(N->getOperand(1), Known);
 
   // If we don't know anything about the high bits, exit.
   if (((Known.Zero|Known.One) & HighBitMask) == 0)
     return false;
 
   // Get the incoming operand to be shifted.
   SDValue InL, InH;
   GetExpandedInteger(N->getOperand(0), InL, InH);
 
   // If we know that any of the high bits of the shift amount are one, then we
   // can do this as a couple of simple shifts.
   if (Known.One.intersects(HighBitMask)) {
     // Mask out the high bit, which we know is set.
     Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
                       DAG.getConstant(~HighBitMask, dl, ShTy));
 
     switch (N->getOpcode()) {
     default: llvm_unreachable("Unknown shift");
     case ISD::SHL:
       Lo = DAG.getConstant(0, dl, NVT);              // Low part is zero.
       Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
       return true;
     case ISD::SRL:
       Hi = DAG.getConstant(0, dl, NVT);              // Hi part is zero.
       Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
       return true;
     case ISD::SRA:
       Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
                        DAG.getConstant(NVTBits - 1, dl, ShTy));
       Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
       return true;
     }
   }
 
   // If we know that all of the high bits of the shift amount are zero, then we
   // can do this as a couple of simple shifts.
   if (HighBitMask.isSubsetOf(Known.Zero)) {
     // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined
     // shift if x is zero.  We can use XOR here because x is known to be smaller
     // than 32.
     SDValue Amt2 = DAG.getNode(ISD::XOR, dl, ShTy, Amt,
                                DAG.getConstant(NVTBits - 1, dl, ShTy));
 
     unsigned Op1, Op2;
     switch (N->getOpcode()) {
     default: llvm_unreachable("Unknown shift");
     case ISD::SHL:  Op1 = ISD::SHL; Op2 = ISD::SRL; break;
     case ISD::SRL:
     case ISD::SRA:  Op1 = ISD::SRL; Op2 = ISD::SHL; break;
     }
 
     // When shifting right the arithmetic for Lo and Hi is swapped.
     if (N->getOpcode() != ISD::SHL)
       std::swap(InL, InH);
 
     // Use a little trick to get the bits that move from Lo to Hi. First
     // shift by one bit.
     SDValue Sh1 = DAG.getNode(Op2, dl, NVT, InL, DAG.getConstant(1, dl, ShTy));
     // Then compute the remaining shift with amount-1.
     SDValue Sh2 = DAG.getNode(Op2, dl, NVT, Sh1, Amt2);
 
     Lo = DAG.getNode(N->getOpcode(), dl, NVT, InL, Amt);
     Hi = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(Op1, dl, NVT, InH, Amt),Sh2);
 
     if (N->getOpcode() != ISD::SHL)
       std::swap(Hi, Lo);
     return true;
   }
 
   return false;
 }
 
 /// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift
 /// of any size.
 bool DAGTypeLegalizer::
 ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Amt = N->getOperand(1);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT ShTy = Amt.getValueType();
   unsigned NVTBits = NVT.getSizeInBits();
   assert(isPowerOf2_32(NVTBits) &&
          "Expanded integer type size not a power of two!");
   SDLoc dl(N);
 
   // Get the incoming operand to be shifted.
   SDValue InL, InH;
   GetExpandedInteger(N->getOperand(0), InL, InH);
 
   SDValue NVBitsNode = DAG.getConstant(NVTBits, dl, ShTy);
   SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
   SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
   SDValue isShort = DAG.getSetCC(dl, getSetCCResultType(ShTy),
                                  Amt, NVBitsNode, ISD::SETULT);
   SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(ShTy),
                                 Amt, DAG.getConstant(0, dl, ShTy),
                                 ISD::SETEQ);
 
   SDValue LoS, HiS, LoL, HiL;
   switch (N->getOpcode()) {
   default: llvm_unreachable("Unknown shift");
   case ISD::SHL:
     // Short: ShAmt < NVTBits
     LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
     HiS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
                       DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
 
     // Long: ShAmt >= NVTBits
     LoL = DAG.getConstant(0, dl, NVT);                    // Lo part is zero.
     HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
 
     Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL);
     Hi = DAG.getSelect(dl, NVT, isZero, InH,
                        DAG.getSelect(dl, NVT, isShort, HiS, HiL));
     return true;
   case ISD::SRL:
     // Short: ShAmt < NVTBits
     HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
     LoS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
     // FIXME: If Amt is zero, the following shift generates an undefined result
     // on some architectures.
                       DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
 
     // Long: ShAmt >= NVTBits
     HiL = DAG.getConstant(0, dl, NVT);                    // Hi part is zero.
     LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
 
     Lo = DAG.getSelect(dl, NVT, isZero, InL,
                        DAG.getSelect(dl, NVT, isShort, LoS, LoL));
     Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRA:
     // Short: ShAmt < NVTBits
     HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
     LoS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
                       DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
 
     // Long: ShAmt >= NVTBits
     HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
                       DAG.getConstant(NVTBits - 1, dl, ShTy));
     LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
 
     Lo = DAG.getSelect(dl, NVT, isZero, InL,
                        DAG.getSelect(dl, NVT, isShort, LoS, LoL));
     Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
     return true;
   }
 }
 
 static std::pair<ISD::CondCode, ISD::NodeType> getExpandedMinMaxOps(int Op) {
 
   switch (Op) {
     default: llvm_unreachable("invalid min/max opcode");
     case ISD::SMAX:
       return std::make_pair(ISD::SETGT, ISD::UMAX);
     case ISD::UMAX:
       return std::make_pair(ISD::SETUGT, ISD::UMAX);
     case ISD::SMIN:
       return std::make_pair(ISD::SETLT, ISD::UMIN);
     case ISD::UMIN:
       return std::make_pair(ISD::SETULT, ISD::UMIN);
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDLoc DL(N);
   ISD::NodeType LoOpc;
   ISD::CondCode CondC;
   std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode());
 
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
 
   // Value types
   EVT NVT = LHSL.getValueType();
   EVT CCT = getSetCCResultType(NVT);
 
   // Hi part is always the same op
   Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH});
 
   // We need to know whether to select Lo part that corresponds to 'winning'
   // Hi part or if Hi parts are equal.
   SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC);
   SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ);
 
   // Lo part corresponding to the 'winning' Hi part
   SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
 
   // Recursed Lo part if Hi parts are equal, this uses unsigned version
   SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL});
 
   Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
 
   EVT NVT = LHSL.getValueType();
   SDValue LoOps[2] = { LHSL, RHSL };
   SDValue HiOps[3] = { LHSH, RHSH };
 
   bool HasOpCarry = TLI.isOperationLegalOrCustom(
       N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY,
       TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
   if (HasOpCarry) {
     SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT));
     if (N->getOpcode() == ISD::ADD) {
       Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps);
     }
     return;
   }
 
   // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
   // them.  TODO: Teach operation legalization how to expand unsupported
   // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
   // a carry of type MVT::Glue, but there doesn't seem to be any way to
   // generate a value of this type in the expanded code sequence.
   bool hasCarry =
     TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
                                    ISD::ADDC : ISD::SUBC,
                                  TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
 
   if (hasCarry) {
     SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
     if (N->getOpcode() == ISD::ADD) {
       Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
     }
     return;
   }
 
   bool hasOVF =
     TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
                                    ISD::UADDO : ISD::USUBO,
                                  TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
   TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
 
   if (hasOVF) {
     EVT OvfVT = getSetCCResultType(NVT);
     SDVTList VTList = DAG.getVTList(NVT, OvfVT);
     int RevOpc;
     if (N->getOpcode() == ISD::ADD) {
       RevOpc = ISD::SUB;
       Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
       Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     } else {
       RevOpc = ISD::ADD;
       Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
       Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2));
     }
     SDValue OVF = Lo.getValue(1);
 
     switch (BoolType) {
     case TargetLoweringBase::UndefinedBooleanContent:
       OVF = DAG.getNode(ISD::AND, dl, OvfVT, DAG.getConstant(1, dl, OvfVT), OVF);
       LLVM_FALLTHROUGH;
     case TargetLoweringBase::ZeroOrOneBooleanContent:
       OVF = DAG.getZExtOrTrunc(OVF, dl, NVT);
       Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF);
       break;
     case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
       OVF = DAG.getSExtOrTrunc(OVF, dl, NVT);
       Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF);
     }
     return;
   }
 
   if (N->getOpcode() == ISD::ADD) {
     Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps);
     Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
 
     if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) {
       SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT);
       Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
       return;
     }
 
     SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
                                    DAG.getConstant(1, dl, NVT),
                                    DAG.getConstant(0, dl, NVT));
     SDValue Cmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[1],
                                 ISD::SETULT);
     SDValue Carry2 = DAG.getSelect(dl, NVT, Cmp2,
                                    DAG.getConstant(1, dl, NVT), Carry1);
     Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
   } else {
     Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
     Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp =
       DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
 
     SDValue Borrow;
     if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
       Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT);
     else
       Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
                              DAG.getConstant(0, dl, NVT));
 
     Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
   SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
   SDValue LoOps[2] = { LHSL, RHSL };
   SDValue HiOps[3] = { LHSH, RHSH };
 
   if (N->getOpcode() == ISD::ADDC) {
     Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
     HiOps[2] = Lo.getValue(1);
     Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
   } else {
     Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
     HiOps[2] = Lo.getValue(1);
     Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
   }
 
   // Legalized the flag result - switch anything that used the old flag to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
   SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
   SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
   SDValue HiOps[3] = { LHSH, RHSH };
 
   Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
   HiOps[2] = Lo.getValue(1);
   Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
 
   // Legalized the flag result - switch anything that used the old flag to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDLoc dl(N);
 
   SDValue Ovf;
 
   bool HasOpCarry = TLI.isOperationLegalOrCustom(
       N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY,
       TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType()));
 
   if (HasOpCarry) {
     // Expand the subcomponents.
     SDValue LHSL, LHSH, RHSL, RHSH;
     GetExpandedInteger(LHS, LHSL, LHSH);
     GetExpandedInteger(RHS, RHSL, RHSH);
     SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
     SDValue LoOps[2] = { LHSL, RHSL };
     SDValue HiOps[3] = { LHSH, RHSH };
 
     unsigned Opc = N->getOpcode() == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
     Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
     HiOps[2] = Lo.getValue(1);
     Hi = DAG.getNode(Opc, dl, VTList, HiOps);
 
     Ovf = Hi.getValue(1);
   } else {
     // Expand the result by simply replacing it with the equivalent
     // non-overflow-checking operation.
     auto Opc = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
     SDValue Sum = DAG.getNode(Opc, dl, LHS.getValueType(), LHS, RHS);
     SplitInteger(Sum, Lo, Hi);
 
     // Calculate the overflow: addition overflows iff a + b < a, and subtraction
     // overflows iff a - b > a.
     auto Cond = N->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
     Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond);
   }
 
   // Legalized the flag result - switch anything that used the old flag to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Ovf);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ADDSUBCARRY(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
   SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
   SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
   SDValue HiOps[3] = { LHSH, RHSH, SDValue() };
 
   Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
   HiOps[2] = Lo.getValue(1);
   Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
 
   // Legalized the flag result - switch anything that used the old flag to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().bitsLE(NVT)) {
     // The low part is any extension of the input (which degenerates to a copy).
     Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op);
     Hi = DAG.getUNDEF(NVT);   // The high part is undefined.
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
     assert(getTypeAction(Op.getValueType()) ==
            TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
            "Operand over promoted?");
     // Split the promoted operand.  This will simplify when it is expanded.
     SplitInteger(Res, Lo, Hi);
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
   unsigned NVTBits = NVT.getSizeInBits();
   unsigned EVTBits = EVT.getSizeInBits();
 
   if (NVTBits < EVTBits) {
     Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi,
                      DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
                                                         EVTBits - NVTBits)));
   } else {
     Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
     // The high part replicates the sign bit of Lo, make it explicit.
     Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
                      DAG.getConstant(NVTBits - 1, dl,
                                      TLI.getPointerTy(DAG.getDataLayout())));
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
   unsigned NVTBits = NVT.getSizeInBits();
   unsigned EVTBits = EVT.getSizeInBits();
 
   if (NVTBits < EVTBits) {
     Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi,
                      DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
                                                         EVTBits - NVTBits)));
   } else {
     Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT));
     // The high part must be zero, make it explicit.
     Hi = DAG.getConstant(0, dl, NVT);
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_BITREVERSE(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
   Lo = DAG.getNode(ISD::BITREVERSE, dl, Lo.getValueType(), Lo);
   Hi = DAG.getNode(ISD::BITREVERSE, dl, Hi.getValueType(), Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
   Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
   Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned NBitWidth = NVT.getSizeInBits();
   auto Constant = cast<ConstantSDNode>(N);
   const APInt &Cst = Constant->getAPIntValue();
   bool IsTarget = Constant->isTargetOpcode();
   bool IsOpaque = Constant->isOpaque();
   SDLoc dl(N);
   Lo = DAG.getConstant(Cst.trunc(NBitWidth), dl, NVT, IsTarget, IsOpaque);
   Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), dl, NVT, IsTarget,
                        IsOpaque);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
 
   SDValue HiNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi,
                                    DAG.getConstant(0, dl, NVT), ISD::SETNE);
 
   SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo);
   SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi);
 
   Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ,
                      DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
                                  DAG.getConstant(NVT.getSizeInBits(), dl,
                                                  NVT)));
   Hi = DAG.getConstant(0, dl, NVT);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
   Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
                    DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
   Hi = DAG.getConstant(0, dl, NVT);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
 
   SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo,
                                    DAG.getConstant(0, dl, NVT), ISD::SETNE);
 
   SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo);
   SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi);
 
   Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ,
                      DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
                                  DAG.getConstant(NVT.getSizeInBits(), dl,
                                                  NVT)));
   Hi = DAG.getConstant(0, dl, NVT);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FLT_ROUNDS(SDNode *N, SDValue &Lo,
                                                SDValue &Hi) {
   SDLoc dl(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned NBitWidth = NVT.getSizeInBits();
 
   EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   Lo = DAG.getNode(ISD::FLT_ROUNDS_, dl, NVT);
   // The high part is the sign of Lo, as -1 is a valid value for FLT_ROUNDS
   Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
                    DAG.getConstant(NBitWidth - 1, dl, ShiftAmtTy));
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
                                                SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   SDValue Op = N->getOperand(0);
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
     Op = GetPromotedFloat(Op);
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first,
                Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
                                                SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   SDValue Op = N->getOperand(0);
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
     Op = GetPromotedFloat(Op);
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first,
                Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   if (ISD::isNormalLoad(N)) {
     ExpandRes_NormalLoad(N, Lo, Hi);
     return;
   }
 
   assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
 
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   ISD::LoadExtType ExtType = N->getExtensionType();
   unsigned Alignment = N->getAlignment();
   MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   if (N->getMemoryVT().bitsLE(NVT)) {
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT,
                         Alignment, MMOFlags, AAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
 
     if (ExtType == ISD::SEXTLOAD) {
       // The high part is obtained by SRA'ing all but one of the bits of the
       // lo part.
       unsigned LoSize = Lo.getValueSizeInBits();
       Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
                        DAG.getConstant(LoSize - 1, dl,
                                        TLI.getPointerTy(DAG.getDataLayout())));
     } else if (ExtType == ISD::ZEXTLOAD) {
       // The high part is just a zero.
       Hi = DAG.getConstant(0, dl, NVT);
     } else {
       assert(ExtType == ISD::EXTLOAD && "Unknown extload!");
       // The high part is undefined.
       Hi = DAG.getUNDEF(NVT);
     }
   } else if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
                      AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
     EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
 
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
                         MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
     Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                      Hi.getValue(1));
   } else {
     // Big-endian - high bits are at low addresses.  Favor aligned loads at
     // the cost of some bit-fiddling.
     EVT MemVT = N->getMemoryVT();
     unsigned EBytes = MemVT.getStoreSize();
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     unsigned ExcessBits = (EBytes - IncrementSize)*8;
 
     // Load both the high bits and maybe some of the low bits.
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
                         Alignment, MMOFlags, AAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
     // Load the rest of the low bits.
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                         MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
     Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                      Hi.getValue(1));
 
     if (ExcessBits < NVT.getSizeInBits()) {
       // Transfer low bits from the bottom of Hi to the top of Lo.
       Lo = DAG.getNode(
           ISD::OR, dl, NVT, Lo,
           DAG.getNode(ISD::SHL, dl, NVT, Hi,
                       DAG.getConstant(ExcessBits, dl,
                                       TLI.getPointerTy(DAG.getDataLayout()))));
       // Move high bits to the right position in Hi.
       Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT,
                        Hi,
                        DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
                                        TLI.getPointerTy(DAG.getDataLayout())));
     }
   }
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Ch);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   SDValue LL, LH, RL, RH;
   GetExpandedInteger(N->getOperand(0), LL, LH);
   GetExpandedInteger(N->getOperand(1), RL, RH);
   Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL);
   Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
                                         SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDLoc dl(N);
 
   SDValue LL, LH, RL, RH;
   GetExpandedInteger(N->getOperand(0), LL, LH);
   GetExpandedInteger(N->getOperand(1), RL, RH);
 
   if (TLI.expandMUL(N, Lo, Hi, NVT, DAG,
                     TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
                     LL, LH, RL, RH))
     return;
 
   // If nothing else, we can make a libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
     LC = RTLIB::MUL_I16;
   else if (VT == MVT::i32)
     LC = RTLIB::MUL_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::MUL_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::MUL_I128;
 
   if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
     // We'll expand the multiplication by brute force because we have no other
     // options. This is a trivially-generalized version of the code from
     // Hacker's Delight (itself derived from Knuth's Algorithm M from section
     // 4.3.1).
     unsigned Bits = NVT.getSizeInBits();
     unsigned HalfBits = Bits >> 1;
     SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
                                    NVT);
     SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
     SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
 
     SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
     SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
 
     EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
     if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) {
       // The type from TLI is too small to fit the shift amount we want.
       // Override it with i32. The shift will have to be legalized.
       ShiftAmtTy = MVT::i32;
     }
     SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
     SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
     SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
     SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
 
     SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
                             DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
     SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
     SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
 
     SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
                             DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
     SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
 
     SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
                             DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
                             DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
     Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
                      DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
 
     Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
                      DAG.getNode(ISD::ADD, dl, NVT,
                                  DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
                                  DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
     return;
   }
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
                Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDLoc DL(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other);
   SDValue R = DAG.getNode(N->getOpcode(), DL, VTs, N->getOperand(0));
   Lo = R.getValue(0);
   Hi = R.getValue(1);
   ReplaceValueWith(SDValue(N, 1), R.getValue(2));
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = Node->getOperand(0);
   SDValue RHS = Node->getOperand(1);
   SDLoc dl(Node);
 
   // Expand the result by simply replacing it with the equivalent
   // non-overflow-checking operation.
   SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
                             ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
                             LHS, RHS);
   SplitInteger(Sum, Lo, Hi);
 
   // Compute the overflow.
   //
   //   LHSSign -> LHS >= 0
   //   RHSSign -> RHS >= 0
   //   SumSign -> Sum >= 0
   //
   //   Add:
   //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
   //   Sub:
   //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
   //
   EVT OType = Node->getValueType(1);
   SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
 
   SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
   SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
   SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
                                     Node->getOpcode() == ISD::SADDO ?
                                     ISD::SETEQ : ISD::SETNE);
 
   SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
   SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
 
   SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
 
   // Use the calculated overflow everywhere.
   ReplaceValueWith(SDValue(Node, 1), Cmp);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
 
   if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) {
     SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops);
     SplitInteger(Res.getValue(0), Lo, Hi);
     return;
   }
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
     LC = RTLIB::SDIV_I16;
   else if (VT == MVT::i32)
     LC = RTLIB::SDIV_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::SDIV_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::SDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   // If we can emit an efficient shift operation, do so now.  Check to see if
   // the RHS is a constant.
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return ExpandShiftByConstant(N, CN->getAPIntValue(), Lo, Hi);
 
   // If we can determine that the high bit of the shift is zero or one, even if
   // the low bits are variable, emit this shift in an optimized form.
   if (ExpandShiftWithKnownAmountBit(N, Lo, Hi))
     return;
 
   // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
   unsigned PartsOpc;
   if (N->getOpcode() == ISD::SHL) {
     PartsOpc = ISD::SHL_PARTS;
   } else if (N->getOpcode() == ISD::SRL) {
     PartsOpc = ISD::SRL_PARTS;
   } else {
     assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
     PartsOpc = ISD::SRA_PARTS;
   }
 
   // Next check to see if the target supports this SHL_PARTS operation or if it
   // will custom expand it.
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
   if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
       Action == TargetLowering::Custom) {
     // Expand the subcomponents.
     SDValue LHSL, LHSH;
     GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
     EVT VT = LHSL.getValueType();
 
     // If the shift amount operand is coming from a vector legalization it may
     // have an illegal type.  Fix that first by casting the operand, otherwise
     // the new SHL_PARTS operation would need further legalization.
     SDValue ShiftOp = N->getOperand(1);
     EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     assert(ShiftTy.getScalarSizeInBits() >=
            Log2_32_Ceil(VT.getScalarSizeInBits()) &&
            "ShiftAmountTy is too small to cover the range of this type!");
     if (ShiftOp.getValueType() != ShiftTy)
       ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
 
     SDValue Ops[] = { LHSL, LHSH, ShiftOp };
     Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops);
     Hi = Lo.getValue(1);
     return;
   }
 
   // Otherwise, emit a libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   bool isSigned;
   if (N->getOpcode() == ISD::SHL) {
     isSigned = false; /*sign irrelevant*/
     if (VT == MVT::i16)
       LC = RTLIB::SHL_I16;
     else if (VT == MVT::i32)
       LC = RTLIB::SHL_I32;
     else if (VT == MVT::i64)
       LC = RTLIB::SHL_I64;
     else if (VT == MVT::i128)
       LC = RTLIB::SHL_I128;
   } else if (N->getOpcode() == ISD::SRL) {
     isSigned = false;
     if (VT == MVT::i16)
       LC = RTLIB::SRL_I16;
     else if (VT == MVT::i32)
       LC = RTLIB::SRL_I32;
     else if (VT == MVT::i64)
       LC = RTLIB::SRL_I64;
     else if (VT == MVT::i128)
       LC = RTLIB::SRL_I128;
   } else {
     assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
     isSigned = true;
     if (VT == MVT::i16)
       LC = RTLIB::SRA_I16;
     else if (VT == MVT::i32)
       LC = RTLIB::SRA_I32;
     else if (VT == MVT::i64)
       LC = RTLIB::SRA_I64;
     else if (VT == MVT::i128)
       LC = RTLIB::SRA_I128;
   }
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
     SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi);
     return;
   }
 
   if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi))
     llvm_unreachable("Unsupported shift!");
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().bitsLE(NVT)) {
     // The low part is sign extension of the input (degenerates to a copy).
     Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
     // The high part is obtained by SRA'ing all but one of the bits of low part.
     unsigned LoSize = NVT.getSizeInBits();
     Hi = DAG.getNode(
         ISD::SRA, dl, NVT, Lo,
         DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout())));
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
     assert(getTypeAction(Op.getValueType()) ==
            TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
            "Operand over promoted?");
     // Split the promoted operand.  This will simplify when it is expanded.
     SplitInteger(Res, Lo, Hi);
     unsigned ExcessBits = Op.getValueSizeInBits() - NVT.getSizeInBits();
     Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
                      DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
                                                         ExcessBits)));
   }
 }
 
 void DAGTypeLegalizer::
 ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
 
   if (EVT.bitsLE(Lo.getValueType())) {
     // sext_inreg the low part if needed.
     Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo,
                      N->getOperand(1));
 
     // The high part gets the sign extension from the lo-part.  This handles
     // things like sextinreg V:i64 from i8.
     Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
                      DAG.getConstant(Hi.getValueSizeInBits() - 1, dl,
                                      TLI.getPointerTy(DAG.getDataLayout())));
   } else {
     // For example, extension of an i48 to an i64.  Leave the low part alone,
     // sext_inreg the high part.
     unsigned ExcessBits = EVT.getSizeInBits() - Lo.getValueSizeInBits();
     Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
                      DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
                                                         ExcessBits)));
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
 
   if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) {
     SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops);
     SplitInteger(Res.getValue(1), Lo, Hi);
     return;
   }
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
     LC = RTLIB::SREM_I16;
   else if (VT == MVT::i32)
     LC = RTLIB::SREM_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::SREM_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::SREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
   Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(),
                    N->getOperand(0),
                    DAG.getConstant(NVT.getSizeInBits(), dl,
                                    TLI.getPointerTy(DAG.getDataLayout())));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   // A divide for UMULO should be faster than a function call.
   if (N->getOpcode() == ISD::UMULO) {
     SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
 
     SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
     SplitInteger(MUL, Lo, Hi);
 
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
     SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT),
                                   RHS, DAG.getConstant(0, dl, VT), ISD::SETEQ);
     SDValue NotZero = DAG.getSelect(dl, VT, isZero,
                                     DAG.getConstant(1, dl, VT), RHS);
     SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
     SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
                                     ISD::SETNE);
     Overflow = DAG.getSelect(dl, N->getValueType(1), isZero,
                              DAG.getConstant(0, dl, N->getValueType(1)),
                              Overflow);
     ReplaceValueWith(SDValue(N, 1), Overflow);
     return;
   }
 
   Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
   EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
 
   // Replace this with a libcall that will check overflow.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i32)
     LC = RTLIB::MULO_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::MULO_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::MULO_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XMULO!");
 
   SDValue Temp = DAG.CreateStackTemporary(PtrVT);
   // Temporary for the overflow value, default it to zero.
   SDValue Chain =
       DAG.getStore(DAG.getEntryNode(), dl, DAG.getConstant(0, dl, PtrVT), Temp,
                    MachinePointerInfo());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (const SDValue &Op : N->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
     Entry.IsSExt = true;
     Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   // Also pass the address of the overflow check.
   Entry.Node = Temp;
   Entry.Ty = PtrTy->getPointerTo();
   Entry.IsSExt = true;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
       .setSExtResult();
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   SplitInteger(CallInfo.first, Lo, Hi);
   SDValue Temp2 =
       DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, MachinePointerInfo());
   SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
                              DAG.getConstant(0, dl, PtrVT),
                              ISD::SETNE);
   // Use the overflow from the libcall everywhere.
   ReplaceValueWith(SDValue(N, 1), Ofl);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
 
   if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) {
     SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops);
     SplitInteger(Res.getValue(0), Lo, Hi);
     return;
   }
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
     LC = RTLIB::UDIV_I16;
   else if (VT == MVT::i32)
     LC = RTLIB::UDIV_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::UDIV_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::UDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
 
   if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) {
     SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops);
     SplitInteger(Res.getValue(1), Lo, Hi);
     return;
   }
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
     LC = RTLIB::UREM_I16;
   else if (VT == MVT::i32)
     LC = RTLIB::UREM_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::UREM_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::UREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().bitsLE(NVT)) {
     // The low part is zero extension of the input (degenerates to a copy).
     Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0));
     Hi = DAG.getConstant(0, dl, NVT);   // The high part is just a zero.
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
     assert(getTypeAction(Op.getValueType()) ==
            TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
            "Operand over promoted?");
     // Split the promoted operand.  This will simplify when it is expanded.
     SplitInteger(Res, Lo, Hi);
     unsigned ExcessBits = Op.getValueSizeInBits() - NVT.getSizeInBits();
     Hi = DAG.getZeroExtendInReg(Hi, dl,
                                 EVT::getIntegerVT(*DAG.getContext(),
                                                   ExcessBits));
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
   SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
   SDValue Zero = DAG.getConstant(0, dl, VT);
   SDValue Swap = DAG.getAtomicCmpSwap(
       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
       cast<AtomicSDNode>(N)->getMemoryVT(), VTs, N->getOperand(0),
       N->getOperand(1), Zero, Zero, cast<AtomicSDNode>(N)->getMemOperand());
 
   ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
   ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
 }
 
 //===----------------------------------------------------------------------===//
 //  Integer Operand Expansion
 //===----------------------------------------------------------------------===//
 
 /// ExpandIntegerOperand - This method is called when the specified operand of
 /// the specified node is found to need expansion.  At this point, all of the
 /// result types of the node are known to be legal, but other operands of the
 /// node may need promotion or expansion as well as the specified one.
 bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
 
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
     return false;
 
   switch (N->getOpcode()) {
   default:
   #ifndef NDEBUG
     dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
   #endif
     llvm_unreachable("Do not know how to expand this operator's operand!");
 
   case ISD::BITCAST:           Res = ExpandOp_BITCAST(N); break;
   case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
   case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
   case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
   case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
   case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
   case ISD::SETCCCARRY:        Res = ExpandIntOp_SETCCCARRY(N); break;
   case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
   case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
   case ISD::UINT_TO_FP:        Res = ExpandIntOp_UINT_TO_FP(N); break;
 
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:
   case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
   case ISD::RETURNADDR:
   case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
 
   case ISD::ATOMIC_STORE:      Res = ExpandIntOp_ATOMIC_STORE(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
   // core about this.
   if (Res.getNode() == N)
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
          "Invalid operand expansion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
 /// IntegerExpandSetCCOperands - Expand the operands of a comparison.  This code
 /// is shared among BR_CC, SELECT_CC, and SETCC handlers.
 void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
                                                   SDValue &NewRHS,
                                                   ISD::CondCode &CCCode,
                                                   const SDLoc &dl) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedInteger(NewLHS, LHSLo, LHSHi);
   GetExpandedInteger(NewRHS, RHSLo, RHSHi);
 
   if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
     if (RHSLo == RHSHi) {
       if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
         if (RHSCST->isAllOnesValue()) {
           // Equality comparison to -1.
           NewLHS = DAG.getNode(ISD::AND, dl,
                                LHSLo.getValueType(), LHSLo, LHSHi);
           NewRHS = RHSLo;
           return;
         }
       }
     }
 
     NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo);
     NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi);
     NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS);
     NewRHS = DAG.getConstant(0, dl, NewLHS.getValueType());
     return;
   }
 
   // If this is a comparison of the sign bit, just look at the top part.
   // X > -1,  x < 0
   if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
     if ((CCCode == ISD::SETLT && CST->isNullValue()) ||     // X < 0
         (CCCode == ISD::SETGT && CST->isAllOnesValue())) {  // X > -1
       NewLHS = LHSHi;
       NewRHS = RHSHi;
       return;
     }
 
   // FIXME: This generated code sucks.
   ISD::CondCode LowCC;
   switch (CCCode) {
   default: llvm_unreachable("Unknown integer setcc!");
   case ISD::SETLT:
   case ISD::SETULT: LowCC = ISD::SETULT; break;
   case ISD::SETGT:
   case ISD::SETUGT: LowCC = ISD::SETUGT; break;
   case ISD::SETLE:
   case ISD::SETULE: LowCC = ISD::SETULE; break;
   case ISD::SETGE:
   case ISD::SETUGE: LowCC = ISD::SETUGE; break;
   }
 
   // LoCmp = lo(op1) < lo(op2)   // Always unsigned comparison
   // HiCmp = hi(op1) < hi(op2)   // Signedness depends on operands
   // dest  = hi(op1) == hi(op2) ? LoCmp : HiCmp;
 
   // NOTE: on targets without efficient SELECT of bools, we can always use
   // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
   TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true,
                                                  nullptr);
   SDValue LoCmp, HiCmp;
   if (TLI.isTypeLegal(LHSLo.getValueType()) &&
       TLI.isTypeLegal(RHSLo.getValueType()))
     LoCmp = TLI.SimplifySetCC(getSetCCResultType(LHSLo.getValueType()), LHSLo,
                               RHSLo, LowCC, false, DagCombineInfo, dl);
   if (!LoCmp.getNode())
     LoCmp = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo,
                          RHSLo, LowCC);
   if (TLI.isTypeLegal(LHSHi.getValueType()) &&
       TLI.isTypeLegal(RHSHi.getValueType()))
     HiCmp = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi,
                               RHSHi, CCCode, false, DagCombineInfo, dl);
   if (!HiCmp.getNode())
     HiCmp =
         DAG.getNode(ISD::SETCC, dl, getSetCCResultType(LHSHi.getValueType()),
                     LHSHi, RHSHi, DAG.getCondCode(CCCode));
 
   ConstantSDNode *LoCmpC = dyn_cast<ConstantSDNode>(LoCmp.getNode());
   ConstantSDNode *HiCmpC = dyn_cast<ConstantSDNode>(HiCmp.getNode());
 
   bool EqAllowed = (CCCode == ISD::SETLE || CCCode == ISD::SETGE ||
                     CCCode == ISD::SETUGE || CCCode == ISD::SETULE);
 
   if ((EqAllowed && (HiCmpC && HiCmpC->isNullValue())) ||
       (!EqAllowed && ((HiCmpC && (HiCmpC->getAPIntValue() == 1)) ||
                       (LoCmpC && LoCmpC->isNullValue())))) {
     // For LE / GE, if high part is known false, ignore the low part.
     // For LT / GT: if low part is known false, return the high part.
     //              if high part is known true, ignore the low part.
     NewLHS = HiCmp;
     NewRHS = SDValue();
     return;
   }
 
   if (LHSHi == RHSHi) {
     // Comparing the low bits is enough.
     NewLHS = LoCmp;
     NewRHS = SDValue();
     return;
   }
 
   // Lower with SETCCCARRY if the target supports it.
   EVT HiVT = LHSHi.getValueType();
   EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT);
   bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT);
 
   // FIXME: Make all targets support this, then remove the other lowering.
   if (HasSETCCCARRY) {
     // SETCCCARRY can detect < and >= directly. For > and <=, flip
     // operands and condition code.
     bool FlipOperands = false;
     switch (CCCode) {
     case ISD::SETGT:  CCCode = ISD::SETLT;  FlipOperands = true; break;
     case ISD::SETUGT: CCCode = ISD::SETULT; FlipOperands = true; break;
     case ISD::SETLE:  CCCode = ISD::SETGE;  FlipOperands = true; break;
     case ISD::SETULE: CCCode = ISD::SETUGE; FlipOperands = true; break;
     default: break;
     }
     if (FlipOperands) {
       std::swap(LHSLo, RHSLo);
       std::swap(LHSHi, RHSHi);
     }
     // Perform a wide subtraction, feeding the carry from the low part into
     // SETCCCARRY. The SETCCCARRY operation is essentially looking at the high
     // part of the result of LHS - RHS. It is negative iff LHS < RHS. It is
     // zero or positive iff LHS >= RHS.
     EVT LoVT = LHSLo.getValueType();
     SDVTList VTList = DAG.getVTList(LoVT, getSetCCResultType(LoVT));
     SDValue LowCmp = DAG.getNode(ISD::USUBO, dl, VTList, LHSLo, RHSLo);
     SDValue Res = DAG.getNode(ISD::SETCCCARRY, dl, getSetCCResultType(HiVT),
                               LHSHi, RHSHi, LowCmp.getValue(1),
                               DAG.getCondCode(CCCode));
     NewLHS = Res;
     NewRHS = SDValue();
     return;
   }
 
   NewLHS = TLI.SimplifySetCC(getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ,
                              false, DagCombineInfo, dl);
   if (!NewLHS.getNode())
     NewLHS =
         DAG.getSetCC(dl, getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ);
   NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), NewLHS, LoCmp, HiCmp);
   NewRHS = SDValue();
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
   IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
 
   // Update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
                                 N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
   IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
 
   // Update N to have the operands specified.
   return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
                                 DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
   IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, use it.
   if (!NewRHS.getNode()) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
     return NewLHS;
   }
 
   // Otherwise, update N to have the operands specified.
   return SDValue(
       DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDValue Carry = N->getOperand(2);
   SDValue Cond = N->getOperand(3);
   SDLoc dl = SDLoc(N);
 
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedInteger(LHS, LHSLo, LHSHi);
   GetExpandedInteger(RHS, RHSLo, RHSHi);
 
   // Expand to a SUBE for the low part and a smaller SETCCCARRY for the high.
   SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType());
   SDValue LowCmp = DAG.getNode(ISD::SUBCARRY, dl, VTList, LHSLo, RHSLo, Carry);
   return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi,
                      LowCmp.getValue(1), Cond);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   // The value being shifted is legal, but the shift amount is too big.
   // It follows that either the result of the shift is undefined, or the
   // upper half of the shift amount is zero.  Just use the lower half.
   SDValue Lo, Hi;
   GetExpandedInteger(N->getOperand(1), Lo, Hi);
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
   // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
   // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
   // constant to valid type.
   SDValue Lo, Hi;
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   EVT DstVT = N->getValueType(0);
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
   return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (ISD::isNormalStore(N))
     return ExpandOp_NormalStore(N, OpNo);
 
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   assert(OpNo == 1 && "Can only expand the stored value so far");
 
   EVT VT = N->getOperand(1).getValueType();
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   unsigned Alignment = N->getAlignment();
   MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
   SDValue Lo, Hi;
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   if (N->getMemoryVT().bitsLE(NVT)) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
                              N->getMemoryVT(), Alignment, MMOFlags, AAInfo);
   }
 
   if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
     Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
                       AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
     EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
 
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
     Hi = DAG.getTruncStore(
         Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
         MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
   // Big-endian - high bits are at low addresses.  Favor aligned stores at
   // the cost of some bit-fiddling.
   GetExpandedInteger(N->getValue(), Lo, Hi);
 
   EVT ExtVT = N->getMemoryVT();
   unsigned EBytes = ExtVT.getStoreSize();
   unsigned IncrementSize = NVT.getSizeInBits()/8;
   unsigned ExcessBits = (EBytes - IncrementSize)*8;
   EVT HiVT = EVT::getIntegerVT(*DAG.getContext(),
                                ExtVT.getSizeInBits() - ExcessBits);
 
   if (ExcessBits < NVT.getSizeInBits()) {
     // Transfer high bits from the top of Lo to the bottom of Hi.
     Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
                      DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
                                      TLI.getPointerTy(DAG.getDataLayout())));
     Hi = DAG.getNode(
         ISD::OR, dl, NVT, Hi,
         DAG.getNode(ISD::SRL, dl, NVT, Lo,
                     DAG.getConstant(ExcessBits, dl,
                                     TLI.getPointerTy(DAG.getDataLayout()))));
   }
 
   // Store both the high bits and maybe some of the low bits.
   Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, Alignment,
                          MMOFlags, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
   // Store the lowest ExcessBits bits in the second half.
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                          MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
   SDValue InL, InH;
   GetExpandedInteger(N->getOperand(0), InL, InH);
   // Just truncate the low part of the source.
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), InL);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
   SDLoc dl(N);
 
   // The following optimization is valid only if every value in SrcVT (when
   // treated as signed) is representable in DstVT.  Check that the mantissa
   // size of DstVT is >= than the number of bits in SrcVT -1.
   const fltSemantics &sem = DAG.EVTToAPFloatSemantics(DstVT);
   if (APFloat::semanticsPrecision(sem) >= SrcVT.getSizeInBits()-1 &&
       TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){
     // Do a signed conversion then adjust the result.
     SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op);
     SignedConv = TLI.LowerOperation(SignedConv, DAG);
 
     // The result of the signed conversion needs adjusting if the 'sign bit' of
     // the incoming integer was set.  To handle this, we dynamically test to see
     // if it is set, and, if so, add a fudge factor.
 
     const uint64_t F32TwoE32  = 0x4F800000ULL;
     const uint64_t F32TwoE64  = 0x5F800000ULL;
     const uint64_t F32TwoE128 = 0x7F800000ULL;
 
     APInt FF(32, 0);
     if (SrcVT == MVT::i32)
       FF = APInt(32, F32TwoE32);
     else if (SrcVT == MVT::i64)
       FF = APInt(32, F32TwoE64);
     else if (SrcVT == MVT::i128)
       FF = APInt(32, F32TwoE128);
     else
       llvm_unreachable("Unsupported UINT_TO_FP!");
 
     // Check whether the sign bit is set.
     SDValue Lo, Hi;
     GetExpandedInteger(Op, Lo, Hi);
     SDValue SignSet = DAG.getSetCC(dl,
                                    getSetCCResultType(Hi.getValueType()),
                                    Hi,
                                    DAG.getConstant(0, dl, Hi.getValueType()),
                                    ISD::SETLT);
 
     // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
     SDValue FudgePtr =
         DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF.zext(64)),
                             TLI.getPointerTy(DAG.getDataLayout()));
 
     // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
     SDValue Zero = DAG.getIntPtrConstant(0, dl);
     SDValue Four = DAG.getIntPtrConstant(4, dl);
     if (DAG.getDataLayout().isBigEndian())
       std::swap(Zero, Four);
     SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet,
                                    Zero, Four);
     unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
     FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(),
                            FudgePtr, Offset);
     Alignment = std::min(Alignment, 4u);
 
     // Load the value out, extending it from f32 to the destination float type.
     // FIXME: Avoid the extend by constructing the right constant pool?
     SDValue Fudge = DAG.getExtLoad(
         ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
         Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
   }
 
   // Otherwise, use a libcall.
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
   return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
   SDLoc dl(N);
   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
                                cast<AtomicSDNode>(N)->getMemoryVT(),
                                N->getOperand(0),
                                N->getOperand(1), N->getOperand(2),
                                cast<AtomicSDNode>(N)->getMemOperand());
   return Swap.getValue(1);
 }
 
 
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue InOp0 = N->getOperand(0);
   EVT InVT = InOp0.getValueType();
 
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
   unsigned OutNumElems = OutVT.getVectorNumElements();
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
   SDLoc dl(N);
   SDValue BaseIdx = N->getOperand(1);
 
   SmallVector<SDValue, 8> Ops;
   Ops.reserve(OutNumElems);
   for (unsigned i = 0; i != OutNumElems; ++i) {
 
     // Extract the element from the original vector.
     SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
       BaseIdx, DAG.getConstant(i, dl, BaseIdx.getValueType()));
     SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
       InVT.getVectorElementType(), N->getOperand(0), Index);
 
     SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext);
     // Insert the converted element to the new vector.
     Ops.push_back(Op);
   }
 
   return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
 
 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
   ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   ArrayRef<int> NewMask = SV->getMask().slice(0, VT.getVectorNumElements());
 
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
   SDValue V1 = GetPromotedInteger(N->getOperand(1));
   EVT OutVT = V0.getValueType();
 
   return DAG.getVectorShuffle(OutVT, dl, V0, V1, NewMask);
 }
 
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
   unsigned NumElems = N->getNumOperands();
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
   SDLoc dl(N);
 
   SmallVector<SDValue, 8> Ops;
   Ops.reserve(NumElems);
   for (unsigned i = 0; i != NumElems; ++i) {
     SDValue Op;
     // BUILD_VECTOR integer operand types are allowed to be larger than the
     // result's element type. This may still be true after the promotion. For
     // example, we might be promoting (<v?i1> = BV <i32>, <i32>, ...) to
     // (v?i16 = BV <i32>, <i32>, ...), and we can't any_extend <i32> to <i16>.
     if (N->getOperand(i).getValueType().bitsLT(NOutVTElem))
       Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
     else
       Op = N->getOperand(i);
     Ops.push_back(Op);
   }
 
   return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
 
   SDLoc dl(N);
 
   assert(!N->getOperand(0).getValueType().isVector() &&
          "Input must be a scalar");
 
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
   SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0));
 
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
 
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
 
   EVT OutElemTy = NOutVT.getVectorElementType();
 
   unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
   unsigned NumOutElem = NOutVT.getVectorNumElements();
   unsigned NumOperands = N->getNumOperands();
   assert(NumElem * NumOperands == NumOutElem &&
          "Unexpected number of elements");
 
   // Take the elements from the first vector.
   SmallVector<SDValue, 8> Ops(NumOutElem);
   for (unsigned i = 0; i < NumOperands; ++i) {
     SDValue Op = N->getOperand(i);
     if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger)
       Op = GetPromotedInteger(Op);
     EVT SclrTy = Op.getValueType().getVectorElementType();
     assert(NumElem == Op.getValueType().getVectorNumElements() &&
            "Unexpected number of elements");
 
     for (unsigned j = 0; j < NumElem; ++j) {
       SDValue Ext = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Op,
           DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       Ops[i * NumElem + j] = DAG.getAnyExtOrTrunc(Ext, dl, OutElemTy);
     }
   }
 
   return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   assert(NVT.isVector() && "This type must be promoted to a vector type");
 
   SDLoc dl(N);
 
   // For operands whose TypeAction is to promote, extend the promoted node
   // appropriately (ZERO_EXTEND or SIGN_EXTEND) from the original pre-promotion
   // type, and then construct a new *_EXTEND_VECTOR_INREG node to the promote-to
   // type..
   if (getTypeAction(N->getOperand(0).getValueType())
       == TargetLowering::TypePromoteInteger) {
     SDValue Promoted;
 
     switch(N->getOpcode()) {
       case ISD::SIGN_EXTEND_VECTOR_INREG:
         Promoted = SExtPromotedInteger(N->getOperand(0));
         break;
       case ISD::ZERO_EXTEND_VECTOR_INREG:
         Promoted = ZExtPromotedInteger(N->getOperand(0));
         break;
       case ISD::ANY_EXTEND_VECTOR_INREG:
         Promoted = GetPromotedInteger(N->getOperand(0));
         break;
       default:
         llvm_unreachable("Node has unexpected Opcode");
     }
     return DAG.getNode(N->getOpcode(), dl, NVT, Promoted);
   }
 
   // Directly extend to the appropriate transform-to type.
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
 
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
 
   SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
     NOutVTElem, N->getOperand(1));
   return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT,
     V0, ConvElem, N->getOperand(2));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
   SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl,
                                   TLI.getVectorIdxTy(DAG.getDataLayout()));
   SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
     V0->getValueType(0).getScalarType(), V0, V1);
 
   // EXTRACT_VECTOR_ELT can return types which are wider than the incoming
   // element types. If this is the case then we need to expand the outgoing
   // value and not truncate it.
   return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
   MVT InVT = V0.getValueType().getSimpleVT();
   MVT OutVT = MVT::getVectorVT(InVT.getVectorElementType(),
                                N->getValueType(0).getVectorNumElements());
   SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, V0, N->getOperand(1));
   return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
   unsigned NumElems = N->getNumOperands();
 
   EVT RetSclrTy = N->getValueType(0).getVectorElementType();
 
   SmallVector<SDValue, 8> NewOps;
   NewOps.reserve(NumElems);
 
   // For each incoming vector
   for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) {
     SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
     EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
     unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
 
     for (unsigned i=0; i<NumElem; ++i) {
       // Extract element from incoming vector
       SDValue Ex = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Incoming,
           DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
       NewOps.push_back(Tr);
     }
   }
 
   return DAG.getBuildVector(N->getValueType(0), dl, NewOps);
 }
Index: vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/SelectionDAG.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/SelectionDAG.cpp	(revision 338000)
@@ -1,8747 +1,8747 @@
 //===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This implements the SelectionDAG class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
 #include <limits>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 /// makeVTList - Return an instance of the SDVTList struct initialized with the
 /// specified members.
 static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
   SDVTList Res = {VTs, NumVTs};
   return Res;
 }
 
 // Default null implementations of the callbacks.
 void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
 void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
 
 #define DEBUG_TYPE "selectiondag"
 
 static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
        cl::Hidden, cl::init(true),
        cl::desc("Gang up loads and stores generated by inlining of memcpy"));
 
 static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
        cl::desc("Number limit for gluing ld/st of memcpy."),
        cl::Hidden, cl::init(0));
 
 static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
   LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
 }
 
 //===----------------------------------------------------------------------===//
 //                              ConstantFPSDNode Class
 //===----------------------------------------------------------------------===//
 
 /// isExactlyValue - We don't rely on operator== working on double values, as
 /// it returns true for things that are clearly not equal, like -0.0 and 0.0.
 /// As such, this method can be used to do an exact bit-for-bit comparison of
 /// two floating point values.
 bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
   return getValueAPF().bitwiseIsEqual(V);
 }
 
 bool ConstantFPSDNode::isValueValidForType(EVT VT,
                                            const APFloat& Val) {
   assert(VT.isFloatingPoint() && "Can only convert between FP types");
 
   // convert modifies in place, so make a copy.
   APFloat Val2 = APFloat(Val);
   bool losesInfo;
   (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
                       APFloat::rmNearestTiesToEven,
                       &losesInfo);
   return !losesInfo;
 }
 
 //===----------------------------------------------------------------------===//
 //                              ISD Namespace
 //===----------------------------------------------------------------------===//
 
 bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
   auto *BV = dyn_cast<BuildVectorSDNode>(N);
   if (!BV)
     return false;
 
   APInt SplatUndef;
   unsigned SplatBitSize;
   bool HasUndefs;
   unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
   return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
                              EltSize) &&
          EltSize == SplatBitSize;
 }
 
 // FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
 // specializations of the more general isConstantSplatVector()?
 
 bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   // Look through a bit convert.
   while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   unsigned i = 0, e = N->getNumOperands();
 
   // Skip over all of the undef values.
   while (i != e && N->getOperand(i).isUndef())
     ++i;
 
   // Do not accept an all-undef vector.
   if (i == e) return false;
 
   // Do not accept build_vectors that aren't all constants or which have non-~0
   // elements. We have to be a bit careful here, as the type of the constant
   // may not be the same as the type of the vector elements due to type
   // legalization (the elements are promoted to a legal type for the target and
   // a vector of a type may be legal when the base element type is not).
   // We only want to check enough bits to cover the vector elements, because
   // we care if the resultant vector is all ones, not whether the individual
   // constants are.
   SDValue NotZero = N->getOperand(i);
   unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
     if (CN->getAPIntValue().countTrailingOnes() < EltSize)
       return false;
   } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
     if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
       return false;
   } else
     return false;
 
   // Okay, we have at least one ~0 value, check to see if the rest match or are
   // undefs. Even with the above element type twiddling, this should be OK, as
   // the same type legalization should have applied to all the elements.
   for (++i; i != e; ++i)
     if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
       return false;
   return true;
 }
 
 bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   // Look through a bit convert.
   while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   bool IsAllUndef = true;
   for (const SDValue &Op : N->op_values()) {
     if (Op.isUndef())
       continue;
     IsAllUndef = false;
     // Do not accept build_vectors that aren't all constants or which have non-0
     // elements. We have to be a bit careful here, as the type of the constant
     // may not be the same as the type of the vector elements due to type
     // legalization (the elements are promoted to a legal type for the target
     // and a vector of a type may be legal when the base element type is not).
     // We only want to check enough bits to cover the vector elements, because
     // we care if the resultant vector is all zeros, not whether the individual
     // constants are.
     unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
       if (CN->getAPIntValue().countTrailingZeros() < EltSize)
         return false;
     } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
       if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
         return false;
     } else
       return false;
   }
 
   // Do not accept an all-undef vector.
   if (IsAllUndef)
     return false;
   return true;
 }
 
 bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (const SDValue &Op : N->op_values()) {
     if (Op.isUndef())
       continue;
     if (!isa<ConstantSDNode>(Op))
       return false;
   }
   return true;
 }
 
 bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (const SDValue &Op : N->op_values()) {
     if (Op.isUndef())
       continue;
     if (!isa<ConstantFPSDNode>(Op))
       return false;
   }
   return true;
 }
 
 bool ISD::allOperandsUndef(const SDNode *N) {
   // Return false if the node has no operands.
   // This is "logically inconsistent" with the definition of "all" but
   // is probably the desired behavior.
   if (N->getNumOperands() == 0)
     return false;
 
   for (const SDValue &Op : N->op_values())
     if (!Op.isUndef())
       return false;
 
   return true;
 }
 
 bool ISD::matchUnaryPredicate(SDValue Op,
                               std::function<bool(ConstantSDNode *)> Match) {
   if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
     return Match(Cst);
 
   if (ISD::BUILD_VECTOR != Op.getOpcode())
     return false;
 
   EVT SVT = Op.getValueType().getScalarType();
   for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
     auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
     if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
       return false;
   }
   return true;
 }
 
 bool ISD::matchBinaryPredicate(
     SDValue LHS, SDValue RHS,
     std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) {
   if (LHS.getValueType() != RHS.getValueType())
     return false;
 
   if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
     if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
       return Match(LHSCst, RHSCst);
 
   if (ISD::BUILD_VECTOR != LHS.getOpcode() ||
       ISD::BUILD_VECTOR != RHS.getOpcode())
     return false;
 
   EVT SVT = LHS.getValueType().getScalarType();
   for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
     auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
     auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
     if (!LHSCst || !RHSCst)
       return false;
     if (LHSCst->getValueType(0) != SVT ||
         LHSCst->getValueType(0) != RHSCst->getValueType(0))
       return false;
     if (!Match(LHSCst, RHSCst))
       return false;
   }
   return true;
 }
 
 ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   switch (ExtType) {
   case ISD::EXTLOAD:
     return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
   case ISD::SEXTLOAD:
     return ISD::SIGN_EXTEND;
   case ISD::ZEXTLOAD:
     return ISD::ZERO_EXTEND;
   default:
     break;
   }
 
   llvm_unreachable("Invalid LoadExtType");
 }
 
 ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
   // To perform this operation, we just need to swap the L and G bits of the
   // operation.
   unsigned OldL = (Operation >> 2) & 1;
   unsigned OldG = (Operation >> 1) & 1;
   return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
                        (OldL << 1) |       // New G bit
                        (OldG << 2));       // New L bit.
 }
 
 ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
   unsigned Operation = Op;
   if (isInteger)
     Operation ^= 7;   // Flip L, G, E bits, but not U.
   else
     Operation ^= 15;  // Flip all of the condition bits.
 
   if (Operation > ISD::SETTRUE2)
     Operation &= ~8;  // Don't let N and U bits get set.
 
   return ISD::CondCode(Operation);
 }
 
 /// For an integer comparison, return 1 if the comparison is a signed operation
 /// and 2 if the result is an unsigned comparison. Return zero if the operation
 /// does not depend on the sign of the input (setne and seteq).
 static int isSignedOp(ISD::CondCode Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Illegal integer setcc operation!");
   case ISD::SETEQ:
   case ISD::SETNE: return 0;
   case ISD::SETLT:
   case ISD::SETLE:
   case ISD::SETGT:
   case ISD::SETGE: return 1;
   case ISD::SETULT:
   case ISD::SETULE:
   case ISD::SETUGT:
   case ISD::SETUGE: return 2;
   }
 }
 
 ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
                                        bool IsInteger) {
   if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed integer setcc with an unsigned integer setcc.
     return ISD::SETCC_INVALID;
 
   unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
 
   // If the N and U bits get set, then the resultant comparison DOES suddenly
   // care about orderedness, and it is true when ordered.
   if (Op > ISD::SETTRUE2)
     Op &= ~16;     // Clear the U bit if the N bit is set.
 
   // Canonicalize illegal integer setcc's.
   if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
     Op = ISD::SETNE;
 
   return ISD::CondCode(Op);
 }
 
 ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
                                         bool IsInteger) {
   if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed setcc with an unsigned setcc.
     return ISD::SETCC_INVALID;
 
   // Combine all of the condition bits.
   ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
 
   // Canonicalize illegal integer setcc's.
   if (IsInteger) {
     switch (Result) {
     default: break;
     case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
     case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
     case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
     case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
     case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
     }
   }
 
   return Result;
 }
 
 //===----------------------------------------------------------------------===//
 //                           SDNode Profile Support
 //===----------------------------------------------------------------------===//
 
 /// AddNodeIDOpcode - Add the node opcode to the NodeID data.
 static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
   ID.AddInteger(OpC);
 }
 
 /// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
 /// solely with their pointer.
 static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
   ID.AddPointer(VTList.VTs);
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDValue> Ops) {
   for (auto& Op : Ops) {
     ID.AddPointer(Op.getNode());
     ID.AddInteger(Op.getResNo());
   }
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDUse> Ops) {
   for (auto& Op : Ops) {
     ID.AddPointer(Op.getNode());
     ID.AddInteger(Op.getResNo());
   }
 }
 
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
                           SDVTList VTList, ArrayRef<SDValue> OpList) {
   AddNodeIDOpcode(ID, OpC);
   AddNodeIDValueTypes(ID, VTList);
   AddNodeIDOperands(ID, OpList);
 }
 
 /// If this is an SDNode with special info, add this info to the NodeID data.
 static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   switch (N->getOpcode()) {
   case ISD::TargetExternalSymbol:
   case ISD::ExternalSymbol:
   case ISD::MCSymbol:
     llvm_unreachable("Should only be used on nodes with operands");
   default: break;  // Normal nodes don't need extra info.
   case ISD::TargetConstant:
   case ISD::Constant: {
     const ConstantSDNode *C = cast<ConstantSDNode>(N);
     ID.AddPointer(C->getConstantIntValue());
     ID.AddBoolean(C->isOpaque());
     break;
   }
   case ISD::TargetConstantFP:
   case ISD::ConstantFP:
     ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
     break;
   case ISD::TargetGlobalAddress:
   case ISD::GlobalAddress:
   case ISD::TargetGlobalTLSAddress:
   case ISD::GlobalTLSAddress: {
     const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
     ID.AddPointer(GA->getGlobal());
     ID.AddInteger(GA->getOffset());
     ID.AddInteger(GA->getTargetFlags());
     break;
   }
   case ISD::BasicBlock:
     ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
     break;
   case ISD::Register:
     ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
     break;
   case ISD::RegisterMask:
     ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
     break;
   case ISD::SRCVALUE:
     ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
     break;
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
     ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
     break;
   case ISD::JumpTable:
   case ISD::TargetJumpTable:
     ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
     ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
     break;
   case ISD::ConstantPool:
   case ISD::TargetConstantPool: {
     const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
     ID.AddInteger(CP->getAlignment());
     ID.AddInteger(CP->getOffset());
     if (CP->isMachineConstantPoolEntry())
       CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
     else
       ID.AddPointer(CP->getConstVal());
     ID.AddInteger(CP->getTargetFlags());
     break;
   }
   case ISD::TargetIndex: {
     const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
     ID.AddInteger(TI->getIndex());
     ID.AddInteger(TI->getOffset());
     ID.AddInteger(TI->getTargetFlags());
     break;
   }
   case ISD::LOAD: {
     const LoadSDNode *LD = cast<LoadSDNode>(N);
     ID.AddInteger(LD->getMemoryVT().getRawBits());
     ID.AddInteger(LD->getRawSubclassData());
     ID.AddInteger(LD->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::STORE: {
     const StoreSDNode *ST = cast<StoreSDNode>(N);
     ID.AddInteger(ST->getMemoryVT().getRawBits());
     ID.AddInteger(ST->getRawSubclassData());
     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::MLOAD: {
     const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
     ID.AddInteger(MLD->getMemoryVT().getRawBits());
     ID.AddInteger(MLD->getRawSubclassData());
     ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::MSTORE: {
     const MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
     ID.AddInteger(MST->getMemoryVT().getRawBits());
     ID.AddInteger(MST->getRawSubclassData());
     ID.AddInteger(MST->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::MGATHER: {
     const MaskedGatherSDNode *MG = cast<MaskedGatherSDNode>(N);
     ID.AddInteger(MG->getMemoryVT().getRawBits());
     ID.AddInteger(MG->getRawSubclassData());
     ID.AddInteger(MG->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::MSCATTER: {
     const MaskedScatterSDNode *MS = cast<MaskedScatterSDNode>(N);
     ID.AddInteger(MS->getMemoryVT().getRawBits());
     ID.AddInteger(MS->getRawSubclassData());
     ID.AddInteger(MS->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::ATOMIC_CMP_SWAP:
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_CLR:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE: {
     const AtomicSDNode *AT = cast<AtomicSDNode>(N);
     ID.AddInteger(AT->getMemoryVT().getRawBits());
     ID.AddInteger(AT->getRawSubclassData());
     ID.AddInteger(AT->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::PREFETCH: {
     const MemSDNode *PF = cast<MemSDNode>(N);
     ID.AddInteger(PF->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
     for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
          i != e; ++i)
       ID.AddInteger(SVN->getMaskElt(i));
     break;
   }
   case ISD::TargetBlockAddress:
   case ISD::BlockAddress: {
     const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
     ID.AddPointer(BA->getBlockAddress());
     ID.AddInteger(BA->getOffset());
     ID.AddInteger(BA->getTargetFlags());
     break;
   }
   } // end switch (N->getOpcode())
 
   // Target specific memory nodes could also have address spaces to check.
   if (N->isTargetMemoryOpcode())
     ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
 }
 
 /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
 /// data.
 static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
   AddNodeIDOpcode(ID, N->getOpcode());
   // Add the return value info.
   AddNodeIDValueTypes(ID, N->getVTList());
   // Add the operand info.
   AddNodeIDOperands(ID, N->ops());
 
   // Handle SDNode leafs with special info.
   AddNodeIDCustom(ID, N);
 }
 
 //===----------------------------------------------------------------------===//
 //                              SelectionDAG Class
 //===----------------------------------------------------------------------===//
 
 /// doNotCSE - Return true if CSE should not be performed for this node.
 static bool doNotCSE(SDNode *N) {
   if (N->getValueType(0) == MVT::Glue)
     return true; // Never CSE anything that produces a flag.
 
   switch (N->getOpcode()) {
   default: break;
   case ISD::HANDLENODE:
   case ISD::EH_LABEL:
     return true;   // Never CSE these nodes.
   }
 
   // Check that remaining values produced are not flags.
   for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
     if (N->getValueType(i) == MVT::Glue)
       return true; // Never CSE anything that produces a flag.
 
   return false;
 }
 
 /// RemoveDeadNodes - This method deletes all unreachable nodes in the
 /// SelectionDAG.
 void SelectionDAG::RemoveDeadNodes() {
   // Create a dummy node (which is not added to allnodes), that adds a reference
   // to the root node, preventing it from being deleted.
   HandleSDNode Dummy(getRoot());
 
   SmallVector<SDNode*, 128> DeadNodes;
 
   // Add all obviously-dead nodes to the DeadNodes worklist.
   for (SDNode &Node : allnodes())
     if (Node.use_empty())
       DeadNodes.push_back(&Node);
 
   RemoveDeadNodes(DeadNodes);
 
   // If the root changed (e.g. it was a dead load, update the root).
   setRoot(Dummy.getValue());
 }
 
 /// RemoveDeadNodes - This method deletes the unreachable nodes in the
 /// given list, and any nodes that become unreachable as a result.
 void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
 
   // Process the worklist, deleting the nodes and adding their uses to the
   // worklist.
   while (!DeadNodes.empty()) {
     SDNode *N = DeadNodes.pop_back_val();
     // Skip to next node if we've already managed to delete the node. This could
     // happen if replacing a node causes a node previously added to the node to
     // be deleted.
     if (N->getOpcode() == ISD::DELETED_NODE)
       continue;
 
     for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
       DUL->NodeDeleted(N, nullptr);
 
     // Take the node out of the appropriate CSE map.
     RemoveNodeFromCSEMaps(N);
 
     // Next, brutally remove the operand list.  This is safe to do, as there are
     // no cycles in the graph.
     for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
       SDUse &Use = *I++;
       SDNode *Operand = Use.getNode();
       Use.set(SDValue());
 
       // Now that we removed this operand, see if there are no uses of it left.
       if (Operand->use_empty())
         DeadNodes.push_back(Operand);
     }
 
     DeallocateNode(N);
   }
 }
 
 void SelectionDAG::RemoveDeadNode(SDNode *N){
   SmallVector<SDNode*, 16> DeadNodes(1, N);
 
   // Create a dummy node that adds a reference to the root node, preventing
   // it from being deleted.  (This matters if the root is an operand of the
   // dead node.)
   HandleSDNode Dummy(getRoot());
 
   RemoveDeadNodes(DeadNodes);
 }
 
 void SelectionDAG::DeleteNode(SDNode *N) {
   // First take this out of the appropriate CSE map.
   RemoveNodeFromCSEMaps(N);
 
   // Finally, remove uses due to operands of this node, remove from the
   // AllNodes list, and delete the node.
   DeleteNodeNotInCSEMaps(N);
 }
 
 void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
   assert(N->getIterator() != AllNodes.begin() &&
          "Cannot delete the entry node!");
   assert(N->use_empty() && "Cannot delete a node that is not dead!");
 
   // Drop all of the operands and decrement used node's use counts.
   N->DropOperands();
 
   DeallocateNode(N);
 }
 
 void SDDbgInfo::erase(const SDNode *Node) {
   DbgValMapType::iterator I = DbgValMap.find(Node);
   if (I == DbgValMap.end())
     return;
   for (auto &Val: I->second)
     Val->setIsInvalidated();
   DbgValMap.erase(I);
 }
 
 void SelectionDAG::DeallocateNode(SDNode *N) {
   // If we have operands, deallocate them.
   removeOperands(N);
 
   NodeAllocator.Deallocate(AllNodes.remove(N));
 
   // Set the opcode to DELETED_NODE to help catch bugs when node
   // memory is reallocated.
   // FIXME: There are places in SDag that have grown a dependency on the opcode
   // value in the released node.
   __asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
   N->NodeType = ISD::DELETED_NODE;
 
   // If any of the SDDbgValue nodes refer to this SDNode, invalidate
   // them and forget about that node.
   DbgInfo->erase(N);
 }
 
 #ifndef NDEBUG
 /// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
 static void VerifySDNode(SDNode *N) {
   switch (N->getOpcode()) {
   default:
     break;
   case ISD::BUILD_PAIR: {
     EVT VT = N->getValueType(0);
     assert(N->getNumValues() == 1 && "Too many results!");
     assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
            "Wrong return type!");
     assert(N->getNumOperands() == 2 && "Wrong number of operands!");
     assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
            "Mismatched operand types!");
     assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
            "Wrong operand type!");
     assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
            "Wrong return type size");
     break;
   }
   case ISD::BUILD_VECTOR: {
     assert(N->getNumValues() == 1 && "Too many results!");
     assert(N->getValueType(0).isVector() && "Wrong return type!");
     assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
            "Wrong number of operands!");
     EVT EltVT = N->getValueType(0).getVectorElementType();
     for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
       assert((I->getValueType() == EltVT ||
              (EltVT.isInteger() && I->getValueType().isInteger() &&
               EltVT.bitsLE(I->getValueType()))) &&
             "Wrong operand type!");
       assert(I->getValueType() == N->getOperand(0).getValueType() &&
              "Operands must all have the same type");
     }
     break;
   }
   }
 }
 #endif // NDEBUG
 
 /// Insert a newly allocated node into the DAG.
 ///
 /// Handles insertion into the all nodes list and CSE map, as well as
 /// verification and other common operations when a new node is allocated.
 void SelectionDAG::InsertNode(SDNode *N) {
   AllNodes.push_back(N);
 #ifndef NDEBUG
   N->PersistentId = NextPersistentId++;
   VerifySDNode(N);
 #endif
 }
 
 /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
 /// correspond to it.  This is useful when we're about to delete or repurpose
 /// the node.  We don't want future request for structurally identical nodes
 /// to return N anymore.
 bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
   bool Erased = false;
   switch (N->getOpcode()) {
   case ISD::HANDLENODE: return false;  // noop.
   case ISD::CONDCODE:
     assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
            "Cond code doesn't exist!");
     Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
     CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
     break;
   case ISD::ExternalSymbol:
     Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
     break;
   case ISD::TargetExternalSymbol: {
     ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
     Erased = TargetExternalSymbols.erase(
                std::pair<std::string,unsigned char>(ESN->getSymbol(),
                                                     ESN->getTargetFlags()));
     break;
   }
   case ISD::MCSymbol: {
     auto *MCSN = cast<MCSymbolSDNode>(N);
     Erased = MCSymbols.erase(MCSN->getMCSymbol());
     break;
   }
   case ISD::VALUETYPE: {
     EVT VT = cast<VTSDNode>(N)->getVT();
     if (VT.isExtended()) {
       Erased = ExtendedValueTypeNodes.erase(VT);
     } else {
       Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
       ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
     }
     break;
   }
   default:
     // Remove it from the CSE Map.
     assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
     assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
     Erased = CSEMap.RemoveNode(N);
     break;
   }
 #ifndef NDEBUG
   // Verify that the node was actually in one of the CSE maps, unless it has a
   // flag result (which cannot be CSE'd) or is one of the special cases that are
   // not subject to CSE.
   if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
       !N->isMachineOpcode() && !doNotCSE(N)) {
     N->dump(this);
     dbgs() << "\n";
     llvm_unreachable("Node is not in map!");
   }
 #endif
   return Erased;
 }
 
 /// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
 /// maps and modified in place. Add it back to the CSE maps, unless an identical
 /// node already exists, in which case transfer all its users to the existing
 /// node. This transfer can potentially trigger recursive merging.
 void
 SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
   // For node types that aren't CSE'd, just act as if no identical node
   // already exists.
   if (!doNotCSE(N)) {
     SDNode *Existing = CSEMap.GetOrInsertNode(N);
     if (Existing != N) {
       // If there was already an existing matching node, use ReplaceAllUsesWith
       // to replace the dead one with the existing one.  This can cause
       // recursive merging of other unrelated nodes down the line.
       ReplaceAllUsesWith(N, Existing);
 
       // N is now dead. Inform the listeners and delete it.
       for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
         DUL->NodeDeleted(N, Existing);
       DeleteNodeNotInCSEMaps(N);
       return;
     }
   }
 
   // If the node doesn't already exist, we updated it.  Inform listeners.
   for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
     DUL->NodeUpdated(N);
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
 /// were replaced with those specified.  If this node is never memoized,
 /// return null, otherwise return a pointer to the slot it would take.  If a
 /// node already exists with these operands, the slot will be non-null.
 SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
                                            void *&InsertPos) {
   if (doNotCSE(N))
     return nullptr;
 
   SDValue Ops[] = { Op };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
     Node->intersectFlagsWith(N->getFlags());
   return Node;
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
 /// were replaced with those specified.  If this node is never memoized,
 /// return null, otherwise return a pointer to the slot it would take.  If a
 /// node already exists with these operands, the slot will be non-null.
 SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
                                            SDValue Op1, SDValue Op2,
                                            void *&InsertPos) {
   if (doNotCSE(N))
     return nullptr;
 
   SDValue Ops[] = { Op1, Op2 };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
     Node->intersectFlagsWith(N->getFlags());
   return Node;
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
 /// were replaced with those specified.  If this node is never memoized,
 /// return null, otherwise return a pointer to the slot it would take.  If a
 /// node already exists with these operands, the slot will be non-null.
 SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
                                            void *&InsertPos) {
   if (doNotCSE(N))
     return nullptr;
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
     Node->intersectFlagsWith(N->getFlags());
   return Node;
 }
 
 unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
   Type *Ty = VT == MVT::iPTR ?
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
   return getDataLayout().getABITypeAlignment(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
     : TM(tm), OptLevel(OL),
       EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
       Root(getEntryNode()) {
   InsertNode(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
 void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE,
                         Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
                         DivergenceAnalysis * Divergence) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
   ORE = &NewORE;
   TLI = getSubtarget().getTargetLowering();
   TSI = getSubtarget().getSelectionDAGInfo();
   LibInfo = LibraryInfo;
   Context = &MF->getFunction().getContext();
   DA = Divergence;
 }
 
 SelectionDAG::~SelectionDAG() {
   assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
   allnodes_clear();
   OperandRecycler.clear(OperandAllocator);
   delete DbgInfo;
 }
 
 void SelectionDAG::allnodes_clear() {
   assert(&*AllNodes.begin() == &EntryNode);
   AllNodes.remove(AllNodes.begin());
   while (!AllNodes.empty())
     DeallocateNode(&AllNodes.front());
 #ifndef NDEBUG
   NextPersistentId = 0;
 #endif
 }
 
 SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
                                           void *&InsertPos) {
   SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
   if (N) {
     switch (N->getOpcode()) {
     default: break;
     case ISD::Constant:
     case ISD::ConstantFP:
       llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
                        "debug location.  Use another overload.");
     }
   }
   return N;
 }
 
 SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
                                           const SDLoc &DL, void *&InsertPos) {
   SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
   if (N) {
     switch (N->getOpcode()) {
     case ISD::Constant:
     case ISD::ConstantFP:
       // Erase debug location from the node if the node is used at several
       // different places. Do not propagate one location to all uses as it
       // will cause a worse single stepping debugging experience.
       if (N->getDebugLoc() != DL.getDebugLoc())
         N->setDebugLoc(DebugLoc());
       break;
     default:
       // When the node's point of use is located earlier in the instruction
       // sequence than its prior point of use, update its debug info to the
       // earlier location.
       if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
         N->setDebugLoc(DL.getDebugLoc());
       break;
     }
   }
   return N;
 }
 
 void SelectionDAG::clear() {
   allnodes_clear();
   OperandRecycler.clear(OperandAllocator);
   OperandAllocator.Reset();
   CSEMap.clear();
 
   ExtendedValueTypeNodes.clear();
   ExternalSymbols.clear();
   TargetExternalSymbols.clear();
   MCSymbols.clear();
   std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
             static_cast<CondCodeSDNode*>(nullptr));
   std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
             static_cast<SDNode*>(nullptr));
 
   EntryNode.UseList = nullptr;
   InsertNode(&EntryNode);
   Root = getEntryNode();
   DbgInfo->clear();
 }
 
 SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType())
              ? getNode(ISD::FP_EXTEND, DL, VT, Op)
              : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
 }
 
 SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ANY_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
 SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
 SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
 SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
                                         EVT OpVT) {
   if (VT.bitsLE(Op.getValueType()))
     return getNode(ISD::TRUNCATE, SL, VT, Op);
 
   TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
   return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
 }
 
 SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
   assert(!VT.isVector() &&
          "getZeroExtendInReg should use the vector element type instead of "
          "the vector type!");
   if (Op.getValueType().getScalarType() == VT) return Op;
   unsigned BitWidth = Op.getScalarValueSizeInBits();
   APInt Imm = APInt::getLowBitsSet(BitWidth,
                                    VT.getSizeInBits());
   return getNode(ISD::AND, DL, Op.getValueType(), Op,
                  getConstant(Imm, DL, Op.getValueType()));
 }
 
 SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
                                               EVT VT) {
   assert(VT.isVector() && "This DAG node is restricted to vector types.");
   assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
          "The sizes of the input and result must match in order to perform the "
          "extend in-register.");
   assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
          "The destination vector type must have fewer lanes than the input.");
   return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
 }
 
 SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
                                                EVT VT) {
   assert(VT.isVector() && "This DAG node is restricted to vector types.");
   assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
          "The sizes of the input and result must match in order to perform the "
          "extend in-register.");
   assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
          "The destination vector type must have fewer lanes than the input.");
   return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
 }
 
 SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
                                                EVT VT) {
   assert(VT.isVector() && "This DAG node is restricted to vector types.");
   assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
          "The sizes of the input and result must match in order to perform the "
          "extend in-register.");
   assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
          "The destination vector type must have fewer lanes than the input.");
   return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
 }
 
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue NegOne =
     getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
   return getNode(ISD::XOR, DL, VT, Val, NegOne);
 }
 
 SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
   return getNode(ISD::XOR, DL, VT, Val, TrueValue);
 }
 
 SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
                                       EVT OpVT) {
   if (!V)
     return getConstant(0, DL, VT);
 
   switch (TLI->getBooleanContents(OpVT)) {
   case TargetLowering::ZeroOrOneBooleanContent:
   case TargetLowering::UndefinedBooleanContent:
     return getConstant(1, DL, VT);
   case TargetLowering::ZeroOrNegativeOneBooleanContent:
     return getAllOnesConstant(DL, VT);
   }
   llvm_unreachable("Unexpected boolean content enum!");
 }
 
 SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
                                   bool isT, bool isO) {
   EVT EltVT = VT.getScalarType();
   assert((EltVT.getSizeInBits() >= 64 ||
          (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
          "getConstant with a uint64_t value that doesn't fit in the type!");
   return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
 }
 
 SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
                                   bool isT, bool isO) {
   return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO);
 }
 
 SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
                                   EVT VT, bool isT, bool isO) {
   assert(VT.isInteger() && "Cannot create FP integer constant!");
 
   EVT EltVT = VT.getScalarType();
   const ConstantInt *Elt = &Val;
 
   // In some cases the vector type is legal but the element type is illegal and
   // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
   // inserted value (the type does not need to match the vector element type).
   // Any extra bits introduced will be truncated away.
   if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
       TargetLowering::TypePromoteInteger) {
    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
    APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
   // In other cases the element type is illegal and needs to be expanded, for
   // example v2i64 on MIPS32. In this case, find the nearest legal type, split
   // the value into n parts and use a vector type with n-times the elements.
   // Then bitcast to the type requested.
   // Legalizing constants too early makes the DAGCombiner's job harder so we
   // only legalize if the DAG tells us we must produce legal types.
   else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
            TLI->getTypeAction(*getContext(), EltVT) ==
            TargetLowering::TypeExpandInteger) {
     const APInt &NewVal = Elt->getValue();
     EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
     unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
     unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
     EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
 
     // Check the temporary vector is the correct size. If this fails then
     // getTypeToTransformTo() probably returned a type whose size (in bits)
     // isn't a power-of-2 factor of the requested type size.
     assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
 
     SmallVector<SDValue, 2> EltParts;
     for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
       EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
                                            .zextOrTrunc(ViaEltSizeInBits), DL,
                                      ViaEltVT, isT, isO));
     }
 
     // EltParts is currently in little endian order. If we actually want
     // big-endian order then reverse it now.
     if (getDataLayout().isBigEndian())
       std::reverse(EltParts.begin(), EltParts.end());
 
     // The elements must be reversed when the element order is different
     // to the endianness of the elements (because the BITCAST is itself a
     // vector shuffle in this situation). However, we do not need any code to
     // perform this reversal because getConstant() is producing a vector
     // splat.
     // This situation occurs in MIPS MSA.
 
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
       Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
 
     SDValue V = getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
     return V;
   }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
          "APInt size does not match type size!");
   unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
   ID.AddPointer(Elt);
   ID.AddBoolean(isO);
   void *IP = nullptr;
   SDNode *N = nullptr;
   if ((N = FindNodeOrInsertPos(ID, DL, IP)))
     if (!VT.isVector())
       return SDValue(N, 0);
 
   if (!N) {
     N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
     NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
   }
 
   SDValue Result(N, 0);
   if (VT.isVector())
     Result = getSplatBuildVector(VT, DL, Result);
 
   return Result;
 }
 
 SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
                                         bool isTarget) {
   return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
 }
 
 SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
                                     bool isTarget) {
   return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
 }
 
 SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
                                     EVT VT, bool isTarget) {
   assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
 
   EVT EltVT = VT.getScalarType();
 
   // Do the map lookup using the actual bit pattern for the floating point
   // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
   // we don't have issues with SNANs.
   unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
   ID.AddPointer(&V);
   void *IP = nullptr;
   SDNode *N = nullptr;
   if ((N = FindNodeOrInsertPos(ID, DL, IP)))
     if (!VT.isVector())
       return SDValue(N, 0);
 
   if (!N) {
     N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
   }
 
   SDValue Result(N, 0);
   if (VT.isVector())
     Result = getSplatBuildVector(VT, DL, Result);
   NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
   return Result;
 }
 
 SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
                                     bool isTarget) {
   EVT EltVT = VT.getScalarType();
   if (EltVT == MVT::f32)
     return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
   else if (EltVT == MVT::f64)
     return getConstantFP(APFloat(Val), DL, VT, isTarget);
   else if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
            EltVT == MVT::f16) {
     bool Ignored;
     APFloat APF = APFloat(Val);
     APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
                 &Ignored);
     return getConstantFP(APF, DL, VT, isTarget);
   } else
     llvm_unreachable("Unsupported type in getConstantFP");
 }
 
 SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
                                        EVT VT, int64_t Offset, bool isTargetGA,
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
 
   // Truncate (with sign-extension) the offset value to the pointer size.
   unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
   unsigned Opc;
   if (GV->isThreadLocal())
     Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
   else
     Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddPointer(GV);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<GlobalAddressSDNode>(
       Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
     InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
   unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(FI);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
                                    unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent jump tables");
   unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(JTI);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
                                       unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
     Alignment = MF->getFunction().optForSize()
                     ? getDataLayout().getABITypeAlignment(C->getType())
                     : getDataLayout().getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(Alignment);
   ID.AddInteger(Offset);
   ID.AddPointer(C);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
                                           TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
                                       unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
     Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(Alignment);
   ID.AddInteger(Offset);
   C->addSelectionDAGCSEId(ID);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
                                           TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
                                      unsigned char TargetFlags) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
   ID.AddInteger(Index);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
   ID.AddPointer(MBB);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<BasicBlockSDNode>(MBB);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getValueType(EVT VT) {
   if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
       ValueTypeNodes.size())
     ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
 
   SDNode *&N = VT.isExtended() ?
     ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
 
   if (N) return SDValue(N, 0);
   N = newSDNode<VTSDNode>(VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
   SDNode *&N = ExternalSymbols[Sym];
   if (N) return SDValue(N, 0);
   N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
   SDNode *&N = MCSymbols[Sym];
   if (N)
     return SDValue(N, 0);
   N = newSDNode<MCSymbolSDNode>(Sym, VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
                                               unsigned char TargetFlags) {
   SDNode *&N =
     TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
                                                                TargetFlags)];
   if (N) return SDValue(N, 0);
   N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   if ((unsigned)Cond >= CondCodeNodes.size())
     CondCodeNodes.resize(Cond+1);
 
   if (!CondCodeNodes[Cond]) {
     auto *N = newSDNode<CondCodeSDNode>(Cond);
     CondCodeNodes[Cond] = N;
     InsertNode(N);
   }
 
   return SDValue(CondCodeNodes[Cond], 0);
 }
 
 /// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
 /// point at N1 to point at N2 and indices that point at N2 to point at N1.
 static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
   std::swap(N1, N2);
   ShuffleVectorSDNode::commuteMask(M);
 }
 
 SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
                                        SDValue N2, ArrayRef<int> Mask) {
   assert(VT.getVectorNumElements() == Mask.size() &&
            "Must have the same number of vector elements as mask elements!");
   assert(VT == N1.getValueType() && VT == N2.getValueType() &&
          "Invalid VECTOR_SHUFFLE");
 
   // Canonicalize shuffle undef, undef -> undef
   if (N1.isUndef() && N2.isUndef())
     return getUNDEF(VT);
 
   // Validate that all indices in Mask are within the range of the elements
   // input to the shuffle.
   int NElts = Mask.size();
   assert(llvm::all_of(Mask,
                       [&](int M) { return M < (NElts * 2) && M >= -1; }) &&
          "Index out of range");
 
   // Copy the mask so we can do any needed cleanup.
   SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());
 
   // Canonicalize shuffle v, v -> v, undef
   if (N1 == N2) {
     N2 = getUNDEF(VT);
     for (int i = 0; i != NElts; ++i)
       if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
   }
 
   // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
   if (N1.isUndef())
     commuteShuffle(N1, N2, MaskVec);
 
   if (TLI->hasVectorBlend()) {
     // If shuffling a splat, try to blend the splat instead. We do this here so
     // that even when this arises during lowering we don't have to re-handle it.
     auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
       BitVector UndefElements;
       SDValue Splat = BV->getSplatValue(&UndefElements);
       if (!Splat)
         return;
 
       for (int i = 0; i < NElts; ++i) {
         if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
           continue;
 
         // If this input comes from undef, mark it as such.
         if (UndefElements[MaskVec[i] - Offset]) {
           MaskVec[i] = -1;
           continue;
         }
 
         // If we can blend a non-undef lane, use that instead.
         if (!UndefElements[i])
           MaskVec[i] = i + Offset;
       }
     };
     if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
       BlendSplat(N1BV, 0);
     if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
       BlendSplat(N2BV, NElts);
   }
 
   // Canonicalize all index into lhs, -> shuffle lhs, undef
   // Canonicalize all index into rhs, -> shuffle rhs, undef
   bool AllLHS = true, AllRHS = true;
   bool N2Undef = N2.isUndef();
   for (int i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= NElts) {
       if (N2Undef)
         MaskVec[i] = -1;
       else
         AllLHS = false;
     } else if (MaskVec[i] >= 0) {
       AllRHS = false;
     }
   }
   if (AllLHS && AllRHS)
     return getUNDEF(VT);
   if (AllLHS && !N2Undef)
     N2 = getUNDEF(VT);
   if (AllRHS) {
     N1 = getUNDEF(VT);
     commuteShuffle(N1, N2, MaskVec);
   }
   // Reset our undef status after accounting for the mask.
   N2Undef = N2.isUndef();
   // Re-check whether both sides ended up undef.
   if (N1.isUndef() && N2Undef)
     return getUNDEF(VT);
 
   // If Identity shuffle return that node.
   bool Identity = true, AllSame = true;
   for (int i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
     if (MaskVec[i] != MaskVec[0]) AllSame = false;
   }
   if (Identity && NElts)
     return N1;
 
   // Shuffling a constant splat doesn't change the result.
   if (N2Undef) {
     SDValue V = N1;
 
     // Look through any bitcasts. We check that these don't change the number
     // (and size) of elements and just changes their types.
     while (V.getOpcode() == ISD::BITCAST)
       V = V->getOperand(0);
 
     // A splat should always show up as a build vector node.
     if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
       BitVector UndefElements;
       SDValue Splat = BV->getSplatValue(&UndefElements);
       // If this is a splat of an undef, shuffling it is also undef.
       if (Splat && Splat.isUndef())
         return getUNDEF(VT);
 
       bool SameNumElts =
           V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
 
       // We only have a splat which can skip shuffles if there is a splatted
       // value and no undef lanes rearranged by the shuffle.
       if (Splat && UndefElements.none()) {
         // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
         // number of elements match or the value splatted is a zero constant.
         if (SameNumElts)
           return N1;
         if (auto *C = dyn_cast<ConstantSDNode>(Splat))
           if (C->isNullValue())
             return N1;
       }
 
       // If the shuffle itself creates a splat, build the vector directly.
       if (AllSame && SameNumElts) {
         EVT BuildVT = BV->getValueType(0);
         const SDValue &Splatted = BV->getOperand(MaskVec[0]);
         SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);
 
         // We may have jumped through bitcasts, so the type of the
         // BUILD_VECTOR may not match the type of the shuffle.
         if (BuildVT != VT)
           NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
         return NewBV;
       }
     }
   }
 
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
   AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
   for (int i = 0; i != NElts; ++i)
     ID.AddInteger(MaskVec[i]);
 
   void* IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
   // Allocate the mask array for the node out of the BumpPtrAllocator, since
   // SDNode doesn't have access to it.  This memory will be "leaked" when
   // the node is deallocated, but recovered when the NodeAllocator is released.
   int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
   std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc);
 
   auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
                                            dl.getDebugLoc(), MaskAlloc);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V = SDValue(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
   EVT VT = SV.getValueType(0);
   SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
   ShuffleVectorSDNode::commuteMask(MaskVec);
 
   SDValue Op0 = SV.getOperand(0);
   SDValue Op1 = SV.getOperand(1);
   return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
 }
 
 SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
   ID.AddInteger(RegNo);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
   N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
   ID.AddPointer(RegMask);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
                                  MCSymbol *Label) {
   return getLabelNode(ISD::EH_LABEL, dl, Root, Label);
 }
 
 SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
                                    SDValue Root, MCSymbol *Label) {
   FoldingSetNodeID ID;
   SDValue Ops[] = { Root };
   AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), Ops);
   ID.AddPointer(Label);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<LabelSDNode>(dl.getIROrder(), dl.getDebugLoc(), Label);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
                                       int64_t Offset,
                                       bool isTarget,
                                       unsigned char TargetFlags) {
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddPointer(BA);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getSrcValue(const Value *V) {
   assert((!V || V->getType()->isPointerTy()) &&
          "SrcValue is not a pointer?");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
   ID.AddPointer(V);
 
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<SrcValueSDNode>(V);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
   ID.AddPointer(MD);
 
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<MDNodeSDNode>(MD);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
   if (VT == V.getValueType())
     return V;
 
   return getNode(ISD::BITCAST, SDLoc(V), VT, V);
 }
 
 SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
                                        unsigned SrcAS, unsigned DestAS) {
   SDValue Ops[] = {Ptr};
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
   ID.AddInteger(SrcAS);
   ID.AddInteger(DestAS);
 
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                            VT, SrcAS, DestAS);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 /// getShiftAmountOperand - Return the specified value casted to
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   EVT OpTy = Op.getValueType();
   EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
   if (OpTy == ShTy || OpTy.isVector()) return Op;
 
   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
 }
 
 SDValue SelectionDAG::expandVAArg(SDNode *Node) {
   SDLoc dl(Node);
   const TargetLowering &TLI = getTargetLoweringInfo();
   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   EVT VT = Node->getValueType(0);
   SDValue Tmp1 = Node->getOperand(0);
   SDValue Tmp2 = Node->getOperand(1);
   unsigned Align = Node->getConstantOperandVal(3);
 
   SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
                                Tmp2, MachinePointerInfo(V));
   SDValue VAList = VAListLoad;
 
   if (Align > TLI.getMinStackArgumentAlignment()) {
     assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
 
     VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                      getConstant(Align - 1, dl, VAList.getValueType()));
 
     VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
                      getConstant(-(int64_t)Align, dl, VAList.getValueType()));
   }
 
   // Increment the pointer, VAList, to the next vaarg
   Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                  getConstant(getDataLayout().getTypeAllocSize(
                                                VT.getTypeForEVT(*getContext())),
                              dl, VAList.getValueType()));
   // Store the incremented VAList to the legalized pointer
   Tmp1 =
       getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
   // Load the actual argument out of the pointer VAList
   return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
 }
 
 SDValue SelectionDAG::expandVACopy(SDNode *Node) {
   SDLoc dl(Node);
   const TargetLowering &TLI = getTargetLoweringInfo();
   // This defaults to loading a pointer from the input and storing it to the
   // output, returning the chain.
   const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
   const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
   SDValue Tmp1 =
       getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
               Node->getOperand(2), MachinePointerInfo(VS));
   return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
                   MachinePointerInfo(VD));
 }
 
 SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
   unsigned ByteSize = VT.getStoreSize();
   Type *Ty = VT.getTypeForEVT(*getContext());
   unsigned StackAlign =
       std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
 
   int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
   unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
   const DataLayout &DL = getDataLayout();
   unsigned Align =
       std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));
 
   MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
   int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
 SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
                                 ISD::CondCode Cond, const SDLoc &dl) {
   EVT OpVT = N1.getValueType();
 
   // These setcc operations always fold.
   switch (Cond) {
   default: break;
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return getBoolConstant(false, dl, VT, OpVT);
   case ISD::SETTRUE:
   case ISD::SETTRUE2: return getBoolConstant(true, dl, VT, OpVT);
 
   case ISD::SETOEQ:
   case ISD::SETOGT:
   case ISD::SETOGE:
   case ISD::SETOLT:
   case ISD::SETOLE:
   case ISD::SETONE:
   case ISD::SETO:
   case ISD::SETUO:
   case ISD::SETUEQ:
   case ISD::SETUNE:
     assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
     break;
   }
 
   if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
     const APInt &C2 = N2C->getAPIntValue();
     if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
       const APInt &C1 = N1C->getAPIntValue();
 
       switch (Cond) {
       default: llvm_unreachable("Unknown integer setcc!");
       case ISD::SETEQ:  return getBoolConstant(C1 == C2, dl, VT, OpVT);
       case ISD::SETNE:  return getBoolConstant(C1 != C2, dl, VT, OpVT);
       case ISD::SETULT: return getBoolConstant(C1.ult(C2), dl, VT, OpVT);
       case ISD::SETUGT: return getBoolConstant(C1.ugt(C2), dl, VT, OpVT);
       case ISD::SETULE: return getBoolConstant(C1.ule(C2), dl, VT, OpVT);
       case ISD::SETUGE: return getBoolConstant(C1.uge(C2), dl, VT, OpVT);
       case ISD::SETLT:  return getBoolConstant(C1.slt(C2), dl, VT, OpVT);
       case ISD::SETGT:  return getBoolConstant(C1.sgt(C2), dl, VT, OpVT);
       case ISD::SETLE:  return getBoolConstant(C1.sle(C2), dl, VT, OpVT);
       case ISD::SETGE:  return getBoolConstant(C1.sge(C2), dl, VT, OpVT);
       }
     }
   }
   if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1)) {
     if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2)) {
       APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
       switch (Cond) {
       default: break;
       case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
       case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT,
                                                OpVT);
       case ISD::SETNE:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
       case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
                                                R==APFloat::cmpLessThan, dl, VT,
                                                OpVT);
       case ISD::SETLT:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
       case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT,
                                                OpVT);
       case ISD::SETGT:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
       case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl,
                                                VT, OpVT);
       case ISD::SETLE:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
       case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan ||
                                                R==APFloat::cmpEqual, dl, VT,
                                                OpVT);
       case ISD::SETGE:  if (R==APFloat::cmpUnordered)
                           return getUNDEF(VT);
                         LLVM_FALLTHROUGH;
       case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
                                            R==APFloat::cmpEqual, dl, VT, OpVT);
       case ISD::SETO:   return getBoolConstant(R!=APFloat::cmpUnordered, dl, VT,
                                                OpVT);
       case ISD::SETUO:  return getBoolConstant(R==APFloat::cmpUnordered, dl, VT,
                                                OpVT);
       case ISD::SETUEQ: return getBoolConstant(R==APFloat::cmpUnordered ||
                                                R==APFloat::cmpEqual, dl, VT,
                                                OpVT);
       case ISD::SETUNE: return getBoolConstant(R!=APFloat::cmpEqual, dl, VT,
                                                OpVT);
       case ISD::SETULT: return getBoolConstant(R==APFloat::cmpUnordered ||
                                                R==APFloat::cmpLessThan, dl, VT,
                                                OpVT);
       case ISD::SETUGT: return getBoolConstant(R==APFloat::cmpGreaterThan ||
                                                R==APFloat::cmpUnordered, dl, VT,
                                                OpVT);
       case ISD::SETULE: return getBoolConstant(R!=APFloat::cmpGreaterThan, dl,
                                                VT, OpVT);
       case ISD::SETUGE: return getBoolConstant(R!=APFloat::cmpLessThan, dl, VT,
                                                OpVT);
       }
     } else {
       // Ensure that the constant occurs on the RHS.
       ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
       MVT CompVT = N1.getValueType().getSimpleVT();
       if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
         return SDValue();
 
       return getSetCC(dl, VT, N2, N1, SwappedCond);
     }
   }
 
   // Could not fold it.
   return SDValue();
 }
 
 /// See if the specified operand can be simplified with the knowledge that only
 /// the bits specified by Mask are used.
 SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &Mask) {
   switch (V.getOpcode()) {
   default:
     break;
   case ISD::Constant: {
     const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
     assert(CV && "Const value should be ConstSDNode.");
     const APInt &CVal = CV->getAPIntValue();
     APInt NewVal = CVal & Mask;
     if (NewVal != CVal)
       return getConstant(NewVal, SDLoc(V), V.getValueType());
     break;
   }
   case ISD::OR:
   case ISD::XOR:
     // If the LHS or RHS don't contribute bits to the or, drop them.
     if (MaskedValueIsZero(V.getOperand(0), Mask))
       return V.getOperand(1);
     if (MaskedValueIsZero(V.getOperand(1), Mask))
       return V.getOperand(0);
     break;
   case ISD::SRL:
     // Only look at single-use SRLs.
     if (!V.getNode()->hasOneUse())
       break;
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
       // See if we can recursively simplify the LHS.
       unsigned Amt = RHSC->getZExtValue();
 
       // Watch out for shift count overflow though.
       if (Amt >= Mask.getBitWidth())
         break;
       APInt NewMask = Mask << Amt;
       if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
         return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
                        V.getOperand(1));
     }
     break;
   case ISD::AND: {
     // X & -1 -> X (ignoring bits which aren't demanded).
     ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
     if (AndVal && Mask.isSubsetOf(AndVal->getAPIntValue()))
       return V.getOperand(0);
     break;
   }
   case ISD::ANY_EXTEND: {
     SDValue Src = V.getOperand(0);
     unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
     // Being conservative here - only peek through if we only demand bits in the
     // non-extended source (even though the extended bits are technically undef).
     if (Mask.getActiveBits() > SrcBitWidth)
       break;
     APInt SrcMask = Mask.trunc(SrcBitWidth);
     if (SDValue DemandedSrc = GetDemandedBits(Src, SrcMask))
       return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
     break;
   }
   }
   return SDValue();
 }
 
 /// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
 /// use this predicate to simplify operations downstream.
 bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
   unsigned BitWidth = Op.getScalarValueSizeInBits();
   return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
 }
 
 /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
 /// this predicate to simplify operations downstream.  Mask is known to be zero
 /// for bits that V cannot have.
 bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
                                      unsigned Depth) const {
   KnownBits Known;
   computeKnownBits(Op, Known, Depth);
   return Mask.isSubsetOf(Known.Zero);
 }
 
 /// Helper function that checks to see if a node is a constant or a
 /// build vector of splat constants at least within the demanded elts.
 static ConstantSDNode *isConstOrDemandedConstSplat(SDValue N,
                                                    const APInt &DemandedElts) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
     return CN;
   if (N.getOpcode() != ISD::BUILD_VECTOR)
     return nullptr;
   EVT VT = N.getValueType();
   ConstantSDNode *Cst = nullptr;
   unsigned NumElts = VT.getVectorNumElements();
   assert(DemandedElts.getBitWidth() == NumElts && "Unexpected vector size");
   for (unsigned i = 0; i != NumElts; ++i) {
     if (!DemandedElts[i])
       continue;
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(i));
     if (!C || (Cst && Cst->getAPIntValue() != C->getAPIntValue()) ||
         C->getValueType(0) != VT.getScalarType())
       return nullptr;
     Cst = C;
   }
   return Cst;
 }
 
 /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
 /// is less than the element bit-width of the shift node, return it.
 static const APInt *getValidShiftAmountConstant(SDValue V) {
   if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
     // Shifting more than the bitwidth is not valid.
     const APInt &ShAmt = SA->getAPIntValue();
     if (ShAmt.ult(V.getScalarValueSizeInBits()))
       return &ShAmt;
   }
   return nullptr;
 }
 
 /// Determine which bits of Op are known to be either zero or one and return
 /// them in Known. For vectors, the known bits are those that are shared by
 /// every vector element.
 void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
                                     unsigned Depth) const {
   EVT VT = Op.getValueType();
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
   computeKnownBits(Op, Known, DemandedElts, Depth);
 }
 
 /// Determine which bits of Op are known to be either zero or one and return
 /// them in Known. The DemandedElts argument allows us to only collect the known
 /// bits that are shared by the requested vector elements.
 void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
                                     const APInt &DemandedElts,
                                     unsigned Depth) const {
   unsigned BitWidth = Op.getScalarValueSizeInBits();
 
   Known = KnownBits(BitWidth);   // Don't know anything.
 
   if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
     // We know all of the bits for a constant!
     Known.One = C->getAPIntValue();
     Known.Zero = ~Known.One;
     return;
   }
   if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
     // We know all of the bits for a constant fp!
     Known.One = C->getValueAPF().bitcastToAPInt();
     Known.Zero = ~Known.One;
     return;
   }
 
   if (Depth == 6)
     return;  // Limit search depth.
 
   KnownBits Known2;
   unsigned NumElts = DemandedElts.getBitWidth();
 
   if (!DemandedElts)
     return;  // No demanded elts, better to assume we don't know anything.
 
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
   case ISD::BUILD_VECTOR:
     // Collect the known bits that are shared by every demanded vector element.
     assert(NumElts == Op.getValueType().getVectorNumElements() &&
            "Unexpected vector size");
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       if (!DemandedElts[i])
         continue;
 
       SDValue SrcOp = Op.getOperand(i);
       computeKnownBits(SrcOp, Known2, Depth + 1);
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
       if (SrcOp.getValueSizeInBits() != BitWidth) {
         assert(SrcOp.getValueSizeInBits() > BitWidth &&
                "Expected BUILD_VECTOR implicit truncation");
         Known2 = Known2.trunc(BitWidth);
       }
 
       // Known bits are the values that are shared by every demanded element.
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
 
       // If we don't know any bits, early out.
       if (Known.isUnknown())
         break;
     }
     break;
   case ISD::VECTOR_SHUFFLE: {
     // Collect the known bits that are shared by every vector element referenced
     // by the shuffle.
     APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
     Known.Zero.setAllBits(); Known.One.setAllBits();
     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
     assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
     for (unsigned i = 0; i != NumElts; ++i) {
       if (!DemandedElts[i])
         continue;
 
       int M = SVN->getMaskElt(i);
       if (M < 0) {
         // For UNDEF elements, we don't know anything about the common state of
         // the shuffle result.
         Known.resetAll();
         DemandedLHS.clearAllBits();
         DemandedRHS.clearAllBits();
         break;
       }
 
       if ((unsigned)M < NumElts)
         DemandedLHS.setBit((unsigned)M % NumElts);
       else
         DemandedRHS.setBit((unsigned)M % NumElts);
     }
     // Known bits are the values that are shared by every demanded element.
     if (!!DemandedLHS) {
       SDValue LHS = Op.getOperand(0);
       computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
     if (!!DemandedRHS) {
       SDValue RHS = Op.getOperand(1);
       computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
     break;
   }
   case ISD::CONCAT_VECTORS: {
     // Split DemandedElts and test each of the demanded subvectors.
     Known.Zero.setAllBits(); Known.One.setAllBits();
     EVT SubVectorVT = Op.getOperand(0).getValueType();
     unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
     unsigned NumSubVectors = Op.getNumOperands();
     for (unsigned i = 0; i != NumSubVectors; ++i) {
       APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
       DemandedSub = DemandedSub.trunc(NumSubVectorElts);
       if (!!DemandedSub) {
         SDValue Sub = Op.getOperand(i);
         computeKnownBits(Sub, Known2, DemandedSub, Depth + 1);
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
       // If we don't know any bits, early out.
       if (Known.isUnknown())
         break;
     }
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
     // If we know the element index, demand any elements from the subvector and
     // the remainder from the src its inserted into, otherwise demand them all.
     SDValue Src = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
     if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
       Known.One.setAllBits();
       Known.Zero.setAllBits();
       uint64_t Idx = SubIdx->getZExtValue();
       APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
       if (!!DemandedSubElts) {
         computeKnownBits(Sub, Known, DemandedSubElts, Depth + 1);
         if (Known.isUnknown())
           break; // early-out.
       }
       APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
       APInt DemandedSrcElts = DemandedElts & ~SubMask;
       if (!!DemandedSrcElts) {
         computeKnownBits(Src, Known2, DemandedSrcElts, Depth + 1);
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
     } else {
       computeKnownBits(Sub, Known, Depth + 1);
       if (Known.isUnknown())
         break; // early-out.
       computeKnownBits(Src, Known2, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
     break;
   }
   case ISD::EXTRACT_SUBVECTOR: {
     // If we know the element index, just demand that subvector elements,
     // otherwise demand them all.
     SDValue Src = Op.getOperand(0);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
       computeKnownBits(Src, Known, DemandedSrc, Depth + 1);
     } else {
       computeKnownBits(Src, Known, Depth + 1);
     }
     break;
   }
   case ISD::BITCAST: {
     SDValue N0 = Op.getOperand(0);
     EVT SubVT = N0.getValueType();
     unsigned SubBitWidth = SubVT.getScalarSizeInBits();
 
     // Ignore bitcasts from unsupported types.
     if (!(SubVT.isInteger() || SubVT.isFloatingPoint()))
       break;
 
     // Fast handling of 'identity' bitcasts.
     if (BitWidth == SubBitWidth) {
       computeKnownBits(N0, Known, DemandedElts, Depth + 1);
       break;
     }
 
     bool IsLE = getDataLayout().isLittleEndian();
 
     // Bitcast 'small element' vector to 'large element' scalar/vector.
     if ((BitWidth % SubBitWidth) == 0) {
       assert(N0.getValueType().isVector() && "Expected bitcast from vector");
 
       // Collect known bits for the (larger) output by collecting the known
       // bits from each set of sub elements and shift these into place.
       // We need to separately call computeKnownBits for each set of
       // sub elements as the knownbits for each is likely to be different.
       unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts(NumElts * SubScale, 0);
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i])
           SubDemandedElts.setBit(i * SubScale);
 
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(N0, Known2, SubDemandedElts.shl(i),
                          Depth + 1);
         unsigned Shifts = IsLE ? i : SubScale - 1 - i;
         Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * Shifts);
         Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * Shifts);
       }
     }
 
     // Bitcast 'large element' scalar/vector to 'small element' vector.
     if ((SubBitWidth % BitWidth) == 0) {
       assert(Op.getValueType().isVector() && "Expected bitcast to vector");
 
       // Collect known bits for the (smaller) output by collecting the known
       // bits from the overlapping larger input elements and extracting the
       // sub sections we actually care about.
       unsigned SubScale = SubBitWidth / BitWidth;
       APInt SubDemandedElts(NumElts / SubScale, 0);
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i])
           SubDemandedElts.setBit(i / SubScale);
 
       computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1);
 
       Known.Zero.setAllBits(); Known.One.setAllBits();
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i]) {
           unsigned Shifts = IsLE ? i : NumElts - 1 - i;
           unsigned Offset = (Shifts % SubScale) * BitWidth;
           Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
           Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
           // If we don't know any bits, early out.
           if (Known.isUnknown())
             break;
         }
     }
     break;
   }
   case ISD::AND:
     // If either the LHS or the RHS are Zero, the result is zero.
     computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     Known.One &= Known2.One;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     Known.Zero |= Known2.Zero;
     break;
   case ISD::OR:
     computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     Known.Zero &= Known2.Zero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
     Known.One |= Known2.One;
     break;
   case ISD::XOR: {
     computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
     Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
     Known.Zero = KnownZeroOut;
     break;
   }
   case ISD::MUL: {
     computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // If low bits are zero in either operand, output low known-0 bits.
     // Also compute a conservative estimate for high known-0 bits.
     // More trickiness is possible, but this is sufficient for the
     // interesting case of alignment computation.
     unsigned TrailZ = Known.countMinTrailingZeros() +
                       Known2.countMinTrailingZeros();
     unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
                                Known2.countMinLeadingZeros(),
                                BitWidth) - BitWidth;
 
     Known.resetAll();
     Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
     Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
     break;
   }
   case ISD::UDIV: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     unsigned LeadZ = Known2.countMinLeadingZeros();
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
     unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
     if (RHSMaxLeadingZeros != BitWidth)
       LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
   }
   case ISD::SELECT:
   case ISD::VSELECT:
     computeKnownBits(Op.getOperand(2), Known, DemandedElts, Depth+1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     break;
   case ISD::SELECT_CC:
     computeKnownBits(Op.getOperand(3), Known, DemandedElts, Depth+1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
     computeKnownBits(Op.getOperand(2), Known2, DemandedElts, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     break;
   case ISD::SMULO:
   case ISD::UMULO:
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     if (Op.getResNo() != 1)
       break;
     // The boolean result conforms to getBooleanContents.
     // If we know the result of a setcc has the top bits zero, use this info.
     // We know that we have an integer-based boolean since these operations
     // are only available for integer.
     if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
       Known.Zero.setBitsFrom(1);
     break;
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
       Known.Zero.setBitsFrom(1);
     break;
   case ISD::SHL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
       computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
       unsigned Shift = ShAmt->getZExtValue();
       Known.Zero <<= Shift;
       Known.One <<= Shift;
       // Low bits are known zero.
       Known.Zero.setLowBits(Shift);
     }
     break;
   case ISD::SRL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
       computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
       unsigned Shift = ShAmt->getZExtValue();
       Known.Zero.lshrInPlace(Shift);
       Known.One.lshrInPlace(Shift);
       // High bits are known zero.
       Known.Zero.setHighBits(Shift);
     } else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
       // If the shift amount is a vector of constants see if we can bound
       // the number of upper zero bits.
       unsigned ShiftAmountMin = BitWidth;
       for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
         if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
           const APInt &ShAmt = C->getAPIntValue();
           if (ShAmt.ult(BitWidth)) {
             ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
                                                 ShAmt.getZExtValue());
             continue;
           }
         }
         // Don't know anything.
         ShiftAmountMin = 0;
         break;
       }
 
       Known.Zero.setHighBits(ShiftAmountMin);
     }
     break;
   case ISD::SRA:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
       computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
       unsigned Shift = ShAmt->getZExtValue();
       // Sign extend known zero/one bit (else is unknown).
       Known.Zero.ashrInPlace(Shift);
       Known.One.ashrInPlace(Shift);
     }
     break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     unsigned EBits = EVT.getScalarSizeInBits();
 
     // Sign extension.  Compute the demanded bits in the result that are not
     // present in the input.
     APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
 
     APInt InSignMask = APInt::getSignMask(EBits);
     APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
 
     // If the sign extended bits are demanded, we know that the sign
     // bit is demanded.
     InSignMask = InSignMask.zext(BitWidth);
     if (NewBits.getBoolValue())
       InputDemandedBits |= InSignMask;
 
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     Known.One &= InputDemandedBits;
     Known.Zero &= InputDemandedBits;
 
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
     if (Known.Zero.intersects(InSignMask)) {        // Input sign bit known clear
       Known.Zero |= NewBits;
       Known.One  &= ~NewBits;
     } else if (Known.One.intersects(InSignMask)) {  // Input sign bit known set
       Known.One  |= NewBits;
       Known.Zero &= ~NewBits;
     } else {                              // Input sign bit unknown
       Known.Zero &= ~NewBits;
       Known.One  &= ~NewBits;
     }
     break;
   }
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we have a known 1, its position is our upper bound.
     unsigned PossibleTZ = Known2.countMaxTrailingZeros();
     unsigned LowBits = Log2_32(PossibleTZ) + 1;
     Known.Zero.setBitsFrom(LowBits);
     break;
   }
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we have a known 1, its position is our upper bound.
     unsigned PossibleLZ = Known2.countMaxLeadingZeros();
     unsigned LowBits = Log2_32(PossibleLZ) + 1;
     Known.Zero.setBitsFrom(LowBits);
     break;
   }
   case ISD::CTPOP: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we know some of the bits are zero, they can't be one.
     unsigned PossibleOnes = Known2.countMaxPopulation();
     Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
     break;
   }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(Op);
     // If this is a ZEXTLoad and we are looking at the loaded value.
     if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
       Known.Zero.setBitsFrom(MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
       if (LD->getExtensionType() == ISD::NON_EXTLOAD)
         computeKnownBitsFromRangeMetadata(*Ranges, Known);
     }
     break;
   }
   case ISD::ZERO_EXTEND_VECTOR_INREG: {
     EVT InVT = Op.getOperand(0).getValueType();
     APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
     computeKnownBits(Op.getOperand(0), Known, InDemandedElts, Depth + 1);
     Known = Known.zext(BitWidth);
     Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
     break;
   }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     Known = Known.zext(BitWidth);
     Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
     break;
   }
   // TODO ISD::SIGN_EXTEND_VECTOR_INREG
   case ISD::SIGN_EXTEND: {
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     // If the sign bit is known to be zero or one, then sext will extend
     // it to the top bits, else it will just zext.
     Known = Known.sext(BitWidth);
     break;
   }
   case ISD::ANY_EXTEND: {
     computeKnownBits(Op.getOperand(0), Known, Depth+1);
     Known = Known.zext(BitWidth);
     break;
   }
   case ISD::TRUNCATE: {
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     Known = Known.trunc(BitWidth);
     break;
   }
   case ISD::AssertZext: {
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
     computeKnownBits(Op.getOperand(0), Known, Depth+1);
     Known.Zero |= (~InMask);
     Known.One  &= (~Known.Zero);
     break;
   }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
     Known.Zero.setBitsFrom(1);
     break;
   case ISD::USUBO:
   case ISD::SSUBO:
     if (Op.getResNo() == 1) {
       // If we know the result of a setcc has the top bits zero, use this info.
       if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
               TargetLowering::ZeroOrOneBooleanContent &&
           BitWidth > 1)
         Known.Zero.setBitsFrom(1);
       break;
     }
     LLVM_FALLTHROUGH;
   case ISD::SUB:
   case ISD::SUBC: {
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
       // We know that the top bits of C-X are clear if X contains less bits
       // than C (i.e. no wrap-around can happen).  For example, 20-X is
       // positive if we can prove that X is >= 0 and < 16.
       if (CLHS->getAPIntValue().isNonNegative()) {
         unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
         computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
                          Depth + 1);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
         // from [0-C].
         if ((Known2.Zero & MaskV) == MaskV) {
           unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
           // Top bits known zero.
           Known.Zero.setHighBits(NLZ2);
         }
       }
     }
 
     // If low bits are know to be zero in both operands, then we know they are
     // going to be 0 in the result. Both addition and complement operations
     // preserve the low zero bits.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     unsigned KnownZeroLow = Known2.countMinTrailingZeros();
     if (KnownZeroLow == 0)
       break;
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
     KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
     Known.Zero.setLowBits(KnownZeroLow);
     break;
   }
   case ISD::UADDO:
   case ISD::SADDO:
   case ISD::ADDCARRY:
     if (Op.getResNo() == 1) {
       // If we know the result of a setcc has the top bits zero, use this info.
       if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
               TargetLowering::ZeroOrOneBooleanContent &&
           BitWidth > 1)
         Known.Zero.setBitsFrom(1);
       break;
     }
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::ADDC:
   case ISD::ADDE: {
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
     // low 3 bits clear.
     // Output known-0 bits are also known if the top bits of each input are
     // known to be clear. For example, if one input has the top 10 bits clear
     // and the other has the top 8 bits clear, we know the top 7 bits of the
     // output must be clear.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
     unsigned KnownZeroLow = Known2.countMinTrailingZeros();
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
                      Depth + 1);
     KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
     KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
 
     if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
       // With ADDE and ADDCARRY, a carry bit may be added in, so we can only
       // use this information if we know (at least) that the low two bits are
       // clear. We then return to the caller that the low bit is unknown but
       // that other bits are known zero.
       if (KnownZeroLow >= 2)
         Known.Zero.setBits(1, KnownZeroLow);
       break;
     }
 
     Known.Zero.setLowBits(KnownZeroLow);
     if (KnownZeroHigh > 1)
       Known.Zero.setHighBits(KnownZeroHigh - 1);
     break;
   }
   case ISD::SREM:
     if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
         computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
         // The low bits of the first operand are unchanged by the srem.
         Known.Zero = Known2.Zero & LowBits;
         Known.One = Known2.One & LowBits;
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
         if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits))
           Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
         if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
           Known.One |= ~LowBits;
         assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
       }
     }
     break;
   case ISD::UREM: {
     if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
         computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
         // The upper bits are all zero, the lower ones are unchanged.
         Known.Zero = Known2.Zero | ~LowBits;
         Known.One = Known2.One & LowBits;
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
     uint32_t Leaders =
         std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
     computeKnownBits(Op.getOperand(0), Known, Depth+1);
     const unsigned Index = Op.getConstantOperandVal(1);
     const unsigned BitWidth = Op.getValueSizeInBits();
 
     // Remove low part of known bits mask
     Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth);
     Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth);
 
     // Remove high part of known bit mask
     Known = Known.trunc(BitWidth);
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT: {
     SDValue InVec = Op.getOperand(0);
     SDValue EltNo = Op.getOperand(1);
     EVT VecVT = InVec.getValueType();
     const unsigned BitWidth = Op.getValueSizeInBits();
     const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
     // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
     // anything about the extended bits.
     if (BitWidth > EltBitWidth)
       Known = Known.trunc(EltBitWidth);
     ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
       // If we know the element index, just demand that vector element.
       unsigned Idx = ConstEltNo->getZExtValue();
       APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
       computeKnownBits(InVec, Known, DemandedElt, Depth + 1);
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
       computeKnownBits(InVec, Known, Depth + 1);
     }
     if (BitWidth > EltBitWidth)
       Known = Known.zext(BitWidth);
     break;
   }
   case ISD::INSERT_VECTOR_ELT: {
     SDValue InVec = Op.getOperand(0);
     SDValue InVal = Op.getOperand(1);
     SDValue EltNo = Op.getOperand(2);
 
     ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
       // If we know the element index, split the demand between the
       // source vector and the inserted element.
       Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
       unsigned EltIdx = CEltNo->getZExtValue();
 
       // If we demand the inserted element then add its common known bits.
       if (DemandedElts[EltIdx]) {
         computeKnownBits(InVal, Known2, Depth + 1);
         Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
         Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
       }
 
       // If we demand the source vector then add its common known bits, ensuring
       // that we don't demand the inserted element.
       APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
       if (!!VectorElts) {
         computeKnownBits(InVec, Known2, VectorElts, Depth + 1);
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
       computeKnownBits(InVec, Known, Depth + 1);
       computeKnownBits(InVal, Known2, Depth + 1);
       Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
       Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
     }
     break;
   }
   case ISD::BITREVERSE: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     Known.Zero = Known2.Zero.reverseBits();
     Known.One = Known2.One.reverseBits();
     break;
   }
   case ISD::BSWAP: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     Known.Zero = Known2.Zero.byteSwap();
     Known.One = Known2.One.byteSwap();
     break;
   }
   case ISD::ABS: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // If the source's MSB is zero then we know the rest of the bits already.
     if (Known2.isNonNegative()) {
       Known.Zero = Known2.Zero;
       Known.One = Known2.One;
       break;
     }
 
     // We only know that the absolute values's MSB will be zero iff there is
     // a set bit that isn't the sign bit (otherwise it could be INT_MIN).
     Known2.One.clearSignBit();
     if (Known2.One.getBoolValue()) {
       Known.Zero = APInt::getSignMask(BitWidth);
       break;
     }
     break;
   }
   case ISD::UMIN: {
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
     // UMIN - we know that the result will have the maximum of the
     // known zero leading bits of the inputs.
     unsigned LeadZero = Known.countMinLeadingZeros();
     LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
 
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     Known.Zero.setHighBits(LeadZero);
     break;
   }
   case ISD::UMAX: {
     computeKnownBits(Op.getOperand(0), Known, DemandedElts,
                      Depth + 1);
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
     // UMAX - we know that the result will have the maximum of the
     // known one leading bits of the inputs.
     unsigned LeadOne = Known.countMinLeadingOnes();
     LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
 
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     Known.One.setHighBits(LeadOne);
     break;
   }
   case ISD::SMIN:
   case ISD::SMAX: {
     // If we have a clamp pattern, we know that the number of sign bits will be
     // the minimum of the clamp min/max range.
     bool IsMax = (Opcode == ISD::SMAX);
     ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
     if ((CstLow = isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)))
       if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
         CstHigh = isConstOrDemandedConstSplat(Op.getOperand(0).getOperand(1),
                                               DemandedElts);
     if (CstLow && CstHigh) {
       if (!IsMax)
         std::swap(CstLow, CstHigh);
 
       const APInt &ValueLow = CstLow->getAPIntValue();
       const APInt &ValueHigh = CstHigh->getAPIntValue();
       if (ValueLow.sle(ValueHigh)) {
         unsigned LowSignBits = ValueLow.getNumSignBits();
         unsigned HighSignBits = ValueHigh.getNumSignBits();
         unsigned MinSignBits = std::min(LowSignBits, HighSignBits);
         if (ValueLow.isNegative() && ValueHigh.isNegative()) {
           Known.One.setHighBits(MinSignBits);
           break;
         }
         if (ValueLow.isNonNegative() && ValueHigh.isNonNegative()) {
           Known.Zero.setHighBits(MinSignBits);
           break;
         }
       }
     }
 
     // Fallback - just get the shared known bits of the operands.
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     if (Known.isUnknown()) break; // Early-out
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     break;
   }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
     TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
     break;
 
   default:
     if (Opcode < ISD::BUILTIN_OP_END)
       break;
     LLVM_FALLTHROUGH;
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
     TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
     break;
   }
 
   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
 }
 
 SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
                                                              SDValue N1) const {
   // X + 0 never overflow
   if (isNullConstant(N1))
     return OFK_Never;
 
   KnownBits N1Known;
   computeKnownBits(N1, N1Known);
   if (N1Known.Zero.getBoolValue()) {
     KnownBits N0Known;
     computeKnownBits(N0, N0Known);
 
     bool overflow;
     (void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
     if (!overflow)
       return OFK_Never;
   }
 
   // mulhi + 1 never overflow
   if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
       (~N1Known.Zero & 0x01) == ~N1Known.Zero)
     return OFK_Never;
 
   if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
     KnownBits N0Known;
     computeKnownBits(N0, N0Known);
 
     if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
       return OFK_Never;
   }
 
   return OFK_Sometime;
 }
 
 bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   EVT OpVT = Val.getValueType();
   unsigned BitWidth = OpVT.getScalarSizeInBits();
 
   // Is the constant a known power of 2?
   if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
     return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
 
   // A left-shift of a constant one will have exactly one bit set because
   // shifting the bit off the end is undefined.
   if (Val.getOpcode() == ISD::SHL) {
     auto *C = isConstOrConstSplat(Val.getOperand(0));
     if (C && C->getAPIntValue() == 1)
       return true;
   }
 
   // Similarly, a logical right-shift of a constant sign-bit will have exactly
   // one bit set.
   if (Val.getOpcode() == ISD::SRL) {
     auto *C = isConstOrConstSplat(Val.getOperand(0));
     if (C && C->getAPIntValue().isSignMask())
       return true;
   }
 
   // Are all operands of a build vector constant powers of two?
   if (Val.getOpcode() == ISD::BUILD_VECTOR)
     if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
           if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
             return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
           return false;
         }))
       return true;
 
   // More could be done here, though the above checks are enough
   // to handle some common cases.
 
   // Fall back to computeKnownBits to catch other known cases.
   KnownBits Known;
   computeKnownBits(Val, Known);
   return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
 }
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
   return ComputeNumSignBits(Op, DemandedElts, Depth);
 }
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
                                           unsigned Depth) const {
   EVT VT = Op.getValueType();
   assert((VT.isInteger() || VT.isFloatingPoint()) && "Invalid VT!");
   unsigned VTBits = VT.getScalarSizeInBits();
   unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
   if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
     const APInt &Val = C->getAPIntValue();
     return Val.getNumSignBits();
   }
 
   if (Depth == 6)
     return 1;  // Limit search depth.
 
   if (!DemandedElts)
     return 1;  // No demanded elts, better to assume we don't know anything.
 
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
   default: break;
   case ISD::AssertSext:
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
     return VTBits-Tmp+1;
   case ISD::AssertZext:
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
     return VTBits-Tmp;
 
   case ISD::BUILD_VECTOR:
     Tmp = VTBits;
     for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
       if (!DemandedElts[i])
         continue;
 
       SDValue SrcOp = Op.getOperand(i);
       Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
       if (SrcOp.getValueSizeInBits() != VTBits) {
         assert(SrcOp.getValueSizeInBits() > VTBits &&
                "Expected BUILD_VECTOR implicit truncation");
         unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
         Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
       }
       Tmp = std::min(Tmp, Tmp2);
     }
     return Tmp;
 
   case ISD::VECTOR_SHUFFLE: {
     // Collect the minimum number of sign bits that are shared by every vector
     // element referenced by the shuffle.
     APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
     assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
     for (unsigned i = 0; i != NumElts; ++i) {
       int M = SVN->getMaskElt(i);
       if (!DemandedElts[i])
         continue;
       // For UNDEF elements, we don't know anything about the common state of
       // the shuffle result.
       if (M < 0)
         return 1;
       if ((unsigned)M < NumElts)
         DemandedLHS.setBit((unsigned)M % NumElts);
       else
         DemandedRHS.setBit((unsigned)M % NumElts);
     }
     Tmp = std::numeric_limits<unsigned>::max();
     if (!!DemandedLHS)
       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
     if (!!DemandedRHS) {
       Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
       Tmp = std::min(Tmp, Tmp2);
     }
     // If we don't know anything, early out and try computeKnownBits fall-back.
     if (Tmp == 1)
       break;
     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
 
   case ISD::BITCAST: {
     SDValue N0 = Op.getOperand(0);
     EVT SrcVT = N0.getValueType();
     unsigned SrcBits = SrcVT.getScalarSizeInBits();
 
     // Ignore bitcasts from unsupported types..
     if (!(SrcVT.isInteger() || SrcVT.isFloatingPoint()))
       break;
 
     // Fast handling of 'identity' bitcasts.
     if (VTBits == SrcBits)
       return ComputeNumSignBits(N0, DemandedElts, Depth + 1);
 
     // Bitcast 'large element' scalar/vector to 'small element' vector.
     // TODO: Handle cases other than 'sign splat' when we have a use case.
     // Requires handling of DemandedElts and Endianness.
     if ((SrcBits % VTBits) == 0) {
       assert(Op.getValueType().isVector() && "Expected bitcast to vector");
       Tmp = ComputeNumSignBits(N0, Depth + 1);
       if (Tmp == SrcBits)
         return VTBits;
     }
     break;
   }
 
   case ISD::SIGN_EXTEND:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
   case ISD::SIGN_EXTEND_INREG:
     // Max of the input and what this extends.
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
     Tmp = VTBits-Tmp+1;
     Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
     return std::max(Tmp, Tmp2);
   case ISD::SIGN_EXTEND_VECTOR_INREG: {
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
     APInt DemandedSrcElts = DemandedElts.zext(SrcVT.getVectorNumElements());
     Tmp = VTBits - SrcVT.getScalarSizeInBits();
     return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
   }
 
   case ISD::SRA:
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
     // SRA X, C   -> adds C sign bits.
     if (ConstantSDNode *C =
             isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)) {
       APInt ShiftVal = C->getAPIntValue();
       ShiftVal += Tmp;
       Tmp = ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
     }
     return Tmp;
   case ISD::SHL:
     if (ConstantSDNode *C =
             isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)) {
       // shl destroys sign bits.
       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
       if (C->getAPIntValue().uge(VTBits) ||      // Bad shift.
           C->getAPIntValue().uge(Tmp)) break;    // Shifted all sign bits out.
       return Tmp - C->getZExtValue();
     }
     break;
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:    // NOT is handled here.
     // Logical binary ops preserve the number of sign bits at the worst.
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
     if (Tmp != 1) {
       Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
       // computeKnownBits, and pick whichever answer is better.
     }
     break;
 
   case ISD::SELECT:
   case ISD::VSELECT:
     Tmp = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
     if (Tmp == 1) return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
     return std::min(Tmp, Tmp2);
   case ISD::SELECT_CC:
     Tmp = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
     if (Tmp == 1) return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth+1);
     return std::min(Tmp, Tmp2);
 
   case ISD::SMIN:
   case ISD::SMAX: {
     // If we have a clamp pattern, we know that the number of sign bits will be
     // the minimum of the clamp min/max range.
     bool IsMax = (Opcode == ISD::SMAX);
     ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
     if ((CstLow = isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)))
       if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
         CstHigh = isConstOrDemandedConstSplat(Op.getOperand(0).getOperand(1),
                                               DemandedElts);
     if (CstLow && CstHigh) {
       if (!IsMax)
         std::swap(CstLow, CstHigh);
       if (CstLow->getAPIntValue().sle(CstHigh->getAPIntValue())) {
         Tmp = CstLow->getAPIntValue().getNumSignBits();
         Tmp2 = CstHigh->getAPIntValue().getNumSignBits();
         return std::min(Tmp, Tmp2);
       }
     }
 
     // Fallback - just get the minimum number of sign bits of the operands.
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
     if (Tmp == 1)
       return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
     return std::min(Tmp, Tmp2);
   }
   case ISD::UMIN:
   case ISD::UMAX:
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
     if (Tmp == 1)
       return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
     return std::min(Tmp, Tmp2);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
       break;
     // The boolean result conforms to getBooleanContents.  Fall through.
     // If setcc returns 0/-1, all bits are sign bits.
     // We know that we have an integer-based boolean since these operations
     // are only available for integer.
     if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
         TargetLowering::ZeroOrNegativeOneBooleanContent)
       return VTBits;
     break;
   case ISD::SETCC:
     // If setcc returns 0/-1, all bits are sign bits.
     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
         TargetLowering::ZeroOrNegativeOneBooleanContent)
       return VTBits;
     break;
   case ISD::ROTL:
   case ISD::ROTR:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       unsigned RotAmt = C->getAPIntValue().urem(VTBits);
 
       // Handle rotate right by N like a rotate left by 32-N.
       if (Opcode == ISD::ROTR)
         RotAmt = (VTBits - RotAmt) % VTBits;
 
       // If we aren't rotating out all of the known-in sign bits, return the
       // number that are left.  This handles rotl(sext(x), 1) for example.
       Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
       if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
     }
     break;
   case ISD::ADD:
   case ISD::ADDC:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
     if (Tmp == 1) return 1;  // Early out.
 
     // Special case decrementing a value (ADD X, -1):
     if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         KnownBits Known;
         computeKnownBits(Op.getOperand(0), Known, Depth+1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
           return VTBits;
 
         // If we are subtracting one from a positive number, there is no carry
         // out of the result.
         if (Known.isNonNegative())
           return Tmp;
       }
 
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
     if (Tmp2 == 1) return 1;
     return std::min(Tmp, Tmp2)-1;
 
   case ISD::SUB:
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
     if (Tmp2 == 1) return 1;
 
     // Handle NEG.
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
       if (CLHS->isNullValue()) {
         KnownBits Known;
         computeKnownBits(Op.getOperand(1), Known, Depth+1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
           return VTBits;
 
         // If the input is known to be positive (the sign bit is known clear),
         // the output of the NEG has the same number of sign bits as the input.
         if (Known.isNonNegative())
           return Tmp2;
 
         // Otherwise, we treat this like a SUB.
       }
 
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
     if (Tmp == 1) return 1;  // Early out.
     return std::min(Tmp, Tmp2)-1;
   case ISD::TRUNCATE: {
     // Check if the sign bits of source go down as far as the truncated value.
     unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
     unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
     if (NumSrcSignBits > (NumSrcBits - VTBits))
       return NumSrcSignBits - (NumSrcBits - VTBits);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
     const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
     const int BitWidth = Op.getValueSizeInBits();
     const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;
 
     // Get reverse index (starting from 1), Op1 value indexes elements from
     // little end. Sign starts at big end.
     const int rIndex = Items - 1 - Op.getConstantOperandVal(1);
 
     // If the sign portion ends in our element the subtraction gives correct
     // result. Otherwise it gives either negative or > bitwidth result
     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
   }
   case ISD::INSERT_VECTOR_ELT: {
     SDValue InVec = Op.getOperand(0);
     SDValue InVal = Op.getOperand(1);
     SDValue EltNo = Op.getOperand(2);
     unsigned NumElts = InVec.getValueType().getVectorNumElements();
 
     ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
       // If we know the element index, split the demand between the
       // source vector and the inserted element.
       unsigned EltIdx = CEltNo->getZExtValue();
 
       // If we demand the inserted element then get its sign bits.
       Tmp = std::numeric_limits<unsigned>::max();
       if (DemandedElts[EltIdx]) {
         // TODO - handle implicit truncation of inserted elements.
         if (InVal.getScalarValueSizeInBits() != VTBits)
           break;
         Tmp = ComputeNumSignBits(InVal, Depth + 1);
       }
 
       // If we demand the source vector then get its sign bits, and determine
       // the minimum.
       APInt VectorElts = DemandedElts;
       VectorElts.clearBit(EltIdx);
       if (!!VectorElts) {
         Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
         Tmp = std::min(Tmp, Tmp2);
       }
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
       Tmp = ComputeNumSignBits(InVec, Depth + 1);
       Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
       Tmp = std::min(Tmp, Tmp2);
     }
     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
   case ISD::EXTRACT_VECTOR_ELT: {
     SDValue InVec = Op.getOperand(0);
     SDValue EltNo = Op.getOperand(1);
     EVT VecVT = InVec.getValueType();
     const unsigned BitWidth = Op.getValueSizeInBits();
     const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
 
     // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
     // anything about sign bits. But if the sizes match we can derive knowledge
     // about sign bits from the vector operand.
     if (BitWidth != EltBitWidth)
       break;
 
     // If we know the element index, just demand that vector element, else for
     // an unknown element index, ignore DemandedElts and demand them all.
     APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
     ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
       DemandedSrcElts =
           APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
 
     return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
   }
   case ISD::EXTRACT_SUBVECTOR: {
     // If we know the element index, just demand that subvector elements,
     // otherwise demand them all.
     SDValue Src = Op.getOperand(0);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
       return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
     }
     return ComputeNumSignBits(Src, Depth + 1);
   }
   case ISD::CONCAT_VECTORS:
     // Determine the minimum number of sign bits across all demanded
     // elts of the input vectors. Early out if the result is already 1.
     Tmp = std::numeric_limits<unsigned>::max();
     EVT SubVectorVT = Op.getOperand(0).getValueType();
     unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
     unsigned NumSubVectors = Op.getNumOperands();
     for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
       APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
       DemandedSub = DemandedSub.trunc(NumSubVectorElts);
       if (!DemandedSub)
         continue;
       Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
       Tmp = std::min(Tmp, Tmp2);
     }
     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
 
   // If we are looking at the loaded value of the SDNode.
   if (Op.getResNo() == 0) {
     // Handle LOADX separately here. EXTLOAD case will fallthrough.
     if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
       unsigned ExtType = LD->getExtensionType();
       switch (ExtType) {
         default: break;
         case ISD::SEXTLOAD:    // '17' bits known
           Tmp = LD->getMemoryVT().getScalarSizeInBits();
           return VTBits-Tmp+1;
         case ISD::ZEXTLOAD:    // '16' bits known
           Tmp = LD->getMemoryVT().getScalarSizeInBits();
           return VTBits-Tmp;
       }
     }
   }
 
   // Allow the target to implement this method for its nodes.
   if (Opcode >= ISD::BUILTIN_OP_END ||
       Opcode == ISD::INTRINSIC_WO_CHAIN ||
       Opcode == ISD::INTRINSIC_W_CHAIN ||
       Opcode == ISD::INTRINSIC_VOID) {
     unsigned NumBits =
         TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
     if (NumBits > 1)
       FirstAnswer = std::max(FirstAnswer, NumBits);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   KnownBits Known;
   computeKnownBits(Op, Known, DemandedElts, Depth);
 
   APInt Mask;
   if (Known.isNonNegative()) {        // sign bit is 0
     Mask = Known.Zero;
   } else if (Known.isNegative()) {  // sign bit is 1;
     Mask = Known.One;
   } else {
     // Nothing known.
     return FirstAnswer;
   }
 
   // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
   // the number of identical bits in the top of the input value.
   Mask = ~Mask;
   Mask <<= Mask.getBitWidth()-VTBits;
   // Return # leading zeros.  We use 'min' here in case Val was zero before
   // shifting.  We don't want to return '64' as for an i32 "0".
   return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
 }
 
 bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
   if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
       !isa<ConstantSDNode>(Op.getOperand(1)))
     return false;
 
   if (Op.getOpcode() == ISD::OR &&
       !MaskedValueIsZero(Op.getOperand(0),
                      cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue()))
     return false;
 
   return true;
 }
 
 bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
   // If we're told that NaNs won't happen, assume they won't.
   if (getTarget().Options.NoNaNsFPMath)
     return true;
 
   if (Op->getFlags().hasNoNaNs())
     return true;
 
   // If the value is a constant, we can obviously see if it is a NaN or not.
   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
     return !C->getValueAPF().isNaN();
 
   // TODO: Recognize more cases here.
 
   return false;
 }
 
 bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
   assert(Op.getValueType().isFloatingPoint() &&
          "Floating point type expected");
 
   // If the value is a constant, we can obviously see if it is a zero or not.
   // TODO: Add BuildVector support.
   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
     return !C->isZero();
   return false;
 }
 
 bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
   assert(!Op.getValueType().isFloatingPoint() &&
          "Floating point types unsupported - use isKnownNeverZeroFloat");
 
   // If the value is a constant, we can obviously see if it is a zero or not.
   if (ISD::matchUnaryPredicate(
           Op, [](ConstantSDNode *C) { return !C->isNullValue(); }))
     return true;
 
   // TODO: Recognize more cases here.
   switch (Op.getOpcode()) {
   default: break;
   case ISD::OR:
     if (isKnownNeverZero(Op.getOperand(1)) ||
         isKnownNeverZero(Op.getOperand(0)))
       return true;
     break;
   }
 
   return false;
 }
 
 bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   // Check the obvious case.
   if (A == B) return true;
 
   // For for negative and positive zero.
   if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
     if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
       if (CA->isZero() && CB->isZero()) return true;
 
   // Otherwise they may not be equal.
   return false;
 }
 
 // FIXME: unify with llvm::haveNoCommonBitsSet.
 // FIXME: could also handle masked merge pattern (X & ~M) op (Y & M)
 bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   assert(A.getValueType() == B.getValueType() &&
          "Values must have the same type");
   KnownBits AKnown, BKnown;
   computeKnownBits(A, AKnown);
   computeKnownBits(B, BKnown);
   return (AKnown.Zero | BKnown.Zero).isAllOnesValue();
 }
 
 static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
                                   ArrayRef<SDValue> Ops,
                                   SelectionDAG &DAG) {
   assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
   assert(llvm::all_of(Ops,
                       [Ops](SDValue Op) {
                         return Ops[0].getValueType() == Op.getValueType();
                       }) &&
          "Concatenation of vectors with inconsistent value types!");
   assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
              VT.getVectorNumElements() &&
          "Incorrect element count in vector concatenation!");
 
   if (Ops.size() == 1)
     return Ops[0];
 
   // Concat of UNDEFs is UNDEF.
   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
     return DAG.getUNDEF(VT);
 
   // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
   // simplified to one big BUILD_VECTOR.
   // FIXME: Add support for SCALAR_TO_VECTOR as well.
   EVT SVT = VT.getScalarType();
   SmallVector<SDValue, 16> Elts;
   for (SDValue Op : Ops) {
     EVT OpVT = Op.getValueType();
     if (Op.isUndef())
       Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
     else if (Op.getOpcode() == ISD::BUILD_VECTOR)
       Elts.append(Op->op_begin(), Op->op_end());
     else
       return SDValue();
   }
 
   // BUILD_VECTOR requires all inputs to be of the same type, find the
   // maximum type and extend them all.
   for (SDValue Op : Elts)
     SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
 
   if (SVT.bitsGT(VT.getScalarType()))
     for (SDValue &Op : Elts)
       Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
                ? DAG.getZExtOrTrunc(Op, DL, SVT)
                : DAG.getSExtOrTrunc(Op, DL, SVT);
 
   SDValue V = DAG.getBuildVector(VT, DL, Elts);
   NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
   return V;
 }
 
 /// Gets or creates the specified node.
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opcode, getVTList(VT), None);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
                               getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   InsertNode(N);
   SDValue V = SDValue(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue Operand, const SDNodeFlags Flags) {
   // Constant fold unary operations with an integer constant operand. Even
   // opaque constant will be folded, because the folding of unary operations
   // doesn't create new constants with different values. Nevertheless, the
   // opaque flag is preserved during folding to prevent future folding with
   // other constants.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
     const APInt &Val = C->getAPIntValue();
     switch (Opcode) {
     default: break;
     case ISD::SIGN_EXTEND:
       return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
                          C->isTargetOpcode(), C->isOpaque());
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::TRUNCATE:
       return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
                          C->isTargetOpcode(), C->isOpaque());
     case ISD::UINT_TO_FP:
     case ISD::SINT_TO_FP: {
       APFloat apf(EVTToAPFloatSemantics(VT),
                   APInt::getNullValue(VT.getSizeInBits()));
       (void)apf.convertFromAPInt(Val,
                                  Opcode==ISD::SINT_TO_FP,
                                  APFloat::rmNearestTiesToEven);
       return getConstantFP(apf, DL, VT);
     }
     case ISD::BITCAST:
       if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
         return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
       if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
         return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
       if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
         return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
       break;
     case ISD::ABS:
       return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
     case ISD::BITREVERSE:
       return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
     case ISD::CTPOP:
       return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
     case ISD::CTLZ:
     case ISD::CTLZ_ZERO_UNDEF:
       return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
     case ISD::CTTZ:
     case ISD::CTTZ_ZERO_UNDEF:
       return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
     case ISD::FP16_TO_FP: {
       bool Ignored;
       APFloat FPV(APFloat::IEEEhalf(),
                   (Val.getBitWidth() == 16) ? Val : Val.trunc(16));
 
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
       (void)FPV.convert(EVTToAPFloatSemantics(VT),
                         APFloat::rmNearestTiesToEven, &Ignored);
       return getConstantFP(FPV, DL, VT);
     }
     }
   }
 
   // Constant fold unary operations with a floating point constant operand.
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
     APFloat V = C->getValueAPF();    // make copy
     switch (Opcode) {
     case ISD::FNEG:
       V.changeSign();
       return getConstantFP(V, DL, VT);
     case ISD::FABS:
       V.clearSign();
       return getConstantFP(V, DL, VT);
     case ISD::FCEIL: {
       APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
       if (fs == APFloat::opOK || fs == APFloat::opInexact)
         return getConstantFP(V, DL, VT);
       break;
     }
     case ISD::FTRUNC: {
       APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
       if (fs == APFloat::opOK || fs == APFloat::opInexact)
         return getConstantFP(V, DL, VT);
       break;
     }
     case ISD::FFLOOR: {
       APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
       if (fs == APFloat::opOK || fs == APFloat::opInexact)
         return getConstantFP(V, DL, VT);
       break;
     }
     case ISD::FP_EXTEND: {
       bool ignored;
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
       (void)V.convert(EVTToAPFloatSemantics(VT),
                       APFloat::rmNearestTiesToEven, &ignored);
       return getConstantFP(V, DL, VT);
     }
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT: {
       bool ignored;
       APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
       // FIXME need to be more flexible about rounding mode.
       APFloat::opStatus s =
           V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
       if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
         break;
       return getConstant(IntVal, DL, VT);
     }
     case ISD::BITCAST:
       if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
         return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
       else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
         return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
       break;
     case ISD::FP_TO_FP16: {
       bool Ignored;
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
       (void)V.convert(APFloat::IEEEhalf(),
                       APFloat::rmNearestTiesToEven, &Ignored);
       return getConstant(V.bitcastToAPInt(), DL, VT);
     }
     }
   }
 
   // Constant fold unary operations with a vector integer or float operand.
   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
     if (BV->isConstant()) {
       switch (Opcode) {
       default:
         // FIXME: Entirely reasonable to perform folding of other unary
         // operations here as the need arises.
         break;
       case ISD::FNEG:
       case ISD::FABS:
       case ISD::FCEIL:
       case ISD::FTRUNC:
       case ISD::FFLOOR:
       case ISD::FP_EXTEND:
       case ISD::FP_TO_SINT:
       case ISD::FP_TO_UINT:
       case ISD::TRUNCATE:
       case ISD::ANY_EXTEND:
       case ISD::ZERO_EXTEND:
       case ISD::SIGN_EXTEND:
       case ISD::UINT_TO_FP:
       case ISD::SINT_TO_FP:
       case ISD::ABS:
       case ISD::BITREVERSE:
       case ISD::BSWAP:
       case ISD::CTLZ:
       case ISD::CTLZ_ZERO_UNDEF:
       case ISD::CTTZ:
       case ISD::CTTZ_ZERO_UNDEF:
       case ISD::CTPOP: {
         SDValue Ops = { Operand };
         if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
           return Fold;
       }
       }
     }
   }
 
   unsigned OpOpcode = Operand.getNode()->getOpcode();
   switch (Opcode) {
   case ISD::TokenFactor:
   case ISD::MERGE_VALUES:
   case ISD::CONCAT_VECTORS:
     return Operand;         // Factor, merge or concat of one node?  No need.
   case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
   case ISD::FP_EXTEND:
     assert(VT.isFloatingPoint() &&
            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid fpext node, dst < src!");
     if (Operand.isUndef())
       return getUNDEF(VT);
     break;
   case ISD::SIGN_EXTEND:
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid SIGN_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid sext node, dst < src!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
       return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // sext(undef) = 0, because the top bits will all be the same.
       return getConstant(0, DL, VT);
     break;
   case ISD::ZERO_EXTEND:
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ZERO_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid zext node, dst < src!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
       return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // zext(undef) = 0, because the top bits will be zero.
       return getConstant(0, DL, VT);
     break;
   case ISD::ANY_EXTEND:
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ANY_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid anyext node, dst < src!");
 
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
       return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
 
     // (ext (trunc x)) -> x
     if (OpOpcode == ISD::TRUNCATE) {
       SDValue OpOp = Operand.getOperand(0);
       if (OpOp.getValueType() == VT) {
         transferDbgValues(Operand, OpOp);
         return OpOp;
       }
     }
     break;
   case ISD::TRUNCATE:
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid TRUNCATE!");
     if (Operand.getValueType() == VT) return Operand;   // noop truncate
     assert((!VT.isVector() ||
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsGT(VT) &&
            "Invalid truncate node, src < dst!");
     if (OpOpcode == ISD::TRUNCATE)
       return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
       if (Operand.getOperand(0).getValueType().getScalarType()
             .bitsLT(VT.getScalarType()))
         return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
       if (Operand.getOperand(0).getValueType().bitsGT(VT))
         return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
       return Operand.getOperand(0);
     }
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
   case ISD::ABS:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid ABS!");
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
   case ISD::BSWAP:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid BSWAP!");
     assert((VT.getScalarSizeInBits() % 16 == 0) &&
            "BSWAP types must be a multiple of 16 bits!");
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
   case ISD::BITREVERSE:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid BITREVERSE!");
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
   case ISD::BITCAST:
     // Basic sanity checking.
     assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
            "Cannot BITCAST between types of different sizes!");
     if (VT == Operand.getValueType()) return Operand;  // noop conversion.
     if (OpOpcode == ISD::BITCAST)  // bitconv(bitconv(x)) -> bitconv(x)
       return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
   case ISD::SCALAR_TO_VECTOR:
     assert(VT.isVector() && !Operand.getValueType().isVector() &&
            (VT.getVectorElementType() == Operand.getValueType() ||
             (VT.getVectorElementType().isInteger() &&
              Operand.getValueType().isInteger() &&
              VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
            "Illegal SCALAR_TO_VECTOR node!");
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
     if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
         isa<ConstantSDNode>(Operand.getOperand(1)) &&
         Operand.getConstantOperandVal(1) == 0 &&
         Operand.getOperand(0).getValueType() == VT)
       return Operand.getOperand(0);
     break;
   case ISD::FNEG:
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
     if ((getTarget().Options.UnsafeFPMath || Flags.hasNoSignedZeros()) &&
         OpOpcode == ISD::FSUB)
       return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
                      Operand.getOperand(0), Flags);
     if (OpOpcode == ISD::FNEG)  // --X -> X
       return Operand.getOperand(0);
     break;
   case ISD::FABS:
     if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
       return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
     break;
   }
 
   SDNode *N;
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = {Operand};
   if (VT != MVT::Glue) { // Don't CSE flag producing nodes
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
       E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
     }
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     N->setFlags(Flags);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     createOperands(N, Ops);
   }
 
   InsertNode(N);
   SDValue V = SDValue(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
                                         const APInt &C2) {
   switch (Opcode) {
   case ISD::ADD:  return std::make_pair(C1 + C2, true);
   case ISD::SUB:  return std::make_pair(C1 - C2, true);
   case ISD::MUL:  return std::make_pair(C1 * C2, true);
   case ISD::AND:  return std::make_pair(C1 & C2, true);
   case ISD::OR:   return std::make_pair(C1 | C2, true);
   case ISD::XOR:  return std::make_pair(C1 ^ C2, true);
   case ISD::SHL:  return std::make_pair(C1 << C2, true);
   case ISD::SRL:  return std::make_pair(C1.lshr(C2), true);
   case ISD::SRA:  return std::make_pair(C1.ashr(C2), true);
   case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
   case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
   case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
   case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
   case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
   case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
   case ISD::UDIV:
     if (!C2.getBoolValue())
       break;
     return std::make_pair(C1.udiv(C2), true);
   case ISD::UREM:
     if (!C2.getBoolValue())
       break;
     return std::make_pair(C1.urem(C2), true);
   case ISD::SDIV:
     if (!C2.getBoolValue())
       break;
     return std::make_pair(C1.sdiv(C2), true);
   case ISD::SREM:
     if (!C2.getBoolValue())
       break;
     return std::make_pair(C1.srem(C2), true);
   }
   return std::make_pair(APInt(1, 0), false);
 }
 
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
                                              EVT VT, const ConstantSDNode *Cst1,
                                              const ConstantSDNode *Cst2) {
   if (Cst1->isOpaque() || Cst2->isOpaque())
     return SDValue();
 
   std::pair<APInt, bool> Folded = FoldValue(Opcode, Cst1->getAPIntValue(),
                                             Cst2->getAPIntValue());
   if (!Folded.second)
     return SDValue();
   return getConstant(Folded.first, DL, VT);
 }
 
 SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
                                        const GlobalAddressSDNode *GA,
                                        const SDNode *N2) {
   if (GA->getOpcode() != ISD::GlobalAddress)
     return SDValue();
   if (!TLI->isOffsetFoldingLegal(GA))
     return SDValue();
   const ConstantSDNode *Cst2 = dyn_cast<ConstantSDNode>(N2);
   if (!Cst2)
     return SDValue();
   int64_t Offset = Cst2->getSExtValue();
   switch (Opcode) {
   case ISD::ADD: break;
   case ISD::SUB: Offset = -uint64_t(Offset); break;
   default: return SDValue();
   }
   return getGlobalAddress(GA->getGlobal(), SDLoc(Cst2), VT,
                           GA->getOffset() + uint64_t(Offset));
 }
 
 bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
   switch (Opcode) {
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM: {
     // If a divisor is zero/undef or any element of a divisor vector is
     // zero/undef, the whole op is undef.
     assert(Ops.size() == 2 && "Div/rem should have 2 operands");
     SDValue Divisor = Ops[1];
     if (Divisor.isUndef() || isNullConstant(Divisor))
       return true;
 
     return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
            llvm::any_of(Divisor->op_values(),
                         [](SDValue V) { return V.isUndef() ||
                                         isNullConstant(V); });
     // TODO: Handle signed overflow.
   }
   // TODO: Handle oversized shifts.
   default:
     return false;
   }
 }
 
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
                                              EVT VT, SDNode *Cst1,
                                              SDNode *Cst2) {
   // If the opcode is a target-specific ISD node, there's nothing we can
   // do here and the operand rules may not line up with the below, so
   // bail early.
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
   if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
     return getUNDEF(VT);
 
   // Handle the case of two scalars.
   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
       SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2);
       assert((!Folded || !VT.isVector()) &&
              "Can't fold vectors ops with scalar operands");
       return Folded;
     }
   }
 
   // fold (add Sym, c) -> Sym+c
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
     return FoldSymbolOffset(Opcode, VT, GA, Cst2);
   if (TLI->isCommutativeBinOp(Opcode))
     if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
       return FoldSymbolOffset(Opcode, VT, GA, Cst1);
 
   // For vectors extract each constant element into Inputs so we can constant
   // fold them individually.
   BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
   BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
   if (!BV1 || !BV2)
     return SDValue();
 
   assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");
 
   EVT SVT = VT.getScalarType();
   EVT LegalSVT = SVT;
   if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(SVT))
       return SDValue();
   }
   SmallVector<SDValue, 4> Outputs;
   for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
     SDValue V1 = BV1->getOperand(I);
     SDValue V2 = BV2->getOperand(I);
 
     if (SVT.isInteger()) {
         if (V1->getValueType(0).bitsGT(SVT))
           V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
         if (V2->getValueType(0).bitsGT(SVT))
           V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
     }
 
     if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
       return SDValue();
 
     // Fold one vector element.
     SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
     if (LegalSVT != SVT)
       ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
 
     // Scalar folding only succeeded if the result is a constant or UNDEF.
     if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
         ScalarResult.getOpcode() != ISD::ConstantFP)
       return SDValue();
     Outputs.push_back(ScalarResult);
   }
 
   assert(VT.getVectorNumElements() == Outputs.size() &&
          "Vector size mismatch!");
 
   // We may have a vector type but a scalar result. Create a splat.
   Outputs.resize(VT.getVectorNumElements(), Outputs.back());
 
   // Build a big vector out of the scalar elements we generated.
   return getBuildVector(VT, SDLoc(), Outputs);
 }
 
 // TODO: Merge with FoldConstantArithmetic
 SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
                                                    const SDLoc &DL, EVT VT,
                                                    ArrayRef<SDValue> Ops,
                                                    const SDNodeFlags Flags) {
   // If the opcode is a target-specific ISD node, there's nothing we can
   // do here and the operand rules may not line up with the below, so
   // bail early.
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
   if (isUndef(Opcode, Ops))
     return getUNDEF(VT);
 
   // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
   if (!VT.isVector())
     return SDValue();
 
   unsigned NumElts = VT.getVectorNumElements();
 
   auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
     return !Op.getValueType().isVector() ||
            Op.getValueType().getVectorNumElements() == NumElts;
   };
 
   auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
     return (Op.isUndef()) || (Op.getOpcode() == ISD::CONDCODE) ||
            (BV && BV->isConstant());
   };
 
   // All operands must be vector types with the same number of elements as
   // the result type and must be either UNDEF or a build vector of constant
   // or UNDEF scalars.
   if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) ||
       !llvm::all_of(Ops, IsScalarOrSameVectorSize))
     return SDValue();
 
   // If we are comparing vectors, then the result needs to be a i1 boolean
   // that is then sign-extended back to the legal result type.
   EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
 
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
   EVT LegalSVT = VT.getScalarType();
   if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(VT.getScalarType()))
       return SDValue();
   }
 
   // Constant fold each scalar lane separately.
   SmallVector<SDValue, 4> ScalarResults;
   for (unsigned i = 0; i != NumElts; i++) {
     SmallVector<SDValue, 4> ScalarOps;
     for (SDValue Op : Ops) {
       EVT InSVT = Op.getValueType().getScalarType();
       BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
       if (!InBV) {
         // We've checked that this is UNDEF or a constant of some kind.
         if (Op.isUndef())
           ScalarOps.push_back(getUNDEF(InSVT));
         else
           ScalarOps.push_back(Op);
         continue;
       }
 
       SDValue ScalarOp = InBV->getOperand(i);
       EVT ScalarVT = ScalarOp.getValueType();
 
       // Build vector (integer) scalar operands may need implicit
       // truncation - do this before constant folding.
       if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
         ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
 
       ScalarOps.push_back(ScalarOp);
     }
 
     // Constant fold the scalar operands.
     SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
 
     // Legalize the (integer) scalar constant if necessary.
     if (LegalSVT != SVT)
       ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
 
     // Scalar folding only succeeded if the result is a constant or UNDEF.
     if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
         ScalarResult.getOpcode() != ISD::ConstantFP)
       return SDValue();
     ScalarResults.push_back(ScalarResult);
   }
 
   SDValue V = getBuildVector(VT, DL, ScalarResults);
   NewSDValueDbgMsg(V, "New node fold constant vector: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, const SDNodeFlags Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
 
   // Canonicalize constant to RHS if commutative.
   if (TLI->isCommutativeBinOp(Opcode)) {
     if (N1C && !N2C) {
       std::swap(N1C, N2C);
       std::swap(N1, N2);
     } else if (N1CFP && !N2CFP) {
       std::swap(N1CFP, N2CFP);
       std::swap(N1, N2);
     }
   }
 
   switch (Opcode) {
   default: break;
   case ISD::TokenFactor:
     assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
            N2.getValueType() == MVT::Other && "Invalid token factor!");
     // Fold trivial token factors.
     if (N1.getOpcode() == ISD::EntryToken) return N2;
     if (N2.getOpcode() == ISD::EntryToken) return N1;
     if (N1 == N2) return N1;
     break;
   case ISD::CONCAT_VECTORS: {
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     SDValue Ops[] = {N1, N2};
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
       return V;
     break;
   }
   case ISD::AND:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
     // worth handling here.
     if (N2C && N2C->isNullValue())
       return N2;
     if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
       return N1;
     break;
   case ISD::OR:
   case ISD::XOR:
   case ISD::ADD:
   case ISD::SUB:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
     // it's worth handling here.
     if (N2C && N2C->isNullValue())
       return N1;
     break;
   case ISD::UDIV:
   case ISD::UREM:
   case ISD::MULHU:
   case ISD::MULHS:
   case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
     break;
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
     break;
   case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
     assert(N1.getValueType() == VT &&
            N1.getValueType().isFloatingPoint() &&
            N2.getValueType().isFloatingPoint() &&
            "Invalid FCOPYSIGN!");
     break;
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:
   case ISD::ROTR:
     assert(VT == N1.getValueType() &&
            "Shift operators return type must be the same as their first arg");
     assert(VT.isInteger() && N2.getValueType().isInteger() &&
            "Shifts only work on integers");
     assert((!VT.isVector() || VT == N2.getValueType()) &&
            "Vector shift amounts must be in the same as their first arg");
     // Verify that the shift amount VT is bit enough to hold valid shift
     // amounts.  This catches things like trying to shift an i1024 value by an
     // i8, which is easy to fall into in generic code that uses
     // TLI.getShiftAmount().
     assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
            "Invalid use of small shift amount with oversized value!");
 
     // Always fold shifts of i1 values so the code generator doesn't need to
     // handle them.  Since we know the size of the shift has to be less than the
     // size of the value, the shift/rotate count is guaranteed to be zero.
     if (VT == MVT::i1)
       return N1;
     if (N2C && N2C->isNullValue())
       return N1;
     break;
   case ISD::FP_ROUND_INREG: {
     EVT EVT = cast<VTSDNode>(N2)->getVT();
     assert(VT == N1.getValueType() && "Not an inreg round!");
     assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
            "Cannot FP_ROUND_INREG integer types");
     assert(EVT.isVector() == VT.isVector() &&
            "FP_ROUND_INREG type should be vector iff the operand "
            "type is vector!");
     assert((!EVT.isVector() ||
             EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
            "Vector element counts must match in FP_ROUND_INREG");
     assert(EVT.bitsLE(VT) && "Not rounding down!");
     (void)EVT;
     if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
     break;
   }
   case ISD::FP_ROUND:
     assert(VT.isFloatingPoint() &&
            N1.getValueType().isFloatingPoint() &&
            VT.bitsLE(N1.getValueType()) &&
            N2C && (N2C->getZExtValue() == 0 || N2C->getZExtValue() == 1) &&
            "Invalid FP_ROUND!");
     if (N1.getValueType() == VT) return N1;  // noop conversion.
     break;
   case ISD::AssertSext:
   case ISD::AssertZext: {
     EVT EVT = cast<VTSDNode>(N2)->getVT();
     assert(VT == N1.getValueType() && "Not an inreg extend!");
     assert(VT.isInteger() && EVT.isInteger() &&
            "Cannot *_EXTEND_INREG FP types");
     assert(!EVT.isVector() &&
            "AssertSExt/AssertZExt type should be the vector element type "
            "rather than the vector type!");
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (VT == EVT) return N1; // noop assertion.
     break;
   }
   case ISD::SIGN_EXTEND_INREG: {
     EVT EVT = cast<VTSDNode>(N2)->getVT();
     assert(VT == N1.getValueType() && "Not an inreg extend!");
     assert(VT.isInteger() && EVT.isInteger() &&
            "Cannot *_EXTEND_INREG FP types");
     assert(EVT.isVector() == VT.isVector() &&
            "SIGN_EXTEND_INREG type should be vector iff the operand "
            "type is vector!");
     assert((!EVT.isVector() ||
             EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
            "Vector element counts must match in SIGN_EXTEND_INREG");
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
 
     auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
       unsigned FromBits = EVT.getScalarSizeInBits();
       Val <<= Val.getBitWidth() - FromBits;
       Val.ashrInPlace(Val.getBitWidth() - FromBits);
       return getConstant(Val, DL, ConstantVT);
     };
 
     if (N1C) {
       const APInt &Val = N1C->getAPIntValue();
       return SignExtendInReg(Val, VT);
     }
     if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
       SmallVector<SDValue, 8> Ops;
       llvm::EVT OpVT = N1.getOperand(0).getValueType();
       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N1.getOperand(i);
         if (Op.isUndef()) {
           Ops.push_back(getUNDEF(OpVT));
           continue;
         }
         ConstantSDNode *C = cast<ConstantSDNode>(Op);
         APInt Val = C->getAPIntValue();
         Ops.push_back(SignExtendInReg(Val, OpVT));
       }
       return getBuildVector(VT, DL, Ops);
     }
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT:
     assert(VT.getSizeInBits() >= N1.getValueType().getScalarSizeInBits() &&
            "The result of EXTRACT_VECTOR_ELT must be at least as wide as the \
              element type of the vector.");
 
     // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
     if (N1.isUndef())
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
     if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
     // expanding copies of large vectors from registers.
     if (N2C &&
         N1.getOpcode() == ISD::CONCAT_VECTORS &&
         N1.getNumOperands() > 0) {
       unsigned Factor =
         N1.getOperand(0).getValueType().getVectorNumElements();
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                      N1.getOperand(N2C->getZExtValue() / Factor),
                      getConstant(N2C->getZExtValue() % Factor, DL,
                                  N2.getValueType()));
     }
 
     // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
     // expanding large vector constants.
     if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
       SDValue Elt = N1.getOperand(N2C->getZExtValue());
 
       if (VT != Elt.getValueType())
         // If the vector element type is not legal, the BUILD_VECTOR operands
         // are promoted and implicitly truncated, and the result implicitly
         // extended. Make that explicit here.
         Elt = getAnyExtOrTrunc(Elt, DL, VT);
 
       return Elt;
     }
 
     // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
     // operations are lowered to scalars.
     if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
       // If the indices are the same, return the inserted element else
       // if the indices are known different, extract the element from
       // the original vector.
       SDValue N1Op2 = N1.getOperand(2);
       ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);
 
       if (N1Op2C && N2C) {
         if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
           if (VT == N1.getOperand(1).getValueType())
             return N1.getOperand(1);
           else
             return getSExtOrTrunc(N1.getOperand(1), DL, VT);
         }
 
         return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
       }
     }
 
     // EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
     // when vector types are scalarized and v1iX is legal.
     // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
     if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
         N1.getValueType().getVectorNumElements() == 1) {
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
                      N1.getOperand(1));
     }
     break;
   case ISD::EXTRACT_ELEMENT:
     assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
     assert(!N1.getValueType().isVector() && !VT.isVector() &&
            (N1.getValueType().isInteger() == VT.isInteger()) &&
            N1.getValueType() != VT &&
            "Wrong types for EXTRACT_ELEMENT!");
 
     // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
     // 64-bit integers into 32-bit parts.  Instead of building the extract of
     // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
     if (N1.getOpcode() == ISD::BUILD_PAIR)
       return N1.getOperand(N2C->getZExtValue());
 
     // EXTRACT_ELEMENT of a constant int is also very common.
     if (N1C) {
       unsigned ElementSize = VT.getSizeInBits();
       unsigned Shift = ElementSize * N2C->getZExtValue();
       APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
       return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
     }
     break;
   case ISD::EXTRACT_SUBVECTOR:
     if (VT.isSimple() && N1.getValueType().isSimple()) {
       assert(VT.isVector() && N1.getValueType().isVector() &&
              "Extract subvector VTs must be a vectors!");
       assert(VT.getVectorElementType() ==
              N1.getValueType().getVectorElementType() &&
              "Extract subvector VTs must have the same element type!");
       assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
              "Extract subvector must be from larger vector to smaller vector!");
 
       if (N2C) {
         assert((VT.getVectorNumElements() + N2C->getZExtValue()
                 <= N1.getValueType().getVectorNumElements())
                && "Extract subvector overflow!");
       }
 
       // Trivial extraction.
       if (VT.getSimpleVT() == N1.getSimpleValueType())
         return N1;
 
       // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
       if (N1.isUndef())
         return getUNDEF(VT);
 
       // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
       // the concat have the same type as the extract.
       if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
           N1.getNumOperands() > 0 &&
           VT == N1.getOperand(0).getValueType()) {
         unsigned Factor = VT.getVectorNumElements();
         return N1.getOperand(N2C->getZExtValue() / Factor);
       }
 
       // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
       // during shuffle legalization.
       if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
           VT == N1.getOperand(1).getValueType())
         return N1.getOperand(1);
     }
     break;
   }
 
   // Perform trivial constant folding.
   if (SDValue SV =
           FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
     return SV;
 
   // Constant fold FP operations.
   bool HasFPExceptions = TLI->hasFloatingPointExceptions();
   if (N1CFP) {
     if (N2CFP) {
       APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
       APFloat::opStatus s;
       switch (Opcode) {
       case ISD::FADD:
         s = V1.add(V2, APFloat::rmNearestTiesToEven);
         if (!HasFPExceptions || s != APFloat::opInvalidOp)
           return getConstantFP(V1, DL, VT);
         break;
       case ISD::FSUB:
         s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
         if (!HasFPExceptions || s!=APFloat::opInvalidOp)
           return getConstantFP(V1, DL, VT);
         break;
       case ISD::FMUL:
         s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
         if (!HasFPExceptions || s!=APFloat::opInvalidOp)
           return getConstantFP(V1, DL, VT);
         break;
       case ISD::FDIV:
         s = V1.divide(V2, APFloat::rmNearestTiesToEven);
         if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
                                  s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, DL, VT);
         }
         break;
       case ISD::FREM :
         s = V1.mod(V2);
         if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
                                  s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, DL, VT);
         }
         break;
       case ISD::FCOPYSIGN:
         V1.copySign(V2);
         return getConstantFP(V1, DL, VT);
       default: break;
       }
     }
 
     if (Opcode == ISD::FP_ROUND) {
       APFloat V = N1CFP->getValueAPF();    // make copy
       bool ignored;
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
       (void)V.convert(EVTToAPFloatSemantics(VT),
                       APFloat::rmNearestTiesToEven, &ignored);
       return getConstantFP(V, DL, VT);
     }
   }
 
   // Any FP binop with an undef operand is folded to NaN. This matches the
   // behavior of the IR optimizer.
   switch (Opcode) {
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
     if (N1.isUndef() || N2.isUndef())
       return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
   }
 
   // Canonicalize an UNDEF to the RHS, even over a constant.
   if (N1.isUndef()) {
     if (TLI->isCommutativeBinOp(Opcode)) {
       std::swap(N1, N2);
     } else {
       switch (Opcode) {
       case ISD::FP_ROUND_INREG:
       case ISD::SIGN_EXTEND_INREG:
       case ISD::SUB:
         return getUNDEF(VT);     // fold op(undef, arg2) -> undef
       case ISD::UDIV:
       case ISD::SDIV:
       case ISD::UREM:
       case ISD::SREM:
       case ISD::SRA:
       case ISD::SRL:
       case ISD::SHL:
         return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
       }
     }
   }
 
   // Fold a bunch of operators when the RHS is undef.
   if (N2.isUndef()) {
     switch (Opcode) {
     case ISD::XOR:
       if (N1.isUndef())
         // Handle undef ^ undef -> 0 special case. This is a common
         // idiom (misuse).
         return getConstant(0, DL, VT);
       LLVM_FALLTHROUGH;
     case ISD::ADD:
     case ISD::ADDC:
     case ISD::ADDE:
     case ISD::SUB:
     case ISD::UDIV:
     case ISD::SDIV:
     case ISD::UREM:
     case ISD::SREM:
     case ISD::SRA:
     case ISD::SRL:
     case ISD::SHL:
       return getUNDEF(VT);       // fold op(arg1, undef) -> undef
     case ISD::MUL:
     case ISD::AND:
       return getConstant(0, DL, VT);  // fold op(arg1, undef) -> 0
     case ISD::OR:
       return getAllOnesConstant(DL, VT);
     }
   }
 
   // Memoize this node if possible.
   SDNode *N;
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = {N1, N2};
   if (VT != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
       E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
     }
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     N->setFlags(Flags);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     createOperands(N, Ops);
   }
 
   InsertNode(N);
   SDValue V = SDValue(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3,
                               const SDNodeFlags Flags) {
   // Perform various simplifications.
   switch (Opcode) {
   case ISD::FMA: {
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(N1.getValueType() == VT && N2.getValueType() == VT &&
            N3.getValueType() == VT && "FMA types must match!");
     ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
     ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
     ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
     if (N1CFP && N2CFP && N3CFP) {
       APFloat  V1 = N1CFP->getValueAPF();
       const APFloat &V2 = N2CFP->getValueAPF();
       const APFloat &V3 = N3CFP->getValueAPF();
       APFloat::opStatus s =
         V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
       if (!TLI->hasFloatingPointExceptions() || s != APFloat::opInvalidOp)
         return getConstantFP(V1, DL, VT);
     }
     break;
   }
   case ISD::CONCAT_VECTORS: {
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     SDValue Ops[] = {N1, N2, N3};
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
       return V;
     break;
   }
   case ISD::SETCC: {
     // Use FoldSetCC to simplify SETCC's.
     if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
       return V;
     // Vector constant folding.
     SDValue Ops[] = {N1, N2, N3};
     if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) {
       NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
       return V;
     }
     break;
   }
   case ISD::SELECT:
     if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
      if (N1C->getZExtValue())
        return N2;             // select true, X, Y -> X
      return N3;             // select false, X, Y -> Y
     }
 
     if (N2 == N3) return N2;   // select C, X, X -> X
     break;
   case ISD::VECTOR_SHUFFLE:
     llvm_unreachable("should use getVectorShuffle constructor!");
   case ISD::INSERT_VECTOR_ELT: {
     ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
     // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
     if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
       return getUNDEF(VT);
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
     SDValue Index = N3;
     if (VT.isSimple() && N1.getValueType().isSimple()
         && N2.getValueType().isSimple()) {
       assert(VT.isVector() && N1.getValueType().isVector() &&
              N2.getValueType().isVector() &&
              "Insert subvector VTs must be a vectors");
       assert(VT == N1.getValueType() &&
              "Dest and insert subvector source types must match!");
       assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
              "Insert subvector must be from smaller vector to larger vector!");
       if (isa<ConstantSDNode>(Index)) {
         assert((N2.getValueType().getVectorNumElements() +
                 cast<ConstantSDNode>(Index)->getZExtValue()
                 <= VT.getVectorNumElements())
                && "Insert subvector overflow!");
       }
 
       // Trivial insertion.
       if (VT.getSimpleVT() == N2.getSimpleValueType())
         return N2;
     }
     break;
   }
   case ISD::BITCAST:
     // Fold bit_convert nodes from a type to themselves.
     if (N1.getValueType() == VT)
       return N1;
     break;
   }
 
   // Memoize node if it doesn't produce a flag.
   SDNode *N;
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = {N1, N2, N3};
   if (VT != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
       E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
     }
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     N->setFlags(Flags);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     createOperands(N, Ops);
   }
 
   InsertNode(N);
   SDValue V = SDValue(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
   return getNode(Opcode, DL, VT, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3, SDValue N4,
                               SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
   return getNode(Opcode, DL, VT, Ops);
 }
 
 /// getStackArgumentTokenFactor - Compute a TokenFactor to force all
 /// the incoming stack arguments to be loaded from the stack.
 SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
   SmallVector<SDValue, 8> ArgChains;
 
   // Include the original chain at the beginning of the list. When this is
   // used by target LowerCall hooks, this helps legalize find the
   // CALLSEQ_BEGIN node.
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument.
   for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
        UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0)
           ArgChains.push_back(SDValue(L, 1));
 
   // Build a tokenfactor for all the chains.
   return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
 /// getMemsetValue - Vectorized representation of the memset value
 /// operand.
 static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
                               const SDLoc &dl) {
   assert(!Value.isUndef());
 
   unsigned NumBits = VT.getScalarSizeInBits();
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
     assert(C->getAPIntValue().getBitWidth() == 8);
     APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
     if (VT.isInteger())
       return DAG.getConstant(Val, dl, VT);
     return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
                              VT);
   }
 
   assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
   EVT IntVT = VT.getScalarType();
   if (!IntVT.isInteger())
     IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
 
   Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
   if (NumBits > 8) {
     // Use a multiplication with 0x010101... to extend the input to the
     // required length.
     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
     Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
                         DAG.getConstant(Magic, dl, IntVT));
   }
 
   if (VT != Value.getValueType() && !VT.isInteger())
     Value = DAG.getBitcast(VT.getScalarType(), Value);
   if (VT != Value.getValueType())
     Value = DAG.getSplatBuildVector(VT, dl, Value);
 
   return Value;
 }
 
 /// getMemsetStringVal - Similar to getMemsetValue. Except this is only
 /// used when a memcpy is turned into a memset when the source is a constant
 /// string ptr.
 static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
                                   const TargetLowering &TLI,
                                   const ConstantDataArraySlice &Slice) {
   // Handle vector with all elements zero.
   if (Slice.Array == nullptr) {
     if (VT.isInteger())
       return DAG.getConstant(0, dl, VT);
     else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
       return DAG.getConstantFP(0.0, dl, VT);
     else if (VT.isVector()) {
       unsigned NumElts = VT.getVectorNumElements();
       MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
       return DAG.getNode(ISD::BITCAST, dl, VT,
                          DAG.getConstant(0, dl,
                                          EVT::getVectorVT(*DAG.getContext(),
                                                           EltVT, NumElts)));
     } else
       llvm_unreachable("Expected type!");
   }
 
   assert(!VT.isVector() && "Can't handle vector type here!");
   unsigned NumVTBits = VT.getSizeInBits();
   unsigned NumVTBytes = NumVTBits / 8;
   unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));
 
   APInt Val(NumVTBits, 0);
   if (DAG.getDataLayout().isLittleEndian()) {
     for (unsigned i = 0; i != NumBytes; ++i)
       Val |= (uint64_t)(unsigned char)Slice[i] << i*8;
   } else {
     for (unsigned i = 0; i != NumBytes; ++i)
       Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
   }
 
   // If the "cost" of materializing the integer immediate is less than the cost
   // of a load, then it is cost effective to turn the load into the immediate.
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
   if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
     return DAG.getConstant(Val, dl, VT);
   return SDValue(nullptr, 0);
 }
 
 SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
                                            const SDLoc &DL) {
   EVT VT = Base.getValueType();
   return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
 }
 
 /// Returns true if memcpy source is constant data.
 static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
   uint64_t SrcDelta = 0;
   GlobalAddressSDNode *G = nullptr;
   if (Src.getOpcode() == ISD::GlobalAddress)
     G = cast<GlobalAddressSDNode>(Src);
   else if (Src.getOpcode() == ISD::ADD &&
            Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
            Src.getOperand(1).getOpcode() == ISD::Constant) {
     G = cast<GlobalAddressSDNode>(Src.getOperand(0));
     SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
   }
   if (!G)
     return false;
 
   return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
                                   SrcDelta + G->getOffset());
 }
 
 /// Determines the optimal series of memory ops to replace the memset / memcpy.
 /// Return true if the number of memory ops is below the threshold (Limit).
 /// It returns the types of the sequence of memory ops to perform
 /// memset / memcpy by reference.
 static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                      unsigned Limit, uint64_t Size,
                                      unsigned DstAlign, unsigned SrcAlign,
                                      bool IsMemset,
                                      bool ZeroMemset,
                                      bool MemcpyStrSrc,
                                      bool AllowOverlap,
                                      unsigned DstAS, unsigned SrcAS,
                                      SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
          "Expecting memcpy / memset source to meet alignment requirement!");
   // If 'SrcAlign' is zero, that means the memory operation does not need to
   // load the value, i.e. memset or memcpy from constant string. Otherwise,
   // it's the inferred alignment of the source. 'DstAlign', on the other hand,
   // is the specified alignment of the memory operation. If it is zero, that
   // means it's possible to change the alignment of the destination.
   // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
   // not need to be loaded.
   EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
                                    IsMemset, ZeroMemset, MemcpyStrSrc,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
     // Use the largest integer type whose alignment constraints are satisfied.
     // We only need to check DstAlign here as SrcAlign is always greater or
     // equal to DstAlign (or zero).
     VT = MVT::i64;
     while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
            !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
       VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
     assert(VT.isInteger());
 
     // Find the largest legal integer type.
     MVT LVT = MVT::i64;
     while (!TLI.isTypeLegal(LVT))
       LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
     assert(LVT.isInteger());
 
     // If the type we've chosen is larger than the largest legal integer type
     // then use that instead.
     if (VT.bitsGT(LVT))
       VT = LVT;
   }
 
   unsigned NumMemOps = 0;
   while (Size != 0) {
     unsigned VTSize = VT.getSizeInBits() / 8;
     while (VTSize > Size) {
       // For now, only use non-vector load / store's for the left-over pieces.
       EVT NewVT = VT;
       unsigned NewVTSize;
 
       bool Found = false;
       if (VT.isVector() || VT.isFloatingPoint()) {
         NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
         if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
             TLI.isSafeMemOpType(NewVT.getSimpleVT()))
           Found = true;
         else if (NewVT == MVT::i64 &&
                  TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
                  TLI.isSafeMemOpType(MVT::f64)) {
           // i64 is usually not legal on 32-bit targets, but f64 may be.
           NewVT = MVT::f64;
           Found = true;
         }
       }
 
       if (!Found) {
         do {
           NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
           if (NewVT == MVT::i8)
             break;
         } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
       }
       NewVTSize = NewVT.getSizeInBits() / 8;
 
       // If the new VT cannot cover all of the remaining bits, then consider
       // issuing a (or a pair of) unaligned and overlapping load / store.
       // FIXME: Only does this for 64-bit or more since we don't have proper
       // cost model for unaligned load / store.
       bool Fast;
       if (NumMemOps && AllowOverlap &&
           VTSize >= 8 && NewVTSize < Size &&
           TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)
         VTSize = Size;
       else {
         VT = NewVT;
         VTSize = NewVTSize;
       }
     }
 
     if (++NumMemOps > Limit)
       return false;
 
     MemOps.push_back(VT);
     Size -= VTSize;
   }
 
   return true;
 }
 
 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
   // On Darwin, -Os means optimize for size without hurting performance, so
   // only really optimize for size when -Oz (MinSize) is used.
   if (MF.getTarget().getTargetTriple().isOSDarwin())
     return MF.getFunction().optForMinSize();
   return MF.getFunction().optForSize();
 }
 
 static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                           SmallVector<SDValue, 32> &OutChains, unsigned From,
                           unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
                           SmallVector<SDValue, 16> &OutStoreChains) {
   assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
   assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
   SmallVector<SDValue, 16> GluedLoadChains;
   for (unsigned i = From; i < To; ++i) {
     OutChains.push_back(OutLoadChains[i]);
     GluedLoadChains.push_back(OutLoadChains[i]);
   }
 
   // Chain for all loads.
   SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                   GluedLoadChains);
 
   for (unsigned i = From; i < To; ++i) {
     StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
     SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
                                   ST->getBasePtr(), ST->getMemoryVT(),
                                   ST->getMemOperand());
     OutChains.push_back(NewStore);
   }
 }
 
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
                                        uint64_t Size, unsigned Align,
                                        bool isVol, bool AlwaysInline,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
   // Turn a memcpy of undef to nop.
   if (Src.isUndef())
     return Chain;
 
   // Expand memcpy to a series of load and store ops if the size operand falls
   // below a certain threshold.
   // TODO: In the AlwaysInline case, if the size is big then generate a loop
   // rather than maybe a humongous number of loads and stores.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
   LLVMContext &C = *DAG.getContext();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool OptSize = shouldLowerMemFuncForSize(MF);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   if (Align > SrcAlign)
     SrcAlign = Align;
   ConstantDataArraySlice Slice;
   bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
   bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
                                 (isZeroConstant ? 0 : SrcAlign),
                                 false, false, CopyFromConstant, true,
                                 DstPtrInfo.getAddrSpace(),
                                 SrcPtrInfo.getAddrSpace(),
                                 DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(C);
     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
       while (NewAlign > Align &&
              DL.exceedsNaturalStackAlignment(NewAlign))
           NewAlign /= 2;
 
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
       Align = NewAlign;
     }
   }
 
   MachineMemOperand::Flags MMOFlags =
       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
   SmallVector<SDValue, 16> OutLoadChains;
   SmallVector<SDValue, 16> OutStoreChains;
   SmallVector<SDValue, 32> OutChains;
   unsigned NumMemOps = MemOps.size();
   uint64_t SrcOff = 0, DstOff = 0;
   for (unsigned i = 0; i != NumMemOps; ++i) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value, Store;
 
     if (VTSize > Size) {
       // Issuing an unaligned load / store pair  that overlaps with the previous
       // pair. Adjust the offset accordingly.
       assert(i == NumMemOps-1 && i != 0);
       SrcOff -= VTSize - Size;
       DstOff -= VTSize - Size;
     }
 
     if (CopyFromConstant &&
         (isZeroConstant || (VT.isInteger() && !VT.isVector()))) {
       // It's unlikely a store of a vector immediate can be done in a single
       // instruction. It would require a load from a constantpool first.
       // We only handle zero vectors here.
       // FIXME: Handle other cases where store of vector immediate is done in
       // a single instruction.
       ConstantDataArraySlice SubSlice;
       if (SrcOff < Slice.Length) {
         SubSlice = Slice;
         SubSlice.move(SrcOff);
       } else {
         // This is an out-of-bounds access and hence UB. Pretend we read zero.
         SubSlice.Array = nullptr;
         SubSlice.Offset = 0;
         SubSlice.Length = VTSize;
       }
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
       if (Value.getNode()) {
         Store = DAG.getStore(Chain, dl, Value,
                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
                              DstPtrInfo.getWithOffset(DstOff), Align,
                              MMOFlags);
         OutChains.push_back(Store);
       }
     }
 
     if (!Store.getNode()) {
       // The type might not be legal for the target.  This should only happen
       // if the type is smaller than a legal type, as on PPC, so the right
       // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
       // to Load/Store if NVT==VT.
       // FIXME does the case above also need this?
       EVT NVT = TLI.getTypeToTransformTo(C, VT);
       assert(NVT.bitsGE(VT));
 
       bool isDereferenceable =
         SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
       MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
       if (isDereferenceable)
         SrcMMOFlags |= MachineMemOperand::MODereferenceable;
 
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
                              MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
       OutLoadChains.push_back(Value.getValue(1));
 
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
           DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
       OutStoreChains.push_back(Store);
     }
     SrcOff += VTSize;
     DstOff += VTSize;
     Size -= VTSize;
   }
 
   unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
                                 TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
   unsigned NumLdStInMemcpy = OutStoreChains.size();
 
   if (NumLdStInMemcpy) {
     // It may be that memcpy might be converted to memset if it's memcpy
     // of constants. In such a case, we won't have loads and stores, but
     // just stores. In the absence of loads, there is nothing to gang up.
     if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
       // If target does not care, just leave as it.
       for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
         OutChains.push_back(OutLoadChains[i]);
         OutChains.push_back(OutStoreChains[i]);
       }
     } else {
       // Ld/St less than/equal limit set by target.
       if (NumLdStInMemcpy <= GluedLdStLimit) {
           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
                                         NumLdStInMemcpy, OutLoadChains,
                                         OutStoreChains);
       } else {
         unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
         unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
         unsigned GlueIter = 0;
 
         for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
           unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
           unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
 
           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
                                        OutLoadChains, OutStoreChains);
           GlueIter += GluedLdStLimit;
         }
 
         // Residual ld/st.
         if (RemainingLdStInMemcpy) {
           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
                                         RemainingLdStInMemcpy, OutLoadChains,
                                         OutStoreChains);
         }
       }
     }
   }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                         SDValue Chain, SDValue Dst, SDValue Src,
                                         uint64_t Size, unsigned Align,
                                         bool isVol, bool AlwaysInline,
                                         MachinePointerInfo DstPtrInfo,
                                         MachinePointerInfo SrcPtrInfo) {
   // Turn a memmove of undef to nop.
   if (Src.isUndef())
     return Chain;
 
   // Expand memmove to a series of load and store ops if the size operand falls
   // below a certain threshold.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
   LLVMContext &C = *DAG.getContext();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool OptSize = shouldLowerMemFuncForSize(MF);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   if (Align > SrcAlign)
     SrcAlign = Align;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align), SrcAlign,
                                 false, false, false, false,
                                 DstPtrInfo.getAddrSpace(),
                                 SrcPtrInfo.getAddrSpace(),
                                 DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(C);
     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
       Align = NewAlign;
     }
   }
 
   MachineMemOperand::Flags MMOFlags =
       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
   uint64_t SrcOff = 0, DstOff = 0;
   SmallVector<SDValue, 8> LoadValues;
   SmallVector<SDValue, 8> LoadChains;
   SmallVector<SDValue, 8> OutChains;
   unsigned NumMemOps = MemOps.size();
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value;
 
     bool isDereferenceable =
       SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
     MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
     if (isDereferenceable)
       SrcMMOFlags |= MachineMemOperand::MODereferenceable;
 
     Value =
         DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                     SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
   }
   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
   OutChains.clear();
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Store;
 
     Store = DAG.getStore(Chain, dl, LoadValues[i],
                          DAG.getMemBasePlusOffset(Dst, DstOff, dl),
                          DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
     OutChains.push_back(Store);
     DstOff += VTSize;
   }
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 /// Lower the call to 'memset' intrinsic function into a series of store
 /// operations.
 ///
 /// \param DAG Selection DAG where lowered code is placed.
 /// \param dl Link to corresponding IR location.
 /// \param Chain Control flow dependency.
 /// \param Dst Pointer to destination memory location.
 /// \param Src Value of byte to write into the memory.
 /// \param Size Number of bytes to write.
 /// \param Align Alignment of the destination in bytes.
 /// \param isVol True if destination is volatile.
 /// \param DstPtrInfo IR information on the memory pointer.
 /// \returns New head in the control flow, if lowering was successful, empty
 /// SDValue otherwise.
 ///
 /// The function tries to replace 'llvm.memset' intrinsic with several store
 /// operations and value calculation code. This is usually profitable for small
 /// memory size.
 static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
                                SDValue Chain, SDValue Dst, SDValue Src,
                                uint64_t Size, unsigned Align, bool isVol,
                                MachinePointerInfo DstPtrInfo) {
   // Turn a memset of undef to nop.
   if (Src.isUndef())
     return Chain;
 
   // Expand memset to a series of load/store ops if the size operand
   // falls below a certain threshold.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool OptSize = shouldLowerMemFuncForSize(MF);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
   bool IsZeroVal =
     isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
   if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
                                 Size, (DstAlignCanChange ? 0 : Align), 0,
                                 true, IsZeroVal, false, true,
                                 DstPtrInfo.getAddrSpace(), ~0u,
                                 DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
     unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
       Align = NewAlign;
     }
   }
 
   SmallVector<SDValue, 8> OutChains;
   uint64_t DstOff = 0;
   unsigned NumMemOps = MemOps.size();
 
   // Find the largest store and generate the bit pattern for it.
   EVT LargestVT = MemOps[0];
   for (unsigned i = 1; i < NumMemOps; i++)
     if (MemOps[i].bitsGT(LargestVT))
       LargestVT = MemOps[i];
   SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
 
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
     if (VTSize > Size) {
       // Issuing an unaligned load / store pair  that overlaps with the previous
       // pair. Adjust the offset accordingly.
       assert(i == NumMemOps-1 && i != 0);
       DstOff -= VTSize - Size;
     }
 
     // If this store is smaller than the largest store see whether we can get
     // the smaller value for free with a truncate.
     SDValue Value = MemSetValue;
     if (VT.bitsLT(LargestVT)) {
       if (!LargestVT.isVector() && !VT.isVector() &&
           TLI.isTruncateFree(LargestVT, VT))
         Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
       else
         Value = getMemsetValue(Src, VT, DAG, dl);
     }
     assert(Value.getValueType() == VT && "Value with wrong type.");
     SDValue Store = DAG.getStore(
         Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
         DstPtrInfo.getWithOffset(DstOff), Align,
         isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
     OutChains.push_back(Store);
     DstOff += VT.getSizeInBits() / 8;
     Size -= VTSize;
   }
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
                                             unsigned AS) {
   // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
   // pointer operands can be losslessly bitcasted to pointers of address space 0
   if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
     report_fatal_error("cannot lower memory intrinsic in address space " +
                        Twine(AS));
   }
 }
 
 SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 SDValue Src, SDValue Size, unsigned Align,
                                 bool isVol, bool AlwaysInline, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo,
                                 MachinePointerInfo SrcPtrInfo) {
   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
   // Check to see if we should lower the memcpy to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (ConstantSize) {
     // Memcpy with size zero? Just return the original chain.
     if (ConstantSize->isNullValue())
       return Chain;
 
     SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
                                              ConstantSize->getZExtValue(),Align,
                                 isVol, false, DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
   // Then check to see if we should lower the memcpy with target-specific
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemcpy(
         *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
         DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
   // If we really need inline code and the target declined to provide it,
   // use a (potentially long) sequence of loads and stores.
   if (AlwaysInline) {
     assert(ConstantSize && "AlwaysInline requires a constant size!");
     return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
                                    ConstantSize->getZExtValue(), Align, isVol,
                                    true, DstPtrInfo, SrcPtrInfo);
   }
 
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
   checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
 
   // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
   // memcpy is not guaranteed to be safe. libc memcpys aren't required to
   // respect volatile, so they may do things like read or write memory
   // beyond the given memory regions. But fixing this isn't easy, and most
   // people don't care.
 
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
                     Dst.getValueType().getTypeForEVT(*getContext()),
                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
                                       TLI->getPointerTy(getDataLayout())),
                     std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
                                       SDValue Dst, unsigned DstAlign,
                                       SDValue Src, unsigned SrcAlign,
                                       SDValue Size, Type *SizeTy,
                                       unsigned ElemSz, bool isTailCall,
                                       MachinePointerInfo DstPtrInfo,
                                       MachinePointerInfo SrcPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
 
   Entry.Node = Src;
   Args.push_back(Entry);
 
   Entry.Ty = SizeTy;
   Entry.Node = Size;
   Args.push_back(Entry);
 
   RTLIB::Libcall LibraryCall =
       RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
   if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
     report_fatal_error("Unsupported element size");
 
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
                     Type::getVoidTy(*getContext()),
                     getExternalSymbol(TLI->getLibcallName(LibraryCall),
                                       TLI->getPointerTy(getDataLayout())),
                     std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                  SDValue Src, SDValue Size, unsigned Align,
                                  bool isVol, bool isTailCall,
                                  MachinePointerInfo DstPtrInfo,
                                  MachinePointerInfo SrcPtrInfo) {
   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
   // Check to see if we should lower the memmove to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (ConstantSize) {
     // Memmove with size zero? Just return the original chain.
     if (ConstantSize->isNullValue())
       return Chain;
 
     SDValue Result =
       getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
                                ConstantSize->getZExtValue(), Align, isVol,
                                false, DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
   // Then check to see if we should lower the memmove with target-specific
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemmove(
         *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
   checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
 
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
 
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME:  pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
                     Dst.getValueType().getTypeForEVT(*getContext()),
                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
                                       TLI->getPointerTy(getDataLayout())),
                     std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
                                        SDValue Dst, unsigned DstAlign,
                                        SDValue Src, unsigned SrcAlign,
                                        SDValue Size, Type *SizeTy,
                                        unsigned ElemSz, bool isTailCall,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
 
   Entry.Node = Src;
   Args.push_back(Entry);
 
   Entry.Ty = SizeTy;
   Entry.Node = Size;
   Args.push_back(Entry);
 
   RTLIB::Libcall LibraryCall =
       RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
   if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
     report_fatal_error("Unsupported element size");
 
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
                     Type::getVoidTy(*getContext()),
                     getExternalSymbol(TLI->getLibcallName(LibraryCall),
                                       TLI->getPointerTy(getDataLayout())),
                     std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 SDValue Src, SDValue Size, unsigned Align,
                                 bool isVol, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo) {
   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
   // Check to see if we should lower the memset to stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (ConstantSize) {
     // Memset with size zero? Just return the original chain.
     if (ConstantSize->isNullValue())
       return Chain;
 
     SDValue Result =
       getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
                       Align, isVol, DstPtrInfo);
 
     if (Result.getNode())
       return Result;
   }
 
   // Then check to see if we should lower the memset with target-specific
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemset(
         *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
 
   // Emit a library call.
   Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
   Entry.Node = Src;
   Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
   Args.push_back(Entry);
   Entry.Node = Size;
   Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
 
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
                     Dst.getValueType().getTypeForEVT(*getContext()),
                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
                                       TLI->getPointerTy(getDataLayout())),
                     std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
                                       SDValue Dst, unsigned DstAlign,
                                       SDValue Value, SDValue Size, Type *SizeTy,
                                       unsigned ElemSz, bool isTailCall,
                                       MachinePointerInfo DstPtrInfo) {
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
 
   Entry.Ty = Type::getInt8Ty(*getContext());
   Entry.Node = Value;
   Args.push_back(Entry);
 
   Entry.Ty = SizeTy;
   Entry.Node = Size;
   Args.push_back(Entry);
 
   RTLIB::Libcall LibraryCall =
       RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
   if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
     report_fatal_error("Unsupported element size");
 
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
                     Type::getVoidTy(*getContext()),
                     getExternalSymbol(TLI->getLibcallName(LibraryCall),
                                       TLI->getPointerTy(getDataLayout())),
                     std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDVTList VTList, ArrayRef<SDValue> Ops,
                                 MachineMemOperand *MMO) {
   FoldingSetNodeID ID;
   ID.AddInteger(MemVT.getRawBits());
   AddNodeIDNode(ID, Opcode, VTList, Ops);
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
 
   auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
                                     VTList, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getAtomicCmpSwap(
     unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
     SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
     unsigned Alignment, AtomicOrdering SuccessOrdering,
     AtomicOrdering FailureOrdering, SyncScope::ID SSID) {
   assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
          Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
 
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
 
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
   auto Flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
                MachineMemOperand::MOStore;
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
                             AAMDNodes(), nullptr, SSID, SuccessOrdering,
                             FailureOrdering);
 
   return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO);
 }
 
 SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
                                        EVT MemVT, SDVTList VTs, SDValue Chain,
                                        SDValue Ptr, SDValue Cmp, SDValue Swp,
                                        MachineMemOperand *MMO) {
   assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
          Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
 
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Val,
                                 const Value *PtrVal, unsigned Alignment,
                                 AtomicOrdering Ordering,
                                 SyncScope::ID SSID) {
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
   // An atomic store does not load. An atomic load does not store.
   // (An atomicrmw obviously both loads and stores.)
   // For now, atomics are considered to be volatile always, and they are
   // chained as such.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
   auto Flags = MachineMemOperand::MOVolatile;
   if (Opcode != ISD::ATOMIC_STORE)
     Flags |= MachineMemOperand::MOLoad;
   if (Opcode != ISD::ATOMIC_LOAD)
     Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
                             MemVT.getStoreSize(), Alignment, AAMDNodes(),
                             nullptr, SSID, Ordering);
 
   return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Val,
                                 MachineMemOperand *MMO) {
   assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
           Opcode == ISD::ATOMIC_LOAD_SUB ||
           Opcode == ISD::ATOMIC_LOAD_AND ||
           Opcode == ISD::ATOMIC_LOAD_CLR ||
           Opcode == ISD::ATOMIC_LOAD_OR ||
           Opcode == ISD::ATOMIC_LOAD_XOR ||
           Opcode == ISD::ATOMIC_LOAD_NAND ||
           Opcode == ISD::ATOMIC_LOAD_MIN ||
           Opcode == ISD::ATOMIC_LOAD_MAX ||
           Opcode == ISD::ATOMIC_LOAD_UMIN ||
           Opcode == ISD::ATOMIC_LOAD_UMAX ||
           Opcode == ISD::ATOMIC_SWAP ||
           Opcode == ISD::ATOMIC_STORE) &&
          "Invalid Atomic Op");
 
   EVT VT = Val.getValueType();
 
   SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
                                                getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr, Val};
   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 EVT VT, SDValue Chain, SDValue Ptr,
                                 MachineMemOperand *MMO) {
   assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
 
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr};
   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
 }
 
 /// getMergeValues - Create a MERGE_VALUES node from the given operands.
 SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
   if (Ops.size() == 1)
     return Ops[0];
 
   SmallVector<EVT, 4> VTs;
   VTs.reserve(Ops.size());
   for (unsigned i = 0; i < Ops.size(); ++i)
     VTs.push_back(Ops[i].getValueType());
   return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
 }
 
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
     EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
     MachineMemOperand::Flags Flags, unsigned Size) {
   if (Align == 0)  // Ensure that codegen never sees alignment 0
     Align = getEVTAlignment(MemVT);
 
   if (!Size)
     Size = MemVT.getStoreSize();
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);
 
   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
 
 SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
                                           SDVTList VTList,
                                           ArrayRef<SDValue> Ops, EVT MemVT,
                                           MachineMemOperand *MMO) {
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
           Opcode == ISD::LIFETIME_START ||
           Opcode == ISD::LIFETIME_END ||
           ((int)Opcode <= std::numeric_limits<int>::max() &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
 
   // Memoize the node unless it returns a flag.
   MemIntrinsicSDNode *N;
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
         Opcode, dl.getIROrder(), VTList, MemVT, MMO));
     ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
       return SDValue(E, 0);
     }
 
     N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
                                       VTList, MemVT, MMO);
     createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
                                       VTList, MemVT, MMO);
     createOperands(N, Ops);
   }
   InsertNode(N);
   return SDValue(N, 0);
 }
 
 /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
 /// MachinePointerInfo record from it.  This is particularly useful because the
 /// code generator has many cases where it doesn't bother passing in a
 /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
 static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
                                            SelectionDAG &DAG, SDValue Ptr,
                                            int64_t Offset = 0) {
   // If this is FI+Offset, we can model it.
   if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
     return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
                                              FI->getIndex(), Offset);
 
   // If this is (FI+Offset1)+Offset2, we can model it.
   if (Ptr.getOpcode() != ISD::ADD ||
       !isa<ConstantSDNode>(Ptr.getOperand(1)) ||
       !isa<FrameIndexSDNode>(Ptr.getOperand(0)))
     return Info;
 
   int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
   return MachinePointerInfo::getFixedStack(
       DAG.getMachineFunction(), FI,
       Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
 }
 
 /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
 /// MachinePointerInfo record from it.  This is particularly useful because the
 /// code generator has many cases where it doesn't bother passing in a
 /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
 static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
                                            SelectionDAG &DAG, SDValue Ptr,
                                            SDValue OffsetOp) {
   // If the 'Offset' value isn't a constant, we can't handle this.
   if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
     return InferPointerInfo(Info, DAG, Ptr, OffsetNode->getSExtValue());
   if (OffsetOp.isUndef())
     return InferPointerInfo(Info, DAG, Ptr);
   return Info;
 }
 
 SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                               EVT VT, const SDLoc &dl, SDValue Chain,
                               SDValue Ptr, SDValue Offset,
                               MachinePointerInfo PtrInfo, EVT MemVT,
                               unsigned Alignment,
                               MachineMemOperand::Flags MMOFlags,
                               const AAMDNodes &AAInfo, const MDNode *Ranges) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MMOFlags |= MachineMemOperand::MOLoad;
   assert((MMOFlags & MachineMemOperand::MOStore) == 0);
   // If we don't have a PtrInfo, infer the trivial frame index case to simplify
   // clients.
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
 SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                               EVT VT, const SDLoc &dl, SDValue Chain,
                               SDValue Ptr, SDValue Offset, EVT MemVT,
                               MachineMemOperand *MMO) {
   if (VT == MemVT) {
     ExtType = ISD::NON_EXTLOAD;
   } else if (ExtType == ISD::NON_EXTLOAD) {
     assert(VT == MemVT && "Non-extending load from different memory type!");
   } else {
     // Extending load.
     assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
            "Should only be an extending load, not truncating!");
     assert(VT.isInteger() == MemVT.isInteger() &&
            "Cannot convert from FP to Int or Int -> FP!");
     assert(VT.isVector() == MemVT.isVector() &&
            "Cannot use an ext load to convert to or from a vector!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
            "Cannot use an ext load to change the number of vector elements!");
   }
 
   bool Indexed = AM != ISD::UNINDEXED;
   assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
 
   SDVTList VTs = Indexed ?
     getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Offset };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
   ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
       dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
   auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
                                   ExtType, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                               SDValue Ptr, MachinePointerInfo PtrInfo,
                               unsigned Alignment,
                               MachineMemOperand::Flags MMOFlags,
                               const AAMDNodes &AAInfo, const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
 }
 
 SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                               SDValue Ptr, MachineMemOperand *MMO) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  VT, MMO);
 }
 
 SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
                                  EVT VT, SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
                                  unsigned Alignment,
                                  MachineMemOperand::Flags MMOFlags,
                                  const AAMDNodes &AAInfo) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
                  MemVT, Alignment, MMOFlags, AAInfo);
 }
 
 SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
                                  EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
                                  MachineMemOperand *MMO) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
                  MemVT, MMO);
 }
 
 SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
                                      SDValue Base, SDValue Offset,
                                      ISD::MemIndexedMode AM) {
   LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
   assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
   // Don't propagate the invariant or dereferenceable flags.
   auto MMOFlags =
       LD->getMemOperand()->getFlags() &
       ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
   return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getPointerInfo(),
                  LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
                  LD->getAAInfo());
 }
 
 SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
                                unsigned Alignment,
                                MachineMemOperand::Flags MMOFlags,
                                const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(Val.getValueType());
 
   MMOFlags |= MachineMemOperand::MOStore;
   assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
 
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
   return getStore(Chain, dl, Val, Ptr, MMO);
 }
 
 SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                SDValue Ptr, MachineMemOperand *MMO) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   EVT VT = Val.getValueType();
   SDVTList VTs = getVTList(MVT::Other);
   SDValue Undef = getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
       dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
   auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
                                    ISD::UNINDEXED, false, VT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                     SDValue Ptr, MachinePointerInfo PtrInfo,
                                     EVT SVT, unsigned Alignment,
                                     MachineMemOperand::Flags MMOFlags,
                                     const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(SVT);
 
   MMOFlags |= MachineMemOperand::MOStore;
   assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
 
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
 }
 
 SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                     SDValue Ptr, EVT SVT,
                                     MachineMemOperand *MMO) {
   EVT VT = Val.getValueType();
 
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (VT == SVT)
     return getStore(Chain, dl, Val, Ptr, MMO);
 
   assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
          "Should only be a truncating store, not extending!");
   assert(VT.isInteger() == SVT.isInteger() &&
          "Can't do FP-INT conversion!");
   assert(VT.isVector() == SVT.isVector() &&
          "Cannot use trunc store to convert to or from a vector!");
   assert((!VT.isVector() ||
           VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
          "Cannot use trunc store to change the number of vector elements!");
 
   SDVTList VTs = getVTList(MVT::Other);
   SDValue Undef = getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
   ID.AddInteger(SVT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
       dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
   auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
                                    ISD::UNINDEXED, true, SVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
                                       SDValue Base, SDValue Offset,
                                       ISD::MemIndexedMode AM) {
   StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
   assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
   SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
   SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
   ID.AddInteger(ST->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
   auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
                                    ST->isTruncatingStore(), ST->getMemoryVT(),
                                    ST->getMemOperand());
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                                     SDValue Ptr, SDValue Mask, SDValue Src0,
                                     EVT MemVT, MachineMemOperand *MMO,
                                     ISD::LoadExtType ExtTy, bool isExpanding) {
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
       dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
   auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
                                         ExtTy, isExpanding, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
                                      SDValue Val, SDValue Ptr, SDValue Mask,
                                      EVT MemVT, MachineMemOperand *MMO,
                                      bool IsTruncating, bool IsCompressing) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   EVT VT = Val.getValueType();
   SDVTList VTs = getVTList(MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Mask, Val };
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
       dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
   auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
                                          IsTruncating, IsCompressing, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
                                       MachineMemOperand *MMO) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
       dl.getIROrder(), VTs, VT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
 
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                           VTs, VT, MMO);
   createOperands(N, Ops);
 
   assert(N->getValue().getValueType() == N->getValueType(0) &&
          "Incompatible type of the PassThru value in MaskedGatherSDNode");
   assert(N->getMask().getValueType().getVectorNumElements() ==
              N->getValueType(0).getVectorNumElements() &&
          "Vector width mismatch between mask and data");
   assert(N->getIndex().getValueType().getVectorNumElements() ==
              N->getValueType(0).getVectorNumElements() &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
          "Scale should be a constant power of 2");
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
                                        MachineMemOperand *MMO) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
       dl.getIROrder(), VTs, VT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                            VTs, VT, MMO);
   createOperands(N, Ops);
 
   assert(N->getMask().getValueType().getVectorNumElements() ==
              N->getValue().getValueType().getVectorNumElements() &&
          "Vector width mismatch between mask and data");
   assert(N->getIndex().getValueType().getVectorNumElements() ==
              N->getValue().getValueType().getVectorNumElements() &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
          "Scale should be a constant power of 2");
 
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
                                SDValue Ptr, SDValue SV, unsigned Align) {
   SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
   return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               ArrayRef<SDUse> Ops) {
   switch (Ops.size()) {
   case 0: return getNode(Opcode, DL, VT);
   case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
   }
 
   // Copy from an SDUse array into an SDValue array for use with
   // the regular getNode logic.
   SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
   return getNode(Opcode, DL, VT, NewOps);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
   unsigned NumOps = Ops.size();
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
   case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
   }
 
   switch (Opcode) {
   default: break;
   case ISD::CONCAT_VECTORS:
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
       return V;
     break;
   case ISD::SELECT_CC:
     assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
     assert(Ops[0].getValueType() == Ops[1].getValueType() &&
            "LHS and RHS of condition must have same type!");
     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
            "True and False arms of SelectCC must have same type!");
     assert(Ops[2].getValueType() == VT &&
            "select_cc node must be of same type as true and false value!");
     break;
   case ISD::BR_CC:
     assert(NumOps == 5 && "BR_CC takes 5 operands!");
     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
            "LHS/RHS of comparison should match types!");
     break;
   }
 
   // Memoize nodes.
   SDNode *N;
   SDVTList VTs = getVTList(VT);
 
   if (VT != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
 
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
       return SDValue(E, 0);
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     createOperands(N, Ops);
 
     CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
     createOperands(N, Ops);
   }
 
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
                               ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
   return getNode(Opcode, DL, getVTList(ResultTys), Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               ArrayRef<SDValue> Ops) {
   if (VTList.NumVTs == 1)
     return getNode(Opcode, DL, VTList.VTs[0], Ops);
 
 #if 0
   switch (Opcode) {
   // FIXME: figure out how to safely handle things like
   // int foo(int x) { return 1 << (x & 255); }
   // int bar() { return foo(256); }
   case ISD::SRA_PARTS:
   case ISD::SRL_PARTS:
   case ISD::SHL_PARTS:
     if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
         cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
       return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
     else if (N3.getOpcode() == ISD::AND)
       if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
         // If the and is only masking out bits that cannot effect the shift,
         // eliminate the and.
         unsigned NumBits = VT.getScalarSizeInBits()*2;
         if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
           return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
       }
     break;
   }
 #endif
 
   // Memoize the node unless it returns a flag.
   SDNode *N;
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
       return SDValue(E, 0);
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
     createOperands(N, Ops);
   }
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
   return V;
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
                               SDVTList VTList) {
   return getNode(Opcode, DL, VTList, None);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1) {
   SDValue Ops[] = { N1 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1, SDValue N2) {
   SDValue Ops[] = { N1, N2 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3) {
   SDValue Ops[] = { N1, N2, N3 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3, SDValue N4,
                               SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
   return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT) {
   return makeVTList(SDNode::getValueTypeList(VT), 1);
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
   FoldingSetNodeID ID;
   ID.AddInteger(2U);
   ID.AddInteger(VT1.getRawBits());
   ID.AddInteger(VT2.getRawBits());
 
   void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
   if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(2);
     Array[0] = VT1;
     Array[1] = VT2;
     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
     VTListMap.InsertNode(Result, IP);
   }
   return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
   FoldingSetNodeID ID;
   ID.AddInteger(3U);
   ID.AddInteger(VT1.getRawBits());
   ID.AddInteger(VT2.getRawBits());
   ID.AddInteger(VT3.getRawBits());
 
   void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
   if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(3);
     Array[0] = VT1;
     Array[1] = VT2;
     Array[2] = VT3;
     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
     VTListMap.InsertNode(Result, IP);
   }
   return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
   FoldingSetNodeID ID;
   ID.AddInteger(4U);
   ID.AddInteger(VT1.getRawBits());
   ID.AddInteger(VT2.getRawBits());
   ID.AddInteger(VT3.getRawBits());
   ID.AddInteger(VT4.getRawBits());
 
   void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
   if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(4);
     Array[0] = VT1;
     Array[1] = VT2;
     Array[2] = VT3;
     Array[3] = VT4;
     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
     VTListMap.InsertNode(Result, IP);
   }
   return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
   unsigned NumVTs = VTs.size();
   FoldingSetNodeID ID;
   ID.AddInteger(NumVTs);
   for (unsigned index = 0; index < NumVTs; index++) {
     ID.AddInteger(VTs[index].getRawBits());
   }
 
   void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
   if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(NumVTs);
     std::copy(VTs.begin(), VTs.end(), Array);
     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
     VTListMap.InsertNode(Result, IP);
   }
   return Result->getSDVTList();
 }
 
 
 /// UpdateNodeOperands - *Mutate* the specified node in-place to have the
 /// specified operands.  If the resultant node already exists in the DAG,
 /// this does not modify the specified node, instead it returns the node that
 /// already exists.  If the resultant node does not exist in the DAG, the
 /// input node is returned.  As a degenerate case, if you specify the same
 /// input operands as the node already has, the input node is returned.
 SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
   assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
 
   // Check to see if there is no change.
   if (Op == N->getOperand(0)) return N;
 
   // See if the modified node already exists.
   void *InsertPos = nullptr;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
     return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
     if (!RemoveNodeFromCSEMaps(N))
       InsertPos = nullptr;
 
   // Now we update the operands.
   N->OperandList[0].set(Op);
 
   updateDivergence(N);
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
   return N;
 }
 
 SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
   assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
 
   // Check to see if there is no change.
   if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
     return N;   // No operands changed, just return the input node.
 
   // See if the modified node already exists.
   void *InsertPos = nullptr;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
     return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
     if (!RemoveNodeFromCSEMaps(N))
       InsertPos = nullptr;
 
   // Now we update the operands.
   if (N->OperandList[0] != Op1)
     N->OperandList[0].set(Op1);
   if (N->OperandList[1] != Op2)
     N->OperandList[1].set(Op2);
 
   updateDivergence(N);
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
   return N;
 }
 
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
   SDValue Ops[] = { Op1, Op2, Op3 };
   return UpdateNodeOperands(N, Ops);
 }
 
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4 };
   return UpdateNodeOperands(N, Ops);
 }
 
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4, SDValue Op5) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
   return UpdateNodeOperands(N, Ops);
 }
 
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
   unsigned NumOps = Ops.size();
   assert(N->getNumOperands() == NumOps &&
          "Update with wrong number of operands");
 
   // If no operands changed just return the input node.
   if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
     return N;
 
   // See if the modified node already exists.
   void *InsertPos = nullptr;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
     return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
     if (!RemoveNodeFromCSEMaps(N))
       InsertPos = nullptr;
 
   // Now we update the operands.
   for (unsigned i = 0; i != NumOps; ++i)
     if (N->OperandList[i] != Ops[i])
       N->OperandList[i].set(Ops[i]);
 
   updateDivergence(N);
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
   return N;
 }
 
 /// DropOperands - Release the operands and set this node to have
 /// zero operands.
 void SDNode::DropOperands() {
   // Unlike the code in MorphNodeTo that does this, we don't need to
   // watch for dead nodes here.
   for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
     SDUse &Use = *I++;
     Use.set(SDValue());
   }
 }
 
 /// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
 /// machine opcode.
 ///
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT) {
   SDVTList VTs = getVTList(VT);
   return SelectNodeTo(N, MachineOpc, VTs, None);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT, SDValue Op1) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1 };
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT, SDValue Op1,
                                    SDValue Op2) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2 };
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT, SDValue Op1,
                                    SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT, ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT);
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2);
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2) {
   SDVTList VTs = getVTList(VT1, VT2);
   return SelectNodeTo(N, MachineOpc, VTs, None);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2, EVT VT3,
                                    ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2,
                                    SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2 };
   return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDVTList VTs,ArrayRef<SDValue> Ops) {
   SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
   // Reset the NodeID to -1.
   New->setNodeId(-1);
   if (New != N) {
     ReplaceAllUsesWith(N, New);
     RemoveDeadNode(N);
   }
   return New;
 }
 
 /// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
 /// the line number information on the merged node since it is not possible to
 /// preserve the information that operation is associated with multiple lines.
 /// This will make the debugger working better at -O0, were there is a higher
 /// probability having other instructions associated with that line.
 ///
 /// For IROrder, we keep the smaller of the two
 SDNode *SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode *N, const SDLoc &OLoc) {
   DebugLoc NLoc = N->getDebugLoc();
   if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
     N->setDebugLoc(DebugLoc());
   }
   unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
   N->setIROrder(Order);
   return N;
 }
 
 /// MorphNodeTo - This *mutates* the specified node to have the specified
 /// return type, opcode, and operands.
 ///
 /// Note that MorphNodeTo returns the resultant node.  If there is already a
 /// node of the specified opcode and operands, it returns that node instead of
 /// the current one.  Note that the SDLoc need not be the same.
 ///
 /// Using MorphNodeTo is faster than creating a new node and swapping it in
 /// with ReplaceAllUsesWith both because it often avoids allocating a new
 /// node, and because it doesn't require CSE recalculation for any of
 /// the node's users.
 ///
 /// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
 /// As a consequence it isn't appropriate to use from within the DAG combiner or
 /// the legalizer which maintain worklists that would need to be updated when
 /// deleting things.
 SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
                                   SDVTList VTs, ArrayRef<SDValue> Ops) {
   // If an identical node already exists, use it.
   void *IP = nullptr;
   if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opc, VTs, Ops);
     if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
       return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
   }
 
   if (!RemoveNodeFromCSEMaps(N))
     IP = nullptr;
 
   // Start the morphing.
   N->NodeType = Opc;
   N->ValueList = VTs.VTs;
   N->NumValues = VTs.NumVTs;
 
   // Clear the operands list, updating used nodes to remove this from their
   // use list.  Keep track of any operands that become dead as a result.
   SmallPtrSet<SDNode*, 16> DeadNodeSet;
   for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
     SDUse &Use = *I++;
     SDNode *Used = Use.getNode();
     Use.set(SDValue());
     if (Used->use_empty())
       DeadNodeSet.insert(Used);
   }
 
   // For MachineNode, initialize the memory references information.
   if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
     MN->setMemRefs(nullptr, nullptr);
 
   // Swap for an appropriately sized array from the recycler.
   removeOperands(N);
   createOperands(N, Ops);
 
   // Delete any nodes that are still dead after adding the uses for the
   // new operands.
   if (!DeadNodeSet.empty()) {
     SmallVector<SDNode *, 16> DeadNodes;
     for (SDNode *N : DeadNodeSet)
       if (N->use_empty())
         DeadNodes.push_back(N);
     RemoveDeadNodes(DeadNodes);
   }
 
   if (IP)
     CSEMap.InsertNode(N, IP);   // Memoize the new node.
   return N;
 }
 
 SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
   unsigned OrigOpc = Node->getOpcode();
   unsigned NewOpc;
   bool IsUnary = false;
   bool IsTernary = false;
   switch (OrigOpc) {
   default:
     llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
   case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
   case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
   case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
   case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
   case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
   case ISD::STRICT_FMA: NewOpc = ISD::FMA; IsTernary = true; break;
   case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
   case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
   case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
   case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break;
   case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break;
   case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break;
   case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break;
   case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break;
   case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break;
   case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break;
   case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break;
   case ISD::STRICT_FNEARBYINT:
     NewOpc = ISD::FNEARBYINT;
     IsUnary = true;
     break;
   }
 
   // We're taking this node out of the chain, so we need to re-link things.
   SDValue InputChain = Node->getOperand(0);
   SDValue OutputChain = SDValue(Node, 1);
   ReplaceAllUsesOfValueWith(OutputChain, InputChain);
 
   SDVTList VTs = getVTList(Node->getOperand(1).getValueType());
   SDNode *Res = nullptr;
   if (IsUnary)
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
   else if (IsTernary)
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
                                            Node->getOperand(2),
                                            Node->getOperand(3)});
   else
     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
                                            Node->getOperand(2) });
 
   // MorphNodeTo can operate in two ways: if an existing node with the
   // specified operands exists, it can just return it.  Otherwise, it
   // updates the node in place to have the requested operands.
   if (Res == Node) {
     // If we updated the node in place, reset the node ID.  To the isel,
     // this should be just like a newly allocated machine node.
     Res->setNodeId(-1);
   } else {
     ReplaceAllUsesWith(Node, Res);
     RemoveDeadNode(Node);
   }
 
   return Res;
 }
 
 /// getMachineNode - These are used for target selectors to create a new node
 /// with specified return type(s), MachineInstr opcode, and operands.
 ///
 /// Note that getMachineNode returns the resultant node.  If there is already a
 /// node of the specified opcode and operands, it returns that node instead of
 /// the current one.
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT) {
   SDVTList VTs = getVTList(VT);
   return getMachineNode(Opcode, dl, VTs, None);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT, SDValue Op1) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT, SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT, SDValue Op1, SDValue Op2,
                                             SDValue Op3) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT, ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT1, EVT VT2, SDValue Op1,
                                             SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT1, EVT VT2, SDValue Op1,
                                             SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT1, EVT VT2,
                                             ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT1, EVT VT2, EVT VT3,
                                             SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   SDValue Ops[] = { Op1, Op2 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT1, EVT VT2, EVT VT3,
                                             SDValue Op1, SDValue Op2,
                                             SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   SDValue Ops[] = { Op1, Op2, Op3 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             EVT VT1, EVT VT2, EVT VT3,
                                             ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
                                             ArrayRef<EVT> ResultTys,
                                             ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(ResultTys);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
                                             SDVTList VTs,
                                             ArrayRef<SDValue> Ops) {
   bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
   MachineSDNode *N;
   void *IP = nullptr;
 
   if (DoCSE) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, ~Opcode, VTs, Ops);
     IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
       return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
     }
   }
 
   // Allocate a new MachineSDNode.
   N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
   createOperands(N, Ops);
 
   if (DoCSE)
     CSEMap.InsertNode(N, IP);
 
   InsertNode(N);
   return N;
 }
 
 /// getTargetExtractSubreg - A convenience function for creating
 /// TargetOpcode::EXTRACT_SUBREG nodes.
 SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
                                              SDValue Operand) {
   SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
   SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
                                   VT, Operand, SRIdxVal);
   return SDValue(Subreg, 0);
 }
 
 /// getTargetInsertSubreg - A convenience function for creating
 /// TargetOpcode::INSERT_SUBREG nodes.
 SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
                                             SDValue Operand, SDValue Subreg) {
   SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
   SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
                                   VT, Operand, Subreg, SRIdxVal);
   return SDValue(Result, 0);
 }
 
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
                                       ArrayRef<SDValue> Ops,
                                       const SDNodeFlags Flags) {
   if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
       E->intersectFlagsWith(Flags);
       return E;
     }
   }
   return nullptr;
 }
 
 /// getDbgValue - Creates a SDDbgValue node.
 ///
 /// SDNode
 SDDbgValue *SelectionDAG::getDbgValue(DIVariable *Var, DIExpression *Expr,
                                       SDNode *N, unsigned R, bool IsIndirect,
                                       const DebugLoc &DL, unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc())
       SDDbgValue(Var, Expr, N, R, IsIndirect, DL, O);
 }
 
 /// Constant
 SDDbgValue *SelectionDAG::getConstantDbgValue(DIVariable *Var,
                                               DIExpression *Expr,
                                               const Value *C,
                                               const DebugLoc &DL, unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, DL, O);
 }
 
 /// FrameIndex
 SDDbgValue *SelectionDAG::getFrameIndexDbgValue(DIVariable *Var,
                                                 DIExpression *Expr, unsigned FI,
                                                 bool IsIndirect,
                                                 const DebugLoc &DL,
                                                 unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc())
       SDDbgValue(Var, Expr, FI, IsIndirect, DL, O, SDDbgValue::FRAMEIX);
 }
 
 /// VReg
 SDDbgValue *SelectionDAG::getVRegDbgValue(DIVariable *Var,
                                           DIExpression *Expr,
                                           unsigned VReg, bool IsIndirect,
                                           const DebugLoc &DL, unsigned O) {
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc())
       SDDbgValue(Var, Expr, VReg, IsIndirect, DL, O, SDDbgValue::VREG);
 }
 
 void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
                                      unsigned OffsetInBits, unsigned SizeInBits,
                                      bool InvalidateDbg) {
   SDNode *FromNode = From.getNode();
   SDNode *ToNode = To.getNode();
   assert(FromNode && ToNode && "Can't modify dbg values");
 
   // PR35338
   // TODO: assert(From != To && "Redundant dbg value transfer");
   // TODO: assert(FromNode != ToNode && "Intranode dbg value transfer");
   if (From == To || FromNode == ToNode)
     return;
 
   if (!FromNode->getHasDebugValue())
     return;
 
   SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (SDDbgValue *Dbg : GetDbgValues(FromNode)) {
     if (Dbg->getKind() != SDDbgValue::SDNODE || Dbg->isInvalidated())
       continue;
 
     // TODO: assert(!Dbg->isInvalidated() && "Transfer of invalid dbg value");
 
     // Just transfer the dbg value attached to From.
     if (Dbg->getResNo() != From.getResNo())
       continue;
 
     DIVariable *Var = Dbg->getVariable();
     auto *Expr = Dbg->getExpression();
     // If a fragment is requested, update the expression.
     if (SizeInBits) {
       // When splitting a larger (e.g., sign-extended) value whose
       // lower bits are described with an SDDbgValue, do not attempt
       // to transfer the SDDbgValue to the upper bits.
       if (auto FI = Expr->getFragmentInfo())
         if (OffsetInBits + SizeInBits > FI->SizeInBits)
           continue;
       auto Fragment = DIExpression::createFragmentExpression(Expr, OffsetInBits,
                                                              SizeInBits);
       if (!Fragment)
         continue;
       Expr = *Fragment;
     }
     // Clone the SDDbgValue and move it to To.
     SDDbgValue *Clone =
         getDbgValue(Var, Expr, ToNode, To.getResNo(), Dbg->isIndirect(),
                     Dbg->getDebugLoc(), Dbg->getOrder());
     ClonedDVs.push_back(Clone);
 
     if (InvalidateDbg)
       Dbg->setIsInvalidated();
   }
 
   for (SDDbgValue *Dbg : ClonedDVs)
     AddDbgValue(Dbg, ToNode, false);
 }
 
 void SelectionDAG::salvageDebugInfo(SDNode &N) {
   if (!N.getHasDebugValue())
     return;
 
   SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (auto DV : GetDbgValues(&N)) {
     if (DV->isInvalidated())
       continue;
     switch (N.getOpcode()) {
     default:
       break;
     case ISD::ADD:
       SDValue N0 = N.getOperand(0);
       SDValue N1 = N.getOperand(1);
       if (!isConstantIntBuildVectorOrConstantInt(N0) &&
           isConstantIntBuildVectorOrConstantInt(N1)) {
         uint64_t Offset = N.getConstantOperandVal(1);
         // Rewrite an ADD constant node into a DIExpression. Since we are
         // performing arithmetic to compute the variable's *value* in the
         // DIExpression, we need to mark the expression with a
         // DW_OP_stack_value.
         auto *DIExpr = DV->getExpression();
         DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
                                        DIExpression::NoDeref,
                                        DIExpression::WithStackValue);
         SDDbgValue *Clone =
             getDbgValue(DV->getVariable(), DIExpr, N0.getNode(), N0.getResNo(),
                         DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
         ClonedDVs.push_back(Clone);
         DV->setIsInvalidated();
         LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
                    N0.getNode()->dumprFull(this);
                    dbgs() << " into " << *DIExpr << '\n');
       }
     }
   }
 
   for (SDDbgValue *Dbg : ClonedDVs)
     AddDbgValue(Dbg, Dbg->getSDNode(), false);
 }
 
 /// Creates a SDDbgLabel node.
 SDDbgLabel *SelectionDAG::getDbgLabel(DILabel *Label,
                                       const DebugLoc &DL, unsigned O) {
   assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   return new (DbgInfo->getAlloc()) SDDbgLabel(Label, DL, O);
 }
 
 namespace {
 
 /// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
 /// pointed to by a use iterator is deleted, increment the use iterator
 /// so that it doesn't dangle.
 ///
 class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
   SDNode::use_iterator &UI;
   SDNode::use_iterator &UE;
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     // Increment the iterator as needed.
     while (UI != UE && N == *UI)
       ++UI;
   }
 
 public:
   RAUWUpdateListener(SelectionDAG &d,
                      SDNode::use_iterator &ui,
                      SDNode::use_iterator &ue)
     : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
 };
 
 } // end anonymous namespace
 
 /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
 /// This can cause recursive merging of nodes in the DAG.
 ///
 /// This version assumes From has a single result value.
 ///
 void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
   SDNode *From = FromN.getNode();
   assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
          "Cannot replace with this method!");
   assert(From != To.getNode() && "Cannot replace uses of with self");
 
   // Preserve Debug Values
   transferDbgValues(FromN, To);
 
   // Iterate over all the existing uses of From. New uses will be added
   // to the beginning of the use list, which we avoid visiting.
   // This specifically avoids visiting uses of From that arise while the
   // replacement is happening, because any such uses would be the result
   // of CSE: If an existing node looks like From after one of its operands
   // is replaced by To, we don't want to replace of all its users with To
   // too. See PR3018 for more info.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
 
     // A user can appear in a use list multiple times, and when this
     // happens the uses are usually next to each other in the list.
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
       SDUse &Use = UI.getUse();
       ++UI;
       Use.set(To);
       if (To->isDivergent() != From->isDivergent())
         updateDivergence(User);
     } while (UI != UE && *UI == User);
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
   if (FromN == getRoot())
     setRoot(To);
 }
 
 /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
 /// This can cause recursive merging of nodes in the DAG.
 ///
 /// This version assumes that for each value of From, there is a
 /// corresponding value in To in the same position with the same type.
 ///
 void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
 #ifndef NDEBUG
   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
     assert((!From->hasAnyUseOfValue(i) ||
             From->getValueType(i) == To->getValueType(i)) &&
            "Cannot use this version of ReplaceAllUsesWith!");
 #endif
 
   // Handle the trivial case.
   if (From == To)
     return;
 
   // Preserve Debug Info. Only do this if there's a use.
   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
     if (From->hasAnyUseOfValue(i)) {
       assert((i < To->getNumValues()) && "Invalid To location");
       transferDbgValues(SDValue(From, i), SDValue(To, i));
     }
 
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
 
     // A user can appear in a use list multiple times, and when this
     // happens the uses are usually next to each other in the list.
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
       SDUse &Use = UI.getUse();
       ++UI;
       Use.setNode(To);
       if (To->isDivergent() != From->isDivergent())
         updateDivergence(User);
     } while (UI != UE && *UI == User);
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
   if (From == getRoot().getNode())
     setRoot(SDValue(To, getRoot().getResNo()));
 }
 
 /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
 /// This can cause recursive merging of nodes in the DAG.
 ///
 /// This version can replace From with any result values.  To must match the
 /// number and types of values returned by From.
 void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
   if (From->getNumValues() == 1)  // Handle the simple case efficiently.
     return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
 
   // Preserve Debug Info.
   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
     transferDbgValues(SDValue(From, i), *To);
 
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
 
     // A user can appear in a use list multiple times, and when this
     // happens the uses are usually next to each other in the list.
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
       SDUse &Use = UI.getUse();
       const SDValue &ToOp = To[Use.getResNo()];
       ++UI;
       Use.set(ToOp);
       if (To->getNode()->isDivergent() != From->isDivergent())
         updateDivergence(User);
     } while (UI != UE && *UI == User);
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
   if (From == getRoot().getNode())
     setRoot(SDValue(To[getRoot().getResNo()]));
 }
 
 /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The Deleted
 /// vector is handled the same way as for ReplaceAllUsesWith.
 void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
   // Handle the really simple, really trivial case efficiently.
   if (From == To) return;
 
   // Handle the simple, trivial, case efficiently.
   if (From.getNode()->getNumValues() == 1) {
     ReplaceAllUsesWith(From, To);
     return;
   }
 
   // Preserve Debug Info.
   transferDbgValues(From, To);
 
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From.getNode()->use_begin(),
                        UE = From.getNode()->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
     bool UserRemovedFromCSEMaps = false;
 
     // A user can appear in a use list multiple times, and when this
     // happens the uses are usually next to each other in the list.
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
       SDUse &Use = UI.getUse();
 
       // Skip uses of different values from the same node.
       if (Use.getResNo() != From.getResNo()) {
         ++UI;
         continue;
       }
 
       // If this node hasn't been modified yet, it's still in the CSE maps,
       // so remove its old self from the CSE maps.
       if (!UserRemovedFromCSEMaps) {
         RemoveNodeFromCSEMaps(User);
         UserRemovedFromCSEMaps = true;
       }
 
       ++UI;
       Use.set(To);
       if (To->isDivergent() != From->isDivergent())
         updateDivergence(User);
     } while (UI != UE && *UI == User);
     // We are iterating over all uses of the From node, so if a use
     // doesn't use the specific value, no changes are made.
     if (!UserRemovedFromCSEMaps)
       continue;
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
   if (From == getRoot())
     setRoot(To);
 }
 
 namespace {
 
   /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
   /// to record information about a use.
   struct UseMemo {
     SDNode *User;
     unsigned Index;
     SDUse *Use;
   };
 
   /// operator< - Sort Memos by User.
   bool operator<(const UseMemo &L, const UseMemo &R) {
     return (intptr_t)L.User < (intptr_t)R.User;
   }
 
 } // end anonymous namespace
 
 void SelectionDAG::updateDivergence(SDNode * N)
 {
   if (TLI->isSDNodeAlwaysUniform(N))
     return;
   bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
   for (auto &Op : N->ops()) {
     if (Op.Val.getValueType() != MVT::Other)
       IsDivergent |= Op.getNode()->isDivergent();
   }
   if (N->SDNodeBits.IsDivergent != IsDivergent) {
     N->SDNodeBits.IsDivergent = IsDivergent;
     for (auto U : N->uses()) {
       updateDivergence(U);
     }
   }
 }
 
 
 void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode*>& Order) {
   DenseMap<SDNode *, unsigned> Degree;
   Order.reserve(AllNodes.size());
   for (auto & N : allnodes()) {
     unsigned NOps = N.getNumOperands();
     Degree[&N] = NOps;
     if (0 == NOps)
       Order.push_back(&N);
   }
   for (std::vector<SDNode *>::iterator I = Order.begin();
   I!=Order.end();++I) {
     SDNode * N = *I;
     for (auto U : N->uses()) {
       unsigned &UnsortedOps = Degree[U];
       if (0 == --UnsortedOps)
         Order.push_back(U);
     }
   }
 }
 
 void SelectionDAG::VerifyDAGDiverence()
 {
   std::vector<SDNode*> TopoOrder;
   CreateTopologicalOrder(TopoOrder);
   const TargetLowering &TLI = getTargetLoweringInfo();
   DenseMap<const SDNode *, bool> DivergenceMap;
   for (auto &N : allnodes()) {
     DivergenceMap[&N] = false;
   }
   for (auto N : TopoOrder) {
     bool IsDivergent = DivergenceMap[N];
     bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA);
     for (auto &Op : N->ops()) {
       if (Op.Val.getValueType() != MVT::Other)
         IsSDNodeDivergent |= DivergenceMap[Op.getNode()];
     }
     if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) {
       DivergenceMap[N] = true;
     }
   }
   for (auto &N : allnodes()) {
     (void)N;
     assert(DivergenceMap[&N] == N.isDivergent() &&
            "Divergence bit inconsistency detected\n");
   }
 }
 
 
 /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The same value
 /// may appear in both the From and To list.  The Deleted vector is
 /// handled the same way as for ReplaceAllUsesWith.
 void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
                                               const SDValue *To,
                                               unsigned Num){
   // Handle the simple, trivial case efficiently.
   if (Num == 1)
     return ReplaceAllUsesOfValueWith(*From, *To);
 
   transferDbgValues(*From, *To);
 
   // Read up all the uses and make records of them. This helps
   // processing new uses that are introduced during the
   // replacement process.
   SmallVector<UseMemo, 4> Uses;
   for (unsigned i = 0; i != Num; ++i) {
     unsigned FromResNo = From[i].getResNo();
     SDNode *FromNode = From[i].getNode();
     for (SDNode::use_iterator UI = FromNode->use_begin(),
          E = FromNode->use_end(); UI != E; ++UI) {
       SDUse &Use = UI.getUse();
       if (Use.getResNo() == FromResNo) {
         UseMemo Memo = { *UI, i, &Use };
         Uses.push_back(Memo);
       }
     }
   }
 
   // Sort the uses, so that all the uses from a given User are together.
   llvm::sort(Uses.begin(), Uses.end());
 
   for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
        UseIndex != UseIndexEnd; ) {
     // We know that this user uses some value of From.  If it is the right
     // value, update it.
     SDNode *User = Uses[UseIndex].User;
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
 
     // The Uses array is sorted, so all the uses for a given User
     // are next to each other in the list.
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
       unsigned i = Uses[UseIndex].Index;
       SDUse &Use = *Uses[UseIndex].Use;
       ++UseIndex;
 
       Use.set(To[i]);
     } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
   }
 }
 
 /// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
 /// based on their topological order. It returns the maximum id and a vector
 /// of the SDNodes* in assigned order by reference.
 unsigned SelectionDAG::AssignTopologicalOrder() {
   unsigned DAGSize = 0;
 
   // SortedPos tracks the progress of the algorithm. Nodes before it are
   // sorted, nodes after it are unsorted. When the algorithm completes
   // it is at the end of the list.
   allnodes_iterator SortedPos = allnodes_begin();
 
   // Visit all the nodes. Move nodes with no operands to the front of
   // the list immediately. Annotate nodes that do have operands with their
   // operand count. Before we do this, the Node Id fields of the nodes
   // may contain arbitrary values. After, the Node Id fields for nodes
   // before SortedPos will contain the topological sort index, and the
   // Node Id fields for nodes At SortedPos and after will contain the
   // count of outstanding operands.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
     SDNode *N = &*I++;
     checkForCycles(N, this);
     unsigned Degree = N->getNumOperands();
     if (Degree == 0) {
       // A node with no uses, add it to the result array immediately.
       N->setNodeId(DAGSize++);
       allnodes_iterator Q(N);
       if (Q != SortedPos)
         SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
       assert(SortedPos != AllNodes.end() && "Overran node list");
       ++SortedPos;
     } else {
       // Temporarily use the Node Id as scratch space for the degree count.
       N->setNodeId(Degree);
     }
   }
 
   // Visit all the nodes. As we iterate, move nodes into sorted order,
   // such that by the time the end is reached all nodes will be sorted.
   for (SDNode &Node : allnodes()) {
     SDNode *N = &Node;
     checkForCycles(N, this);
     // N is in sorted position, so all its uses have one less operand
     // that needs to be sorted.
     for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
          UI != UE; ++UI) {
       SDNode *P = *UI;
       unsigned Degree = P->getNodeId();
       assert(Degree != 0 && "Invalid node degree");
       --Degree;
       if (Degree == 0) {
         // All of P's operands are sorted, so P may sorted now.
         P->setNodeId(DAGSize++);
         if (P->getIterator() != SortedPos)
           SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
         assert(SortedPos != AllNodes.end() && "Overran node list");
         ++SortedPos;
       } else {
         // Update P's outstanding operand count.
         P->setNodeId(Degree);
       }
     }
     if (Node.getIterator() == SortedPos) {
 #ifndef NDEBUG
       allnodes_iterator I(N);
       SDNode *S = &*++I;
       dbgs() << "Overran sorted position:\n";
       S->dumprFull(this); dbgs() << "\n";
       dbgs() << "Checking if this is due to cycles\n";
       checkForCycles(this, true);
 #endif
       llvm_unreachable(nullptr);
     }
   }
 
   assert(SortedPos == AllNodes.end() &&
          "Topological sort incomplete!");
   assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
          "First node in topological sort is not the entry token!");
   assert(AllNodes.front().getNodeId() == 0 &&
          "First node in topological sort has non-zero id!");
   assert(AllNodes.front().getNumOperands() == 0 &&
          "First node in topological sort has operands!");
   assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
          "Last node in topologic sort has unexpected id!");
   assert(AllNodes.back().use_empty() &&
          "Last node in topologic sort has users!");
   assert(DAGSize == allnodes_size() && "Node count mismatch!");
   return DAGSize;
 }
 
 /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
 /// value is produced by SD.
 void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
   if (SD) {
     assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
     SD->setHasDebugValue(true);
   }
   DbgInfo->add(DB, SD, isParameter);
 }
 
 void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
   DbgInfo->add(DB);
 }
 
 SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
                                                    SDValue NewMemOp) {
   assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
   // The new memory operation must have the same position as the old load in
   // terms of memory dependency. Create a TokenFactor for the old load and new
   // memory operation and update uses of the old load's output chain to use that
   // TokenFactor.
   SDValue OldChain = SDValue(OldLoad, 1);
   SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
   if (!OldLoad->hasAnyUseOfValue(1))
     return NewChain;
 
   SDValue TokenFactor =
       getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
   ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
   UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
   return TokenFactor;
 }
 
 //===----------------------------------------------------------------------===//
 //                              SDNode Class
 //===----------------------------------------------------------------------===//
 
 bool llvm::isNullConstant(SDValue V) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
   return Const != nullptr && Const->isNullValue();
 }
 
 bool llvm::isNullFPConstant(SDValue V) {
   ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
   return Const != nullptr && Const->isZero() && !Const->isNegative();
 }
 
 bool llvm::isAllOnesConstant(SDValue V) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
   return Const != nullptr && Const->isAllOnesValue();
 }
 
 bool llvm::isOneConstant(SDValue V) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
   return Const != nullptr && Const->isOne();
 }
 
 bool llvm::isBitwiseNot(SDValue V) {
   return V.getOpcode() == ISD::XOR && isAllOnesConstant(V.getOperand(1));
 }
 
 ConstantSDNode *llvm::isConstOrConstSplat(SDValue N) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
     return CN;
 
   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
     BitVector UndefElements;
     ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);
 
     // BuildVectors can truncate their operands. Ignore that case here.
     // FIXME: We blindly ignore splats which include undef which is overly
     // pessimistic.
     if (CN && UndefElements.none() &&
         CN->getValueType(0) == N.getValueType().getScalarType())
       return CN;
   }
 
   return nullptr;
 }
 
 ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N) {
   if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
     return CN;
 
   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
     BitVector UndefElements;
     ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
 
     if (CN && UndefElements.none())
       return CN;
   }
 
   return nullptr;
 }
 
 HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
 
 GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
                                          const DebugLoc &DL,
                                          const GlobalValue *GA, EVT VT,
                                          int64_t o, unsigned char TF)
     : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
 
 AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
                                          EVT VT, unsigned SrcAS,
                                          unsigned DestAS)
     : SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
       SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
 
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
                      SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
     : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
   MemSDNodeBits.IsVolatile = MMO->isVolatile();
   MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
   MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
   MemSDNodeBits.IsInvariant = MMO->isInvariant();
 
   // We check here that the size of the memory operand fits within the size of
   // the MMO. This is because the MMO might indicate only a possible address
   // range instead of specifying the affected memory addresses precisely.
   assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
 }
 
 /// Profile - Gather unique data for the node.
 ///
 void SDNode::Profile(FoldingSetNodeID &ID) const {
   AddNodeIDNode(ID, this);
 }
 
 namespace {
 
   struct EVTArray {
     std::vector<EVT> VTs;
 
     EVTArray() {
       VTs.reserve(MVT::LAST_VALUETYPE);
       for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
         VTs.push_back(MVT((MVT::SimpleValueType)i));
     }
   };
 
 } // end anonymous namespace
 
 static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
 static ManagedStatic<EVTArray> SimpleVTArray;
 static ManagedStatic<sys::SmartMutex<true>> VTMutex;
 
 /// getValueTypeList - Return a pointer to the specified value type.
 ///
 const EVT *SDNode::getValueTypeList(EVT VT) {
   if (VT.isExtended()) {
     sys::SmartScopedLock<true> Lock(*VTMutex);
     return &(*EVTs->insert(VT).first);
   } else {
     assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
            "Value type out of range!");
     return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
   }
 }
 
 /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
 /// indicated value.  This method ignores uses of other values defined by this
 /// operation.
 bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
   assert(Value < getNumValues() && "Bad value!");
 
   // TODO: Only iterate over uses of a given value of the node
   for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
     if (UI.getUse().getResNo() == Value) {
       if (NUses == 0)
         return false;
       --NUses;
     }
   }
 
   // Found exactly the right number of uses?
   return NUses == 0;
 }
 
 /// hasAnyUseOfValue - Return true if there are any use of the indicated
 /// value. This method ignores uses of other values defined by this operation.
 bool SDNode::hasAnyUseOfValue(unsigned Value) const {
   assert(Value < getNumValues() && "Bad value!");
 
   for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
     if (UI.getUse().getResNo() == Value)
       return true;
 
   return false;
 }
 
 /// isOnlyUserOf - Return true if this node is the only use of N.
 bool SDNode::isOnlyUserOf(const SDNode *N) const {
   bool Seen = false;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
     SDNode *User = *I;
     if (User == this)
       Seen = true;
     else
       return false;
   }
 
   return Seen;
 }
 
 /// Return true if the only users of N are contained in Nodes.
 bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
   bool Seen = false;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
     SDNode *User = *I;
     if (llvm::any_of(Nodes,
                      [&User](const SDNode *Node) { return User == Node; }))
       Seen = true;
     else
       return false;
   }
 
   return Seen;
 }
 
 /// isOperand - Return true if this node is an operand of N.
 bool SDValue::isOperandOf(const SDNode *N) const {
   for (const SDValue &Op : N->op_values())
     if (*this == Op)
       return true;
   return false;
 }
 
 bool SDNode::isOperandOf(const SDNode *N) const {
   for (const SDValue &Op : N->op_values())
     if (this == Op.getNode())
       return true;
   return false;
 }
 
 /// reachesChainWithoutSideEffects - Return true if this operand (which must
 /// be a chain) reaches the specified operand without crossing any
 /// side-effecting instructions on any chain path.  In practice, this looks
 /// through token factors and non-volatile loads.  In order to remain efficient,
 /// this only looks a couple of nodes in, it does not do an exhaustive search.
 ///
 /// Note that we only need to examine chains when we're searching for
 /// side-effects; SelectionDAG requires that all side-effects are represented
 /// by chains, even if another operand would force a specific ordering. This
 /// constraint is necessary to allow transformations like splitting loads.
 bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
                                              unsigned Depth) const {
   if (*this == Dest) return true;
 
   // Don't search too deeply, we just want to be able to see through
   // TokenFactor's etc.
   if (Depth == 0) return false;
 
   // If this is a token factor, all inputs to the TF happen in parallel.
   if (getOpcode() == ISD::TokenFactor) {
     // First, try a shallow search.
     if (is_contained((*this)->ops(), Dest)) {
       // We found the chain we want as an operand of this TokenFactor.
       // Essentially, we reach the chain without side-effects if we could
       // serialize the TokenFactor into a simple chain of operations with
       // Dest as the last operation. This is automatically true if the
       // chain has one use: there are no other ordering constraints.
       // If the chain has more than one use, we give up: some other
       // use of Dest might force a side-effect between Dest and the current
       // node.
       if (Dest.hasOneUse())
         return true;
     }
     // Next, try a deep search: check whether every operand of the TokenFactor
     // reaches Dest.
     return llvm::all_of((*this)->ops(), [=](SDValue Op) {
       return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
     });
   }
 
   // Loads don't have side effects, look through them.
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
     if (!Ld->isVolatile())
       return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
   }
   return false;
 }
 
 bool SDNode::hasPredecessor(const SDNode *N) const {
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 16> Worklist;
   Worklist.push_back(this);
   return hasPredecessorHelper(N, Visited, Worklist);
 }
 
 void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
   this->Flags.intersectWith(Flags);
 }
 
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
   assert(N->getNumValues() == 1 &&
          "Can't unroll a vector with multiple results!");
 
   EVT VT = N->getValueType(0);
   unsigned NE = VT.getVectorNumElements();
   EVT EltVT = VT.getVectorElementType();
   SDLoc dl(N);
 
   SmallVector<SDValue, 8> Scalars;
   SmallVector<SDValue, 4> Operands(N->getNumOperands());
 
   // If ResNE is 0, fully unroll the vector op.
   if (ResNE == 0)
     ResNE = NE;
   else if (NE > ResNE)
     NE = ResNE;
 
   unsigned i;
   for (i= 0; i != NE; ++i) {
     for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
       SDValue Operand = N->getOperand(j);
       EVT OperandVT = Operand.getValueType();
       if (OperandVT.isVector()) {
         // A vector operand; extract a single element.
         EVT OperandEltVT = OperandVT.getVectorElementType();
         Operands[j] =
             getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
                     getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
       } else {
         // A scalar operand; just use it as is.
         Operands[j] = Operand;
       }
     }
 
     switch (N->getOpcode()) {
     default: {
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
                                 N->getFlags()));
       break;
     }
     case ISD::VSELECT:
       Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
       break;
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
     case ISD::ROTL:
     case ISD::ROTR:
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
                                getShiftAmountOperand(Operands[0].getValueType(),
                                                      Operands[1])));
       break;
     case ISD::SIGN_EXTEND_INREG:
     case ISD::FP_ROUND_INREG: {
       EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
                                 Operands[0],
                                 getValueType(ExtVT)));
     }
     }
   }
 
   for (; i < ResNE; ++i)
     Scalars.push_back(getUNDEF(EltVT));
 
   EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
   return getBuildVector(VecVT, dl, Scalars);
 }
 
 bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
                                                   LoadSDNode *Base,
                                                   unsigned Bytes,
                                                   int Dist) const {
   if (LD->isVolatile() || Base->isVolatile())
     return false;
   if (LD->isIndexed() || Base->isIndexed())
     return false;
   if (LD->getChain() != Base->getChain())
     return false;
   EVT VT = LD->getValueType(0);
   if (VT.getSizeInBits() / 8 != Bytes)
     return false;
 
   auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
   auto LocDecomp = BaseIndexOffset::match(LD, *this);
 
   int64_t Offset = 0;
   if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
     return (Dist * Bytes == Offset);
   return false;
 }
 
 /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
 /// it cannot be inferred.
 unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   // If this is a GlobalAddress + cst, return the alignment.
   const GlobalValue *GV;
   int64_t GVOffset = 0;
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned IdxWidth = getDataLayout().getIndexTypeSizeInBits(GV->getType());
     KnownBits Known(IdxWidth);
     llvm::computeKnownBits(GV, Known, getDataLayout());
     unsigned AlignBits = Known.countMinTrailingZeros();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
       return MinAlign(Align, GVOffset);
   }
 
   // If this is a direct reference to a stack slot, use information about the
   // stack slot's alignment.
   int FrameIdx = 1 << 31;
   int64_t FrameOffset = 0;
   if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
     FrameIdx = FI->getIndex();
   } else if (isBaseWithConstantOffset(Ptr) &&
              isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
     // Handle FI+Cst
     FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
     FrameOffset = Ptr.getConstantOperandVal(1);
   }
 
   if (FrameIdx != (1 << 31)) {
     const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
     unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
                                     FrameOffset);
     return FIInfoAlign;
   }
 
   return 0;
 }
 
 /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
 /// which is split (or expanded) into two not necessarily identical pieces.
 std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
   // Currently all types are split in half.
   EVT LoVT, HiVT;
   if (!VT.isVector())
     LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
   else
     LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());
 
   return std::make_pair(LoVT, HiVT);
 }
 
 /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
 /// low/high part.
 std::pair<SDValue, SDValue>
 SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
                           const EVT &HiVT) {
   assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
          N.getValueType().getVectorNumElements() &&
          "More vector elements requested than available!");
   SDValue Lo, Hi;
   Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
                getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
   Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
                getConstant(LoVT.getVectorNumElements(), DL,
                            TLI->getVectorIdxTy(getDataLayout())));
   return std::make_pair(Lo, Hi);
 }
 
 void SelectionDAG::ExtractVectorElements(SDValue Op,
                                          SmallVectorImpl<SDValue> &Args,
                                          unsigned Start, unsigned Count) {
   EVT VT = Op.getValueType();
   if (Count == 0)
     Count = VT.getVectorNumElements();
 
   EVT EltVT = VT.getVectorElementType();
   EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
   SDLoc SL(Op);
   for (unsigned i = Start, e = Start + Count; i != e; ++i) {
     Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                            Op, getConstant(i, SL, IdxTy)));
   }
 }
 
 // getAddressSpace - Return the address space this GlobalAddress belongs to.
 unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
 }
 
 Type *ConstantPoolSDNode::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
   return Val.ConstVal->getType();
 }
 
 bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
                                         unsigned &SplatBitSize,
                                         bool &HasAnyUndefs,
                                         unsigned MinSplatBits,
                                         bool IsBigEndian) const {
   EVT VT = getValueType(0);
   assert(VT.isVector() && "Expected a vector type");
   unsigned VecWidth = VT.getSizeInBits();
   if (MinSplatBits > VecWidth)
     return false;
 
   // FIXME: The widths are based on this node's type, but build vectors can
   // truncate their operands.
   SplatValue = APInt(VecWidth, 0);
   SplatUndef = APInt(VecWidth, 0);
 
   // Get the bits. Bits with undefined values (when the corresponding element
   // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
   // in SplatValue. If any of the values are not constant, give up and return
   // false.
   unsigned int NumOps = getNumOperands();
   assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
   unsigned EltWidth = VT.getScalarSizeInBits();
 
   for (unsigned j = 0; j < NumOps; ++j) {
     unsigned i = IsBigEndian ? NumOps - 1 - j : j;
     SDValue OpVal = getOperand(i);
     unsigned BitPos = j * EltWidth;
 
     if (OpVal.isUndef())
       SplatUndef.setBits(BitPos, BitPos + EltWidth);
     else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
       SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
     else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
       SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
     else
       return false;
   }
 
   // The build_vector is all constants or undefs. Find the smallest element
   // size that splats the vector.
   HasAnyUndefs = (SplatUndef != 0);
 
   // FIXME: This does not work for vectors with elements less than 8 bits.
   while (VecWidth > 8) {
     unsigned HalfSize = VecWidth / 2;
     APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
     APInt LowValue = SplatValue.trunc(HalfSize);
     APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
     APInt LowUndef = SplatUndef.trunc(HalfSize);
 
     // If the two halves do not match (ignoring undef bits), stop here.
     if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
         MinSplatBits > HalfSize)
       break;
 
     SplatValue = HighValue | LowValue;
     SplatUndef = HighUndef & LowUndef;
 
     VecWidth = HalfSize;
   }
 
   SplatBitSize = VecWidth;
   return true;
 }
 
 SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
   if (UndefElements) {
     UndefElements->clear();
     UndefElements->resize(getNumOperands());
   }
   SDValue Splatted;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     SDValue Op = getOperand(i);
     if (Op.isUndef()) {
       if (UndefElements)
         (*UndefElements)[i] = true;
     } else if (!Splatted) {
       Splatted = Op;
     } else if (Splatted != Op) {
       return SDValue();
     }
   }
 
   if (!Splatted) {
     assert(getOperand(0).isUndef() &&
            "Can only have a splat without a constant for all undefs.");
     return getOperand(0);
   }
 
   return Splatted;
 }
 
 ConstantSDNode *
 BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
   return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
 }
 
 ConstantFPSDNode *
 BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
   return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
 }
 
 int32_t
 BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
                                                    uint32_t BitWidth) const {
   if (ConstantFPSDNode *CN =
           dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
     bool IsExact;
     APSInt IntVal(BitWidth);
     const APFloat &APF = CN->getValueAPF();
     if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
             APFloat::opOK ||
         !IsExact)
       return -1;
 
     return IntVal.exactLogBase2();
   }
   return -1;
 }
 
 bool BuildVectorSDNode::isConstant() const {
   for (const SDValue &Op : op_values()) {
     unsigned Opc = Op.getOpcode();
     if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
       return false;
   }
   return true;
 }
 
 bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
   // Find the first non-undef value in the shuffle mask.
   unsigned i, e;
   for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
     /* search */;
 
   assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");
 
   // Make sure all remaining elements are either undef or the same as the first
   // non-undef value.
   for (int Idx = Mask[i]; i != e; ++i)
     if (Mask[i] >= 0 && Mask[i] != Idx)
       return false;
   return true;
 }
 
 // Returns the SDNode if it is a constant integer BuildVector
 // or constant integer.
 SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
   if (isa<ConstantSDNode>(N))
     return N.getNode();
   if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
     return N.getNode();
   // Treat a GlobalAddress supporting constant offset folding as a
   // constant integer.
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
     if (GA->getOpcode() == ISD::GlobalAddress &&
         TLI->isOffsetFoldingLegal(GA))
       return GA;
   return nullptr;
 }
 
 SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
   if (isa<ConstantFPSDNode>(N))
     return N.getNode();
 
   if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
     return N.getNode();
 
   return nullptr;
 }
 
 void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
   assert(!Node->OperandList && "Node already has operands");
   SDUse *Ops = OperandRecycler.allocate(
     ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
 
   bool IsDivergent = false;
   for (unsigned I = 0; I != Vals.size(); ++I) {
     Ops[I].setUser(Node);
     Ops[I].setInitial(Vals[I]);
     if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
       IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent();
   }
   Node->NumOperands = Vals.size();
   Node->OperandList = Ops;
   IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
   if (!TLI->isSDNodeAlwaysUniform(Node))
     Node->SDNodeBits.IsDivergent = IsDivergent;
   checkForCycles(Node);
 }
 
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSetImpl<const SDNode*> &Visited,
                                  SmallPtrSetImpl<const SDNode*> &Checked,
                                  const llvm::SelectionDAG *DAG) {
   // If this node has already been checked, don't check it again.
   if (Checked.count(N))
     return;
 
   // If a node has already been visited on this depth-first walk, reject it as
   // a cycle.
   if (!Visited.insert(N).second) {
     errs() << "Detected cycle in SelectionDAG\n";
     dbgs() << "Offending node:\n";
     N->dumprFull(DAG); dbgs() << "\n";
     abort();
   }
 
   for (const SDValue &Op : N->op_values())
     checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);
 
   Checked.insert(N);
   Visited.erase(N);
 }
 #endif
 
 void llvm::checkForCycles(const llvm::SDNode *N,
                           const llvm::SelectionDAG *DAG,
                           bool force) {
 #ifndef NDEBUG
   bool check = force;
 #ifdef EXPENSIVE_CHECKS
   check = true;
 #endif  // EXPENSIVE_CHECKS
   if (check) {
     assert(N && "Checking nonexistent SDNode");
     SmallPtrSet<const SDNode*, 32> visited;
     SmallPtrSet<const SDNode*, 32> checked;
     checkForCyclesHelper(N, visited, checked, DAG);
   }
 #endif  // !NDEBUG
 }
 
 void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
   checkForCycles(DAG->getRoot().getNode(), DAG, force);
 }
Index: vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp	(revision 338000)
@@ -1,10302 +1,10323 @@
 //===- SelectionDAGBuilder.cpp - Selection-DAG building -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This implements routines for translating from LLVM IR into SelectionDAG IR.
 //
 //===----------------------------------------------------------------------===//
 
 #include "SelectionDAGBuilder.h"
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <iterator>
 #include <limits>
 #include <numeric>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
 
 /// LimitFloatPrecision - Generate low-precision inline sequences for
 /// some float libcalls (6, 8 or 12 bits).
 static unsigned LimitFloatPrecision;
 
 static cl::opt<unsigned, true>
     LimitFPPrecision("limit-float-precision",
                      cl::desc("Generate low-precision inline sequences "
                               "for some float libcalls"),
                      cl::location(LimitFloatPrecision), cl::Hidden,
                      cl::init(0));
 
 static cl::opt<unsigned> SwitchPeelThreshold(
     "switch-peel-threshold", cl::Hidden, cl::init(66),
     cl::desc("Set the case probability threshold for peeling the case from a "
              "switch statement. A value greater than 100 will void this "
              "optimization"));
 
 // Limit the width of DAG chains. This is important in general to prevent
 // DAG-based analysis from blowing up. For example, alias analysis and
 // load clustering may not complete in reasonable time. It is difficult to
 // recognize and avoid this situation within each individual analysis, and
 // future analyses are likely to have the same behavior. Limiting DAG width is
 // the safe approach and will be especially important with global DAGs.
 //
 // MaxParallelChains default is arbitrarily high to avoid affecting
 // optimization, but could be lowered to improve compile time. Any ld-ld-st-st
 // sequence over this should have been converted to llvm.memcpy by the
 // frontend. It is easy to induce this behavior with .ll code such as:
 // %buffer = alloca [4096 x i8]
 // %data = load [4096 x i8]* %argPtr
 // store [4096 x i8] %data, [4096 x i8]* %buffer
 static const unsigned MaxParallelChains = 64;
 
 // Return the calling convention if the Value passed requires ABI mangling as it
 // is a parameter to a function or a return value from a function which is not
 // an intrinsic.
 static Optional<CallingConv::ID> getABIRegCopyCC(const Value *V) {
   if (auto *R = dyn_cast<ReturnInst>(V))
     return R->getParent()->getParent()->getCallingConv();
 
   if (auto *CI = dyn_cast<CallInst>(V)) {
     const bool IsInlineAsm = CI->isInlineAsm();
     const bool IsIndirectFunctionCall =
         !IsInlineAsm && !CI->getCalledFunction();
 
     // It is possible that the call instruction is an inline asm statement or an
     // indirect function call in which case the return value of
     // getCalledFunction() would be nullptr.
     const bool IsInstrinsicCall =
         !IsInlineAsm && !IsIndirectFunctionCall &&
         CI->getCalledFunction()->getIntrinsicID() != Intrinsic::not_intrinsic;
 
     if (!IsInlineAsm && !IsInstrinsicCall)
       return CI->getCallingConv();
   }
 
   return None;
 }
 
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V,
                                       Optional<CallingConv::ID> CC);
 
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
 /// larger than ValueVT then AssertOp can be used to specify whether the extra
 /// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
 /// (ISD::AssertSext).
 static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
                                 const SDValue *Parts, unsigned NumParts,
                                 MVT PartVT, EVT ValueVT, const Value *V,
                                 Optional<CallingConv::ID> CC = None,
                                 Optional<ISD::NodeType> AssertOp = None) {
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
                                   CC);
 
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Val = Parts[0];
 
   if (NumParts > 1) {
     // Assemble the value from multiple parts.
     if (ValueVT.isInteger()) {
       unsigned PartBits = PartVT.getSizeInBits();
       unsigned ValueBits = ValueVT.getSizeInBits();
 
       // Assemble the power of 2 part.
       unsigned RoundParts = NumParts & (NumParts - 1) ?
         1 << Log2_32(NumParts) : NumParts;
       unsigned RoundBits = PartBits * RoundParts;
       EVT RoundVT = RoundBits == ValueBits ?
         ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
       SDValue Lo, Hi;
 
       EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
 
       if (RoundParts > 2) {
         Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
                               PartVT, HalfVT, V);
         Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
                               RoundParts / 2, PartVT, HalfVT, V);
       } else {
         Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
         Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
       }
 
       if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
 
       Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
 
       if (RoundParts < NumParts) {
         // Assemble the trailing non-power-of-2 part.
         unsigned OddParts = NumParts - RoundParts;
         EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
         Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT,
                               OddVT, V, CC);
 
         // Combine the round and odd parts.
         Lo = Val;
         if (DAG.getDataLayout().isBigEndian())
           std::swap(Lo, Hi);
         EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
         Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
         Hi =
             DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
                         DAG.getConstant(Lo.getValueSizeInBits(), DL,
                                         TLI.getPointerTy(DAG.getDataLayout())));
         Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
         Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
       }
     } else if (PartVT.isFloatingPoint()) {
       // FP split into multiple FP parts (for ppcf128)
       assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
              "Unexpected split");
       SDValue Lo, Hi;
       Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
       Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
       if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
         std::swap(Lo, Hi);
       Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
     } else {
       // FP split into integer parts (soft fp)
       assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
              !PartVT.isVector() && "Unexpected split");
       EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
       Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC);
     }
   }
 
   // There is now one part, held in Val.  Correct it to match ValueVT.
   // PartEVT is the type of the register class that holds the value.
   // ValueVT is the type of the inline asm operation.
   EVT PartEVT = Val.getValueType();
 
   if (PartEVT == ValueVT)
     return Val;
 
   if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
       ValueVT.bitsLT(PartEVT)) {
     // For an FP value in an integer part, we need to truncate to the right
     // width first.
     PartEVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
     Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
   }
 
   // Handle types that have the same size.
   if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
     return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
   // Handle types with different sizes.
   if (PartEVT.isInteger() && ValueVT.isInteger()) {
     if (ValueVT.bitsLT(PartEVT)) {
       // For a truncate, see if we have any information to
       // indicate whether the truncated bits will always be
       // zero or sign-extension.
       if (AssertOp.hasValue())
         Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
                           DAG.getValueType(ValueVT));
       return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
     }
     return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
   }
 
   if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
     // FP_ROUND's are always exact here.
     if (ValueVT.bitsLT(Val.getValueType()))
       return DAG.getNode(
           ISD::FP_ROUND, DL, ValueVT, Val,
           DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
 
     return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
   }
 
   llvm_unreachable("Unknown mismatch!");
 }
 
 static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
                                               const Twine &ErrMsg) {
   const Instruction *I = dyn_cast_or_null<Instruction>(V);
   if (!V)
     return Ctx.emitError(ErrMsg);
 
   const char *AsmError = ", possible invalid constraint for vector type";
   if (const CallInst *CI = dyn_cast<CallInst>(I))
     if (isa<InlineAsm>(CI->getCalledValue()))
       return Ctx.emitError(I, ErrMsg + AsmError);
 
   return Ctx.emitError(I, ErrMsg);
 }
 
 /// getCopyFromPartsVector - Create a value that contains the specified legal
 /// parts combined into the value they represent.  If the parts combine to a
 /// type larger than ValueVT then AssertOp can be used to specify whether the
 /// extra bits are known to be zero (ISD::AssertZext) or sign extended from
 /// ValueVT (ISD::AssertSext).
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V,
                                       Optional<CallingConv::ID> CallConv) {
   assert(ValueVT.isVector() && "Not a vector value");
   assert(NumParts > 0 && "No parts to assemble!");
   const bool IsABIRegCopy = CallConv.hasValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Val = Parts[0];
 
   // Handle a multi-element vector.
   if (NumParts > 1) {
     EVT IntermediateVT;
     MVT RegisterVT;
     unsigned NumIntermediates;
     unsigned NumRegs;
 
     if (IsABIRegCopy) {
       NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
           *DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
           NumIntermediates, RegisterVT);
     } else {
       NumRegs =
           TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
                                      NumIntermediates, RegisterVT);
     }
 
     assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
     NumParts = NumRegs; // Silence a compiler warning.
     assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
     assert(RegisterVT.getSizeInBits() ==
            Parts[0].getSimpleValueType().getSizeInBits() &&
            "Part type sizes don't match!");
 
     // Assemble the parts into intermediate operands.
     SmallVector<SDValue, 8> Ops(NumIntermediates);
     if (NumIntermediates == NumParts) {
       // If the register was not expanded, truncate or copy the value,
       // as appropriate.
       for (unsigned i = 0; i != NumParts; ++i)
         Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
                                   PartVT, IntermediateVT, V);
     } else if (NumParts > 0) {
       // If the intermediate type was expanded, build the intermediate
       // operands from the parts.
       assert(NumParts % NumIntermediates == 0 &&
              "Must expand into a divisible number of parts!");
       unsigned Factor = NumParts / NumIntermediates;
       for (unsigned i = 0; i != NumIntermediates; ++i)
         Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
                                   PartVT, IntermediateVT, V);
     }
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
     // intermediate operands.
     EVT BuiltVectorTy =
         EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
                          (IntermediateVT.isVector()
                               ? IntermediateVT.getVectorNumElements() * NumParts
                               : NumIntermediates));
     Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
                                                 : ISD::BUILD_VECTOR,
                       DL, BuiltVectorTy, Ops);
   }
 
   // There is now one part, held in Val.  Correct it to match ValueVT.
   EVT PartEVT = Val.getValueType();
 
   if (PartEVT == ValueVT)
     return Val;
 
   if (PartEVT.isVector()) {
     // If the element type of the source/dest vectors are the same, but the
     // parts vector has more elements than the value vector, then we have a
     // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
     // elements we want.
     if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
       assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
              "Cannot narrow, it would be a lossy transformation");
       return DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
           DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
     }
 
     // Vector/Vector bitcast.
     if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
       return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
     assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
       "Cannot handle this kind of promotion");
     // Promoted vector extract
     return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
 
   }
 
   // Trivial bitcast if the types are the same size and the destination
   // vector type is legal.
   if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
       TLI.isTypeLegal(ValueVT))
     return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
   if (ValueVT.getVectorNumElements() != 1) {
      // Certain ABIs require that vectors are passed as integers. For vectors
      // are the same size, this is an obvious bitcast.
      if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
        return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
      } else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
        // Bitcast Val back the original type and extract the corresponding
        // vector we want.
        unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
        EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
                                            ValueVT.getVectorElementType(), Elts);
        Val = DAG.getBitcast(WiderVecType, Val);
        return DAG.getNode(
            ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
            DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
      }
 
      diagnosePossiblyInvalidConstraint(
          *DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
      return DAG.getUNDEF(ValueVT);
   }
 
   // Handle cases such as i8 -> <1 x i1>
   EVT ValueSVT = ValueVT.getVectorElementType();
   if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
     Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
                                     : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
 
   return DAG.getBuildVector(ValueVT, DL, Val);
 }
 
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
                                  MVT PartVT, const Value *V,
                                  Optional<CallingConv::ID> CallConv);
 
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
 /// integers, ExtendKind can be used to specify how to generate the extra bits.
 static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
                            SDValue *Parts, unsigned NumParts, MVT PartVT,
                            const Value *V,
                            Optional<CallingConv::ID> CallConv = None,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
   EVT ValueVT = Val.getValueType();
 
   // Handle the vector case separately.
   if (ValueVT.isVector())
     return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
                                 CallConv);
 
   unsigned PartBits = PartVT.getSizeInBits();
   unsigned OrigNumParts = NumParts;
   assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
          "Copying to an illegal type!");
 
   if (NumParts == 0)
     return;
 
   assert(!ValueVT.isVector() && "Vector case handled elsewhere");
   EVT PartEVT = PartVT;
   if (PartEVT == ValueVT) {
     assert(NumParts == 1 && "No-op copy with multiple parts!");
     Parts[0] = Val;
     return;
   }
 
   if (NumParts * PartBits > ValueVT.getSizeInBits()) {
     // If the parts cover more bits than the value has, promote the value.
     if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
       assert(NumParts == 1 && "Do not know what to promote to!");
       Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
     } else {
       if (ValueVT.isFloatingPoint()) {
         // FP values need to be bitcast, then extended if they are being put
         // into a larger container.
         ValueVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
         Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
       }
       assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
              ValueVT.isInteger() &&
              "Unknown mismatch!");
       ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
       Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
       if (PartVT == MVT::x86mmx)
         Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
     }
   } else if (PartBits == ValueVT.getSizeInBits()) {
     // Different types of the same size.
     assert(NumParts == 1 && PartEVT != ValueVT);
     Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
   } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
     // If the parts cover less bits than value has, truncate the value.
     assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
            ValueVT.isInteger() &&
            "Unknown mismatch!");
     ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
     Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
     if (PartVT == MVT::x86mmx)
       Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
   }
 
   // The value may have changed - recompute ValueVT.
   ValueVT = Val.getValueType();
   assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
          "Failed to tile the value with PartVT!");
 
   if (NumParts == 1) {
     if (PartEVT != ValueVT) {
       diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
                                         "scalar-to-vector conversion failed");
       Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
     }
 
     Parts[0] = Val;
     return;
   }
 
   // Expand the value into multiple parts.
   if (NumParts & (NumParts - 1)) {
     // The number of parts is not a power of 2.  Split off and copy the tail.
     assert(PartVT.isInteger() && ValueVT.isInteger() &&
            "Do not know what to expand to!");
     unsigned RoundParts = 1 << Log2_32(NumParts);
     unsigned RoundBits = RoundParts * PartBits;
     unsigned OddParts = NumParts - RoundParts;
     SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
                                  DAG.getIntPtrConstant(RoundBits, DL));
     getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V,
                    CallConv);
 
     if (DAG.getDataLayout().isBigEndian())
       // The odd parts were reversed by getCopyToParts - unreverse them.
       std::reverse(Parts + RoundParts, Parts + NumParts);
 
     NumParts = RoundParts;
     ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
     Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
   }
 
   // The number of parts is a power of 2.  Repeatedly bisect the value using
   // EXTRACT_ELEMENT.
   Parts[0] = DAG.getNode(ISD::BITCAST, DL,
                          EVT::getIntegerVT(*DAG.getContext(),
                                            ValueVT.getSizeInBits()),
                          Val);
 
   for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
     for (unsigned i = 0; i < NumParts; i += StepSize) {
       unsigned ThisBits = StepSize * PartBits / 2;
       EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
       SDValue &Part0 = Parts[i];
       SDValue &Part1 = Parts[i+StepSize/2];
 
       Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
                           ThisVT, Part0, DAG.getIntPtrConstant(1, DL));
       Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
                           ThisVT, Part0, DAG.getIntPtrConstant(0, DL));
 
       if (ThisBits == PartBits && ThisVT != PartVT) {
         Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
         Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
       }
     }
   }
 
   if (DAG.getDataLayout().isBigEndian())
     std::reverse(Parts, Parts + OrigNumParts);
 }
 
 /// getCopyToPartsVector - Create a series of nodes that contain the specified
 /// value split into legal parts.
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
                                  MVT PartVT, const Value *V,
                                  Optional<CallingConv::ID> CallConv) {
   EVT ValueVT = Val.getValueType();
   assert(ValueVT.isVector() && "Not a vector");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const bool IsABIRegCopy = CallConv.hasValue();
 
   if (NumParts == 1) {
     EVT PartEVT = PartVT;
     if (PartEVT == ValueVT) {
       // Nothing to do.
     } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
       // Bitconvert vector->vector case.
       Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
     } else if (PartVT.isVector() &&
                PartEVT.getVectorElementType() == ValueVT.getVectorElementType() &&
                PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
       EVT ElementVT = PartVT.getVectorElementType();
       // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
       // undef elements.
       SmallVector<SDValue, 16> Ops;
       for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
         Ops.push_back(DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, DL, ElementVT, Val,
             DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
       for (unsigned i = ValueVT.getVectorNumElements(),
            e = PartVT.getVectorNumElements(); i != e; ++i)
         Ops.push_back(DAG.getUNDEF(ElementVT));
 
       Val = DAG.getBuildVector(PartVT, DL, Ops);
 
       // FIXME: Use CONCAT for 2x -> 4x.
 
       //SDValue UndefElts = DAG.getUNDEF(VectorTy);
       //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
     } else if (PartVT.isVector() &&
                PartEVT.getVectorElementType().bitsGE(
                  ValueVT.getVectorElementType()) &&
                PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
 
       // Promoted vector extract
       Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
     } else {
       if (ValueVT.getVectorNumElements() == 1) {
         Val = DAG.getNode(
             ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
             DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
       } else {
         assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
                "lossy conversion of vector to scalar type");
         EVT IntermediateType =
             EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
         Val = DAG.getBitcast(IntermediateType, Val);
         Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
       }
     }
 
     assert(Val.getValueType() == PartVT && "Unexpected vector part value type");
     Parts[0] = Val;
     return;
   }
 
   // Handle a multi-element vector.
   EVT IntermediateVT;
   MVT RegisterVT;
   unsigned NumIntermediates;
   unsigned NumRegs;
   if (IsABIRegCopy) {
     NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
         *DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
         NumIntermediates, RegisterVT);
   } else {
     NumRegs =
         TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
                                    NumIntermediates, RegisterVT);
   }
   unsigned NumElements = ValueVT.getVectorNumElements();
 
   assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
   NumParts = NumRegs; // Silence a compiler warning.
   assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
 
   // Convert the vector to the appropiate type if necessary.
   unsigned DestVectorNoElts =
       NumIntermediates *
       (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1);
   EVT BuiltVectorTy = EVT::getVectorVT(
       *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
   if (Val.getValueType() != BuiltVectorTy)
     Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
 
   // Split the vector into intermediate operands.
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
     if (IntermediateVT.isVector())
       Ops[i] =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
                       DAG.getConstant(i * (NumElements / NumIntermediates), DL,
                                       TLI.getVectorIdxTy(DAG.getDataLayout())));
     else
       Ops[i] = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
           DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   // Split the intermediate operands into legal parts.
   if (NumParts == NumIntermediates) {
     // If the register was not expanded, promote or copy the value,
     // as appropriate.
     for (unsigned i = 0; i != NumParts; ++i)
       getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V, CallConv);
   } else if (NumParts > 0) {
     // If the intermediate type was expanded, split each the value into
     // legal parts.
     assert(NumIntermediates != 0 && "division by zero");
     assert(NumParts % NumIntermediates == 0 &&
            "Must expand into a divisible number of parts!");
     unsigned Factor = NumParts / NumIntermediates;
     for (unsigned i = 0; i != NumIntermediates; ++i)
       getCopyToParts(DAG, DL, Ops[i], &Parts[i * Factor], Factor, PartVT, V,
                      CallConv);
   }
 }
 
 RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
                            EVT valuevt, Optional<CallingConv::ID> CC)
     : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
       RegCount(1, regs.size()), CallConv(CC) {}
 
 RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
                            const DataLayout &DL, unsigned Reg, Type *Ty,
                            Optional<CallingConv::ID> CC) {
   ComputeValueVTs(TLI, DL, Ty, ValueVTs);
 
   CallConv = CC;
 
   for (EVT ValueVT : ValueVTs) {
     unsigned NumRegs =
         isABIMangled()
             ? TLI.getNumRegistersForCallingConv(Context, CC.getValue(), ValueVT)
             : TLI.getNumRegisters(Context, ValueVT);
     MVT RegisterVT =
         isABIMangled()
             ? TLI.getRegisterTypeForCallingConv(Context, CC.getValue(), ValueVT)
             : TLI.getRegisterType(Context, ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i)
       Regs.push_back(Reg + i);
     RegVTs.push_back(RegisterVT);
     RegCount.push_back(NumRegs);
     Reg += NumRegs;
   }
 }
 
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
                                       const SDLoc &dl, SDValue &Chain,
                                       SDValue *Flag, const Value *V) const {
   // A Value with type {} or [0 x %t] needs no registers.
   if (ValueVTs.empty())
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Assemble the legal parts into the final values.
   SmallVector<SDValue, 4> Values(ValueVTs.size());
   SmallVector<SDValue, 8> Parts;
   for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
     // Copy the legal parts from the registers.
     EVT ValueVT = ValueVTs[Value];
     unsigned NumRegs = RegCount[Value];
     MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv(
                                           *DAG.getContext(),
                                           CallConv.getValue(), RegVTs[Value])
                                     : RegVTs[Value];
 
     Parts.resize(NumRegs);
     for (unsigned i = 0; i != NumRegs; ++i) {
       SDValue P;
       if (!Flag) {
         P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
       } else {
         P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
         *Flag = P.getValue(2);
       }
 
       Chain = P.getValue(1);
       Parts[i] = P;
 
       // If the source register was virtual and if we know something about it,
       // add an assert node.
       if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
           !RegisterVT.isInteger() || RegisterVT.isVector())
         continue;
 
       const FunctionLoweringInfo::LiveOutInfo *LOI =
         FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
       if (!LOI)
         continue;
 
       unsigned RegSize = RegisterVT.getSizeInBits();
       unsigned NumSignBits = LOI->NumSignBits;
       unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
 
       if (NumZeroBits == RegSize) {
         // The current value is a zero.
         // Explicitly express that as it would be easier for
         // optimizations to kick in.
         Parts[i] = DAG.getConstant(0, dl, RegisterVT);
         continue;
       }
 
       // FIXME: We capture more information than the dag can represent.  For
       // now, just use the tightest assertzext/assertsext possible.
       bool isSExt;
       EVT FromVT(MVT::Other);
       if (NumZeroBits) {
         FromVT = EVT::getIntegerVT(*DAG.getContext(), RegSize - NumZeroBits);
         isSExt = false;
       } else if (NumSignBits > 1) {
         FromVT =
             EVT::getIntegerVT(*DAG.getContext(), RegSize - NumSignBits + 1);
         isSExt = true;
       } else {
         continue;
       }
       // Add an assertion node.
       assert(FromVT != MVT::Other);
       Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
                              RegisterVT, P, DAG.getValueType(FromVT));
     }
 
     Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs,
                                      RegisterVT, ValueVT, V, CallConv);
     Part += NumRegs;
     Parts.clear();
   }
 
   return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
 }
 
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
                                  const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                                  const Value *V,
                                  ISD::NodeType PreferredExtendType) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   ISD::NodeType ExtendKind = PreferredExtendType;
 
   // Get the list of the values's legal parts.
   unsigned NumRegs = Regs.size();
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
     unsigned NumParts = RegCount[Value];
 
     MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv(
                                           *DAG.getContext(),
                                           CallConv.getValue(), RegVTs[Value])
                                     : RegVTs[Value];
 
     if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
       ExtendKind = ISD::ZERO_EXTEND;
 
     getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value), &Parts[Part],
                    NumParts, RegisterVT, V, CallConv, ExtendKind);
     Part += NumParts;
   }
 
   // Copy the parts into the registers.
   SmallVector<SDValue, 8> Chains(NumRegs);
   for (unsigned i = 0; i != NumRegs; ++i) {
     SDValue Part;
     if (!Flag) {
       Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
     } else {
       Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
       *Flag = Part.getValue(1);
     }
 
     Chains[i] = Part.getValue(0);
   }
 
   if (NumRegs == 1 || Flag)
     // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
     // flagged to it. That is the CopyToReg nodes and the user are considered
     // a single scheduling unit. If we create a TokenFactor and return it as
     // chain, then the TokenFactor is both a predecessor (operand) of the
     // user as well as a successor (the TF operands are flagged to the user).
     // c1, f1 = CopyToReg
     // c2, f2 = CopyToReg
     // c3     = TokenFactor c1, c2
     // ...
     //        = op c3, ..., f2
     Chain = Chains[NumRegs-1];
   else
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 }
 
 void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
                                         unsigned MatchingIdx, const SDLoc &dl,
                                         SelectionDAG &DAG,
                                         std::vector<SDValue> &Ops) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
   if (HasMatching)
     Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
   else if (!Regs.empty() &&
            TargetRegisterInfo::isVirtualRegister(Regs.front())) {
     // Put the register class of the virtual registers in the flag word.  That
     // way, later passes can recompute register class constraints for inline
     // assembly as well as normal instructions.
     // Don't do this for tied operands that can use the regclass information
     // from the def.
     const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
     const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
     Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
   }
 
   SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
   Ops.push_back(Res);
 
   if (Code == InlineAsm::Kind_Clobber) {
     // Clobbers should always have a 1:1 mapping with registers, and may
     // reference registers that have illegal (e.g. vector) types. Hence, we
     // shouldn't try to apply any sort of splitting logic to them.
     assert(Regs.size() == RegVTs.size() && Regs.size() == ValueVTs.size() &&
            "No 1:1 mapping from clobbers to regs?");
     unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
     (void)SP;
     for (unsigned I = 0, E = ValueVTs.size(); I != E; ++I) {
       Ops.push_back(DAG.getRegister(Regs[I], RegVTs[I]));
       assert(
           (Regs[I] != SP ||
            DAG.getMachineFunction().getFrameInfo().hasOpaqueSPAdjustment()) &&
           "If we clobbered the stack pointer, MFI should know about it.");
     }
     return;
   }
 
   for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
     unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
     MVT RegisterVT = RegVTs[Value];
     for (unsigned i = 0; i != NumRegs; ++i) {
       assert(Reg < Regs.size() && "Mismatch in # registers expected");
       unsigned TheReg = Regs[Reg++];
       Ops.push_back(DAG.getRegister(TheReg, RegisterVT));
     }
   }
 }
 
 SmallVector<std::pair<unsigned, unsigned>, 4>
 RegsForValue::getRegsAndSizes() const {
   SmallVector<std::pair<unsigned, unsigned>, 4> OutVec;
   unsigned I = 0;
   for (auto CountAndVT : zip_first(RegCount, RegVTs)) {
     unsigned RegCount = std::get<0>(CountAndVT);
     MVT RegisterVT = std::get<1>(CountAndVT);
     unsigned RegisterSize = RegisterVT.getSizeInBits();
     for (unsigned E = I + RegCount; I != E; ++I)
       OutVec.push_back(std::make_pair(Regs[I], RegisterSize));
   }
   return OutVec;
 }
 
 void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
                                const TargetLibraryInfo *li) {
   AA = aa;
   GFI = gfi;
   LibInfo = li;
   DL = &DAG.getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
 
 void SelectionDAGBuilder::clear() {
   NodeMap.clear();
   UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
   CurInst = nullptr;
   HasTailCall = false;
   SDNodeOrder = LowestSDNodeOrder;
   StatepointLowering.clear();
 }
 
 void SelectionDAGBuilder::clearDanglingDebugInfo() {
   DanglingDebugInfoMap.clear();
 }
 
 SDValue SelectionDAGBuilder::getRoot() {
   if (PendingLoads.empty())
     return DAG.getRoot();
 
   if (PendingLoads.size() == 1) {
     SDValue Root = PendingLoads[0];
     DAG.setRoot(Root);
     PendingLoads.clear();
     return Root;
   }
 
   // Otherwise, we have to make a token factor node.
   SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
                              PendingLoads);
   PendingLoads.clear();
   DAG.setRoot(Root);
   return Root;
 }
 
 SDValue SelectionDAGBuilder::getControlRoot() {
   SDValue Root = DAG.getRoot();
 
   if (PendingExports.empty())
     return Root;
 
   // Turn all of the CopyToReg chains into one factored node.
   if (Root.getOpcode() != ISD::EntryToken) {
     unsigned i = 0, e = PendingExports.size();
     for (; i != e; ++i) {
       assert(PendingExports[i].getNode()->getNumOperands() > 1);
       if (PendingExports[i].getNode()->getOperand(0) == Root)
         break;  // Don't add the root if we already indirectly depend on it.
     }
 
     if (i == e)
       PendingExports.push_back(Root);
   }
 
   Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
                      PendingExports);
   PendingExports.clear();
   DAG.setRoot(Root);
   return Root;
 }
 
 void SelectionDAGBuilder::visit(const Instruction &I) {
   // Set up outgoing PHI node register values before emitting the terminator.
   if (isa<TerminatorInst>(&I)) {
     HandlePHINodesInSuccessorBlocks(I.getParent());
   }
 
   // Increase the SDNodeOrder if dealing with a non-debug instruction.
   if (!isa<DbgInfoIntrinsic>(I))
     ++SDNodeOrder;
 
   CurInst = &I;
 
   visit(I.getOpcode(), I);
 
   if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
     // Propagate the fast-math-flags of this IR instruction to the DAG node that
     // maps to this instruction.
     // TODO: We could handle all flags (nsw, etc) here.
     // TODO: If an IR instruction maps to >1 node, only the final node will have
     //       flags set.
     if (SDNode *Node = getNodeForIRValue(&I)) {
       SDNodeFlags IncomingFlags;
       IncomingFlags.copyFMF(*FPMO);
       if (!Node->getFlags().isDefined())
         Node->setFlags(IncomingFlags);
       else
         Node->intersectFlagsWith(IncomingFlags);
     }
   }
 
   if (!isa<TerminatorInst>(&I) && !HasTailCall &&
       !isStatepoint(&I)) // statepoints handle their exports internally
     CopyToExportRegsIfNeeded(&I);
 
   CurInst = nullptr;
 }
 
 void SelectionDAGBuilder::visitPHI(const PHINode &) {
   llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
 }
 
 void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   // Note: this doesn't use InstVisitor, because it has to work with
   // ConstantExpr's in addition to instructions.
   switch (Opcode) {
   default: llvm_unreachable("Unknown instruction type encountered!");
     // Build the switch statement using the Instruction.def file.
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
     case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
 #include "llvm/IR/Instruction.def"
   }
 }
 
 void SelectionDAGBuilder::dropDanglingDebugInfo(const DILocalVariable *Variable,
                                                 const DIExpression *Expr) {
   auto isMatchingDbgValue = [&](DanglingDebugInfo &DDI) {
     const DbgValueInst *DI = DDI.getDI();
     DIVariable *DanglingVariable = DI->getVariable();
     DIExpression *DanglingExpr = DI->getExpression();
     if (DanglingVariable == Variable && Expr->fragmentsOverlap(DanglingExpr)) {
       LLVM_DEBUG(dbgs() << "Dropping dangling debug info for " << *DI << "\n");
       return true;
     }
     return false;
   };
 
   for (auto &DDIMI : DanglingDebugInfoMap) {
     DanglingDebugInfoVector &DDIV = DDIMI.second;
     DDIV.erase(remove_if(DDIV, isMatchingDbgValue), DDIV.end());
   }
 }
 
 // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
 // generate the debug data structures now that we've seen its definition.
 void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
                                                    SDValue Val) {
   auto DanglingDbgInfoIt = DanglingDebugInfoMap.find(V);
   if (DanglingDbgInfoIt == DanglingDebugInfoMap.end())
     return;
 
   DanglingDebugInfoVector &DDIV = DanglingDbgInfoIt->second;
   for (auto &DDI : DDIV) {
     const DbgValueInst *DI = DDI.getDI();
     assert(DI && "Ill-formed DanglingDebugInfo");
     DebugLoc dl = DDI.getdl();
     unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
     unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
     DILocalVariable *Variable = DI->getVariable();
     DIExpression *Expr = DI->getExpression();
     assert(Variable->isValidLocationForIntrinsic(dl) &&
            "Expected inlined-at fields to agree");
     SDDbgValue *SDV;
     if (Val.getNode()) {
       if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, false, Val)) {
         LLVM_DEBUG(dbgs() << "Resolve dangling debug info [order="
                           << DbgSDNodeOrder << "] for:\n  " << *DI << "\n");
         LLVM_DEBUG(dbgs() << "  By mapping to:\n    "; Val.dump());
         // Increase the SDNodeOrder for the DbgValue here to make sure it is
         // inserted after the definition of Val when emitting the instructions
         // after ISel. An alternative could be to teach
         // ScheduleDAGSDNodes::EmitSchedule to delay the insertion properly.
         LLVM_DEBUG(if (ValSDNodeOrder > DbgSDNodeOrder) dbgs()
                    << "changing SDNodeOrder from " << DbgSDNodeOrder << " to "
                    << ValSDNodeOrder << "\n");
         SDV = getDbgValue(Val, Variable, Expr, dl,
                           std::max(DbgSDNodeOrder, ValSDNodeOrder));
         DAG.AddDbgValue(SDV, Val.getNode(), false);
       } else
         LLVM_DEBUG(dbgs() << "Resolved dangling debug info for " << *DI
                           << "in EmitFuncArgumentDbgValue\n");
     } else
       LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
   }
   DDIV.clear();
 }
 
 /// getCopyFromRegs - If there was virtual register allocated for the value V
 /// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
 SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
   DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
   SDValue Result;
 
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
 
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
                      DAG.getDataLayout(), InReg, Ty, getABIRegCopyCC(V));
     SDValue Chain = DAG.getEntryNode();
     Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
                                  V);
     resolveDanglingDebugInfo(V, Result);
   }
 
   return Result;
 }
 
 /// getValue - Return an SDValue for the given Value.
 SDValue SelectionDAGBuilder::getValue(const Value *V) {
   // If we already have an SDValue for this value, use it. It's important
   // to do this first, so that we don't create a CopyFromReg if we already
   // have a regular SDValue.
   SDValue &N = NodeMap[V];
   if (N.getNode()) return N;
 
   // If there's a virtual register allocated and initialized for this
   // value, use it.
   if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
     return copyFromReg;
 
   // Otherwise create a new SDValue and remember it.
   SDValue Val = getValueImpl(V);
   NodeMap[V] = Val;
   resolveDanglingDebugInfo(V, Val);
   return Val;
 }
 
 // Return true if SDValue exists for the given Value
 bool SelectionDAGBuilder::findValue(const Value *V) const {
   return (NodeMap.find(V) != NodeMap.end()) ||
     (FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
 }
 
 /// getNonRegisterValue - Return an SDValue for the given Value, but
 /// don't look in FuncInfo.ValueMap for a virtual register.
 SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
   // If we already have an SDValue for this value, use it.
   SDValue &N = NodeMap[V];
   if (N.getNode()) {
     if (isa<ConstantSDNode>(N) || isa<ConstantFPSDNode>(N)) {
       // Remove the debug location from the node as the node is about to be used
       // in a location which may differ from the original debug location.  This
       // is relevant to Constant and ConstantFP nodes because they can appear
       // as constant expressions inside PHI nodes.
       N->setDebugLoc(DebugLoc());
     }
     return N;
   }
 
   // Otherwise create a new SDValue and remember it.
   SDValue Val = getValueImpl(V);
   NodeMap[V] = Val;
   resolveDanglingDebugInfo(V, Val);
   return Val;
 }
 
 /// getValueImpl - Helper function for getValue and getNonRegisterValue.
 /// Create an SDValue for the given value.
 SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
       return DAG.getConstant(*CI, getCurSDLoc(), VT);
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
 
     if (isa<ConstantPointerNull>(C)) {
       unsigned AS = V->getType()->getPointerAddressSpace();
       return DAG.getConstant(0, getCurSDLoc(),
                              TLI.getPointerTy(DAG.getDataLayout(), AS));
     }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
       return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);
 
     if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
       return DAG.getUNDEF(VT);
 
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
       visit(CE->getOpcode(), *CE);
       SDValue N1 = NodeMap[V];
       assert(N1.getNode() && "visit didn't populate the NodeMap!");
       return N1;
     }
 
     if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
       SmallVector<SDValue, 4> Constants;
       for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
            OI != OE; ++OI) {
         SDNode *Val = getValue(*OI).getNode();
         // If the operand is an empty aggregate, there are no values.
         if (!Val) continue;
         // Add each leaf value from the operand to the Constants list
         // to form a flattened list of all the values.
         for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
           Constants.push_back(SDValue(Val, i));
       }
 
       return DAG.getMergeValues(Constants, getCurSDLoc());
     }
 
     if (const ConstantDataSequential *CDS =
           dyn_cast<ConstantDataSequential>(C)) {
       SmallVector<SDValue, 4> Ops;
       for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
         SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
         // Add each leaf value from the operand to the Constants list
         // to form a flattened list of all the values.
         for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
           Ops.push_back(SDValue(Val, i));
       }
 
       if (isa<ArrayType>(CDS->getType()))
         return DAG.getMergeValues(Ops, getCurSDLoc());
       return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
     }
 
     if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
       assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
              "Unknown struct or array constant!");
 
       SmallVector<EVT, 4> ValueVTs;
       ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
       unsigned NumElts = ValueVTs.size();
       if (NumElts == 0)
         return SDValue(); // empty struct
       SmallVector<SDValue, 4> Constants(NumElts);
       for (unsigned i = 0; i != NumElts; ++i) {
         EVT EltVT = ValueVTs[i];
         if (isa<UndefValue>(C))
           Constants[i] = DAG.getUNDEF(EltVT);
         else if (EltVT.isFloatingPoint())
           Constants[i] = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
         else
           Constants[i] = DAG.getConstant(0, getCurSDLoc(), EltVT);
       }
 
       return DAG.getMergeValues(Constants, getCurSDLoc());
     }
 
     if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
       return DAG.getBlockAddress(BA, VT);
 
     VectorType *VecTy = cast<VectorType>(V->getType());
     unsigned NumElements = VecTy->getNumElements();
 
     // Now that we know the number and type of the elements, get that number of
     // elements into the Ops array based on what kind of constant it is.
     SmallVector<SDValue, 16> Ops;
     if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
       for (unsigned i = 0; i != NumElements; ++i)
         Ops.push_back(getValue(CV->getOperand(i)));
     } else {
       assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
       EVT EltVT =
           TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
 
       SDValue Op;
       if (EltVT.isFloatingPoint())
         Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
       else
         Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
       Ops.assign(NumElements, Op);
     }
 
     // Create a BUILD_VECTOR node.
     return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
   }
 
   // If this is a static alloca, generate it as the frameindex instead of
   // computation.
   if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end())
       return DAG.getFrameIndex(SI->second,
                                TLI.getFrameIndexTy(DAG.getDataLayout()));
   }
 
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
 
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
                      Inst->getType(), getABIRegCopyCC(V));
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
 
   llvm_unreachable("Can't get register for value!");
 }
 
 void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
   auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
   bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
   bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
   bool IsSEH = isAsynchronousEHPersonality(Pers);
   bool IsWasmCXX = Pers == EHPersonality::Wasm_CXX;
   MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
   if (!IsSEH)
     CatchPadMBB->setIsEHScopeEntry();
   // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
   if (IsMSVCCXX || IsCoreCLR)
     CatchPadMBB->setIsEHFuncletEntry();
   // Wasm does not need catchpads anymore
   if (!IsWasmCXX)
     DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other,
                             getControlRoot()));
 }
 
 void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
   // Update machine-CFG edge.
   MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
   FuncInfo.MBB->addSuccessor(TargetMBB);
 
   auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
   bool IsSEH = isAsynchronousEHPersonality(Pers);
   if (IsSEH) {
     // If this is not a fall-through branch or optimizations are switched off,
     // emit the branch.
     if (TargetMBB != NextBlock(FuncInfo.MBB) ||
         TM.getOptLevel() == CodeGenOpt::None)
       DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
                               getControlRoot(), DAG.getBasicBlock(TargetMBB)));
     return;
   }
 
   // Figure out the funclet membership for the catchret's successor.
   // This will be used by the FuncletLayout pass to determine how to order the
   // BB's.
   // A 'catchret' returns to the outer scope's color.
   Value *ParentPad = I.getCatchSwitchParentPad();
   const BasicBlock *SuccessorColor;
   if (isa<ConstantTokenNone>(ParentPad))
     SuccessorColor = &FuncInfo.Fn->getEntryBlock();
   else
     SuccessorColor = cast<Instruction>(ParentPad)->getParent();
   assert(SuccessorColor && "No parent funclet for catchret!");
   MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
   assert(SuccessorColorMBB && "No MBB for SuccessorColor!");
 
   // Create the terminator node.
   SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
                             getControlRoot(), DAG.getBasicBlock(TargetMBB),
                             DAG.getBasicBlock(SuccessorColorMBB));
   DAG.setRoot(Ret);
 }
 
 void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
   // Don't emit any special code for the cleanuppad instruction. It just marks
   // the start of an EH scope/funclet.
   FuncInfo.MBB->setIsEHScopeEntry();
   FuncInfo.MBB->setIsEHFuncletEntry();
   FuncInfo.MBB->setIsCleanupFuncletEntry();
 }
 
 /// When an invoke or a cleanupret unwinds to the next EH pad, there are
 /// many places it could ultimately go. In the IR, we have a single unwind
 /// destination, but in the machine CFG, we enumerate all the possible blocks.
 /// This function skips over imaginary basic blocks that hold catchswitch
 /// instructions, and finds all the "real" machine
 /// basic block destinations. As those destinations may not be successors of
 /// EHPadBB, here we also calculate the edge probability to those destinations.
 /// The passed-in Prob is the edge probability to EHPadBB.
 static void findUnwindDestinations(
     FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
     BranchProbability Prob,
     SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
         &UnwindDests) {
   EHPersonality Personality =
     classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
   bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
   bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
   bool IsSEH = isAsynchronousEHPersonality(Personality);
 
   while (EHPadBB) {
     const Instruction *Pad = EHPadBB->getFirstNonPHI();
     BasicBlock *NewEHPadBB = nullptr;
     if (isa<LandingPadInst>(Pad)) {
       // Stop on landingpads. They are not funclets.
       UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
       break;
     } else if (isa<CleanupPadInst>(Pad)) {
       // Stop on cleanup pads. Cleanups are always funclet entries for all known
       // personalities.
       UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
       UnwindDests.back().first->setIsEHScopeEntry();
       UnwindDests.back().first->setIsEHFuncletEntry();
       break;
     } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
       // Add the catchpad handlers to the possible destinations.
       for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
         UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
         // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
         if (IsMSVCCXX || IsCoreCLR)
           UnwindDests.back().first->setIsEHFuncletEntry();
         if (!IsSEH)
           UnwindDests.back().first->setIsEHScopeEntry();
       }
       NewEHPadBB = CatchSwitch->getUnwindDest();
     } else {
       continue;
     }
 
     BranchProbabilityInfo *BPI = FuncInfo.BPI;
     if (BPI && NewEHPadBB)
       Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
     EHPadBB = NewEHPadBB;
   }
 }
 
 void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
   // Update successor info.
   SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
   auto UnwindDest = I.getUnwindDest();
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   BranchProbability UnwindDestProb =
       (BPI && UnwindDest)
           ? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
           : BranchProbability::getZero();
   findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
   for (auto &UnwindDest : UnwindDests) {
     UnwindDest.first->setIsEHPad();
     addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
   }
   FuncInfo.MBB->normalizeSuccProbs();
 
   // Create the terminator node.
   SDValue Ret =
       DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
   DAG.setRoot(Ret);
 }
 
 void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
   report_fatal_error("visitCatchSwitch not yet implemented!");
 }
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   auto &DL = DAG.getDataLayout();
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
   SmallVector<SDValue, 8> OutVals;
 
   // Calls to @llvm.experimental.deoptimize don't generate a return value, so
   // lower
   //
   //   %val = call <ty> @llvm.experimental.deoptimize()
   //   ret <ty> %val
   //
   // differently.
   if (I.getParent()->getTerminatingDeoptimizeCall()) {
     LowerDeoptimizingReturn();
     return;
   }
 
   if (!FuncInfo.CanLowerReturn) {
     unsigned DemoteReg = FuncInfo.DemoteRegister;
     const Function *F = I.getParent()->getParent();
 
     // Emit a store of the return value through the virtual register.
     // Leave Outs empty so that LowerReturn won't try to load return
     // registers the usual way.
     SmallVector<EVT, 1> PtrValueVTs;
     ComputeValueVTs(TLI, DL,
                     F->getReturnType()->getPointerTo(
                         DAG.getDataLayout().getAllocaAddrSpace()),
                     PtrValueVTs);
 
     SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
                                         DemoteReg, PtrValueVTs[0]);
     SDValue RetOp = getValue(I.getOperand(0));
 
     SmallVector<EVT, 4> ValueVTs;
     SmallVector<uint64_t, 4> Offsets;
     ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
     for (unsigned i = 0; i != NumValues; ++i) {
       // An aggregate return value cannot wrap around the address space, so
       // offsets to its parts don't wrap either.
       SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr, Offsets[i]);
       Chains[i] = DAG.getStore(
           Chain, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + i),
           // FIXME: better loc info would be nice.
           Ptr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
     }
 
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                         MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
 
       const Function *F = I.getParent()->getParent();
 
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
       if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::SExt))
         ExtendKind = ISD::SIGN_EXTEND;
       else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                                Attribute::ZExt))
         ExtendKind = ISD::ZERO_EXTEND;
 
       LLVMContext &Context = F->getContext();
       bool RetInReg = F->getAttributes().hasAttribute(
           AttributeList::ReturnIndex, Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
           VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
 
         CallingConv::ID CC = F->getCallingConv();
 
         unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, CC, VT);
         MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, CC, VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
                        &Parts[0], NumParts, PartVT, &I, CC, ExtendKind);
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
         if (RetInReg)
           Flags.setInReg();
 
         // Propagate extension type if any
         if (ExtendKind == ISD::SIGN_EXTEND)
           Flags.setSExt();
         else if (ExtendKind == ISD::ZERO_EXTEND)
           Flags.setZExt();
 
         for (unsigned i = 0; i < NumParts; ++i) {
           Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
                                         VT, /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
     }
   }
 
   // Push in swifterror virtual register as the last element of Outs. This makes
   // sure swifterror virtual register will be returned in the swifterror
   // physical register.
   const Function *F = I.getParent()->getParent();
   if (TLI.supportSwiftError() &&
       F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
     assert(FuncInfo.SwiftErrorArg && "Need a swift error argument");
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
     Flags.setSwiftError();
     Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /*vt*/,
                                   EVT(TLI.getPointerTy(DL)) /*argvt*/,
                                   true /*isfixed*/, 1 /*origidx*/,
                                   0 /*partOffs*/));
     // Create SDNode for the swifterror virtual register.
     OutVals.push_back(
         DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVRegUseAt(
                             &I, FuncInfo.MBB, FuncInfo.SwiftErrorArg).first,
                         EVT(TLI.getPointerTy(DL))));
   }
 
   bool isVarArg = DAG.getMachineFunction().getFunction().isVarArg();
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction().getCallingConv();
   Chain = DAG.getTargetLoweringInfo().LowerReturn(
       Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
 
   // Verify that the target's LowerReturn behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
          "LowerReturn didn't return a valid chain!");
 
   // Update the DAG with the new chain value resulting from return lowering.
   DAG.setRoot(Chain);
 }
 
 /// CopyToExportRegsIfNeeded - If the given value has virtual registers
 /// created for it, emit nodes to copy the value into the virtual
 /// registers.
 void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
   // Skip empty types
   if (V->getType()->isEmptyTy())
     return;
 
   DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
   if (VMI != FuncInfo.ValueMap.end()) {
     assert(!V->use_empty() && "Unused value assigned virtual registers!");
     CopyValueToVirtualRegister(V, VMI->second);
   }
 }
 
 /// ExportFromCurrentBlock - If this condition isn't known to be exported from
 /// the current basic block, add it to ValueMap now so that we'll get a
 /// CopyTo/FromReg.
 void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
   // No need to export constants.
   if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
 
   // Already exported?
   if (FuncInfo.isExportedInst(V)) return;
 
   unsigned Reg = FuncInfo.InitializeRegForValue(V);
   CopyValueToVirtualRegister(V, Reg);
 }
 
 bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
                                                      const BasicBlock *FromBB) {
   // The operands of the setcc have to be in this block.  We don't know
   // how to export them from some other block.
   if (const Instruction *VI = dyn_cast<Instruction>(V)) {
     // Can export from current BB.
     if (VI->getParent() == FromBB)
       return true;
 
     // Is already exported, noop.
     return FuncInfo.isExportedInst(V);
   }
 
   // If this is an argument, we can export it if the BB is the entry block or
   // if it is already exported.
   if (isa<Argument>(V)) {
     if (FromBB == &FromBB->getParent()->getEntryBlock())
       return true;
 
     // Otherwise, can only export this if it is already exported.
     return FuncInfo.isExportedInst(V);
   }
 
   // Otherwise, constants can always be exported.
   return true;
 }
 
 /// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
 BranchProbability
 SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
                                         const MachineBasicBlock *Dst) const {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   const BasicBlock *SrcBB = Src->getBasicBlock();
   const BasicBlock *DstBB = Dst->getBasicBlock();
   if (!BPI) {
     // If BPI is not available, set the default probability as 1 / N, where N is
     // the number of successors.
     auto SuccSize = std::max<uint32_t>(succ_size(SrcBB), 1);
     return BranchProbability(1, SuccSize);
   }
   return BPI->getEdgeProbability(SrcBB, DstBB);
 }
 
 void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
                                                MachineBasicBlock *Dst,
                                                BranchProbability Prob) {
   if (!FuncInfo.BPI)
     Src->addSuccessorWithoutProb(Dst);
   else {
     if (Prob.isUnknown())
       Prob = getEdgeProbability(Src, Dst);
     Src->addSuccessor(Dst, Prob);
   }
 }
 
 static bool InBlock(const Value *V, const BasicBlock *BB) {
   if (const Instruction *I = dyn_cast<Instruction>(V))
     return I->getParent() == BB;
   return true;
 }
 
 /// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
 /// This function emits a branch and is used at the leaves of an OR or an
 /// AND operator tree.
 void
 SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
                                                   MachineBasicBlock *TBB,
                                                   MachineBasicBlock *FBB,
                                                   MachineBasicBlock *CurBB,
                                                   MachineBasicBlock *SwitchBB,
                                                   BranchProbability TProb,
                                                   BranchProbability FProb,
                                                   bool InvertCond) {
   const BasicBlock *BB = CurBB->getBasicBlock();
 
   // If the leaf of the tree is a comparison, merge the condition into
   // the caseblock.
   if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
     // The operands of the cmp have to be in this block.  We don't know
     // how to export them from some other block.  If this is the first block
     // of the sequence, no exporting is needed.
     if (CurBB == SwitchBB ||
         (isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
          isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
       ISD::CondCode Condition;
       if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
         ICmpInst::Predicate Pred =
             InvertCond ? IC->getInversePredicate() : IC->getPredicate();
         Condition = getICmpCondCode(Pred);
       } else {
         const FCmpInst *FC = cast<FCmpInst>(Cond);
         FCmpInst::Predicate Pred =
             InvertCond ? FC->getInversePredicate() : FC->getPredicate();
         Condition = getFCmpCondCode(Pred);
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
       }
 
       CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
                    TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
       SwitchCases.push_back(CB);
       return;
     }
   }
 
   // Create a CaseBlock record representing this branch.
   ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
   CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
                nullptr, TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
   SwitchCases.push_back(CB);
 }
 
 /// FindMergedConditions - If Cond is an expression like
 void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                MachineBasicBlock *TBB,
                                                MachineBasicBlock *FBB,
                                                MachineBasicBlock *CurBB,
                                                MachineBasicBlock *SwitchBB,
                                                Instruction::BinaryOps Opc,
                                                BranchProbability TProb,
                                                BranchProbability FProb,
                                                bool InvertCond) {
   // Skip over not part of the tree and remember to invert op and operands at
   // next level.
   if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
     const Value *CondOp = BinaryOperator::getNotArgument(Cond);
     if (InBlock(CondOp, CurBB->getBasicBlock())) {
       FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
                            !InvertCond);
       return;
     }
   }
 
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
   // Compute the effective opcode for Cond, taking into account whether it needs
   // to be inverted, e.g.
   //   and (not (or A, B)), C
   // gets lowered as
   //   and (and (not A, not B), C)
   unsigned BOpc = 0;
   if (BOp) {
     BOpc = BOp->getOpcode();
     if (InvertCond) {
       if (BOpc == Instruction::And)
         BOpc = Instruction::Or;
       else if (BOpc == Instruction::Or)
         BOpc = Instruction::And;
     }
   }
 
   // If this node is not part of the or/and tree, emit it as a branch.
   if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
       BOpc != unsigned(Opc) || !BOp->hasOneUse() ||
       BOp->getParent() != CurBB->getBasicBlock() ||
       !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
       !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
     EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
                                  TProb, FProb, InvertCond);
     return;
   }
 
   //  Create TmpBB after CurBB.
   MachineFunction::iterator BBI(CurBB);
   MachineFunction &MF = DAG.getMachineFunction();
   MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
   CurBB->getParent()->insert(++BBI, TmpBB);
 
   if (Opc == Instruction::Or) {
     // Codegen X | Y as:
     // BB1:
     //   jmp_if_X TBB
     //   jmp TmpBB
     // TmpBB:
     //   jmp_if_Y TBB
     //   jmp FBB
     //
 
     // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
     // The requirement is that
     //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
     //     = TrueProb for original BB.
     // Assuming the original probabilities are A and B, one choice is to set
     // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
     // A/(1+B) and 2B/(1+B). This choice assumes that
     //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
     // Another choice is to assume TrueProb for BB1 equals to TrueProb for
     // TmpBB, but the math is more complicated.
 
     auto NewTrueProb = TProb / 2;
     auto NewFalseProb = TProb / 2 + FProb;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
                          NewTrueProb, NewFalseProb, InvertCond);
 
     // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
     SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
                          Probs[0], Probs[1], InvertCond);
   } else {
     assert(Opc == Instruction::And && "Unknown merge op!");
     // Codegen X & Y as:
     // BB1:
     //   jmp_if_X TmpBB
     //   jmp FBB
     // TmpBB:
     //   jmp_if_Y TBB
     //   jmp FBB
     //
     //  This requires creation of TmpBB after CurBB.
 
     // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
     // The requirement is that
     //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
     //     = FalseProb for original BB.
     // Assuming the original probabilities are A and B, one choice is to set
     // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
     // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
     // TrueProb for BB1 * FalseProb for TmpBB.
 
     auto NewTrueProb = TProb + FProb / 2;
     auto NewFalseProb = FProb / 2;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
                          NewTrueProb, NewFalseProb, InvertCond);
 
     // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
     SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
                          Probs[0], Probs[1], InvertCond);
   }
 }
 
 /// If the set of cases should be emitted as a series of branches, return true.
 /// If we should emit this as a bunch of and/or'd together conditions, return
 /// false.
 bool
 SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
   if (Cases.size() != 2) return true;
 
   // If this is two comparisons of the same values or'd or and'd together, they
   // will get folded into a single comparison, so don't emit two blocks.
   if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
        Cases[0].CmpRHS == Cases[1].CmpRHS) ||
       (Cases[0].CmpRHS == Cases[1].CmpLHS &&
        Cases[0].CmpLHS == Cases[1].CmpRHS)) {
     return false;
   }
 
   // Handle: (X != null) | (Y != null) --> (X|Y) != 0
   // Handle: (X == null) & (Y == null) --> (X|Y) == 0
   if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
       Cases[0].CC == Cases[1].CC &&
       isa<Constant>(Cases[0].CmpRHS) &&
       cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
     if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
       return false;
     if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
       return false;
   }
 
   return true;
 }
 
 void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   MachineBasicBlock *BrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges.
   MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
 
   if (I.isUnconditional()) {
     // Update machine-CFG edges.
     BrMBB->addSuccessor(Succ0MBB);
 
     // If this is not a fall-through branch or optimizations are switched off,
     // emit the branch.
     if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None)
       DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
                               MVT::Other, getControlRoot(),
                               DAG.getBasicBlock(Succ0MBB)));
 
     return;
   }
 
   // If this condition is one of the special cases we handle, do special stuff
   // now.
   const Value *CondVal = I.getCondition();
   MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
 
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
   // As long as jumps are not expensive, this should improve performance.
   // For example, instead of something like:
   //     cmp A, B
   //     C = seteq
   //     cmp D, E
   //     F = setle
   //     or C, F
   //     jnz foo
   // Emit:
   //     cmp A, B
   //     je foo
   //     cmp D, E
   //     jle foo
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
     Instruction::BinaryOps Opcode = BOp->getOpcode();
     if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
         !I.getMetadata(LLVMContext::MD_unpredictable) &&
         (Opcode == Instruction::And || Opcode == Instruction::Or)) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
                            getEdgeProbability(BrMBB, Succ0MBB),
                            getEdgeProbability(BrMBB, Succ1MBB),
                            /*InvertCond=*/false);
       // If the compares in later blocks need to use values not currently
       // exported from this block, export them now.  This block should always
       // be the first entry.
       assert(SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");
 
       // Allow some cases to be rejected.
       if (ShouldEmitAsBranches(SwitchCases)) {
         for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) {
           ExportFromCurrentBlock(SwitchCases[i].CmpLHS);
           ExportFromCurrentBlock(SwitchCases[i].CmpRHS);
         }
 
         // Emit the branch for this block.
         visitSwitchCase(SwitchCases[0], BrMBB);
         SwitchCases.erase(SwitchCases.begin());
         return;
       }
 
       // Okay, we decided not to do this, remove any inserted MBB's and clear
       // SwitchCases.
       for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i)
         FuncInfo.MF->erase(SwitchCases[i].ThisBB);
 
       SwitchCases.clear();
     }
   }
 
   // Create a CaseBlock record representing this branch.
   CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
                nullptr, Succ0MBB, Succ1MBB, BrMBB, getCurSDLoc());
 
   // Use visitSwitchCase to actually insert the fast branch sequence for this
   // cond branch.
   visitSwitchCase(CB, BrMBB);
 }
 
 /// visitSwitchCase - Emits the necessary code to represent a single node in
 /// the binary search tree resulting from lowering a switch instruction.
 void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
                                           MachineBasicBlock *SwitchBB) {
   SDValue Cond;
   SDValue CondLHS = getValue(CB.CmpLHS);
   SDLoc dl = CB.DL;
 
   // Build the setcc now.
   if (!CB.CmpMHS) {
     // Fold "(X == true)" to X and "(X == false)" to !X to
     // handle common cases produced by branch lowering.
     if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
         CB.CC == ISD::SETEQ)
       Cond = CondLHS;
     else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
              CB.CC == ISD::SETEQ) {
       SDValue True = DAG.getConstant(1, dl, CondLHS.getValueType());
       Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
     } else
       Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
   } else {
     assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
 
     SDValue CmpOp = getValue(CB.CmpMHS);
     EVT VT = CmpOp.getValueType();
 
     if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
       Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, dl, VT),
                           ISD::SETLE);
     } else {
       SDValue SUB = DAG.getNode(ISD::SUB, dl,
                                 VT, CmpOp, DAG.getConstant(Low, dl, VT));
       Cond = DAG.getSetCC(dl, MVT::i1, SUB,
                           DAG.getConstant(High-Low, dl, VT), ISD::SETULE);
     }
   }
 
   // Update successor info
   addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
   // TrueBB and FalseBB are always different unless the incoming IR is
   // degenerate. This only happens when running llc on weird IR.
   if (CB.TrueBB != CB.FalseBB)
     addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
   SwitchBB->normalizeSuccProbs();
 
   // If the lhs block is the next block, invert the condition so that we can
   // fall through to the lhs instead of the rhs block.
   if (CB.TrueBB == NextBlock(SwitchBB)) {
     std::swap(CB.TrueBB, CB.FalseBB);
     SDValue True = DAG.getConstant(1, dl, Cond.getValueType());
     Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
   }
 
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
                                MVT::Other, getControlRoot(), Cond,
                                DAG.getBasicBlock(CB.TrueBB));
 
   // Insert the false branch. Do this even if it's a fall through branch,
   // this makes it easier to do DAG optimizations which require inverting
   // the branch condition.
   BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
                        DAG.getBasicBlock(CB.FalseBB));
 
   DAG.setRoot(BrCond);
 }
 
 /// visitJumpTable - Emit JumpTable node in the current MBB
 void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.Reg != -1U && "Should lower JT Header first!");
   EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                      JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
   SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
                                     MVT::Other, Index.getValue(1),
                                     Table, Index);
   DAG.setRoot(BrJumpTable);
 }
 
 /// visitJumpTableHeader - This function emits necessary code to produce index
 /// in the JumpTable from switch case.
 void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
                                                JumpTableHeader &JTH,
                                                MachineBasicBlock *SwitchBB) {
   SDLoc dl = getCurSDLoc();
 
   // Subtract the lowest switch case value from the value being switched on and
   // conditional branch to default mbb if the result is greater than the
   // difference between smallest and largest cases.
   SDValue SwitchOp = getValue(JTH.SValue);
   EVT VT = SwitchOp.getValueType();
   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
                             DAG.getConstant(JTH.First, dl, VT));
 
   // The SDNode we just created, which holds the value being switched on minus
   // the smallest case value, needs to be copied to a virtual register so it
   // can be used as an index into the jump table in a subsequent basic block.
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
   unsigned JumpTableReg =
       FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
 
   // Emit the range check for the jump table, and branch to the default block
   // for the switch statement if the value being switched on exceeds the largest
   // case in the switch.
   SDValue CMP = DAG.getSetCC(
       dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                  Sub.getValueType()),
       Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);
 
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
                                MVT::Other, CopyTo, CMP,
                                DAG.getBasicBlock(JT.Default));
 
   // Avoid emitting unnecessary branches to the next block.
   if (JT.MBB != NextBlock(SwitchBB))
     BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
                          DAG.getBasicBlock(JT.MBB));
 
   DAG.setRoot(BrCond);
 }
 
 /// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
 /// variable if there exists one.
 static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
                                  SDValue &Chain) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
   MachineFunction &MF = DAG.getMachineFunction();
   Value *Global = TLI.getSDagStackGuard(*MF.getFunction().getParent());
   MachineSDNode *Node =
       DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
   if (Global) {
     MachinePointerInfo MPInfo(Global);
     MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
     auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
                  MachineMemOperand::MODereferenceable;
     *MemRefs = MF.getMachineMemOperand(MPInfo, Flags, PtrTy.getSizeInBits() / 8,
                                        DAG.getEVTAlignment(PtrTy));
     Node->setMemRefs(MemRefs, MemRefs + 1);
   }
   return SDValue(Node, 0);
 }
 
 /// Codegen a new tail for a stack protector check ParentMBB which has had its
 /// tail spliced into a stack protector check success bb.
 ///
 /// For a high level explanation of how this fits into the stack protector
 /// generation see the comment on the declaration of class
 /// StackProtectorDescriptor.
 void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
                                                   MachineBasicBlock *ParentBB) {
 
   // First create the loads to the guard/stack slot for the comparison.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
 
   SDValue Guard;
   SDLoc dl = getCurSDLoc();
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
   const Module &M = *ParentBB->getParent()->getFunction().getParent();
   unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));
 
   // Generate code to load the content of the guard slot.
   SDValue GuardVal = DAG.getLoad(
       PtrTy, dl, DAG.getEntryNode(), StackSlotPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
       MachineMemOperand::MOVolatile);
 
   if (TLI.useStackGuardXorFP())
     GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
 
   // Retrieve guard check function, nullptr if instrumentation is inlined.
   if (const Value *GuardCheck = TLI.getSSPStackGuardCheck(M)) {
     // The target provides a guard check function to validate the guard value.
     // Generate a call to that function with the content of the guard slot as
     // argument.
     auto *Fn = cast<Function>(GuardCheck);
     FunctionType *FnTy = Fn->getFunctionType();
     assert(FnTy->getNumParams() == 1 && "Invalid function signature");
 
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     Entry.Node = GuardVal;
     Entry.Ty = FnTy->getParamType(0);
     if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
       Entry.IsInReg = true;
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(getCurSDLoc())
       .setChain(DAG.getEntryNode())
       .setCallee(Fn->getCallingConv(), FnTy->getReturnType(),
                  getValue(GuardCheck), std::move(Args));
 
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return;
   }
 
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
   SDValue Chain = DAG.getEntryNode();
   if (TLI.useLoadStackGuardNode()) {
     Guard = getLoadStackGuard(DAG, dl, Chain);
   } else {
     const Value *IRGuard = TLI.getSDagStackGuard(M);
     SDValue GuardPtr = getValue(IRGuard);
 
     Guard =
         DAG.getLoad(PtrTy, dl, Chain, GuardPtr, MachinePointerInfo(IRGuard, 0),
                     Align, MachineMemOperand::MOVolatile);
   }
 
   // Perform the comparison via a subtract/getsetcc.
   EVT VT = Guard.getValueType();
   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, GuardVal);
 
   SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
                                                         *DAG.getContext(),
                                                         Sub.getValueType()),
                              Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);
 
   // If the sub is not 0, then we know the guard/stackslot do not equal, so
   // branch to failure MBB.
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
                                MVT::Other, GuardVal.getOperand(0),
                                Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
   // Otherwise branch to success MBB.
   SDValue Br = DAG.getNode(ISD::BR, dl,
                            MVT::Other, BrCond,
                            DAG.getBasicBlock(SPD.getSuccessMBB()));
 
   DAG.setRoot(Br);
 }
 
 /// Codegen the failure basic block for a stack protector check.
 ///
 /// A failure stack protector machine basic block consists simply of a call to
 /// __stack_chk_fail().
 ///
 /// For a high level explanation of how this fits into the stack protector
 /// generation see the comment on the declaration of class
 /// StackProtectorDescriptor.
 void
 SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain =
       TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
                       None, false, getCurSDLoc(), false, false).second;
   DAG.setRoot(Chain);
 }
 
 /// visitBitTestHeader - This function emits necessary code to produce value
 /// suitable for "bit tests"
 void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
                                              MachineBasicBlock *SwitchBB) {
   SDLoc dl = getCurSDLoc();
 
   // Subtract the minimum value
   SDValue SwitchOp = getValue(B.SValue);
   EVT VT = SwitchOp.getValueType();
   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
                             DAG.getConstant(B.First, dl, VT));
 
   // Check range
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue RangeCmp = DAG.getSetCC(
       dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                  Sub.getValueType()),
       Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
 
   // Determine the type of the test operands.
   bool UsePtrType = false;
   if (!TLI.isTypeLegal(VT))
     UsePtrType = true;
   else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
       if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
         // Switch table case range are encoded into series of masks.
         // Just use pointer type, it's guaranteed to fit.
         UsePtrType = true;
         break;
       }
   }
   if (UsePtrType) {
     VT = TLI.getPointerTy(DAG.getDataLayout());
     Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
   }
 
   B.RegVT = VT.getSimpleVT();
   B.Reg = FuncInfo.CreateReg(B.RegVT);
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, B.Reg, Sub);
 
   MachineBasicBlock* MBB = B.Cases[0].ThisBB;
 
   addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
   addSuccessorWithProb(SwitchBB, MBB, B.Prob);
   SwitchBB->normalizeSuccProbs();
 
   SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
                                 MVT::Other, CopyTo, RangeCmp,
                                 DAG.getBasicBlock(B.Default));
 
   // Avoid emitting unnecessary branches to the next block.
   if (MBB != NextBlock(SwitchBB))
     BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
                           DAG.getBasicBlock(MBB));
 
   DAG.setRoot(BrRange);
 }
 
 /// visitBitTestCase - this function produces one "bit test"
 void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                                            MachineBasicBlock* NextMBB,
                                            BranchProbability BranchProbToNext,
                                            unsigned Reg,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
   SDLoc dl = getCurSDLoc();
   MVT VT = BB.RegVT;
   SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), dl, Reg, VT);
   SDValue Cmp;
   unsigned PopCount = countPopulation(B.Mask);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (PopCount == 1) {
     // Testing for a single bit; just compare the shift count with what it
     // would need to be to shift a 1 bit in that position.
     Cmp = DAG.getSetCC(
         dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
         ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT),
         ISD::SETEQ);
   } else if (PopCount == BB.Range) {
     // There is only one zero bit in the range, test for it directly.
     Cmp = DAG.getSetCC(
         dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
         ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT),
         ISD::SETNE);
   } else {
     // Make desired shift
     SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
                                     DAG.getConstant(1, dl, VT), ShiftOp);
 
     // Emit bit tests and jumps
     SDValue AndOp = DAG.getNode(ISD::AND, dl,
                                 VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
     Cmp = DAG.getSetCC(
         dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
         AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
   }
 
   // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
   addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
   // The branch probability from SwitchBB to NextMBB is BranchProbToNext.
   addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
   // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
   // one as they are relative probabilities (and thus work more like weights),
   // and hence we need to normalize them to let the sum of them become one.
   SwitchBB->normalizeSuccProbs();
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
                               MVT::Other, getControlRoot(),
                               Cmp, DAG.getBasicBlock(B.TargetBB));
 
   // Avoid emitting unnecessary branches to the next block.
   if (NextMBB != NextBlock(SwitchBB))
     BrAnd = DAG.getNode(ISD::BR, dl, MVT::Other, BrAnd,
                         DAG.getBasicBlock(NextMBB));
 
   DAG.setRoot(BrAnd);
 }
 
 void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
 
   // Retrieve successors. Look through artificial IR level blocks like
   // catchswitch for successors.
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   assert(!I.hasOperandBundlesOtherThan(
              {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
          "Cannot lower invokes with arbitrary operand bundles yet!");
 
   const Value *Callee(I.getCalledValue());
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     visitInlineAsm(&I);
   else if (Fn && Fn->isIntrinsic()) {
     switch (Fn->getIntrinsicID()) {
     default:
       llvm_unreachable("Cannot invoke this intrinsic");
     case Intrinsic::donothing:
       // Ignore invokes to @llvm.donothing: jump directly to the next BB.
       break;
     case Intrinsic::experimental_patchpoint_void:
     case Intrinsic::experimental_patchpoint_i64:
       visitPatchpoint(&I, EHPadBB);
       break;
     case Intrinsic::experimental_gc_statepoint:
       LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
       break;
     }
   } else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
     // Currently we do not lower any intrinsic calls with deopt operand bundles.
     // Eventually we will support lowering the @llvm.experimental.deoptimize
     // intrinsic, and right now there are no plans to support other intrinsics
     // with deopt state.
     LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
   } else {
     LowerCallTo(&I, getValue(Callee), false, EHPadBB);
   }
 
   // If the value of the invoke is used outside of its defining block, make it
   // available as a virtual register.
   // We already took care of the exported value for the statepoint instruction
   // during call to the LowerStatepoint.
   if (!isStatepoint(I)) {
     CopyToExportRegsIfNeeded(&I);
   }
 
   SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   BranchProbability EHPadBBProb =
       BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
           : BranchProbability::getZero();
   findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);
 
   // Update successor info.
   addSuccessorWithProb(InvokeMBB, Return);
   for (auto &UnwindDest : UnwindDests) {
     UnwindDest.first->setIsEHPad();
     addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
   }
   InvokeMBB->normalizeSuccProbs();
 
   // Drop into normal successor.
   DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
                           MVT::Other, getControlRoot(),
                           DAG.getBasicBlock(Return)));
 }
 
 void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
   llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
 }
 
 void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   assert(FuncInfo.MBB->isEHPad() &&
          "Call to landingpad not in landing pad!");
 
   MachineBasicBlock *MBB = FuncInfo.MBB;
   addLandingPadInfo(LP, *MBB);
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother to create these DAG nodes.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
   if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
       TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
     return;
 
   // If landingpad's return type is token type, we don't create DAG nodes
   // for its exception pointer and selector value. The extraction of exception
   // pointer or selector value from token type landingpads is not currently
   // supported.
   if (LP.getType()->isTokenTy())
     return;
 
   SmallVector<EVT, 2> ValueVTs;
   SDLoc dl = getCurSDLoc();
   ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
   assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
 
   // Get the two live-in registers as SDValues. The physregs have already been
   // copied into virtual registers.
   SDValue Ops[2];
   if (FuncInfo.ExceptionPointerVirtReg) {
     Ops[0] = DAG.getZExtOrTrunc(
         DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                            FuncInfo.ExceptionPointerVirtReg,
                            TLI.getPointerTy(DAG.getDataLayout())),
         dl, ValueVTs[0]);
   } else {
     Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   Ops[1] = DAG.getZExtOrTrunc(
       DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                          FuncInfo.ExceptionSelectorVirtReg,
                          TLI.getPointerTy(DAG.getDataLayout())),
       dl, ValueVTs[1]);
 
   // Merge into one.
   SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
                             DAG.getVTList(ValueVTs), Ops);
   setValue(&LP, Res);
 }
 
 void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) {
 #ifndef NDEBUG
   for (const CaseCluster &CC : Clusters)
     assert(CC.Low == CC.High && "Input clusters must be single-case");
 #endif
 
   llvm::sort(Clusters.begin(), Clusters.end(),
              [](const CaseCluster &a, const CaseCluster &b) {
     return a.Low->getValue().slt(b.Low->getValue());
   });
 
   // Merge adjacent clusters with the same destination.
   const unsigned N = Clusters.size();
   unsigned DstIndex = 0;
   for (unsigned SrcIndex = 0; SrcIndex < N; ++SrcIndex) {
     CaseCluster &CC = Clusters[SrcIndex];
     const ConstantInt *CaseVal = CC.Low;
     MachineBasicBlock *Succ = CC.MBB;
 
     if (DstIndex != 0 && Clusters[DstIndex - 1].MBB == Succ &&
         (CaseVal->getValue() - Clusters[DstIndex - 1].High->getValue()) == 1) {
       // If this case has the same successor and is a neighbour, merge it into
       // the previous cluster.
       Clusters[DstIndex - 1].High = CaseVal;
       Clusters[DstIndex - 1].Prob += CC.Prob;
     } else {
       std::memmove(&Clusters[DstIndex++], &Clusters[SrcIndex],
                    sizeof(Clusters[SrcIndex]));
     }
   }
   Clusters.resize(DstIndex);
 }
 
 void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
                                            MachineBasicBlock *Last) {
   // Update JTCases.
   for (unsigned i = 0, e = JTCases.size(); i != e; ++i)
     if (JTCases[i].first.HeaderBB == First)
       JTCases[i].first.HeaderBB = Last;
 
   // Update BitTestCases.
   for (unsigned i = 0, e = BitTestCases.size(); i != e; ++i)
     if (BitTestCases[i].Parent == First)
       BitTestCases[i].Parent = Last;
 }
 
 void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
   MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges with unique successors.
   SmallSet<BasicBlock*, 32> Done;
   for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
     BasicBlock *BB = I.getSuccessor(i);
     bool Inserted = Done.insert(BB).second;
     if (!Inserted)
         continue;
 
     MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
     addSuccessorWithProb(IndirectBrMBB, Succ);
   }
   IndirectBrMBB->normalizeSuccProbs();
 
   DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
                           MVT::Other, getControlRoot(),
                           getValue(I.getAddress())));
 }
 
 void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
   if (!DAG.getTarget().Options.TrapUnreachable)
     return;
 
   // We may be able to ignore unreachable behind a noreturn call.
   if (DAG.getTarget().Options.NoTrapAfterNoreturn) {
     const BasicBlock &BB = *I.getParent();
     if (&I != &BB.front()) {
       BasicBlock::const_iterator PredI =
         std::prev(BasicBlock::const_iterator(&I));
       if (const CallInst *Call = dyn_cast<CallInst>(&*PredI)) {
         if (Call->doesNotReturn())
           return;
       }
     }
   }
 
   DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
 void SelectionDAGBuilder::visitFSub(const User &I) {
   // -0.0 - X --> fneg
   Type *Ty = I.getType();
   if (isa<Constant>(I.getOperand(0)) &&
       I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
     SDValue Op2 = getValue(I.getOperand(1));
     setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
                              Op2.getValueType(), Op2));
     return;
   }
 
   visitBinary(I, ISD::FSUB);
 }
 
 /// Checks if the given instruction performs a vector reduction, in which case
 /// we have the freedom to alter the elements in the result as long as the
 /// reduction of them stays unchanged.
 static bool isVectorReductionOp(const User *I) {
   const Instruction *Inst = dyn_cast<Instruction>(I);
   if (!Inst || !Inst->getType()->isVectorTy())
     return false;
 
   auto OpCode = Inst->getOpcode();
   switch (OpCode) {
   case Instruction::Add:
   case Instruction::Mul:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
     break;
   case Instruction::FAdd:
   case Instruction::FMul:
     if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
       if (FPOp->getFastMathFlags().isFast())
         break;
     LLVM_FALLTHROUGH;
   default:
     return false;
   }
 
   unsigned ElemNum = Inst->getType()->getVectorNumElements();
   // Ensure the reduction size is a power of 2.
   if (!isPowerOf2_32(ElemNum))
     return false;
 
   unsigned ElemNumToReduce = ElemNum;
 
   // Do DFS search on the def-use chain from the given instruction. We only
   // allow four kinds of operations during the search until we reach the
   // instruction that extracts the first element from the vector:
   //
   //   1. The reduction operation of the same opcode as the given instruction.
   //
   //   2. PHI node.
   //
   //   3. ShuffleVector instruction together with a reduction operation that
   //      does a partial reduction.
   //
   //   4. ExtractElement that extracts the first element from the vector, and we
   //      stop searching the def-use chain here.
   //
   // 3 & 4 above perform a reduction on all elements of the vector. We push defs
   // from 1-3 to the stack to continue the DFS. The given instruction is not
   // a reduction operation if we meet any other instructions other than those
   // listed above.
 
   SmallVector<const User *, 16> UsersToVisit{Inst};
   SmallPtrSet<const User *, 16> Visited;
   bool ReduxExtracted = false;
 
   while (!UsersToVisit.empty()) {
     auto User = UsersToVisit.back();
     UsersToVisit.pop_back();
     if (!Visited.insert(User).second)
       continue;
 
     for (const auto &U : User->users()) {
       auto Inst = dyn_cast<Instruction>(U);
       if (!Inst)
         return false;
 
       if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
         if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
           if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
             return false;
         UsersToVisit.push_back(U);
       } else if (const ShuffleVectorInst *ShufInst =
                      dyn_cast<ShuffleVectorInst>(U)) {
         // Detect the following pattern: A ShuffleVector instruction together
         // with a reduction that do partial reduction on the first and second
         // ElemNumToReduce / 2 elements, and store the result in
         // ElemNumToReduce / 2 elements in another vector.
 
         unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
         if (ResultElements < ElemNum)
           return false;
 
         if (ElemNumToReduce == 1)
           return false;
         if (!isa<UndefValue>(U->getOperand(1)))
           return false;
         for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
           if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
             return false;
         for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
           if (ShufInst->getMaskValue(i) != -1)
             return false;
 
         // There is only one user of this ShuffleVector instruction, which
         // must be a reduction operation.
         if (!U->hasOneUse())
           return false;
 
         auto U2 = dyn_cast<Instruction>(*U->user_begin());
         if (!U2 || U2->getOpcode() != OpCode)
           return false;
 
         // Check operands of the reduction operation.
         if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
             (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
           UsersToVisit.push_back(U2);
           ElemNumToReduce /= 2;
         } else
           return false;
       } else if (isa<ExtractElementInst>(U)) {
         // At this moment we should have reduced all elements in the vector.
         if (ElemNumToReduce != 1)
           return false;
 
         const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
         if (!Val || !Val->isZero())
           return false;
 
         ReduxExtracted = true;
       } else
         return false;
     }
   }
   return ReduxExtracted;
 }
 
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
   if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
     Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
     Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
   }
   if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
     Flags.setExact(ExactOp->isExact());
   }
   if (isVectorReductionOp(&I)) {
     Flags.setVectorReduction(true);
     LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
   }
 
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
   SDValue BinNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(),
                                      Op1, Op2, Flags);
   setValue(&I, BinNodeValue);
 }
 
 void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
   EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
       Op2.getValueType(), DAG.getDataLayout());
 
   // Coerce the shift amount to the right type if we can.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
     unsigned ShiftSize = ShiftTy.getSizeInBits();
     unsigned Op2Size = Op2.getValueSizeInBits();
     SDLoc DL = getCurSDLoc();
 
     // If the operand is smaller than the shift count type, promote it.
     if (ShiftSize > Op2Size)
       Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);
 
     // If the operand is larger than the shift count type but the shift
     // count type has enough bits to represent any shift value, truncate
     // it now. This is a common case and it exposes the truncate to
     // optimization early.
     else if (ShiftSize >= Log2_32_Ceil(Op2.getValueSizeInBits()))
       Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
     // Otherwise we'll need to temporarily settle for some other convenient
     // type.  Type legalization will make adjustments once the shiftee is split.
     else
       Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
   }
 
   bool nuw = false;
   bool nsw = false;
   bool exact = false;
 
   if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
 
     if (const OverflowingBinaryOperator *OFBinOp =
             dyn_cast<const OverflowingBinaryOperator>(&I)) {
       nuw = OFBinOp->hasNoUnsignedWrap();
       nsw = OFBinOp->hasNoSignedWrap();
     }
     if (const PossiblyExactOperator *ExactOp =
             dyn_cast<const PossiblyExactOperator>(&I))
       exact = ExactOp->isExact();
   }
   SDNodeFlags Flags;
   Flags.setExact(exact);
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
   SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
                             Flags);
   setValue(&I, Res);
 }
 
 void SelectionDAGBuilder::visitSDiv(const User &I) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
   SDNodeFlags Flags;
   Flags.setExact(isa<PossiblyExactOperator>(&I) &&
                  cast<PossiblyExactOperator>(&I)->isExact());
   setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
                            Op2, Flags));
 }
 
 void SelectionDAGBuilder::visitICmp(const User &I) {
   ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
   if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
     predicate = IC->getPredicate();
   else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
     predicate = ICmpInst::Predicate(IC->getPredicate());
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Opcode = getICmpCondCode(predicate);
 
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
 }
 
 void SelectionDAGBuilder::visitFCmp(const User &I) {
   FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
   if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
     predicate = FC->getPredicate();
   else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
     predicate = FCmpInst::Predicate(FC->getPredicate());
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
   ISD::CondCode Condition = getFCmpCondCode(predicate);
   auto *FPMO = dyn_cast<FPMathOperator>(&I);
   if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
 
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
 }
 
 // Check if the condition of the select has one use or two users that are both
 // selects with the same condition.
 static bool hasOnlySelectUsers(const Value *Cond) {
   return llvm::all_of(Cond->users(), [](const Value *V) {
     return isa<SelectInst>(V);
   });
 }
 
 void SelectionDAGBuilder::visitSelect(const User &I) {
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
                   ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
 
   SmallVector<SDValue, 4> Values(NumValues);
   SDValue Cond     = getValue(I.getOperand(0));
   SDValue LHSVal   = getValue(I.getOperand(1));
   SDValue RHSVal   = getValue(I.getOperand(2));
   auto BaseOps = {Cond};
   ISD::NodeType OpCode = Cond.getValueType().isVector() ?
     ISD::VSELECT : ISD::SELECT;
 
   // Min/max matching is only viable if all output VTs are the same.
   if (std::equal(ValueVTs.begin(), ValueVTs.end(), ValueVTs.begin())) {
     EVT VT = ValueVTs[0];
     LLVMContext &Ctx = *DAG.getContext();
     auto &TLI = DAG.getTargetLoweringInfo();
 
     // We care about the legality of the operation after it has been type
     // legalized.
     while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
            VT != TLI.getTypeToTransformTo(Ctx, VT))
       VT = TLI.getTypeToTransformTo(Ctx, VT);
 
     // If the vselect is legal, assume we want to leave this as a vector setcc +
     // vselect. Otherwise, if this is going to be scalarized, we want to see if
     // min/max is legal on the scalar type.
     bool UseScalarMinMax = VT.isVector() &&
       !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);
 
     Value *LHS, *RHS;
     auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
     ISD::NodeType Opc = ISD::DELETED_NODE;
     switch (SPR.Flavor) {
     case SPF_UMAX:    Opc = ISD::UMAX; break;
     case SPF_UMIN:    Opc = ISD::UMIN; break;
     case SPF_SMAX:    Opc = ISD::SMAX; break;
     case SPF_SMIN:    Opc = ISD::SMIN; break;
     case SPF_FMINNUM:
       switch (SPR.NaNBehavior) {
       case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
       case SPNB_RETURNS_NAN:   Opc = ISD::FMINNAN; break;
       case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
       case SPNB_RETURNS_ANY: {
         if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
           Opc = ISD::FMINNUM;
         else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT))
           Opc = ISD::FMINNAN;
         else if (UseScalarMinMax)
           Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
             ISD::FMINNUM : ISD::FMINNAN;
         break;
       }
       }
       break;
     case SPF_FMAXNUM:
       switch (SPR.NaNBehavior) {
       case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
       case SPNB_RETURNS_NAN:   Opc = ISD::FMAXNAN; break;
       case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
       case SPNB_RETURNS_ANY:
 
         if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
           Opc = ISD::FMAXNUM;
         else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT))
           Opc = ISD::FMAXNAN;
         else if (UseScalarMinMax)
           Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
             ISD::FMAXNUM : ISD::FMAXNAN;
         break;
       }
       break;
     default: break;
     }
 
     if (Opc != ISD::DELETED_NODE &&
         (TLI.isOperationLegalOrCustom(Opc, VT) ||
          (UseScalarMinMax &&
           TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
         // If the underlying comparison instruction is used by any other
         // instruction, the consumed instructions won't be destroyed, so it is
         // not profitable to convert to a min/max.
         hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
       OpCode = Opc;
       LHSVal = getValue(LHS);
       RHSVal = getValue(RHS);
       BaseOps = {};
     }
   }
 
   for (unsigned i = 0; i != NumValues; ++i) {
     SmallVector<SDValue, 3> Ops(BaseOps.begin(), BaseOps.end());
     Ops.push_back(SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
     Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
     Values[i] = DAG.getNode(OpCode, getCurSDLoc(),
                             LHSVal.getNode()->getValueType(LHSVal.getResNo()+i),
                             Ops);
   }
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(ValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitTrunc(const User &I) {
   // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitZExt(const User &I) {
   // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // ZExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitSExt(const User &I) {
   // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // SExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPTrunc(const User &I) {
   // FPTrunc is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   SDLoc dl = getCurSDLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
                            DAG.getTargetConstant(
                                0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
 }
 
 void SelectionDAGBuilder::visitFPExt(const User &I) {
   // FPExt is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToUI(const User &I) {
   // FPToUI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToSI(const User &I) {
   // FPToSI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitUIToFP(const User &I) {
   // UIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitSIToFP(const User &I) {
   // SIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitIntToPtr(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitBitCast(const User &I) {
   SDValue N = getValue(I.getOperand(0));
   SDLoc dl = getCurSDLoc();
   EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                         I.getType());
 
   // BitCast assures us that source and destination are the same size so this is
   // either a BITCAST or a no-op.
   if (DestVT != N.getValueType())
     setValue(&I, DAG.getNode(ISD::BITCAST, dl,
                              DestVT, N)); // convert types.
   // Check if the original LLVM IR Operand was a ConstantInt, because getValue()
   // might fold any kind of constant expression to an integer constant and that
   // is not what we are looking for. Only recognize a bitcast of a genuine
   // constant integer as an opaque constant.
   else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
     setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
                                  /*isOpaque*/true));
   else
     setValue(&I, N);            // noop cast.
 }
 
 void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Value *SV = I.getOperand(0);
   SDValue N = getValue(SV);
   EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
 
   unsigned SrcAS = SV->getType()->getPointerAddressSpace();
   unsigned DestAS = I.getType()->getPointerAddressSpace();
 
   if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
     N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
 
   setValue(&I, N);
 }
 
 void SelectionDAGBuilder::visitInsertElement(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
   SDValue InVal = getValue(I.getOperand(1));
   SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
                                      TLI.getVectorIdxTy(DAG.getDataLayout()));
   setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
                            TLI.getValueType(DAG.getDataLayout(), I.getType()),
                            InVec, InVal, InIdx));
 }
 
 void SelectionDAGBuilder::visitExtractElement(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
   SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
                                      TLI.getVectorIdxTy(DAG.getDataLayout()));
   setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
                            TLI.getValueType(DAG.getDataLayout(), I.getType()),
                            InVec, InIdx));
 }
 
 void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   SDValue Src1 = getValue(I.getOperand(0));
   SDValue Src2 = getValue(I.getOperand(1));
   SDLoc DL = getCurSDLoc();
 
   SmallVector<int, 8> Mask;
   ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
   unsigned MaskNumElts = Mask.size();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
   if (SrcNumElts == MaskNumElts) {
     setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
     return;
   }
 
   // Normalize the shuffle vector since mask and vector length don't match.
   if (SrcNumElts < MaskNumElts) {
     // Mask is longer than the source vectors. We can use concatenate vector to
     // make the mask and vectors lengths match.
 
     if (MaskNumElts % SrcNumElts == 0) {
       // Mask length is a multiple of the source vector length.
       // Check if the shuffle is some kind of concatenation of the input
       // vectors.
       unsigned NumConcat = MaskNumElts / SrcNumElts;
       bool IsConcat = true;
       SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
       for (unsigned i = 0; i != MaskNumElts; ++i) {
         int Idx = Mask[i];
         if (Idx < 0)
           continue;
         // Ensure the indices in each SrcVT sized piece are sequential and that
         // the same source is used for the whole piece.
         if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
             (ConcatSrcs[i / SrcNumElts] >= 0 &&
              ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
           IsConcat = false;
           break;
         }
         // Remember which source this index came from.
         ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
       }
 
       // The shuffle is concatenating multiple vectors together. Just emit
       // a CONCAT_VECTORS operation.
       if (IsConcat) {
         SmallVector<SDValue, 8> ConcatOps;
         for (auto Src : ConcatSrcs) {
           if (Src < 0)
             ConcatOps.push_back(DAG.getUNDEF(SrcVT));
           else if (Src == 0)
             ConcatOps.push_back(Src1);
           else
             ConcatOps.push_back(Src2);
         }
         setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps));
         return;
       }
     }
 
     unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
     unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
     EVT PaddedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                     PaddedMaskNumElts);
 
     // Pad both vectors with undefs to make them the same length as the mask.
     SDValue UndefVal = DAG.getUNDEF(SrcVT);
 
     SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
     SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
     MOps1[0] = Src1;
     MOps2[0] = Src2;
 
     Src1 = Src1.isUndef()
                ? DAG.getUNDEF(PaddedVT)
                : DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps1);
     Src2 = Src2.isUndef()
                ? DAG.getUNDEF(PaddedVT)
                : DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps2);
 
     // Readjust mask for new input vector length.
     SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
     for (unsigned i = 0; i != MaskNumElts; ++i) {
       int Idx = Mask[i];
       if (Idx >= (int)SrcNumElts)
         Idx -= SrcNumElts - PaddedMaskNumElts;
       MappedOps[i] = Idx;
     }
 
     SDValue Result = DAG.getVectorShuffle(PaddedVT, DL, Src1, Src2, MappedOps);
 
     // If the concatenated vector was padded, extract a subvector with the
     // correct number of elements.
     if (MaskNumElts != PaddedMaskNumElts)
       Result = DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
           DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
     setValue(&I, Result);
     return;
   }
 
   if (SrcNumElts > MaskNumElts) {
     // Analyze the access pattern of the vector to see if we can extract
     // two subvectors and do the shuffle.
     int StartIdx[2] = { -1, -1 };  // StartIdx to extract from
     bool CanExtract = true;
     for (int Idx : Mask) {
       unsigned Input = 0;
       if (Idx < 0)
         continue;
 
       if (Idx >= (int)SrcNumElts) {
         Input = 1;
         Idx -= SrcNumElts;
       }
 
       // If all the indices come from the same MaskNumElts sized portion of
       // the sources we can use extract. Also make sure the extract wouldn't
       // extract past the end of the source.
       int NewStartIdx = alignDown(Idx, MaskNumElts);
       if (NewStartIdx + MaskNumElts > SrcNumElts ||
           (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
         CanExtract = false;
       // Make sure we always update StartIdx as we use it to track if all
       // elements are undef.
       StartIdx[Input] = NewStartIdx;
     }
 
     if (StartIdx[0] < 0 && StartIdx[1] < 0) {
       setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
       return;
     }
     if (CanExtract) {
       // Extract appropriate subvector and generate a vector shuffle
       for (unsigned Input = 0; Input < 2; ++Input) {
         SDValue &Src = Input == 0 ? Src1 : Src2;
         if (StartIdx[Input] < 0)
           Src = DAG.getUNDEF(VT);
         else {
           Src = DAG.getNode(
               ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
               DAG.getConstant(StartIdx[Input], DL,
                               TLI.getVectorIdxTy(DAG.getDataLayout())));
         }
       }
 
       // Calculate new mask.
       SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
       for (int &Idx : MappedOps) {
         if (Idx >= (int)SrcNumElts)
           Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
         else if (Idx >= 0)
           Idx -= StartIdx[0];
       }
 
       setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
       return;
     }
   }
 
   // We can't use either concat vectors or extract subvectors so fall back to
   // replacing the shuffle with extract and build vector.
   // to insert and build vector.
   EVT EltVT = VT.getVectorElementType();
   EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
   SmallVector<SDValue,8> Ops;
   for (int Idx : Mask) {
     SDValue Res;
 
     if (Idx < 0) {
       Res = DAG.getUNDEF(EltVT);
     } else {
       SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
       if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
 
       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                         EltVT, Src, DAG.getConstant(Idx, DL, IdxVT));
     }
 
     Ops.push_back(Res);
   }
 
   setValue(&I, DAG.getBuildVector(VT, DL, Ops));
 }
 
 void SelectionDAGBuilder::visitInsertValue(const User &I) {
   ArrayRef<unsigned> Indices;
   if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I))
     Indices = IV->getIndices();
   else
     Indices = cast<ConstantExpr>(&I)->getIndices();
 
   const Value *Op0 = I.getOperand(0);
   const Value *Op1 = I.getOperand(1);
   Type *AggTy = I.getType();
   Type *ValTy = Op1->getType();
   bool IntoUndef = isa<UndefValue>(Op0);
   bool FromUndef = isa<UndefValue>(Op1);
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> AggValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
   SmallVector<EVT, 4> ValValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
 
   unsigned NumAggValues = AggValueVTs.size();
   unsigned NumValValues = ValValueVTs.size();
   SmallVector<SDValue, 4> Values(NumAggValues);
 
   // Ignore an insertvalue that produces an empty object
   if (!NumAggValues) {
     setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
     return;
   }
 
   SDValue Agg = getValue(Op0);
   unsigned i = 0;
   // Copy the beginning value(s) from the original aggregate.
   for (; i != LinearIndex; ++i)
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
                 SDValue(Agg.getNode(), Agg.getResNo() + i);
   // Copy values from the inserted value(s).
   if (NumValValues) {
     SDValue Val = getValue(Op1);
     for (; i != LinearIndex + NumValValues; ++i)
       Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
                   SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
   }
   // Copy remaining value(s) from the original aggregate.
   for (; i != NumAggValues; ++i)
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
                 SDValue(Agg.getNode(), Agg.getResNo() + i);
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(AggValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitExtractValue(const User &I) {
   ArrayRef<unsigned> Indices;
   if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I))
     Indices = EV->getIndices();
   else
     Indices = cast<ConstantExpr>(&I)->getIndices();
 
   const Value *Op0 = I.getOperand(0);
   Type *AggTy = Op0->getType();
   Type *ValTy = I.getType();
   bool OutOfUndef = isa<UndefValue>(Op0);
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
 
   // Ignore a extractvalue that produces an empty object
   if (!NumValValues) {
     setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
     return;
   }
 
   SmallVector<SDValue, 4> Values(NumValValues);
 
   SDValue Agg = getValue(Op0);
   // Copy out the selected value(s).
   for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
     Values[i - LinearIndex] =
       OutOfUndef ?
         DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
         SDValue(Agg.getNode(), Agg.getResNo() + i);
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(ValValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   Value *Op0 = I.getOperand(0);
   // Note that the pointer operand may be a vector of pointers. Take the scalar
   // element which holds a pointer.
   unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
   SDValue N = getValue(Op0);
   SDLoc dl = getCurSDLoc();
 
   // Normalize Vector GEP - all scalar operands should be converted to the
   // splat vector.
   unsigned VectorWidth = I.getType()->isVectorTy() ?
     cast<VectorType>(I.getType())->getVectorNumElements() : 0;
 
   if (VectorWidth && !N.getValueType().isVector()) {
     LLVMContext &Context = *DAG.getContext();
     EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
     N = DAG.getSplatBuildVector(VT, dl, N);
   }
 
   for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
        GTI != E; ++GTI) {
     const Value *Idx = GTI.getOperand();
     if (StructType *StTy = GTI.getStructTypeOrNull()) {
       unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
       if (Field) {
         // N = N + Offset
         uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
 
         // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
           Flags.setNoUnsignedWrap(true);
 
         N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
                         DAG.getConstant(Offset, dl, N.getValueType()), Flags);
       }
     } else {
       unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
       MVT IdxTy = MVT::getIntegerVT(IdxSize);
       APInt ElementSize(IdxSize, DL->getTypeAllocSize(GTI.getIndexedType()));
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
       const auto *CI = dyn_cast<ConstantInt>(Idx);
       if (!CI && isa<ConstantDataVector>(Idx) &&
           cast<ConstantDataVector>(Idx)->getSplatValue())
         CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
 
       if (CI) {
         if (CI->isZero())
           continue;
         APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
         LLVMContext &Context = *DAG.getContext();
         SDValue OffsVal = VectorWidth ?
           DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
           DAG.getConstant(Offs, dl, IdxTy);
 
         // In an inbouds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
           Flags.setNoUnsignedWrap(true);
 
         N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags);
         continue;
       }
 
       // N = N + Idx * ElementSize;
       SDValue IdxN = getValue(Idx);
 
       if (!IdxN.getValueType().isVector() && VectorWidth) {
         EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
         IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
       }
 
       // If the index is smaller or larger than intptr_t, truncate or extend
       // it.
       IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());
 
       // If this is a multiply by a power of two, turn it into a shl
       // immediately.  This is a very common case.
       if (ElementSize != 1) {
         if (ElementSize.isPowerOf2()) {
           unsigned Amt = ElementSize.logBase2();
           IdxN = DAG.getNode(ISD::SHL, dl,
                              N.getValueType(), IdxN,
                              DAG.getConstant(Amt, dl, IdxN.getValueType()));
         } else {
           SDValue Scale = DAG.getConstant(ElementSize, dl, IdxN.getValueType());
           IdxN = DAG.getNode(ISD::MUL, dl,
                              N.getValueType(), IdxN, Scale);
         }
       }
 
       N = DAG.getNode(ISD::ADD, dl,
                       N.getValueType(), N, IdxN);
     }
   }
 
   setValue(&I, N);
 }
 
 void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   // If this is a fixed sized alloca in the entry block of the function,
   // allocate it statically on the stack.
   if (FuncInfo.StaticAllocaMap.count(&I))
     return;   // getValue will auto-populate this.
 
   SDLoc dl = getCurSDLoc();
   Type *Ty = I.getAllocatedType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   auto &DL = DAG.getDataLayout();
   uint64_t TySize = DL.getTypeAllocSize(Ty);
   unsigned Align =
       std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
   EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout(), DL.getAllocaAddrSpace());
   if (AllocSize.getValueType() != IntPtr)
     AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
 
   AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr,
                           AllocSize,
                           DAG.getConstant(TySize, dl, IntPtr));
 
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
   // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
   unsigned StackAlign =
       DAG.getSubtarget().getFrameLowering()->getStackAlignment();
   if (Align <= StackAlign)
     Align = 0;
 
   // Round the size of the allocation up to the stack alignment size
   // by add SA-1 to the size. This doesn't overflow because we're computing
   // an address inside an alloca.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
   AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize,
                           DAG.getConstant(StackAlign - 1, dl, IntPtr), Flags);
 
   // Mask out the low bits for alignment purposes.
   AllocSize =
       DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
                   DAG.getConstant(~(uint64_t)(StackAlign - 1), dl, IntPtr));
 
   SDValue Ops[] = {getRoot(), AllocSize, DAG.getConstant(Align, dl, IntPtr)};
   SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
   SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
   setValue(&I, DSA);
   DAG.setRoot(DSA.getValue(1));
 
   assert(FuncInfo.MF->getFrameInfo().hasVarSizedObjects());
 }
 
 void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (I.isAtomic())
     return visitAtomicLoad(I);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Value *SV = I.getOperand(0);
   if (TLI.supportSwiftError()) {
     // Swifterror values can come from either a function parameter with
     // swifterror attribute or an alloca with swifterror attribute.
     if (const Argument *Arg = dyn_cast<Argument>(SV)) {
       if (Arg->hasSwiftErrorAttr())
         return visitLoadFromSwiftError(I);
     }
 
     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
       if (Alloca->isSwiftError())
         return visitLoadFromSwiftError(I);
     }
   }
 
   SDValue Ptr = getValue(SV);
 
   Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
   bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
   bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
   bool isDereferenceable = isDereferenceablePointer(SV, DAG.getDataLayout());
   unsigned Alignment = I.getAlignment();
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
 
   SDValue Root;
   bool ConstantMemory = false;
   if (isVolatile || NumValues > MaxParallelChains)
     // Serialize volatile loads with other side effects.
     Root = getRoot();
   else if (AA && AA->pointsToConstantMemory(MemoryLocation(
                SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
   } else {
     // Do not serialize non-volatile loads against each other.
     Root = DAG.getRoot();
   }
 
   SDLoc dl = getCurSDLoc();
 
   if (isVolatile)
     Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
 
   // An aggregate load cannot wrap around the address space, so offsets to its
   // parts don't wrap either.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
 
   SmallVector<SDValue, 4> Values(NumValues);
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   EVT PtrVT = Ptr.getValueType();
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // Serializing loads here may result in excessive register pressure, and
     // TokenFactor places arbitrary choke points on the scheduler. SD scheduling
     // could recover a bit by hoisting nodes upward in the chain by recognizing
     // they are side-effect free or do not alias. The optimizer should really
     // avoid this case by converting large object/array copies to llvm.memcpy
     // (MaxParallelChains should always remain as failsafe).
     if (ChainI == MaxParallelChains) {
       assert(PendingLoads.empty() && "PendingLoads must be serialized first");
       SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                   makeArrayRef(Chains.data(), ChainI));
       Root = Chain;
       ChainI = 0;
     }
     SDValue A = DAG.getNode(ISD::ADD, dl,
                             PtrVT, Ptr,
                             DAG.getConstant(Offsets[i], dl, PtrVT),
                             Flags);
     auto MMOFlags = MachineMemOperand::MONone;
     if (isVolatile)
       MMOFlags |= MachineMemOperand::MOVolatile;
     if (isNonTemporal)
       MMOFlags |= MachineMemOperand::MONonTemporal;
     if (isInvariant)
       MMOFlags |= MachineMemOperand::MOInvariant;
     if (isDereferenceable)
       MMOFlags |= MachineMemOperand::MODereferenceable;
     MMOFlags |= TLI.getMMOFlags(I);
 
     SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A,
                             MachinePointerInfo(SV, Offsets[i]), Alignment,
                             MMOFlags, AAInfo, Ranges);
 
     Values[i] = L;
     Chains[ChainI] = L.getValue(1);
   }
 
   if (!ConstantMemory) {
     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                 makeArrayRef(Chains.data(), ChainI));
     if (isVolatile)
       DAG.setRoot(Chain);
     else
       PendingLoads.push_back(Chain);
   }
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, dl,
                            DAG.getVTList(ValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
   assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
          "call visitStoreToSwiftError when backend supports swifterror");
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   const Value *SrcV = I.getOperand(0);
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
                   SrcV->getType(), ValueVTs, &Offsets);
   assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
          "expect a single EVT for swifterror");
 
   SDValue Src = getValue(SrcV);
   // Create a virtual register, then update the virtual register.
   unsigned VReg; bool CreatedVReg;
   std::tie(VReg, CreatedVReg) = FuncInfo.getOrCreateSwiftErrorVRegDefAt(&I);
   // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
   // Chain can be getRoot or getControlRoot.
   SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
                                       SDValue(Src.getNode(), Src.getResNo()));
   DAG.setRoot(CopyNode);
   if (CreatedVReg)
     FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg);
 }
 
 void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
          "call visitLoadFromSwiftError when backend supports swifterror");
 
   assert(!I.isVolatile() &&
          I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
          I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
          "Support volatile, non temporal, invariant for load_from_swift_error");
 
   const Value *SV = I.getOperand(0);
   Type *Ty = I.getType();
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
   assert((!AA || !AA->pointsToConstantMemory(MemoryLocation(
              SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
          "load_from_swift_error should not be constant memory");
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
                   ValueVTs, &Offsets);
   assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
          "expect a single EVT for swifterror");
 
   // Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
   SDValue L = DAG.getCopyFromReg(
       getRoot(), getCurSDLoc(),
       FuncInfo.getOrCreateSwiftErrorVRegUseAt(&I, FuncInfo.MBB, SV).first,
       ValueVTs[0]);
 
   setValue(&I, L);
 }
 
 void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   if (I.isAtomic())
     return visitAtomicStore(I);
 
   const Value *SrcV = I.getOperand(0);
   const Value *PtrV = I.getOperand(1);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.supportSwiftError()) {
     // Swifterror values can come from either a function parameter with
     // swifterror attribute or an alloca with swifterror attribute.
     if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
       if (Arg->hasSwiftErrorAttr())
         return visitStoreToSwiftError(I);
     }
 
     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
       if (Alloca->isSwiftError())
         return visitStoreToSwiftError(I);
     }
   }
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
                   SrcV->getType(), ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
 
   // Get the lowered operands. Note that we do this after
   // checking if NumResults is zero, because with zero results
   // the operands won't have values in the map.
   SDValue Src = getValue(SrcV);
   SDValue Ptr = getValue(PtrV);
 
   SDValue Root = getRoot();
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   SDLoc dl = getCurSDLoc();
   EVT PtrVT = Ptr.getValueType();
   unsigned Alignment = I.getAlignment();
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
   auto MMOFlags = MachineMemOperand::MONone;
   if (I.isVolatile())
     MMOFlags |= MachineMemOperand::MOVolatile;
   if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
     MMOFlags |= MachineMemOperand::MONonTemporal;
   MMOFlags |= TLI.getMMOFlags(I);
 
   // An aggregate load cannot wrap around the address space, so offsets to its
   // parts don't wrap either.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
 
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // See visitLoad comments.
     if (ChainI == MaxParallelChains) {
       SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                   makeArrayRef(Chains.data(), ChainI));
       Root = Chain;
       ChainI = 0;
     }
     SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
                               DAG.getConstant(Offsets[i], dl, PtrVT), Flags);
     SDValue St = DAG.getStore(
         Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add,
         MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo);
     Chains[ChainI] = St;
   }
 
   SDValue StoreNode = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                   makeArrayRef(Chains.data(), ChainI));
   DAG.setRoot(StoreNode);
 }
 
 void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
                                            bool IsCompressing) {
   SDLoc sdl = getCurSDLoc();
 
   auto getMaskedStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
                            unsigned& Alignment) {
     // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
     Src0 = I.getArgOperand(0);
     Ptr = I.getArgOperand(1);
     Alignment = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
     Mask = I.getArgOperand(3);
   };
   auto getCompressingStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
                            unsigned& Alignment) {
     // llvm.masked.compressstore.*(Src0, Ptr, Mask)
     Src0 = I.getArgOperand(0);
     Ptr = I.getArgOperand(1);
     Mask = I.getArgOperand(2);
     Alignment = 0;
   };
 
   Value  *PtrOperand, *MaskOperand, *Src0Operand;
   unsigned Alignment;
   if (IsCompressing)
     getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
   else
     getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
 
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(Src0Operand);
   SDValue Mask = getValue(MaskOperand);
 
   EVT VT = Src0.getValueType();
   if (!Alignment)
     Alignment = DAG.getEVTAlignment(VT);
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
   MachineMemOperand *MMO =
     DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(PtrOperand),
                           MachineMemOperand::MOStore,  VT.getStoreSize(),
                           Alignment, AAInfo);
   SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
                                          MMO, false /* Truncating */,
                                          IsCompressing);
   DAG.setRoot(StoreNode);
   setValue(&I, StoreNode);
 }
 
 // Get a uniform base for the Gather/Scatter intrinsic.
 // The first argument of the Gather/Scatter intrinsic is a vector of pointers.
 // We try to represent it as a base pointer + vector of indices.
 // Usually, the vector of pointers comes from a 'getelementptr' instruction.
 // The first operand of the GEP may be a single pointer or a vector of pointers
 // Example:
 //   %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
 //  or
 //   %gep.ptr = getelementptr i32, i32* %ptr,        <8 x i32> %ind
 // %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
 //
 // When the first GEP operand is a single pointer - it is the uniform base we
 // are looking for. If first operand of the GEP is a splat vector - we
 // extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
 static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
                            SDValue &Scale, SelectionDAGBuilder* SDB) {
   SelectionDAG& DAG = SDB->DAG;
   LLVMContext &Context = *DAG.getContext();
 
   assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
   const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP)
     return false;
 
   const Value *GEPPtr = GEP->getPointerOperand();
   if (!GEPPtr->getType()->isVectorTy())
     Ptr = GEPPtr;
   else if (!(Ptr = getSplatValue(GEPPtr)))
     return false;
 
   unsigned FinalIndex = GEP->getNumOperands() - 1;
   Value *IndexVal = GEP->getOperand(FinalIndex);
 
   // Ensure all the other indices are 0.
   for (unsigned i = 1; i < FinalIndex; ++i) {
     auto *C = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (!C || !C->isZero())
       return false;
   }
 
   // The operands of the GEP may be defined in another basic block.
   // In this case we'll not find nodes for the operands.
   if (!SDB->findValue(Ptr) || !SDB->findValue(IndexVal))
     return false;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
   Scale = DAG.getTargetConstant(DL.getTypeAllocSize(GEP->getResultElementType()),
                                 SDB->getCurSDLoc(), TLI.getPointerTy(DL));
   Base = SDB->getValue(Ptr);
   Index = SDB->getValue(IndexVal);
 
   if (!Index.getValueType().isVector()) {
     unsigned GEPWidth = GEP->getType()->getVectorNumElements();
     EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
     Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index);
   }
   return true;
 }
 
 void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
   const Value *Ptr = I.getArgOperand(1);
   SDValue Src0 = getValue(I.getArgOperand(0));
   SDValue Mask = getValue(I.getArgOperand(3));
   EVT VT = Src0.getValueType();
   unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
   if (!Alignment)
     Alignment = DAG.getEVTAlignment(VT);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
   SDValue Base;
   SDValue Index;
   SDValue Scale;
   const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
 
   const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
                          Alignment, AAInfo);
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
                                          Ops, MMO);
   DAG.setRoot(Scatter);
   setValue(&I, Scatter);
 }
 
 void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   SDLoc sdl = getCurSDLoc();
 
   auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
                            unsigned& Alignment) {
     // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
     Ptr = I.getArgOperand(0);
     Alignment = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
     Mask = I.getArgOperand(2);
     Src0 = I.getArgOperand(3);
   };
   auto getExpandingLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
                            unsigned& Alignment) {
     // @llvm.masked.expandload.*(Ptr, Mask, Src0)
     Ptr = I.getArgOperand(0);
     Alignment = 0;
     Mask = I.getArgOperand(1);
     Src0 = I.getArgOperand(2);
   };
 
   Value  *PtrOperand, *MaskOperand, *Src0Operand;
   unsigned Alignment;
   if (IsExpanding)
     getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
   else
     getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
 
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(Src0Operand);
   SDValue Mask = getValue(MaskOperand);
 
   EVT VT = Src0.getValueType();
   if (!Alignment)
     Alignment = DAG.getEVTAlignment(VT);
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   // Do not serialize masked loads of constant memory with anything.
   bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation(
       PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
   MachineMemOperand *MMO =
     DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(PtrOperand),
                           MachineMemOperand::MOLoad,  VT.getStoreSize(),
                           Alignment, AAInfo, Ranges);
 
   SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
                                    ISD::NON_EXTLOAD, IsExpanding);
   if (AddToChain)
     PendingLoads.push_back(Load.getValue(1));
   setValue(&I, Load);
 }
 
 void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
   const Value *Ptr = I.getArgOperand(0);
   SDValue Src0 = getValue(I.getArgOperand(3));
   SDValue Mask = getValue(I.getArgOperand(2));
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
   if (!Alignment)
     Alignment = DAG.getEVTAlignment(VT);
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
   SDValue Scale;
   const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
   bool ConstantMemory = false;
   if (UniformBase &&
       AA && AA->pointsToConstantMemory(MemoryLocation(
           BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
           AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
   }
 
   MachineMemOperand *MMO =
     DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
                          Alignment, AAInfo, Ranges);
 
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
                                        Ops, MMO);
 
   SDValue OutChain = Gather.getValue(1);
   if (!ConstantMemory)
     PendingLoads.push_back(OutChain);
   setValue(&I, Gather);
 }
 
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
   AtomicOrdering FailureOrder = I.getFailureOrdering();
   SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
   MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
   SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
   SDValue L = DAG.getAtomicCmpSwap(
       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
       getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
       getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
       /*Alignment=*/ 0, SuccessOrder, FailureOrder, SSID);
 
   SDValue OutChain = L.getValue(2);
 
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
   SDLoc dl = getCurSDLoc();
   ISD::NodeType NT;
   switch (I.getOperation()) {
   default: llvm_unreachable("Unknown atomicrmw operation");
   case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
   case AtomicRMWInst::Add:  NT = ISD::ATOMIC_LOAD_ADD; break;
   case AtomicRMWInst::Sub:  NT = ISD::ATOMIC_LOAD_SUB; break;
   case AtomicRMWInst::And:  NT = ISD::ATOMIC_LOAD_AND; break;
   case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
   case AtomicRMWInst::Or:   NT = ISD::ATOMIC_LOAD_OR; break;
   case AtomicRMWInst::Xor:  NT = ISD::ATOMIC_LOAD_XOR; break;
   case AtomicRMWInst::Max:  NT = ISD::ATOMIC_LOAD_MAX; break;
   case AtomicRMWInst::Min:  NT = ISD::ATOMIC_LOAD_MIN; break;
   case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
   case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
   }
   AtomicOrdering Order = I.getOrdering();
   SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
   SDValue L =
     DAG.getAtomic(NT, dl,
                   getValue(I.getValOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
                   I.getPointerOperand(),
                   /* Alignment=*/ 0, Order, SSID);
 
   SDValue OutChain = L.getValue(1);
 
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   SDLoc dl = getCurSDLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Ops[3];
   Ops[0] = getRoot();
   Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
                            TLI.getFenceOperandTy(DAG.getDataLayout()));
   Ops[2] = DAG.getConstant(I.getSyncScopeID(), dl,
                            TLI.getFenceOperandTy(DAG.getDataLayout()));
   DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
 }
 
 void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering Order = I.getOrdering();
   SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
 
   if (!TLI.supportsUnalignedAtomics() &&
       I.getAlignment() < VT.getStoreSize())
     report_fatal_error("Cannot generate unaligned atomic load");
 
   MachineMemOperand *MMO =
       DAG.getMachineFunction().
       getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
                            MachineMemOperand::MOVolatile |
                            MachineMemOperand::MOLoad,
                            VT.getStoreSize(),
                            I.getAlignment() ? I.getAlignment() :
                                               DAG.getEVTAlignment(VT),
                            AAMDNodes(), nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
   SDValue L =
       DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
                     getValue(I.getPointerOperand()), MMO);
 
   SDValue OutChain = L.getValue(1);
 
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   SDLoc dl = getCurSDLoc();
 
   AtomicOrdering Order = I.getOrdering();
   SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT =
       TLI.getValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
 
   if (I.getAlignment() < VT.getStoreSize())
     report_fatal_error("Cannot generate unaligned atomic store");
 
   SDValue OutChain =
     DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValueOperand()),
                   I.getPointerOperand(), I.getAlignment(),
                   Order, SSID);
 
   DAG.setRoot(OutChain);
 }
 
 /// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
 /// node.
 void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
                                                unsigned Intrinsic) {
   // Ignore the callsite's attributes. A specific call site may be marked with
   // readnone, but the lowering code will expect the chain based on the
   // definition.
   const Function *F = I.getCalledFunction();
   bool HasChain = !F->doesNotAccessMemory();
   bool OnlyLoad = HasChain && F->onlyReadsMemory();
 
   // Build the operand list.
   SmallVector<SDValue, 8> Ops;
   if (HasChain) {  // If this intrinsic has side-effects, chainify it.
     if (OnlyLoad) {
       // We don't need to serialize loads against other loads.
       Ops.push_back(DAG.getRoot());
     } else {
       Ops.push_back(getRoot());
     }
   }
 
   // Info is set by getTgtMemInstrinsic
   TargetLowering::IntrinsicInfo Info;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
                                                DAG.getMachineFunction(),
                                                Intrinsic);
 
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
   if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
       Info.opc == ISD::INTRINSIC_W_CHAIN)
     Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
                                         TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
     SDValue Op = getValue(I.getArgOperand(i));
     Ops.push_back(Op);
   }
 
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
 
   SDVTList VTs = DAG.getVTList(ValueVTs);
 
   // Create the node.
   SDValue Result;
   if (IsTgtIntrinsic) {
     // This is target intrinsic that touches memory
     Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs,
       Ops, Info.memVT,
       MachinePointerInfo(Info.ptrVal, Info.offset), Info.align,
       Info.flags, Info.size);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
     Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
   } else {
     Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
   }
 
   if (HasChain) {
     SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
     if (OnlyLoad)
       PendingLoads.push_back(Chain);
     else
       DAG.setRoot(Chain);
   }
 
   if (!I.getType()->isVoidTy()) {
     if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
       EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
       Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
     } else
       Result = lowerRangeToAssertZExt(DAG, I, Result);
 
     setValue(&I, Result);
   }
 }
 
 /// GetSignificand - Get the significand and build it into a floating-point
 /// number with exponent of 1:
 ///
 ///   Op = (Op & 0x007fffff) | 0x3f800000;
 ///
 /// where Op is the hexadecimal representation of floating point value.
 static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
   SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x007fffff, dl, MVT::i32));
   SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
                            DAG.getConstant(0x3f800000, dl, MVT::i32));
   return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
 }
 
 /// GetExponent - Get the exponent:
 ///
 ///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
 ///
 /// where Op is the hexadecimal representation of floating point value.
 static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
                            const TargetLowering &TLI, const SDLoc &dl) {
   SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x7f800000, dl, MVT::i32));
   SDValue t1 = DAG.getNode(
       ISD::SRL, dl, MVT::i32, t0,
       DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
   SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
                            DAG.getConstant(127, dl, MVT::i32));
   return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
 }
 
 /// getF32Constant - Get 32-bit floating point constant.
 static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
                               const SDLoc &dl) {
   return DAG.getConstantFP(APFloat(APFloat::IEEEsingle(), APInt(32, Flt)), dl,
                            MVT::f32);
 }
 
 static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
                                        SelectionDAG &DAG) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   //   IntegerPartOfX = ((int32_t)(t0);
   SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
 
   //   FractionalPartOfX = t0 - (float)IntegerPartOfX;
   SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
   SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
 
   //   IntegerPartOfX <<= 23;
   IntegerPartOfX = DAG.getNode(
       ISD::SHL, dl, MVT::i32, IntegerPartOfX,
       DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
                                   DAG.getDataLayout())));
 
   SDValue TwoToFractionalPartOfX;
   if (LimitFloatPrecision <= 6) {
     // For floating-point precision of 6:
     //
     //   TwoToFractionalPartOfX =
     //     0.997535578f +
     //       (0.735607626f + 0.252464424f * x) * x;
     //
     // error 0.0144103317, which is 6 bits
     SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                              getF32Constant(DAG, 0x3e814304, dl));
     SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
                              getF32Constant(DAG, 0x3f3c50c8, dl));
     SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
     TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                                          getF32Constant(DAG, 0x3f7f5e7e, dl));
   } else if (LimitFloatPrecision <= 12) {
     // For floating-point precision of 12:
     //
     //   TwoToFractionalPartOfX =
     //     0.999892986f +
     //       (0.696457318f +
     //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
     //
     // error 0.000107046256, which is 13 to 14 bits
     SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                              getF32Constant(DAG, 0x3da235e3, dl));
     SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
                              getF32Constant(DAG, 0x3e65b8f3, dl));
     SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
     SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                              getF32Constant(DAG, 0x3f324b07, dl));
     SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
     TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
                                          getF32Constant(DAG, 0x3f7ff8fd, dl));
   } else { // LimitFloatPrecision <= 18
     // For floating-point precision of 18:
     //
     //   TwoToFractionalPartOfX =
     //     0.999999982f +
     //       (0.693148872f +
     //         (0.240227044f +
     //           (0.554906021e-1f +
     //             (0.961591928e-2f +
     //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
     // error 2.47208000*10^(-7), which is better than 18 bits
     SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                              getF32Constant(DAG, 0x3924b03e, dl));
     SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
                              getF32Constant(DAG, 0x3ab24b87, dl));
     SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
     SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                              getF32Constant(DAG, 0x3c1d8c17, dl));
     SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
     SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
                              getF32Constant(DAG, 0x3d634a1d, dl));
     SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
     SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
                              getF32Constant(DAG, 0x3e75fe14, dl));
     SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
     SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
                               getF32Constant(DAG, 0x3f317234, dl));
     SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
     TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
                                          getF32Constant(DAG, 0x3f800000, dl));
   }
 
   // Add the exponent into the result in integer domain.
   SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX);
   return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
                      DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX));
 }
 
 /// expandExp - Lower an exp intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
 
     // Put the exponent in the right bit position for later addition to the
     // final result:
     //
     //   #define LOG2OFe 1.4426950f
     //   t0 = Op * LOG2OFe
 
     // TODO: What fast-math-flags should be set here?
     SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
                              getF32Constant(DAG, 0x3fb8aa3b, dl));
     return getLimitedPrecisionExp2(t0, dl, DAG);
   }
 
   // No special expansion.
   return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
 }
 
 /// expandLog - Lower a log intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 
     // Scale the exponent by log(2) [0.69314718f].
     SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
     SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
                                         getF32Constant(DAG, 0x3f317218, dl));
 
     // Get the significand and build it into a floating-point number with
     // exponent of 1.
     SDValue X = GetSignificand(DAG, Op1, dl);
 
     SDValue LogOfMantissa;
     if (LimitFloatPrecision <= 6) {
       // For floating-point precision of 6:
       //
       //   LogofMantissa =
       //     -1.1609546f +
       //       (1.4034025f - 0.23903021f * x) * x;
       //
       // error 0.0034276066, which is better than 8 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbe74c456, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3fb3a2b1, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                   getF32Constant(DAG, 0x3f949a29, dl));
     } else if (LimitFloatPrecision <= 12) {
       // For floating-point precision of 12:
       //
       //   LogOfMantissa =
       //     -1.7417939f +
       //       (2.8212026f +
       //         (-1.4699568f +
       //           (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
       //
       // error 0.000061011436, which is 14 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbd67b6d6, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3ee4f4b8, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                getF32Constant(DAG, 0x3fbc278b, dl));
       SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
       SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                                getF32Constant(DAG, 0x40348e95, dl));
       SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
       LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
                                   getF32Constant(DAG, 0x3fdef31a, dl));
     } else { // LimitFloatPrecision <= 18
       // For floating-point precision of 18:
       //
       //   LogOfMantissa =
       //     -2.1072184f +
       //       (4.2372794f +
       //         (-3.7029485f +
       //           (2.2781945f +
       //             (-0.87823314f +
       //               (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
       //
       // error 0.0000023660568, which is better than 18 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbc91e5ac, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3e4350aa, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                getF32Constant(DAG, 0x3f60d3e3, dl));
       SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
       SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                                getF32Constant(DAG, 0x4011cdf0, dl));
       SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
       SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
                                getF32Constant(DAG, 0x406cfd1c, dl));
       SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
       SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
                                getF32Constant(DAG, 0x408797cb, dl));
       SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
       LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
                                   getF32Constant(DAG, 0x4006dcab, dl));
     }
 
     return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
   }
 
   // No special expansion.
   return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
 }
 
 /// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 
     // Get the exponent.
     SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
 
     // Get the significand and build it into a floating-point number with
     // exponent of 1.
     SDValue X = GetSignificand(DAG, Op1, dl);
 
     // Different possible minimax approximations of significand in
     // floating-point for various degrees of accuracy over [1,2].
     SDValue Log2ofMantissa;
     if (LimitFloatPrecision <= 6) {
       // For floating-point precision of 6:
       //
       //   Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
       //
       // error 0.0049451742, which is more than 7 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbeb08fe0, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x40019463, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                    getF32Constant(DAG, 0x3fd6633d, dl));
     } else if (LimitFloatPrecision <= 12) {
       // For floating-point precision of 12:
       //
       //   Log2ofMantissa =
       //     -2.51285454f +
       //       (4.07009056f +
       //         (-2.12067489f +
       //           (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
       //
       // error 0.0000876136000, which is better than 13 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbda7262e, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3f25280b, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                getF32Constant(DAG, 0x4007b923, dl));
       SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
       SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                                getF32Constant(DAG, 0x40823e2f, dl));
       SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
       Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
                                    getF32Constant(DAG, 0x4020d29c, dl));
     } else { // LimitFloatPrecision <= 18
       // For floating-point precision of 18:
       //
       //   Log2ofMantissa =
       //     -3.0400495f +
       //       (6.1129976f +
       //         (-5.3420409f +
       //           (3.2865683f +
       //             (-1.2669343f +
       //               (0.27515199f -
       //                 0.25691327e-1f * x) * x) * x) * x) * x) * x;
       //
       // error 0.0000018516, which is better than 18 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbcd2769e, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3e8ce0b9, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                getF32Constant(DAG, 0x3fa22ae7, dl));
       SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
       SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
                                getF32Constant(DAG, 0x40525723, dl));
       SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
       SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
                                getF32Constant(DAG, 0x40aaf200, dl));
       SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
       SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
                                getF32Constant(DAG, 0x40c39dad, dl));
       SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
       Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
                                    getF32Constant(DAG, 0x4042902c, dl));
     }
 
     return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
   }
 
   // No special expansion.
   return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
 }
 
 /// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                            const TargetLowering &TLI) {
   // TODO: What fast-math-flags should be set on the floating-point nodes?
 
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 
     // Scale the exponent by log10(2) [0.30102999f].
     SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
     SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
                                         getF32Constant(DAG, 0x3e9a209a, dl));
 
     // Get the significand and build it into a floating-point number with
     // exponent of 1.
     SDValue X = GetSignificand(DAG, Op1, dl);
 
     SDValue Log10ofMantissa;
     if (LimitFloatPrecision <= 6) {
       // For floating-point precision of 6:
       //
       //   Log10ofMantissa =
       //     -0.50419619f +
       //       (0.60948995f - 0.10380950f * x) * x;
       //
       // error 0.0014886165, which is 6 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0xbdd49a13, dl));
       SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3f1c0789, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
                                     getF32Constant(DAG, 0x3f011300, dl));
     } else if (LimitFloatPrecision <= 12) {
       // For floating-point precision of 12:
       //
       //   Log10ofMantissa =
       //     -0.64831180f +
       //       (0.91751397f +
       //         (-0.31664806f + 0.47637168e-1f * x) * x) * x;
       //
       // error 0.00019228036, which is better than 12 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0x3d431f31, dl));
       SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3ea21fb2, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
                                getF32Constant(DAG, 0x3f6ae232, dl));
       SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
       Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
                                     getF32Constant(DAG, 0x3f25f7c3, dl));
     } else { // LimitFloatPrecision <= 18
       // For floating-point precision of 18:
       //
       //   Log10ofMantissa =
       //     -0.84299375f +
       //       (1.5327582f +
       //         (-1.0688956f +
       //           (0.49102474f +
       //             (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
       //
       // error 0.0000037995730, which is better than 18 bits
       SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
                                getF32Constant(DAG, 0x3c5d51ce, dl));
       SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
                                getF32Constant(DAG, 0x3e00685a, dl));
       SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
       SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
                                getF32Constant(DAG, 0x3efb6798, dl));
       SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
       SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
                                getF32Constant(DAG, 0x3f88d192, dl));
       SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
       SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
                                getF32Constant(DAG, 0x3fc4316c, dl));
       SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
       Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
                                     getF32Constant(DAG, 0x3f57ce70, dl));
     }
 
     return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
   }
 
   // No special expansion.
   return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
 }
 
 /// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
 static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
     return getLimitedPrecisionExp2(Op, dl, DAG);
 
   // No special expansion.
   return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
 }
 
 /// visitPow - Lower a pow intrinsic. Handles the special sequences for
 /// limited-precision mode with x == 10.0f.
 static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
                          SelectionDAG &DAG, const TargetLowering &TLI) {
   bool IsExp10 = false;
   if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
       APFloat Ten(10.0f);
       IsExp10 = LHSC->isExactlyValue(Ten);
     }
   }
 
   // TODO: What fast-math-flags should be set on the FMUL node?
   if (IsExp10) {
     // Put the exponent in the right bit position for later addition to the
     // final result:
     //
     //   #define LOG2OF10 3.3219281f
     //   t0 = Op * LOG2OF10;
     SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
                              getF32Constant(DAG, 0x40549a78, dl));
     return getLimitedPrecisionExp2(t0, dl, DAG);
   }
 
   // No special expansion.
   return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
 }
 
 /// ExpandPowI - Expand a llvm.powi intrinsic.
 static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
   // If RHS is a constant, we can expand this out to a multiplication tree,
   // otherwise we end up lowering to a call to __powidf2 (for example).  When
   // optimizing for size, we only want to do this if the expansion would produce
   // a small number of multiplies, otherwise we do the full expansion.
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
     // Get the exponent as a positive value.
     unsigned Val = RHSC->getSExtValue();
     if ((int)Val < 0) Val = -Val;
 
     // powi(x, 0) -> 1.0
     if (Val == 0)
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
     const Function &F = DAG.getMachineFunction().getFunction();
     if (!F.optForSize() ||
         // If optimizing for size, don't insert too many multiplies.
         // This inserts up to 5 multiplies.
         countPopulation(Val) + Log2_32(Val) < 7) {
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
       // the benefit of being both really simple and much better than a libcall.
       SDValue Res;  // Logically starts equal to 1.0
       SDValue CurSquare = LHS;
       // TODO: Intrinsics should have fast-math-flags that propagate to these
       // nodes.
       while (Val) {
         if (Val & 1) {
           if (Res.getNode())
             Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
           else
             Res = CurSquare;  // 1.0*CurSquare.
         }
 
         CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
                                 CurSquare, CurSquare);
         Val >>= 1;
       }
 
       // If the original was negative, invert the result, producing 1/(x*x*x).
       if (RHSC->getSExtValue() < 0)
         Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
                           DAG.getConstantFP(1.0, DL, LHS.getValueType()), Res);
       return Res;
     }
   }
 
   // Otherwise, expand to a libcall.
   return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
 }
 
 // getUnderlyingArgReg - Find underlying register used for a truncated or
 // bitcasted argument.
 static unsigned getUnderlyingArgReg(const SDValue &N) {
   switch (N.getOpcode()) {
   case ISD::CopyFromReg:
     return cast<RegisterSDNode>(N.getOperand(1))->getReg();
   case ISD::BITCAST:
   case ISD::AssertZext:
   case ISD::AssertSext:
   case ISD::TRUNCATE:
     return getUnderlyingArgReg(N.getOperand(0));
   default:
     return 0;
   }
 }
 
 /// If the DbgValueInst is a dbg_value of a function argument, create the
 /// corresponding DBG_VALUE machine instruction for it now.  At the end of
 /// instruction selection, they will be inserted to the entry BB.
 bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     const Value *V, DILocalVariable *Variable, DIExpression *Expr,
     DILocation *DL, bool IsDbgDeclare, const SDValue &N) {
   const Argument *Arg = dyn_cast<Argument>(V);
   if (!Arg)
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
   bool IsIndirect = false;
   Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
   int FI = FuncInfo.getArgumentFrameIndex(Arg);
   if (FI != std::numeric_limits<int>::max())
     Op = MachineOperand::CreateFI(FI);
 
   if (!Op && N.getNode()) {
     unsigned Reg = getUnderlyingArgReg(N);
     if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
       unsigned PR = RegInfo.getLiveInPhysReg(Reg);
       if (PR)
         Reg = PR;
     }
     if (Reg) {
       Op = MachineOperand::CreateReg(Reg, false);
       IsIndirect = IsDbgDeclare;
     }
   }
 
   if (!Op && N.getNode())
     // Check if frame index is available.
     if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(N.getNode()))
       if (FrameIndexSDNode *FINode =
           dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
         Op = MachineOperand::CreateFI(FINode->getIndex());
 
   if (!Op) {
     // Check if ValueMap has reg number.
     DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
     if (VMI != FuncInfo.ValueMap.end()) {
       const auto &TLI = DAG.getTargetLoweringInfo();
       RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
                        V->getType(), getABIRegCopyCC(V));
       if (RFV.occupiesMultipleRegs()) {
         unsigned Offset = 0;
         for (auto RegAndSize : RFV.getRegsAndSizes()) {
           Op = MachineOperand::CreateReg(RegAndSize.first, false);
           auto FragmentExpr = DIExpression::createFragmentExpression(
               Expr, Offset, RegAndSize.second);
           if (!FragmentExpr)
             continue;
           FuncInfo.ArgDbgValues.push_back(
               BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
                       Op->getReg(), Variable, *FragmentExpr));
           Offset += RegAndSize.second;
         }
         return true;
       }
       Op = MachineOperand::CreateReg(VMI->second, false);
       IsIndirect = IsDbgDeclare;
     }
   }
 
   if (!Op)
     return false;
 
   assert(Variable->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   IsIndirect = (Op->isReg()) ? IsIndirect : true;
   FuncInfo.ArgDbgValues.push_back(
       BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
               *Op, Variable, Expr));
 
   return true;
 }
 
 /// Return the appropriate SDDbgValue based on N.
 SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
                                              DILocalVariable *Variable,
                                              DIExpression *Expr,
                                              const DebugLoc &dl,
                                              unsigned DbgSDNodeOrder) {
   if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
     // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
     // stack slot locations.
     //
     // Consider "int x = 0; int *px = &x;". There are two kinds of interesting
     // debug values here after optimization:
     //
     //   dbg.value(i32* %px, !"int *px", !DIExpression()), and
     //   dbg.value(i32* %px, !"int x", !DIExpression(DW_OP_deref))
     //
     // Both describe the direct values of their associated variables.
     return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(),
                                      /*IsIndirect*/ false, dl, DbgSDNodeOrder);
   }
   return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(),
                          /*IsIndirect*/ false, dl, DbgSDNodeOrder);
 }
 
 // VisualStudio defines setjmp as _setjmp
 #if defined(_MSC_VER) && defined(setjmp) && \
                          !defined(setjmp_undefined_for_msvc)
 #  pragma push_macro("setjmp")
 #  undef setjmp
 #  define setjmp_undefined_for_msvc
 #endif
 
 /// Lower the call to the specified intrinsic function. If we want to emit this
 /// as a call to a named external function, return the name. Otherwise, lower it
 /// and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc sdl = getCurSDLoc();
   DebugLoc dl = getCurDebugLoc();
   SDValue Res;
 
   switch (Intrinsic) {
   default:
     // By default, turn this into a target intrinsic node.
     visitTargetIntrinsic(I, Intrinsic);
     return nullptr;
   case Intrinsic::vastart:  visitVAStart(I); return nullptr;
   case Intrinsic::vaend:    visitVAEnd(I); return nullptr;
   case Intrinsic::vacopy:   visitVACopy(I); return nullptr;
   case Intrinsic::returnaddress:
     setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::addressofreturnaddress:
     setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::read_register: {
     Value *Reg = I.getArgOperand(0);
     SDValue Chain = getRoot();
     SDValue RegName =
         DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     Res = DAG.getNode(ISD::READ_REGISTER, sdl,
       DAG.getVTList(VT, MVT::Other), Chain, RegName);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return nullptr;
   }
   case Intrinsic::write_register: {
     Value *Reg = I.getArgOperand(0);
     Value *RegValue = I.getArgOperand(1);
     SDValue Chain = getRoot();
     SDValue RegName =
         DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
                             RegName, getValue(RegValue)));
     return nullptr;
   }
   case Intrinsic::setjmp:
     return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
   case Intrinsic::longjmp:
     return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
     const auto &MCI = cast<MemCpyInst>(I);
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     // @llvm.memcpy defines 0 and 1 to both mean no alignment.
     unsigned DstAlign = std::max<unsigned>(MCI.getDestAlignment(), 1);
     unsigned SrcAlign = std::max<unsigned>(MCI.getSourceAlignment(), 1);
     unsigned Align = MinAlign(DstAlign, SrcAlign);
     bool isVol = MCI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
     // node.
     SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                false, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
                                MachinePointerInfo(I.getArgOperand(1)));
     updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::memset: {
     const auto &MSI = cast<MemSetInst>(I);
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     // @llvm.memset defines 0 and 1 to both mean no alignment.
     unsigned Align = std::max<unsigned>(MSI.getDestAlignment(), 1);
     bool isVol = MSI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                isTC, MachinePointerInfo(I.getArgOperand(0)));
     updateDAGForMaybeTailCall(MS);
     return nullptr;
   }
   case Intrinsic::memmove: {
     const auto &MMI = cast<MemMoveInst>(I);
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     // @llvm.memmove defines 0 and 1 to both mean no alignment.
     unsigned DstAlign = std::max<unsigned>(MMI.getDestAlignment(), 1);
     unsigned SrcAlign = std::max<unsigned>(MMI.getSourceAlignment(), 1);
     unsigned Align = MinAlign(DstAlign, SrcAlign);
     bool isVol = MMI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memmove DAG
     // node.
     SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                 isTC, MachinePointerInfo(I.getArgOperand(0)),
                                 MachinePointerInfo(I.getArgOperand(1)));
     updateDAGForMaybeTailCall(MM);
     return nullptr;
   }
   case Intrinsic::memcpy_element_unordered_atomic: {
     const AtomicMemCpyInst &MI = cast<AtomicMemCpyInst>(I);
     SDValue Dst = getValue(MI.getRawDest());
     SDValue Src = getValue(MI.getRawSource());
     SDValue Length = getValue(MI.getLength());
 
     unsigned DstAlign = MI.getDestAlignment();
     unsigned SrcAlign = MI.getSourceAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     SDValue MC = DAG.getAtomicMemcpy(getRoot(), sdl, Dst, DstAlign, Src,
                                      SrcAlign, Length, LengthTy, ElemSz, isTC,
                                      MachinePointerInfo(MI.getRawDest()),
                                      MachinePointerInfo(MI.getRawSource()));
     updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::memmove_element_unordered_atomic: {
     auto &MI = cast<AtomicMemMoveInst>(I);
     SDValue Dst = getValue(MI.getRawDest());
     SDValue Src = getValue(MI.getRawSource());
     SDValue Length = getValue(MI.getLength());
 
     unsigned DstAlign = MI.getDestAlignment();
     unsigned SrcAlign = MI.getSourceAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     SDValue MC = DAG.getAtomicMemmove(getRoot(), sdl, Dst, DstAlign, Src,
                                       SrcAlign, Length, LengthTy, ElemSz, isTC,
                                       MachinePointerInfo(MI.getRawDest()),
                                       MachinePointerInfo(MI.getRawSource()));
     updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::memset_element_unordered_atomic: {
     auto &MI = cast<AtomicMemSetInst>(I);
     SDValue Dst = getValue(MI.getRawDest());
     SDValue Val = getValue(MI.getValue());
     SDValue Length = getValue(MI.getLength());
 
     unsigned DstAlign = MI.getDestAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
     bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
     SDValue MC = DAG.getAtomicMemset(getRoot(), sdl, Dst, DstAlign, Val, Length,
                                      LengthTy, ElemSz, isTC,
                                      MachinePointerInfo(MI.getRawDest()));
     updateDAGForMaybeTailCall(MC);
     return nullptr;
   }
   case Intrinsic::dbg_addr:
   case Intrinsic::dbg_declare: {
     const DbgInfoIntrinsic &DI = cast<DbgInfoIntrinsic>(I);
     DILocalVariable *Variable = DI.getVariable();
     DIExpression *Expression = DI.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
     assert(Variable && "Missing variable");
 
     // Check if address has undef value.
     const Value *Address = DI.getVariableLocation();
     if (!Address || isa<UndefValue>(Address) ||
         (Address->use_empty() && !isa<Argument>(Address))) {
       LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       return nullptr;
     }
 
     bool isParameter = Variable->isParameter() || isa<Argument>(Address);
 
     // Check if this variable can be described by a frame index, typically
     // either as a static alloca or a byval parameter.
     int FI = std::numeric_limits<int>::max();
     if (const auto *AI =
             dyn_cast<AllocaInst>(Address->stripInBoundsConstantOffsets())) {
       if (AI->isStaticAlloca()) {
         auto I = FuncInfo.StaticAllocaMap.find(AI);
         if (I != FuncInfo.StaticAllocaMap.end())
           FI = I->second;
       }
     } else if (const auto *Arg = dyn_cast<Argument>(
                    Address->stripInBoundsConstantOffsets())) {
       FI = FuncInfo.getArgumentFrameIndex(Arg);
     }
 
     // llvm.dbg.addr is control dependent and always generates indirect
     // DBG_VALUE instructions. llvm.dbg.declare is handled as a frame index in
     // the MachineFunction variable table.
     if (FI != std::numeric_limits<int>::max()) {
       if (Intrinsic == Intrinsic::dbg_addr) {
         SDDbgValue *SDV = DAG.getFrameIndexDbgValue(
             Variable, Expression, FI, /*IsIndirect*/ true, dl, SDNodeOrder);
         DAG.AddDbgValue(SDV, getRoot().getNode(), isParameter);
       }
       return nullptr;
     }
 
     SDValue &N = NodeMap[Address];
     if (!N.getNode() && isa<Argument>(Address))
       // Check unused arguments map.
       N = UnusedArgNodeMap[Address];
     SDDbgValue *SDV;
     if (N.getNode()) {
       if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
         Address = BCI->getOperand(0);
       // Parameters are handled specially.
       auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
       if (isParameter && FINode) {
         // Byval parameter. We have a frame index at this point.
         SDV =
             DAG.getFrameIndexDbgValue(Variable, Expression, FINode->getIndex(),
                                       /*IsIndirect*/ true, dl, SDNodeOrder);
       } else if (isa<Argument>(Address)) {
         // Address is an argument, so try to emit its dbg value using
         // virtual register info from the FuncInfo.ValueMap.
         EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true, N);
         return nullptr;
       } else {
         SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
                               true, dl, SDNodeOrder);
       }
       DAG.AddDbgValue(SDV, N.getNode(), isParameter);
     } else {
       // If Address is an argument then try to emit its dbg value using
       // virtual register info from the FuncInfo.ValueMap.
       if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true,
                                     N)) {
         LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       }
     }
     return nullptr;
   }
   case Intrinsic::dbg_label: {
     const DbgLabelInst &DI = cast<DbgLabelInst>(I);
     DILabel *Label = DI.getLabel();
     assert(Label && "Missing label");
 
     SDDbgLabel *SDV;
     SDV = DAG.getDbgLabel(Label, dl, SDNodeOrder);
     DAG.AddDbgLabel(SDV);
     return nullptr;
   }
   case Intrinsic::dbg_value: {
     const DbgValueInst &DI = cast<DbgValueInst>(I);
     assert(DI.getVariable() && "Missing variable");
 
     DILocalVariable *Variable = DI.getVariable();
     DIExpression *Expression = DI.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
     const Value *V = DI.getValue();
     if (!V)
       return nullptr;
 
     SDDbgValue *SDV;
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
       SDV = DAG.getConstantDbgValue(Variable, Expression, V, dl, SDNodeOrder);
       DAG.AddDbgValue(SDV, nullptr, false);
       return nullptr;
     }
 
     // Do not use getValue() in here; we don't want to generate code at
     // this point if it hasn't been done yet.
     SDValue N = NodeMap[V];
     if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
       N = UnusedArgNodeMap[V];
     if (N.getNode()) {
       if (EmitFuncArgumentDbgValue(V, Variable, Expression, dl, false, N))
         return nullptr;
       SDV = getDbgValue(N, Variable, Expression, dl, SDNodeOrder);
       DAG.AddDbgValue(SDV, N.getNode(), false);
       return nullptr;
     }
 
     // PHI nodes have already been selected, so we should know which VReg that
     // is assigns to already.
     if (isa<PHINode>(V)) {
       auto VMI = FuncInfo.ValueMap.find(V);
       if (VMI != FuncInfo.ValueMap.end()) {
         unsigned Reg = VMI->second;
         // The PHI node may be split up into several MI PHI nodes (in
         // FunctionLoweringInfo::set).
         RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
                          V->getType(), None);
         if (RFV.occupiesMultipleRegs()) {
           unsigned Offset = 0;
           unsigned BitsToDescribe = 0;
           if (auto VarSize = Variable->getSizeInBits())
             BitsToDescribe = *VarSize;
           if (auto Fragment = Expression->getFragmentInfo())
             BitsToDescribe = Fragment->SizeInBits;
           for (auto RegAndSize : RFV.getRegsAndSizes()) {
             unsigned RegisterSize = RegAndSize.second;
             // Bail out if all bits are described already.
             if (Offset >= BitsToDescribe)
               break;
             unsigned FragmentSize = (Offset + RegisterSize > BitsToDescribe)
                 ? BitsToDescribe - Offset
                 : RegisterSize;
             auto FragmentExpr = DIExpression::createFragmentExpression(
                 Expression, Offset, FragmentSize);
             if (!FragmentExpr)
                 continue;
             SDV = DAG.getVRegDbgValue(Variable, *FragmentExpr, RegAndSize.first,
                                       false, dl, SDNodeOrder);
             DAG.AddDbgValue(SDV, nullptr, false);
             Offset += RegisterSize;
           }
         } else {
           SDV = DAG.getVRegDbgValue(Variable, Expression, Reg, false, dl,
                                     SDNodeOrder);
           DAG.AddDbgValue(SDV, nullptr, false);
         }
         return nullptr;
       }
     }
 
     // TODO: When we get here we will either drop the dbg.value completely, or
     // we try to move it forward by letting it dangle for awhile. So we should
     // probably add an extra DbgValue to the DAG here, with a reference to
     // "noreg", to indicate that we have lost the debug location for the
     // variable.
 
     if (!V->use_empty() ) {
       // Do not call getValue(V) yet, as we don't want to generate code.
       // Remember it for later.
       DanglingDebugInfoMap[V].emplace_back(&DI, dl, SDNodeOrder);
       return nullptr;
     }
 
     LLVM_DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
     LLVM_DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
     return nullptr;
   }
 
   case Intrinsic::eh_typeid_for: {
     // Find the type id for the given typeinfo.
     GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
     unsigned TypeID = DAG.getMachineFunction().getTypeIDFor(GV);
     Res = DAG.getConstant(TypeID, sdl, MVT::i32);
     setValue(&I, Res);
     return nullptr;
   }
 
   case Intrinsic::eh_return_i32:
   case Intrinsic::eh_return_i64:
     DAG.getMachineFunction().setCallsEHReturn(true);
     DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
                             MVT::Other,
                             getControlRoot(),
                             getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1))));
     return nullptr;
   case Intrinsic::eh_unwind_init:
     DAG.getMachineFunction().setCallsUnwindInit(true);
     return nullptr;
   case Intrinsic::eh_dwarf_cfa:
     setValue(&I, DAG.getNode(ISD::EH_DWARF_CFA, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::eh_sjlj_callsite: {
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
     ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
     assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
     assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
     MMI.setCurrentCallSite(CI->getZExtValue());
     return nullptr;
   }
   case Intrinsic::eh_sjlj_functioncontext: {
     // Get and store the index of the function context.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     AllocaInst *FnCtx =
       cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
     int FI = FuncInfo.StaticAllocaMap[FnCtx];
     MFI.setFunctionContextIndex(FI);
     return nullptr;
   }
   case Intrinsic::eh_sjlj_setjmp: {
     SDValue Ops[2];
     Ops[0] = getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
     SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
                              DAG.getVTList(MVT::i32, MVT::Other), Ops);
     setValue(&I, Op.getValue(0));
     DAG.setRoot(Op.getValue(1));
     return nullptr;
   }
   case Intrinsic::eh_sjlj_longjmp:
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
                             getRoot(), getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::eh_sjlj_setup_dispatch:
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
                             getRoot()));
     return nullptr;
   case Intrinsic::masked_gather:
     visitMaskedGather(I);
     return nullptr;
   case Intrinsic::masked_load:
     visitMaskedLoad(I);
     return nullptr;
   case Intrinsic::masked_scatter:
     visitMaskedScatter(I);
     return nullptr;
   case Intrinsic::masked_store:
     visitMaskedStore(I);
     return nullptr;
   case Intrinsic::masked_expandload:
     visitMaskedLoad(I, true /* IsExpanding */);
     return nullptr;
   case Intrinsic::masked_compressstore:
     visitMaskedStore(I, true /* IsCompressing */);
     return nullptr;
   case Intrinsic::x86_mmx_pslli_w:
   case Intrinsic::x86_mmx_pslli_d:
   case Intrinsic::x86_mmx_pslli_q:
   case Intrinsic::x86_mmx_psrli_w:
   case Intrinsic::x86_mmx_psrli_d:
   case Intrinsic::x86_mmx_psrli_q:
   case Intrinsic::x86_mmx_psrai_w:
   case Intrinsic::x86_mmx_psrai_d: {
     SDValue ShAmt = getValue(I.getArgOperand(1));
     if (isa<ConstantSDNode>(ShAmt)) {
       visitTargetIntrinsic(I, Intrinsic);
       return nullptr;
     }
     unsigned NewIntrinsic = 0;
     EVT ShAmtVT = MVT::v2i32;
     switch (Intrinsic) {
     case Intrinsic::x86_mmx_pslli_w:
       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
       break;
     case Intrinsic::x86_mmx_pslli_d:
       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
       break;
     case Intrinsic::x86_mmx_pslli_q:
       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
       break;
     case Intrinsic::x86_mmx_psrli_w:
       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
       break;
     case Intrinsic::x86_mmx_psrli_d:
       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
       break;
     case Intrinsic::x86_mmx_psrli_q:
       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
       break;
     case Intrinsic::x86_mmx_psrai_w:
       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
       break;
     case Intrinsic::x86_mmx_psrai_d:
       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
       break;
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     }
 
     // The vector shift intrinsics with scalars uses 32b shift amounts but
     // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
     // to be zero.
     // We must do this early because v2i32 is not a legal type.
     SDValue ShOps[2];
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
     ShAmt =  DAG.getBuildVector(ShAmtVT, sdl, ShOps);
     EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
                        DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
                        getValue(I.getArgOperand(0)), ShAmt);
     setValue(&I, Res);
     return nullptr;
   }
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
     return nullptr;
   case Intrinsic::log:
     setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::log2:
     setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::log10:
     setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::exp:
     setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::exp2:
     setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::pow:
     setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
                            getValue(I.getArgOperand(1)), DAG, TLI));
     return nullptr;
   case Intrinsic::sqrt:
   case Intrinsic::fabs:
   case Intrinsic::sin:
   case Intrinsic::cos:
   case Intrinsic::floor:
   case Intrinsic::ceil:
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
   case Intrinsic::canonicalize: {
     unsigned Opcode;
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
     case Intrinsic::fabs:      Opcode = ISD::FABS;       break;
     case Intrinsic::sin:       Opcode = ISD::FSIN;       break;
     case Intrinsic::cos:       Opcode = ISD::FCOS;       break;
     case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
     case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
     case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
     case Intrinsic::round:     Opcode = ISD::FROUND;     break;
     case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
     }
 
     setValue(&I, DAG.getNode(Opcode, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
   case Intrinsic::minnum: {
     auto VT = getValue(I.getArgOperand(0)).getValueType();
     unsigned Opc =
         I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)
             ? ISD::FMINNAN
             : ISD::FMINNUM;
     setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
   }
   case Intrinsic::maxnum: {
     auto VT = getValue(I.getArgOperand(0)).getValueType();
     unsigned Opc =
         I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)
             ? ISD::FMAXNAN
             : ISD::FMAXNUM;
     setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
   }
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return nullptr;
   case Intrinsic::experimental_constrained_fadd:
   case Intrinsic::experimental_constrained_fsub:
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
   case Intrinsic::experimental_constrained_fma:
   case Intrinsic::experimental_constrained_sqrt:
   case Intrinsic::experimental_constrained_pow:
   case Intrinsic::experimental_constrained_powi:
   case Intrinsic::experimental_constrained_sin:
   case Intrinsic::experimental_constrained_cos:
   case Intrinsic::experimental_constrained_exp:
   case Intrinsic::experimental_constrained_exp2:
   case Intrinsic::experimental_constrained_log:
   case Intrinsic::experimental_constrained_log10:
   case Intrinsic::experimental_constrained_log2:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
     return nullptr;
   case Intrinsic::fmuladd: {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
         TLI.isFMAFasterThanFMulAndFAdd(VT)) {
       setValue(&I, DAG.getNode(ISD::FMA, sdl,
                                getValue(I.getArgOperand(0)).getValueType(),
                                getValue(I.getArgOperand(0)),
                                getValue(I.getArgOperand(1)),
                                getValue(I.getArgOperand(2))));
     } else {
       // TODO: Intrinsic calls should have fast-math-flags.
       SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
                                 getValue(I.getArgOperand(0)).getValueType(),
                                 getValue(I.getArgOperand(0)),
                                 getValue(I.getArgOperand(1)));
       SDValue Add = DAG.getNode(ISD::FADD, sdl,
                                 getValue(I.getArgOperand(0)).getValueType(),
                                 Mul,
                                 getValue(I.getArgOperand(2)));
       setValue(&I, Add);
     }
     return nullptr;
   }
   case Intrinsic::convert_to_fp16:
     setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
                              DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
                                          getValue(I.getArgOperand(0)),
                                          DAG.getTargetConstant(0, sdl,
                                                                MVT::i32))));
     return nullptr;
   case Intrinsic::convert_from_fp16:
     setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
                              TLI.getValueType(DAG.getDataLayout(), I.getType()),
                              DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
                                          getValue(I.getArgOperand(0)))));
     return nullptr;
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
     return nullptr;
   }
   case Intrinsic::readcyclecounter: {
     SDValue Op = getRoot();
     Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
                       DAG.getVTList(MVT::i64, MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return nullptr;
   }
   case Intrinsic::bitreverse:
     setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::bswap:
     setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::cttz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
                              sdl, Ty, Arg));
     return nullptr;
   }
   case Intrinsic::ctlz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
                              sdl, Ty, Arg));
     return nullptr;
   }
   case Intrinsic::ctpop: {
     SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
     return nullptr;
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
     bool IsFSHL = Intrinsic == Intrinsic::fshl;
     SDValue X = getValue(I.getArgOperand(0));
     SDValue Y = getValue(I.getArgOperand(1));
     SDValue Z = getValue(I.getArgOperand(2));
     EVT VT = X.getValueType();
 
     // When X == Y, this is rotate. Create the node directly if legal.
     // TODO: This should also be done if the operation is custom, but we have
     // to make sure targets are handling the modulo shift amount as expected.
     // TODO: If the rotate direction (left or right) corresponding to the shift
     // is not available, adjust the shift value and invert the direction.
     auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
     if (X == Y && TLI.isOperationLegal(RotateOpcode, VT)) {
       setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
       return nullptr;
     }
 
     // Get the shift amount and inverse shift amount, modulo the bit-width.
     SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
     SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
     SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, Z);
     SDValue InvShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
 
     // fshl: (X << (Z % BW)) | (Y >> ((BW - Z) % BW))
     // fshr: (X << ((BW - Z) % BW)) | (Y >> (Z % BW))
     SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
     SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
     SDValue Res = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);
 
     // If (Z % BW == 0), then (BW - Z) % BW is also zero, so the result would
     // be X | Y. If X == Y (rotate), that's fine. If not, we have to select.
     if (X != Y) {
       SDValue Zero = DAG.getConstant(0, sdl, VT);
       EVT CCVT = MVT::i1;
       if (VT.isVector())
         CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());
       // For fshl, 0 shift returns the 1st arg (X).
       // For fshr, 0 shift returns the 2nd arg (Y).
       SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
       Res = DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Res);
     }
     setValue(&I, Res);
     return nullptr;
   }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
         ISD::STACKSAVE, sdl,
         DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return nullptr;
   }
   case Intrinsic::stackrestore:
     Res = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
     return nullptr;
   case Intrinsic::get_dynamic_area_offset: {
     SDValue Op = getRoot();
     EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
     // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
     // target.
     if (PtrTy != ResTy)
       report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
                          " intrinsic!");
     Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
                       Op);
     DAG.setRoot(Op);
     setValue(&I, Res);
     return nullptr;
   }
   case Intrinsic::stackguard: {
     EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     MachineFunction &MF = DAG.getMachineFunction();
     const Module &M = *MF.getFunction().getParent();
     SDValue Chain = getRoot();
     if (TLI.useLoadStackGuardNode()) {
       Res = getLoadStackGuard(DAG, sdl, Chain);
     } else {
       const Value *Global = TLI.getSDagStackGuard(M);
       unsigned Align = DL->getPrefTypeAlignment(Global->getType());
       Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
                         MachinePointerInfo(Global, 0), Align,
                         MachineMemOperand::MOVolatile);
     }
     if (TLI.useStackGuardXorFP())
       Res = TLI.emitStackGuardXorFP(DAG, Res, sdl);
     DAG.setRoot(Chain);
     setValue(&I, Res);
     return nullptr;
   }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo &MFI = MF.getFrameInfo();
     EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     SDValue Src, Chain = getRoot();
 
     if (TLI.useLoadStackGuardNode())
       Src = getLoadStackGuard(DAG, sdl, Chain);
     else
       Src = getValue(I.getArgOperand(0));   // The guard's value.
 
     AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
     int FI = FuncInfo.StaticAllocaMap[Slot];
     MFI.setStackProtectorIndex(FI);
 
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
     // Store the stack protector onto the stack.
     Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
                                                  DAG.getMachineFunction(), FI),
                        /* Alignment = */ 0, MachineMemOperand::MOVolatile);
     setValue(&I, Res);
     DAG.setRoot(Res);
     return nullptr;
   }
   case Intrinsic::objectsize: {
     // If we don't know by now, we're never going to know.
     ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
 
     assert(CI && "Non-constant type in __builtin_object_size?");
 
     SDValue Arg = getValue(I.getCalledValue());
     EVT Ty = Arg.getValueType();
 
     if (CI->isZero())
       Res = DAG.getConstant(-1ULL, sdl, Ty);
     else
       Res = DAG.getConstant(0, sdl, Ty);
 
     setValue(&I, Res);
     return nullptr;
   }
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
     // Drop the intrinsic, but forward the value
     setValue(&I, getValue(I.getOperand(0)));
     return nullptr;
   case Intrinsic::assume:
   case Intrinsic::var_annotation:
   case Intrinsic::sideeffect:
     // Discard annotate attributes, assumptions, and artificial side-effects.
     return nullptr;
 
   case Intrinsic::codeview_annotation: {
     // Emit a label associated with this metadata.
     MachineFunction &MF = DAG.getMachineFunction();
     MCSymbol *Label =
         MF.getMMI().getContext().createTempSymbol("annotation", true);
     Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
     MF.addCodeViewAnnotation(Label, cast<MDNode>(MD));
     Res = DAG.getLabelNode(ISD::ANNOTATION_LABEL, sdl, getRoot(), Label);
     DAG.setRoot(Res);
     return nullptr;
   }
 
   case Intrinsic::init_trampoline: {
     const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
 
     SDValue Ops[6];
     Ops[0] = getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
     Ops[5] = DAG.getSrcValue(F);
 
     Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);
 
     DAG.setRoot(Res);
     return nullptr;
   }
   case Intrinsic::adjust_trampoline:
     setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::gcroot: {
     assert(DAG.getMachineFunction().getFunction().hasGC() &&
            "only valid in functions with gc specified, enforced by Verifier");
     assert(GFI && "implied by previous");
     const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
     const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
 
     FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
     GFI->addStackRoot(FI->getIndex(), TypeMap);
     return nullptr;
   }
   case Intrinsic::gcread:
   case Intrinsic::gcwrite:
     llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
   case Intrinsic::flt_rounds:
     setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
     return nullptr;
 
   case Intrinsic::expect:
     // Just replace __builtin_expect(exp, c) with EXP.
     setValue(&I, getValue(I.getArgOperand(0)));
     return nullptr;
 
   case Intrinsic::debugtrap:
   case Intrinsic::trap: {
     StringRef TrapFuncName =
         I.getAttributes()
             .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
             .getValueAsString();
     if (TrapFuncName.empty()) {
       ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
         ISD::TRAP : ISD::DEBUGTRAP;
       DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
       return nullptr;
     }
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
         CallingConv::C, I.getType(),
         DAG.getExternalSymbol(TrapFuncName.data(),
                               TLI.getPointerTy(DAG.getDataLayout())),
         std::move(Args));
 
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return nullptr;
   }
 
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::usub_with_overflow:
   case Intrinsic::ssub_with_overflow:
   case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow: {
     ISD::NodeType Op;
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
     case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
     case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
     case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
     case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
     case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
     }
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
 
     SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
     setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
     return nullptr;
   }
   case Intrinsic::prefetch: {
     SDValue Ops[5];
     unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
     auto Flags = rw == 0 ? MachineMemOperand::MOLoad :MachineMemOperand::MOStore;
     Ops[0] = DAG.getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = getValue(I.getArgOperand(3));
     SDValue Result = DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
                                              DAG.getVTList(MVT::Other), Ops,
                                              EVT::getIntegerVT(*Context, 8),
                                              MachinePointerInfo(I.getArgOperand(0)),
                                              0, /* align */
                                              Flags);
 
     // Chain the prefetch in parallell with any pending loads, to stay out of
     // the way of later optimizations.
     PendingLoads.push_back(Result);
     Result = getRoot();
     DAG.setRoot(Result);
     return nullptr;
   }
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
     bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
     // Stack coloring is not enabled in O0, discard region information.
     if (TM.getOptLevel() == CodeGenOpt::None)
       return nullptr;
 
     SmallVector<Value *, 4> Allocas;
     GetUnderlyingObjects(I.getArgOperand(1), Allocas, *DL);
 
     for (SmallVectorImpl<Value*>::iterator Object = Allocas.begin(),
            E = Allocas.end(); Object != E; ++Object) {
       AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
 
       // Could not find an Alloca.
       if (!LifetimeObject)
         continue;
 
       // First check that the Alloca is static, otherwise it won't have a
       // valid frame index.
       auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
       if (SI == FuncInfo.StaticAllocaMap.end())
         return nullptr;
 
       int FI = SI->second;
 
       SDValue Ops[2];
       Ops[0] = getRoot();
       Ops[1] =
           DAG.getFrameIndex(FI, TLI.getFrameIndexTy(DAG.getDataLayout()), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
       Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
       DAG.setRoot(Res);
     }
     return nullptr;
   }
   case Intrinsic::invariant_start:
     // Discard region information.
     setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
   case Intrinsic::invariant_end:
     // Discard region information.
     return nullptr;
   case Intrinsic::clear_cache:
     return TLI.getClearCacheBuiltinName();
   case Intrinsic::donothing:
     // ignore
     return nullptr;
   case Intrinsic::experimental_stackmap:
     visitStackmap(I);
     return nullptr;
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64:
     visitPatchpoint(&I);
     return nullptr;
   case Intrinsic::experimental_gc_statepoint:
     LowerStatepoint(ImmutableStatepoint(&I));
     return nullptr;
   case Intrinsic::experimental_gc_result:
     visitGCResult(cast<GCResultInst>(I));
     return nullptr;
   case Intrinsic::experimental_gc_relocate:
     visitGCRelocate(cast<GCRelocateInst>(I));
     return nullptr;
   case Intrinsic::instrprof_increment:
     llvm_unreachable("instrprof failed to lower an increment");
   case Intrinsic::instrprof_value_profile:
     llvm_unreachable("instrprof failed to lower a value profiling call");
   case Intrinsic::localescape: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
     // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
     // is the same on all targets.
     for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
       Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
       if (isa<ConstantPointerNull>(Arg))
         continue; // Skip null pointers. They represent a hole in index space.
       AllocaInst *Slot = cast<AllocaInst>(Arg);
       assert(FuncInfo.StaticAllocaMap.count(Slot) &&
              "can only escape static allocas");
       int FI = FuncInfo.StaticAllocaMap[Slot];
       MCSymbol *FrameAllocSym =
           MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
               GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
               TII->get(TargetOpcode::LOCAL_ESCAPE))
           .addSym(FrameAllocSym)
           .addFrameIndex(FI);
     }
 
     return nullptr;
   }
 
   case Intrinsic::localrecover: {
     // i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
     MachineFunction &MF = DAG.getMachineFunction();
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);
 
     // Get the symbol that defines the frame offset.
     auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
     auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
     unsigned IdxVal =
         unsigned(Idx->getLimitedValue(std::numeric_limits<int>::max()));
     MCSymbol *FrameAllocSym =
         MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
             GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
 
     // Create a MCSymbol for the label to avoid any target lowering
     // that would make this PC relative.
     SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
     SDValue OffsetVal =
         DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);
 
     // Add the offset to the FP.
     Value *FP = I.getArgOperand(1);
     SDValue FPVal = getValue(FP);
     SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
     setValue(&I, Add);
 
     return nullptr;
   }
 
   case Intrinsic::eh_exceptionpointer:
   case Intrinsic::eh_exceptioncode: {
     // Get the exception pointer vreg, copy from it, and resize it to fit.
     const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
     const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
     unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
     SDValue N =
         DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
     if (Intrinsic == Intrinsic::eh_exceptioncode)
       N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
     setValue(&I, N);
     return nullptr;
   }
   case Intrinsic::xray_customevent: {
     // Here we want to make sure that the intrinsic behaves as if it has a
     // specific calling convention, and only for x86_64.
     // FIXME: Support other platforms later.
     const auto &Triple = DAG.getTarget().getTargetTriple();
     if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
       return nullptr;
 
     SDLoc DL = getCurSDLoc();
     SmallVector<SDValue, 8> Ops;
 
     // We want to say that we always want the arguments in registers.
     SDValue LogEntryVal = getValue(I.getArgOperand(0));
     SDValue StrSizeVal = getValue(I.getArgOperand(1));
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Chain = getRoot();
     Ops.push_back(LogEntryVal);
     Ops.push_back(StrSizeVal);
     Ops.push_back(Chain);
 
     // We need to enforce the calling convention for the callsite, so that
     // argument ordering is enforced correctly, and that register allocation can
     // see that some registers may be assumed clobbered and have to preserve
     // them across calls to the intrinsic.
     MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
                                            DL, NodeTys, Ops);
     SDValue patchableNode = SDValue(MN, 0);
     DAG.setRoot(patchableNode);
     setValue(&I, patchableNode);
     return nullptr;
   }
   case Intrinsic::xray_typedevent: {
     // Here we want to make sure that the intrinsic behaves as if it has a
     // specific calling convention, and only for x86_64.
     // FIXME: Support other platforms later.
     const auto &Triple = DAG.getTarget().getTargetTriple();
     if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
       return nullptr;
 
     SDLoc DL = getCurSDLoc();
     SmallVector<SDValue, 8> Ops;
 
     // We want to say that we always want the arguments in registers.
     // It's unclear to me how manipulating the selection DAG here forces callers
     // to provide arguments in registers instead of on the stack.
     SDValue LogTypeId = getValue(I.getArgOperand(0));
     SDValue LogEntryVal = getValue(I.getArgOperand(1));
     SDValue StrSizeVal = getValue(I.getArgOperand(2));
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Chain = getRoot();
     Ops.push_back(LogTypeId);
     Ops.push_back(LogEntryVal);
     Ops.push_back(StrSizeVal);
     Ops.push_back(Chain);
 
     // We need to enforce the calling convention for the callsite, so that
     // argument ordering is enforced correctly, and that register allocation can
     // see that some registers may be assumed clobbered and have to preserve
     // them across calls to the intrinsic.
     MachineSDNode *MN = DAG.getMachineNode(
         TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, DL, NodeTys, Ops);
     SDValue patchableNode = SDValue(MN, 0);
     DAG.setRoot(patchableNode);
     setValue(&I, patchableNode);
     return nullptr;
   }
   case Intrinsic::experimental_deoptimize:
     LowerDeoptimizeCall(&I);
     return nullptr;
 
   case Intrinsic::experimental_vector_reduce_fadd:
   case Intrinsic::experimental_vector_reduce_fmul:
   case Intrinsic::experimental_vector_reduce_add:
   case Intrinsic::experimental_vector_reduce_mul:
   case Intrinsic::experimental_vector_reduce_and:
   case Intrinsic::experimental_vector_reduce_or:
   case Intrinsic::experimental_vector_reduce_xor:
   case Intrinsic::experimental_vector_reduce_smax:
   case Intrinsic::experimental_vector_reduce_smin:
   case Intrinsic::experimental_vector_reduce_umax:
   case Intrinsic::experimental_vector_reduce_umin:
   case Intrinsic::experimental_vector_reduce_fmax:
   case Intrinsic::experimental_vector_reduce_fmin:
     visitVectorReduce(I, Intrinsic);
     return nullptr;
 
   case Intrinsic::icall_branch_funnel: {
     SmallVector<SDValue, 16> Ops;
     Ops.push_back(DAG.getRoot());
     Ops.push_back(getValue(I.getArgOperand(0)));
 
     int64_t Offset;
     auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
         I.getArgOperand(1), Offset, DAG.getDataLayout()));
     if (!Base)
       report_fatal_error(
           "llvm.icall.branch.funnel operand must be a GlobalValue");
     Ops.push_back(DAG.getTargetGlobalAddress(Base, getCurSDLoc(), MVT::i64, 0));
 
     struct BranchFunnelTarget {
       int64_t Offset;
       SDValue Target;
     };
     SmallVector<BranchFunnelTarget, 8> Targets;
 
     for (unsigned Op = 1, N = I.getNumArgOperands(); Op != N; Op += 2) {
       auto *ElemBase = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
           I.getArgOperand(Op), Offset, DAG.getDataLayout()));
       if (ElemBase != Base)
         report_fatal_error("all llvm.icall.branch.funnel operands must refer "
                            "to the same GlobalValue");
 
       SDValue Val = getValue(I.getArgOperand(Op + 1));
       auto *GA = dyn_cast<GlobalAddressSDNode>(Val);
       if (!GA)
         report_fatal_error(
             "llvm.icall.branch.funnel operand must be a GlobalValue");
       Targets.push_back({Offset, DAG.getTargetGlobalAddress(
                                      GA->getGlobal(), getCurSDLoc(),
                                      Val.getValueType(), GA->getOffset())});
     }
     llvm::sort(Targets.begin(), Targets.end(),
                [](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) {
                  return T1.Offset < T2.Offset;
                });
 
     for (auto &T : Targets) {
       Ops.push_back(DAG.getTargetConstant(T.Offset, getCurSDLoc(), MVT::i32));
       Ops.push_back(T.Target);
     }
 
     SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL,
                                  getCurSDLoc(), MVT::Other, Ops),
               0);
     DAG.setRoot(N);
     setValue(&I, N);
     HasTailCall = true;
     return nullptr;
   }
 
   case Intrinsic::wasm_landingpad_index: {
     // TODO store landing pad index in a map, which will be used when generating
     // LSDA information
     return nullptr;
   }
   }
 }
 
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
     const ConstrainedFPIntrinsic &FPI) {
   SDLoc sdl = getCurSDLoc();
   unsigned Opcode;
   switch (FPI.getIntrinsicID()) {
   default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   case Intrinsic::experimental_constrained_fadd:
     Opcode = ISD::STRICT_FADD;
     break;
   case Intrinsic::experimental_constrained_fsub:
     Opcode = ISD::STRICT_FSUB;
     break;
   case Intrinsic::experimental_constrained_fmul:
     Opcode = ISD::STRICT_FMUL;
     break;
   case Intrinsic::experimental_constrained_fdiv:
     Opcode = ISD::STRICT_FDIV;
     break;
   case Intrinsic::experimental_constrained_frem:
     Opcode = ISD::STRICT_FREM;
     break;
   case Intrinsic::experimental_constrained_fma:
     Opcode = ISD::STRICT_FMA;
     break;
   case Intrinsic::experimental_constrained_sqrt:
     Opcode = ISD::STRICT_FSQRT;
     break;
   case Intrinsic::experimental_constrained_pow:
     Opcode = ISD::STRICT_FPOW;
     break;
   case Intrinsic::experimental_constrained_powi:
     Opcode = ISD::STRICT_FPOWI;
     break;
   case Intrinsic::experimental_constrained_sin:
     Opcode = ISD::STRICT_FSIN;
     break;
   case Intrinsic::experimental_constrained_cos:
     Opcode = ISD::STRICT_FCOS;
     break;
   case Intrinsic::experimental_constrained_exp:
     Opcode = ISD::STRICT_FEXP;
     break;
   case Intrinsic::experimental_constrained_exp2:
     Opcode = ISD::STRICT_FEXP2;
     break;
   case Intrinsic::experimental_constrained_log:
     Opcode = ISD::STRICT_FLOG;
     break;
   case Intrinsic::experimental_constrained_log10:
     Opcode = ISD::STRICT_FLOG10;
     break;
   case Intrinsic::experimental_constrained_log2:
     Opcode = ISD::STRICT_FLOG2;
     break;
   case Intrinsic::experimental_constrained_rint:
     Opcode = ISD::STRICT_FRINT;
     break;
   case Intrinsic::experimental_constrained_nearbyint:
     Opcode = ISD::STRICT_FNEARBYINT;
     break;
   }
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getRoot();
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
   ValueVTs.push_back(MVT::Other); // Out chain
 
   SDVTList VTs = DAG.getVTList(ValueVTs);
   SDValue Result;
   if (FPI.isUnaryOp())
     Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)) });
   else if (FPI.isTernaryOp())
     Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)),
                                   getValue(FPI.getArgOperand(1)),
                                   getValue(FPI.getArgOperand(2)) });
   else
     Result = DAG.getNode(Opcode, sdl, VTs,
                          { Chain, getValue(FPI.getArgOperand(0)),
                            getValue(FPI.getArgOperand(1))  });
 
   assert(Result.getNode()->getNumValues() == 2);
   SDValue OutChain = Result.getValue(1);
   DAG.setRoot(OutChain);
   SDValue FPResult = Result.getValue(0);
   setValue(&FPI, FPResult);
 }
 
 std::pair<SDValue, SDValue>
 SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
                                     const BasicBlock *EHPadBB) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineModuleInfo &MMI = MF.getMMI();
   MCSymbol *BeginLabel = nullptr;
 
   if (EHPadBB) {
     // Insert a label before the invoke call to mark the try range.  This can be
     // used to detect deletion of the invoke via the MachineModuleInfo.
     BeginLabel = MMI.getContext().createTempSymbol();
 
     // For SjLj, keep track of which landing pads go with which invokes
     // so as to maintain the ordering of pads in the LSDA.
     unsigned CallSiteIndex = MMI.getCurrentCallSite();
     if (CallSiteIndex) {
       MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
       LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
 
       // Now that the call site is handled, stop tracking it.
       MMI.setCurrentCallSite(0);
     }
 
     // Both PendingLoads and PendingExports must be flushed here;
     // this call might not return.
     (void)getRoot();
     DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));
 
     CLI.setChain(getRoot());
   }
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
 
   assert((CLI.IsTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
          "Null value expected with tail call!");
 
   if (!Result.second.getNode()) {
     // As a special case, a null chain means that a tail call has been emitted
     // and the DAG root is already updated.
     HasTailCall = true;
 
     // Since there's no actual continuation from this block, nothing can be
     // relying on us setting vregs for them.
     PendingExports.clear();
   } else {
     DAG.setRoot(Result.second);
   }
 
   if (EHPadBB) {
     // Insert a label at the end of the invoke call to mark the try range.  This
     // can be used to detect deletion of the invoke via the MachineModuleInfo.
     MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
     DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));
 
     // Inform MachineModuleInfo of range.
     auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
     // There is a platform (e.g. wasm) that uses funclet style IR but does not
     // actually use outlined funclets and their LSDA info style.
     if (MF.hasEHFunclets() && isFuncletEHPersonality(Pers)) {
       assert(CLI.CS);
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
       EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
                                 BeginLabel, EndLabel);
     } else {
       MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
     }
   }
 
   return Result;
 }
 
 void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                       bool isTailCall,
                                       const BasicBlock *EHPadBB) {
   auto &DL = DAG.getDataLayout();
   FunctionType *FTy = CS.getFunctionType();
   Type *RetTy = CS.getType();
 
   TargetLowering::ArgListTy Args;
   Args.reserve(CS.arg_size());
 
   const Value *SwiftErrorVal = nullptr;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // We can't tail call inside a function with a swifterror argument. Lowering
   // does not support this yet. It would have to move into the swifterror
   // register before the call.
   auto *Caller = CS.getInstruction()->getParent()->getParent();
   if (TLI.supportSwiftError() &&
       Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
     isTailCall = false;
 
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
     TargetLowering::ArgListEntry Entry;
     const Value *V = *i;
 
     // Skip empty types
     if (V->getType()->isEmptyTy())
       continue;
 
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
     Entry.setAttributes(&CS, i - CS.arg_begin());
 
     // Use swifterror virtual register as input to the call.
     if (Entry.IsSwiftError && TLI.supportSwiftError()) {
       SwiftErrorVal = V;
       // We find the virtual register for the actual swifterror argument.
       // Instead of using the Value, we use the virtual register instead.
       Entry.Node = DAG.getRegister(FuncInfo
                                        .getOrCreateSwiftErrorVRegUseAt(
                                            CS.getInstruction(), FuncInfo.MBB, V)
                                        .first,
                                    EVT(TLI.getPointerTy(DL)));
     }
 
     Args.push_back(Entry);
 
     // If we have an explicit sret argument that is an Instruction, (i.e., it
     // might point to function-local memory), we can't meaningfully tail-call.
     if (Entry.IsSRet && isa<Instruction>(V))
       isTailCall = false;
   }
 
   // Check if target-independent constraints permit a tail call here.
   // Target-dependent constraints are checked within TLI->LowerCallTo.
   if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
     isTailCall = false;
 
   // Disable tail calls if there is an swifterror argument. Targets have not
   // been updated to support tail calls.
   if (TLI.supportSwiftError() && SwiftErrorVal)
     isTailCall = false;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc())
       .setChain(getRoot())
       .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
       .setTailCall(isTailCall)
       .setConvergent(CS.isConvergent());
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   if (Result.first.getNode()) {
     const Instruction *Inst = CS.getInstruction();
     Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
     setValue(Inst, Result.first);
   }
 
   // The last element of CLI.InVals has the SDValue for swifterror return.
   // Here we copy it to a virtual register and update SwiftErrorMap for
   // book-keeping.
   if (SwiftErrorVal && TLI.supportSwiftError()) {
     // Get the last element of InVals.
     SDValue Src = CLI.InVals.back();
     unsigned VReg; bool CreatedVReg;
     std::tie(VReg, CreatedVReg) =
         FuncInfo.getOrCreateSwiftErrorVRegDefAt(CS.getInstruction());
     SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
     // We update the virtual register for the actual swifterror argument.
     if (CreatedVReg)
       FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg);
     DAG.setRoot(CopyNode);
   }
 }
 
 static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
                              SelectionDAGBuilder &Builder) {
   // Check to see if this load can be trivially constant folded, e.g. if the
   // input is from a string literal.
   if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
     // Cast pointer to the type we really want to load.
     Type *LoadTy =
         Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
     if (LoadVT.isVector())
       LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
 
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
 
     if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
             const_cast<Constant *>(LoadInput), LoadTy, *Builder.DL))
       return Builder.getValue(LoadCst);
   }
 
   // Otherwise, we have to emit the load.  If the pointer is to unfoldable but
   // still constant memory, the input chain can be the entry node.
   SDValue Root;
   bool ConstantMemory = false;
 
   // Do not serialize (non-volatile) loads of constant memory with anything.
   if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
     Root = Builder.DAG.getEntryNode();
     ConstantMemory = true;
   } else {
     // Do not serialize non-volatile loads against each other.
     Root = Builder.DAG.getRoot();
   }
 
   SDValue Ptr = Builder.getValue(PtrVal);
   SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
                                         Ptr, MachinePointerInfo(PtrVal),
                                         /* Alignment = */ 1);
 
   if (!ConstantMemory)
     Builder.PendingLoads.push_back(LoadVal.getValue(1));
   return LoadVal;
 }
 
 /// Record the value for an instruction that produces an integer result,
 /// converting the type where necessary.
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
   EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                     I.getType(), true);
   if (IsSigned)
     Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
   else
     Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
   setValue(&I, Value);
 }
 
 /// See if we can lower a memcmp call into an optimized form. If so, return
 /// true and lower it. Otherwise return false, and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
     EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                           I.getType(), true);
     setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
     return true;
   }
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
       DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
       getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, true);
     PendingLoads.push_back(Res.second);
     return true;
   }
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
   if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I))
     return false;
 
   // If the target has a fast compare for the given size, it will return a
   // preferred load type for that size. Require that the load VT is legal and
   // that the target supports unaligned loads of that type. Otherwise, return
   // INVALID.
   auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     MVT LVT = TLI.hasFastEqualityCompare(NumBits);
     if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       // TODO: Check alignment of src and dest ptrs.
       unsigned DstAS = LHS->getType()->getPointerAddressSpace();
       unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
       if (!TLI.isTypeLegal(LVT) ||
           !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
           !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
         LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     }
 
     return LVT;
   };
 
   // This turns into unaligned loads. We only do this if the target natively
   // supports the MVT we'll be loading or if it is small enough (<= 4) that
   // we'll only produce a small number of byte loads.
   MVT LoadVT;
   unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
   switch (NumBitsToCompare) {
   default:
     return false;
   case 16:
     LoadVT = MVT::i16;
     break;
   case 32:
     LoadVT = MVT::i32;
     break;
   case 64:
   case 128:
   case 256:
     LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
     break;
   }
 
   if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
     return false;
 
   SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
   SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
 
   // Bitcast to a wide integer type if the loads are vectors.
   if (LoadVT.isVector()) {
     EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
     LoadL = DAG.getBitcast(CmpVT, LoadL);
     LoadR = DAG.getBitcast(CmpVT, LoadR);
   }
 
   SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
   processIntegerCallValue(I, Cmp, false);
   return true;
 }
 
 /// See if we can lower a memchr call into an optimized form. If so, return
 /// true and lower it. Otherwise return false, and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
   const Value *Src = I.getArgOperand(0);
   const Value *Char = I.getArgOperand(1);
   const Value *Length = I.getArgOperand(2);
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(Src), getValue(Char), getValue(Length),
                                 MachinePointerInfo(Src));
   if (Res.first.getNode()) {
     setValue(&I, Res.first);
     PendingLoads.push_back(Res.second);
     return true;
   }
 
   return false;
 }
 
 /// See if we can lower a mempcpy call into an optimized form. If so, return
 /// true and lower it. Otherwise return false, and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   SDValue Dst = getValue(I.getArgOperand(0));
   SDValue Src = getValue(I.getArgOperand(1));
   SDValue Size = getValue(I.getArgOperand(2));
 
   unsigned DstAlign = DAG.InferPtrAlignment(Dst);
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   unsigned Align = std::min(DstAlign, SrcAlign);
   if (Align == 0) // Alignment of one or both could not be inferred.
     Align = 1; // 0 and 1 both specify no alignment, but 0 is reserved.
 
   bool isVol = false;
   SDLoc sdl = getCurSDLoc();
 
   // In the mempcpy context we need to pass in a false value for isTailCall
   // because the return pointer needs to be adjusted by the size of
   // the copied memory.
   SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Align, isVol,
                              false, /*isTailCall=*/false,
                              MachinePointerInfo(I.getArgOperand(0)),
                              MachinePointerInfo(I.getArgOperand(1)));
   assert(MC.getNode() != nullptr &&
          "** memcpy should not be lowered as TailCall in mempcpy context **");
   DAG.setRoot(MC);
 
   // Check if Size needs to be truncated or extended.
   Size = DAG.getSExtOrTrunc(Size, sdl, Dst.getValueType());
 
   // Adjust return pointer to point just past the last dst byte.
   SDValue DstPlusSize = DAG.getNode(ISD::ADD, sdl, Dst.getValueType(),
                                     Dst, Size);
   setValue(&I, DstPlusSize);
   return true;
 }
 
 /// See if we can lower a strcpy call into an optimized form.  If so, return
 /// true and lower it, otherwise return false and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
                                 getValue(Arg0), getValue(Arg1),
                                 MachinePointerInfo(Arg0),
                                 MachinePointerInfo(Arg1), isStpcpy);
   if (Res.first.getNode()) {
     setValue(&I, Res.first);
     DAG.setRoot(Res.second);
     return true;
   }
 
   return false;
 }
 
 /// See if we can lower a strcmp call into an optimized form.  If so, return
 /// true and lower it, otherwise return false and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(Arg0), getValue(Arg1),
                                 MachinePointerInfo(Arg0),
                                 MachinePointerInfo(Arg1));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, true);
     PendingLoads.push_back(Res.second);
     return true;
   }
 
   return false;
 }
 
 /// See if we can lower a strlen call into an optimized form.  If so, return
 /// true and lower it, otherwise return false and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
   const Value *Arg0 = I.getArgOperand(0);
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
                                 getValue(Arg0), MachinePointerInfo(Arg0));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, false);
     PendingLoads.push_back(Res.second);
     return true;
   }
 
   return false;
 }
 
 /// See if we can lower a strnlen call into an optimized form.  If so, return
 /// true and lower it, otherwise return false and it will be lowered like a
 /// normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
     TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
                                  getValue(Arg0), getValue(Arg1),
                                  MachinePointerInfo(Arg0));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, false);
     PendingLoads.push_back(Res.second);
     return true;
   }
 
   return false;
 }
 
 /// See if we can lower a unary floating-point operation into an SDNode with
 /// the specified Opcode.  If so, return true and lower it, otherwise return
 /// false and it will be lowered like a normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
                                               unsigned Opcode) {
   // We already checked this call's prototype; verify it doesn't modify errno.
   if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp = getValue(I.getArgOperand(0));
   setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
   return true;
 }
 
 /// See if we can lower a binary floating-point operation into an SDNode with
 /// the specified Opcode. If so, return true and lower it. Otherwise return
 /// false, and it will be lowered like a normal call.
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
                                                unsigned Opcode) {
   // We already checked this call's prototype; verify it doesn't modify errno.
   if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp0 = getValue(I.getArgOperand(0));
   SDValue Tmp1 = getValue(I.getArgOperand(1));
   EVT VT = Tmp0.getValueType();
   setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
   return true;
 }
 
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Handle inline assembly differently.
   if (isa<InlineAsm>(I.getCalledValue())) {
     visitInlineAsm(&I);
     return;
   }
 
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
   computeUsesVAFloatArgument(I, MMI);
 
   const char *RenameFn = nullptr;
   if (Function *F = I.getCalledFunction()) {
     if (F->isDeclaration()) {
       // Is this an LLVM intrinsic or a target-specific intrinsic?
       unsigned IID = F->getIntrinsicID();
       if (!IID)
         if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())
           IID = II->getIntrinsicID(F);
 
       if (IID) {
         RenameFn = visitIntrinsicCall(I, IID);
         if (!RenameFn)
           return;
       }
     }
 
     // Check for well-known libc/libm calls.  If the function is internal, it
     // can't be a library call.  Don't do the check if marked as nobuiltin for
     // some reason or the call site requires strict floating point semantics.
     LibFunc Func;
     if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&
         F->hasName() && LibInfo->getLibFunc(*F, Func) &&
         LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
       default: break;
       case LibFunc_copysign:
       case LibFunc_copysignf:
       case LibFunc_copysignl:
         // We already checked this call's prototype; verify it doesn't modify
         // errno.
         if (I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
                                    LHS.getValueType(), LHS, RHS));
           return;
         }
         break;
       case LibFunc_fabs:
       case LibFunc_fabsf:
       case LibFunc_fabsl:
         if (visitUnaryFloatCall(I, ISD::FABS))
           return;
         break;
       case LibFunc_fmin:
       case LibFunc_fminf:
       case LibFunc_fminl:
         if (visitBinaryFloatCall(I, ISD::FMINNUM))
           return;
         break;
       case LibFunc_fmax:
       case LibFunc_fmaxf:
       case LibFunc_fmaxl:
         if (visitBinaryFloatCall(I, ISD::FMAXNUM))
           return;
         break;
       case LibFunc_sin:
       case LibFunc_sinf:
       case LibFunc_sinl:
         if (visitUnaryFloatCall(I, ISD::FSIN))
           return;
         break;
       case LibFunc_cos:
       case LibFunc_cosf:
       case LibFunc_cosl:
         if (visitUnaryFloatCall(I, ISD::FCOS))
           return;
         break;
       case LibFunc_sqrt:
       case LibFunc_sqrtf:
       case LibFunc_sqrtl:
       case LibFunc_sqrt_finite:
       case LibFunc_sqrtf_finite:
       case LibFunc_sqrtl_finite:
         if (visitUnaryFloatCall(I, ISD::FSQRT))
           return;
         break;
       case LibFunc_floor:
       case LibFunc_floorf:
       case LibFunc_floorl:
         if (visitUnaryFloatCall(I, ISD::FFLOOR))
           return;
         break;
       case LibFunc_nearbyint:
       case LibFunc_nearbyintf:
       case LibFunc_nearbyintl:
         if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
           return;
         break;
       case LibFunc_ceil:
       case LibFunc_ceilf:
       case LibFunc_ceill:
         if (visitUnaryFloatCall(I, ISD::FCEIL))
           return;
         break;
       case LibFunc_rint:
       case LibFunc_rintf:
       case LibFunc_rintl:
         if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
         break;
       case LibFunc_round:
       case LibFunc_roundf:
       case LibFunc_roundl:
         if (visitUnaryFloatCall(I, ISD::FROUND))
           return;
         break;
       case LibFunc_trunc:
       case LibFunc_truncf:
       case LibFunc_truncl:
         if (visitUnaryFloatCall(I, ISD::FTRUNC))
           return;
         break;
       case LibFunc_log2:
       case LibFunc_log2f:
       case LibFunc_log2l:
         if (visitUnaryFloatCall(I, ISD::FLOG2))
           return;
         break;
       case LibFunc_exp2:
       case LibFunc_exp2f:
       case LibFunc_exp2l:
         if (visitUnaryFloatCall(I, ISD::FEXP2))
           return;
         break;
       case LibFunc_memcmp:
         if (visitMemCmpCall(I))
           return;
         break;
       case LibFunc_mempcpy:
         if (visitMemPCpyCall(I))
           return;
         break;
       case LibFunc_memchr:
         if (visitMemChrCall(I))
           return;
         break;
       case LibFunc_strcpy:
         if (visitStrCpyCall(I, false))
           return;
         break;
       case LibFunc_stpcpy:
         if (visitStrCpyCall(I, true))
           return;
         break;
       case LibFunc_strcmp:
         if (visitStrCmpCall(I))
           return;
         break;
       case LibFunc_strlen:
         if (visitStrLenCall(I))
           return;
         break;
       case LibFunc_strnlen:
         if (visitStrNLenCall(I))
           return;
         break;
       }
     }
   }
 
   SDValue Callee;
   if (!RenameFn)
     Callee = getValue(I.getCalledValue());
   else
     Callee = DAG.getExternalSymbol(
         RenameFn,
         DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
 
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   assert(!I.hasOperandBundlesOtherThan(
              {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
          "Cannot lower calls with arbitrary operand bundles!");
 
   if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
     LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
   else
     // Check if we can potentially perform a tail call. More detailed checking
     // is be done within LowerCallTo, after more information about the call is
     // known.
     LowerCallTo(&I, Callee, I.isTailCall());
 }
 
 namespace {
 
 /// AsmOperandInfo - This contains information for each constraint that we are
 /// lowering.
 class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
 public:
   /// CallOperand - If this is the result output operand or a clobber
   /// this is null, otherwise it is the incoming operand to the CallInst.
   /// This gets modified as the asm is processed.
   SDValue CallOperand;
 
   /// AssignedRegs - If this is a register or register class operand, this
   /// contains the set of register corresponding to the operand.
   RegsForValue AssignedRegs;
 
   explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
     : TargetLowering::AsmOperandInfo(info), CallOperand(nullptr, 0) {
   }
 
   /// Whether or not this operand accesses memory
   bool hasMemory(const TargetLowering &TLI) const {
     // Indirect operand accesses access memory.
     if (isIndirect)
       return true;
 
     for (const auto &Code : Codes)
       if (TLI.getConstraintType(Code) == TargetLowering::C_Memory)
         return true;
 
     return false;
   }
 
   /// getCallOperandValEVT - Return the EVT of the Value* that this operand
   /// corresponds to.  If there is no Value* for this operand, it returns
   /// MVT::Other.
   EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
                            const DataLayout &DL) const {
     if (!CallOperandVal) return MVT::Other;
 
     if (isa<BasicBlock>(CallOperandVal))
       return TLI.getPointerTy(DL);
 
     llvm::Type *OpTy = CallOperandVal->getType();
 
     // FIXME: code duplicated from TargetLowering::ParseConstraints().
     // If this is an indirect operand, the operand is a pointer to the
     // accessed type.
     if (isIndirect) {
       PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
       if (!PtrTy)
         report_fatal_error("Indirect operand for inline asm not a pointer!");
       OpTy = PtrTy->getElementType();
     }
 
     // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
     if (StructType *STy = dyn_cast<StructType>(OpTy))
       if (STy->getNumElements() == 1)
         OpTy = STy->getElementType(0);
 
     // If OpTy is not a single value, it may be a struct/union that we
     // can tile with integers.
     if (!OpTy->isSingleValueType() && OpTy->isSized()) {
       unsigned BitSize = DL.getTypeSizeInBits(OpTy);
       switch (BitSize) {
       default: break;
       case 1:
       case 8:
       case 16:
       case 32:
       case 64:
       case 128:
         OpTy = IntegerType::get(Context, BitSize);
         break;
       }
     }
 
     return TLI.getValueType(DL, OpTy, true);
   }
 };
 
 using SDISelAsmOperandInfoVector = SmallVector<SDISelAsmOperandInfo, 16>;
 
 } // end anonymous namespace
 
 /// Make sure that the output operand \p OpInfo and its corresponding input
 /// operand \p MatchingOpInfo have compatible constraint types (otherwise error
 /// out).
 static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo,
                                SDISelAsmOperandInfo &MatchingOpInfo,
                                SelectionDAG &DAG) {
   if (OpInfo.ConstraintVT == MatchingOpInfo.ConstraintVT)
     return;
 
   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
   const auto &TLI = DAG.getTargetLoweringInfo();
 
   std::pair<unsigned, const TargetRegisterClass *> MatchRC =
       TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
                                        OpInfo.ConstraintVT);
   std::pair<unsigned, const TargetRegisterClass *> InputRC =
       TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode,
                                        MatchingOpInfo.ConstraintVT);
   if ((OpInfo.ConstraintVT.isInteger() !=
        MatchingOpInfo.ConstraintVT.isInteger()) ||
       (MatchRC.second != InputRC.second)) {
     // FIXME: error out in a more elegant fashion
     report_fatal_error("Unsupported asm: input constraint"
                        " with a matching output constraint of"
                        " incompatible type!");
   }
   MatchingOpInfo.ConstraintVT = OpInfo.ConstraintVT;
 }
 
 /// Get a direct memory input to behave well as an indirect operand.
 /// This may introduce stores, hence the need for a \p Chain.
 /// \return The (possibly updated) chain.
 static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
                                         SDISelAsmOperandInfo &OpInfo,
                                         SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we don't have an indirect input, put it in the constpool if we can,
   // otherwise spill it to a stack slot.
   // TODO: This isn't quite right. We need to handle these according to
   // the addressing mode that the constraint wants. Also, this may take
   // an additional register for the computation and we don't want that
   // either.
 
   // If the operand is a float, integer, or vector constant, spill to a
   // constant pool entry to get its address.
   const Value *OpVal = OpInfo.CallOperandVal;
   if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
       isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
     OpInfo.CallOperand = DAG.getConstantPool(
         cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
     return Chain;
   }
 
   // Otherwise, create a stack slot and emit a store to it before the asm.
   Type *Ty = OpVal->getType();
   auto &DL = DAG.getDataLayout();
   uint64_t TySize = DL.getTypeAllocSize(Ty);
   unsigned Align = DL.getPrefTypeAlignment(Ty);
   MachineFunction &MF = DAG.getMachineFunction();
   int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
   Chain = DAG.getStore(Chain, Location, OpInfo.CallOperand, StackSlot,
                        MachinePointerInfo::getFixedStack(MF, SSFI));
   OpInfo.CallOperand = StackSlot;
 
   return Chain;
 }
 
 /// GetRegistersForValue - Assign registers (virtual or physical) for the
 /// specified operand.  We prefer to assign virtual registers, to allow the
 /// register allocator to handle the assignment process.  However, if the asm
 /// uses features that we can't model on machineinstrs, we have SDISel do the
 /// allocation.  This produces generally horrible, but correct, code.
 ///
 ///   OpInfo describes the operand
 ///   RefOpInfo describes the matching operand if any, the operand otherwise
 static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
                                  const SDLoc &DL, SDISelAsmOperandInfo &OpInfo,
                                  SDISelAsmOperandInfo &RefOpInfo) {
   LLVMContext &Context = *DAG.getContext();
 
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<unsigned, 4> Regs;
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
   // If this is a constraint for a single physreg, or a constraint for a
   // register class, find it.
   std::pair<unsigned, const TargetRegisterClass *> PhysReg =
       TLI.getRegForInlineAsmConstraint(&TRI, RefOpInfo.ConstraintCode,
                                        RefOpInfo.ConstraintVT);
 
   unsigned NumRegs = 1;
   if (OpInfo.ConstraintVT != MVT::Other) {
     // If this is a FP operand in an integer register (or visa versa), or more
     // generally if the operand value disagrees with the register class we plan
     // to stick it in, fix the operand type.
     //
     // If this is an input value, the bitcast to the new type is done now.
     // Bitcast for output value is done at the end of visitInlineAsm().
     if ((OpInfo.Type == InlineAsm::isOutput ||
          OpInfo.Type == InlineAsm::isInput) &&
         PhysReg.second &&
         !TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) {
       // Try to convert to the first EVT that the reg class contains.  If the
       // types are identical size, use a bitcast to convert (e.g. two differing
       // vector types).  Note: output bitcast is done at the end of
       // visitInlineAsm().
       MVT RegVT = *TRI.legalclasstypes_begin(*PhysReg.second);
       if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
         // Exclude indirect inputs while they are unsupported because the code
         // to perform the load is missing and thus OpInfo.CallOperand still
         // refer to the input address rather than the pointed-to value.
         if (OpInfo.Type == InlineAsm::isInput && !OpInfo.isIndirect)
           OpInfo.CallOperand =
               DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
         OpInfo.ConstraintVT = RegVT;
         // If the operand is a FP value and we want it in integer registers,
         // use the corresponding integer type. This turns an f64 value into
         // i64, which can be passed with two i32 values on a 32-bit machine.
       } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
         RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
         if (OpInfo.Type == InlineAsm::isInput)
           OpInfo.CallOperand =
               DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
         OpInfo.ConstraintVT = RegVT;
       }
     }
 
     NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
   }
 
   // No need to allocate a matching input constraint since the constraint it's
   // matching to has already been allocated.
   if (OpInfo.isMatchingInputConstraint())
     return;
 
   MVT RegVT;
   EVT ValueVT = OpInfo.ConstraintVT;
 
   // If this is a constraint for a specific physical register, like {r17},
   // assign it now.
   if (unsigned AssignedReg = PhysReg.first) {
     const TargetRegisterClass *RC = PhysReg.second;
     if (OpInfo.ConstraintVT == MVT::Other)
       ValueVT = *TRI.legalclasstypes_begin(*RC);
 
     // Get the actual register value type.  This is important, because the user
     // may have asked for (e.g.) the AX register in i32 type.  We need to
     // remember that AX is actually i16 to get the right extension.
     RegVT = *TRI.legalclasstypes_begin(*RC);
 
     // This is a explicit reference to a physical register.
     Regs.push_back(AssignedReg);
 
     // If this is an expanded reference, add the rest of the regs to Regs.
     if (NumRegs != 1) {
       TargetRegisterClass::iterator I = RC->begin();
       for (; *I != AssignedReg; ++I)
         assert(I != RC->end() && "Didn't find reg!");
 
       // Already added the first reg.
       --NumRegs; ++I;
       for (; NumRegs; --NumRegs, ++I) {
         assert(I != RC->end() && "Ran out of registers to allocate!");
         Regs.push_back(*I);
       }
     }
 
     OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
     return;
   }
 
   // Otherwise, if this was a reference to an LLVM register class, create vregs
   // for this reference.
   if (const TargetRegisterClass *RC = PhysReg.second) {
     RegVT = *TRI.legalclasstypes_begin(*RC);
     if (OpInfo.ConstraintVT == MVT::Other)
       ValueVT = RegVT;
 
     // Create the appropriate number of virtual registers.
     MachineRegisterInfo &RegInfo = MF.getRegInfo();
     for (; NumRegs; --NumRegs)
       Regs.push_back(RegInfo.createVirtualRegister(RC));
 
     OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
     return;
   }
 
   // Otherwise, we couldn't allocate enough registers for this.
 }
 
 static unsigned
 findMatchingInlineAsmOperand(unsigned OperandNo,
                              const std::vector<SDValue> &AsmNodeOperands) {
   // Scan until we find the definition we already emitted of this operand.
   unsigned CurOp = InlineAsm::Op_FirstOperand;
   for (; OperandNo; --OperandNo) {
     // Advance to the next operand.
     unsigned OpFlag =
         cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
     assert((InlineAsm::isRegDefKind(OpFlag) ||
             InlineAsm::isRegDefEarlyClobberKind(OpFlag) ||
             InlineAsm::isMemKind(OpFlag)) &&
            "Skipped past definitions?");
     CurOp += InlineAsm::getNumOperandRegisters(OpFlag) + 1;
   }
   return CurOp;
 }
 
 /// Fill \p Regs with \p NumRegs new virtual registers of type \p RegVT
 /// \return true if it has succeeded, false otherwise
 static bool createVirtualRegs(SmallVector<unsigned, 4> &Regs, unsigned NumRegs,
                               MVT RegVT, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
   for (unsigned i = 0, e = NumRegs; i != e; ++i) {
     if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
       Regs.push_back(RegInfo.createVirtualRegister(RC));
     else
       return false;
   }
   return true;
 }
 
 namespace {
 
 class ExtraFlags {
   unsigned Flags = 0;
 
 public:
   explicit ExtraFlags(ImmutableCallSite CS) {
     const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
     if (IA->hasSideEffects())
       Flags |= InlineAsm::Extra_HasSideEffects;
     if (IA->isAlignStack())
       Flags |= InlineAsm::Extra_IsAlignStack;
     if (CS.isConvergent())
       Flags |= InlineAsm::Extra_IsConvergent;
     Flags |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
   }
 
   void update(const TargetLowering::AsmOperandInfo &OpInfo) {
     // Ideally, we would only check against memory constraints.  However, the
     // meaning of an Other constraint can be target-specific and we can't easily
     // reason about it.  Therefore, be conservative and set MayLoad/MayStore
     // for Other constraints as well.
     if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
         OpInfo.ConstraintType == TargetLowering::C_Other) {
       if (OpInfo.Type == InlineAsm::isInput)
         Flags |= InlineAsm::Extra_MayLoad;
       else if (OpInfo.Type == InlineAsm::isOutput)
         Flags |= InlineAsm::Extra_MayStore;
       else if (OpInfo.Type == InlineAsm::isClobber)
         Flags |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
     }
   }
 
   unsigned get() const { return Flags; }
 };
 
 } // end anonymous namespace
 
 /// visitInlineAsm - Handle a call to an InlineAsm object.
 void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
 
   /// ConstraintOperands - Information about all of the constraints.
   SDISelAsmOperandInfoVector ConstraintOperands;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
       DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);
 
   bool hasMemory = false;
 
   // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
   ExtraFlags ExtraInfo(CS);
 
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
   unsigned ResNo = 0;   // ResNo - The result number of the next output.
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     ConstraintOperands.push_back(SDISelAsmOperandInfo(TargetConstraints[i]));
     SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
 
     MVT OpVT = MVT::Other;
 
     // Compute the value type for each operand.
     if (OpInfo.Type == InlineAsm::isInput ||
         (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
       OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
 
       // Process the call argument. BasicBlocks are labels, currently appearing
       // only in asm's.
       if (const BasicBlock *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
         OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
       } else {
         OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
       }
 
       OpVT =
           OpInfo
               .getCallOperandValEVT(*DAG.getContext(), TLI, DAG.getDataLayout())
               .getSimpleVT();
     }
 
     if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       // The return value of the call is this value.  As such, there is no
       // corresponding argument.
       assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
         OpVT = TLI.getSimpleValueType(DAG.getDataLayout(),
                                       STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
         OpVT = TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
       }
       ++ResNo;
     }
 
     OpInfo.ConstraintVT = OpVT;
 
     if (!hasMemory)
       hasMemory = OpInfo.hasMemory(TLI);
 
     // Determine if this InlineAsm MayLoad or MayStore based on the constraints.
     // FIXME: Could we compute this on OpInfo rather than TargetConstraints[i]?
     auto TargetConstraint = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(TargetConstraint, SDValue());
 
     ExtraInfo.update(TargetConstraint);
   }
 
   SDValue Chain, Flag;
 
   // We won't need to flush pending loads if this asm doesn't touch
   // memory and is nonvolatile.
   if (hasMemory || IA->hasSideEffects())
     Chain = getRoot();
   else
     Chain = DAG.getRoot();
 
   // Second pass over the constraints: compute which constraint option to use
   // and assign registers to constraints that want a specific physreg.
   for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
     SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
 
     // If this is an output operand with a matching input operand, look up the
     // matching input. If their types mismatch, e.g. one is an integer, the
     // other is floating point, or their sizes are different, flag it as an
     // error.
     if (OpInfo.hasMatchingInput()) {
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
       patchMatchingInput(OpInfo, Input, DAG);
     }
 
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.Type == InlineAsm::isClobber)
       continue;
 
     // If this is a memory input, and if the operand is not indirect, do what we
     // need to provide an address for the memory input.
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         !OpInfo.isIndirect) {
       assert((OpInfo.isMultipleAlternative ||
               (OpInfo.Type == InlineAsm::isInput)) &&
              "Can only indirectify direct input operands!");
 
       // Memory operands really want the address of the value.
       Chain = getAddressForMemoryInput(Chain, getCurSDLoc(), OpInfo, DAG);
 
       // There is no longer a Value* corresponding to this operand.
       OpInfo.CallOperandVal = nullptr;
 
       // It is now an indirect operand.
       OpInfo.isIndirect = true;
     }
 
     // If this constraint is for a specific register, allocate it before
     // anything else.
     SDISelAsmOperandInfo &RefOpInfo =
         OpInfo.isMatchingInputConstraint()
             ? ConstraintOperands[OpInfo.getMatchedOperand()]
             : ConstraintOperands[i];
     if (RefOpInfo.ConstraintType == TargetLowering::C_Register)
       GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo, RefOpInfo);
   }
 
   // Third pass - Loop over all of the operands, assigning virtual or physregs
   // to register class operands.
   for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
     SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
     SDISelAsmOperandInfo &RefOpInfo =
         OpInfo.isMatchingInputConstraint()
             ? ConstraintOperands[OpInfo.getMatchedOperand()]
             : ConstraintOperands[i];
 
     // C_Register operands have already been allocated, Other/Memory don't need
     // to be.
     if (RefOpInfo.ConstraintType == TargetLowering::C_RegisterClass)
       GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo, RefOpInfo);
   }
 
   // AsmNodeOperands - The operands for the ISD::INLINEASM node.
   std::vector<SDValue> AsmNodeOperands;
   AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
   AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
       IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));
 
   // If we have a !srcloc metadata node associated with it, we want to attach
   // this to the ultimately generated inline asm machineinstr.  To do this, we
   // pass in the third operand as this (potentially null) inline asm MDNode.
   const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
   AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
 
   // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
   // bits as operand 3.
   AsmNodeOperands.push_back(DAG.getTargetConstant(
       ExtraInfo.get(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
 
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
   RegsForValue RetValRegs;
 
   // IndirectStoresToEmit - The set of stores to emit after the inline asm node.
   std::vector<std::pair<RegsForValue, Value *>> IndirectStoresToEmit;
 
   for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
     SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
 
     switch (OpInfo.Type) {
     case InlineAsm::isOutput:
       if (OpInfo.ConstraintType != TargetLowering::C_RegisterClass &&
           OpInfo.ConstraintType != TargetLowering::C_Register) {
         // Memory output, or 'other' output (e.g. 'X' constraint).
         assert(OpInfo.isIndirect && "Memory output must be indirect operand");
 
         unsigned ConstraintID =
             TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
         assert(ConstraintID != InlineAsm::Constraint_Unknown &&
                "Failed to convert memory constraint code to constraint id.");
 
         // Add information to the INLINEASM node to know about this output.
         unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, getCurSDLoc(),
                                                         MVT::i32));
         AsmNodeOperands.push_back(OpInfo.CallOperand);
         break;
       }
 
       // Otherwise, this is a register or register class output.
 
       // Copy the output from the appropriate register.  Find a register that
       // we can use.
       if (OpInfo.AssignedRegs.Regs.empty()) {
         emitInlineAsmError(
             CS, "couldn't allocate output register for constraint '" +
                     Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
       // If this is an indirect operand, store through the pointer after the
       // asm.
       if (OpInfo.isIndirect) {
         IndirectStoresToEmit.push_back(std::make_pair(OpInfo.AssignedRegs,
                                                       OpInfo.CallOperandVal));
       } else {
         // This is the result value of the call.
         assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
         // Concatenate this output onto the outputs list.
         RetValRegs.append(OpInfo.AssignedRegs);
       }
 
       // Add information to the INLINEASM node to know that this register is
       // set.
       OpInfo.AssignedRegs
           .AddInlineAsmOperands(OpInfo.isEarlyClobber
                                     ? InlineAsm::Kind_RegDefEarlyClobber
                                     : InlineAsm::Kind_RegDef,
                                 false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
       break;
 
     case InlineAsm::isInput: {
       SDValue InOperandVal = OpInfo.CallOperand;
 
       if (OpInfo.isMatchingInputConstraint()) {
         // If this is required to match an output register we have already set,
         // just use its register.
         auto CurOp = findMatchingInlineAsmOperand(OpInfo.getMatchedOperand(),
                                                   AsmNodeOperands);
         unsigned OpFlag =
           cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
         if (InlineAsm::isRegDefKind(OpFlag) ||
             InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
           // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
           if (OpInfo.isIndirect) {
             // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
             emitInlineAsmError(CS, "inline asm not supported yet:"
                                    " don't know how to handle tied "
                                    "indirect register inputs");
             return;
           }
 
           MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
           SmallVector<unsigned, 4> Regs;
 
           if (!createVirtualRegs(Regs,
                                  InlineAsm::getNumOperandRegisters(OpFlag),
                                  RegVT, DAG)) {
             emitInlineAsmError(CS, "inline asm error: This value type register "
                                    "class is not natively supported!");
             return;
           }
 
           RegsForValue MatchedRegs(Regs, RegVT, InOperandVal.getValueType());
 
           SDLoc dl = getCurSDLoc();
           // Use the produced MatchedRegs object to
           MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
                                     CS.getInstruction());
           MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
                                            true, OpInfo.getMatchedOperand(), dl,
                                            DAG, AsmNodeOperands);
           break;
         }
 
         assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
         assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
                "Unexpected number of operands");
         // Add information to the INLINEASM node to know about this input.
         // See InlineAsm.h isUseOperandTiedToDef.
         OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
         OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
                                                     OpInfo.getMatchedOperand());
         AsmNodeOperands.push_back(DAG.getTargetConstant(
             OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
         AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
         break;
       }
 
       // Treat indirect 'X' constraint as memory.
       if (OpInfo.ConstraintType == TargetLowering::C_Other &&
           OpInfo.isIndirect)
         OpInfo.ConstraintType = TargetLowering::C_Memory;
 
       if (OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
         TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                           Ops, DAG);
         if (Ops.empty()) {
           emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
                                      Twine(OpInfo.ConstraintCode) + "'");
           return;
         }
 
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType =
           InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
         AsmNodeOperands.push_back(DAG.getTargetConstant(
             ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
         AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
         break;
       }
 
       if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
         assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
         assert(InOperandVal.getValueType() ==
                    TLI.getPointerTy(DAG.getDataLayout()) &&
                "Memory operands expect pointer values");
 
         unsigned ConstraintID =
             TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
         assert(ConstraintID != InlineAsm::Constraint_Unknown &&
                "Failed to convert memory constraint code to constraint id.");
 
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
                                                         getCurSDLoc(),
                                                         MVT::i32));
         AsmNodeOperands.push_back(InOperandVal);
         break;
       }
 
       assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
               OpInfo.ConstraintType == TargetLowering::C_Register) &&
              "Unknown constraint type!");
 
       // TODO: Support this.
       if (OpInfo.isIndirect) {
         emitInlineAsmError(
             CS, "Don't know how to handle indirect register inputs yet "
                 "for constraint '" +
                     Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty()) {
         emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
                                    Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
       SDLoc dl = getCurSDLoc();
 
       OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl,
                                         Chain, &Flag, CS.getInstruction());
 
       OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
                                                dl, DAG, AsmNodeOperands);
       break;
     }
     case InlineAsm::isClobber:
       // Add the clobbered value to the operand list, so that the register
       // allocator is aware that the physreg got clobbered.
       if (!OpInfo.AssignedRegs.Regs.empty())
         OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
                                                  false, 0, getCurSDLoc(), DAG,
                                                  AsmNodeOperands);
       break;
     }
   }
 
   // Finish up input operands.  Set the input chain and add the flag last.
   AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
   if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
 
   Chain = DAG.getNode(ISD::INLINEASM, getCurSDLoc(),
                       DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   Flag = Chain.getValue(1);
 
   // If this asm returns a register value, copy the result from that register
   // and set it as the value of the call.
   if (!RetValRegs.Regs.empty()) {
     SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
                                              Chain, &Flag, CS.getInstruction());
 
-    // FIXME: Why don't we do this for inline asms with MRVs?
-    if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
-      EVT ResultType = TLI.getValueType(DAG.getDataLayout(), CS.getType());
-
+    llvm::Type *CSResultType = CS.getType();
+    unsigned numRet;
+    ArrayRef<Type *> ResultTypes;
+    SmallVector<SDValue, 1> ResultValues(1);
+    if (CSResultType->isSingleValueType()) {
+      numRet = 1;
+      ResultValues[0] = Val;
+      ResultTypes = makeArrayRef(CSResultType);
+    } else {
+      numRet = CSResultType->getNumContainedTypes();
+      assert(Val->getNumOperands() == numRet &&
+             "Mismatch in number of output operands in asm result");
+      ResultTypes = CSResultType->subtypes();
+      ArrayRef<SDUse> ValueUses = Val->ops();
+      ResultValues.resize(numRet);
+      std::transform(ValueUses.begin(), ValueUses.end(), ResultValues.begin(),
+                     [](const SDUse &u) -> SDValue { return u.get(); });
+    }
+    SmallVector<EVT, 1> ResultVTs(numRet);
+    for (unsigned i = 0; i < numRet; i++) {
+      EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), ResultTypes[i]);
+      SDValue Val = ResultValues[i];
+      assert(ResultTypes[i]->isSized() && "Unexpected unsized type");
       // If the type of the inline asm call site return value is different but
       // has same size as the type of the asm output bitcast it.  One example
       // of this is for vectors with different width / number of elements.
       // This can happen for register classes that can contain multiple
       // different value types.  The preg or vreg allocated may not have the
       // same VT as was expected.
       //
       // This can also happen for a return value that disagrees with the
       // register class it is put in, eg. a double in a general-purpose
       // register on a 32-bit machine.
-      if (ResultType != Val.getValueType() &&
-          ResultType.getSizeInBits() == Val.getValueSizeInBits()) {
-        Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(),
-                          ResultType, Val);
-
-      } else if (ResultType != Val.getValueType() &&
-                 ResultType.isInteger() && Val.getValueType().isInteger()) {
-        // If a result value was tied to an input value, the computed result may
-        // have a wider width than the expected result.  Extract the relevant
-        // portion.
-        Val = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultType, Val);
+      if (ResultVT != Val.getValueType() &&
+          ResultVT.getSizeInBits() == Val.getValueSizeInBits())
+        Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(), ResultVT, Val);
+      else if (ResultVT != Val.getValueType() && ResultVT.isInteger() &&
+               Val.getValueType().isInteger()) {
+        // If a result value was tied to an input value, the computed result
+        // may have a wider width than the expected result.  Extract the
+        // relevant portion.
+        Val = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultVT, Val);
       }
 
-      assert(ResultType == Val.getValueType() && "Asm result value mismatch!");
+      assert(ResultVT == Val.getValueType() && "Asm result value mismatch!");
+      ResultVTs[i] = ResultVT;
+      ResultValues[i] = Val;
     }
 
+    Val = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                      DAG.getVTList(ResultVTs), ResultValues);
     setValue(CS.getInstruction(), Val);
     // Don't need to use this as a chain in this case.
     if (!IA->hasSideEffects() && !hasMemory && IndirectStoresToEmit.empty())
       return;
   }
 
   std::vector<std::pair<SDValue, const Value *>> StoresToEmit;
 
   // Process indirect outputs, first output all of the flagged copies out of
   // physregs.
   for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
     RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
     const Value *Ptr = IndirectStoresToEmit[i].second;
     SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
                                              Chain, &Flag, IA);
     StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
   }
 
   // Emit the non-flagged stores from the physregs.
   SmallVector<SDValue, 8> OutChains;
   for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) {
     SDValue Val = DAG.getStore(Chain, getCurSDLoc(), StoresToEmit[i].first,
                                getValue(StoresToEmit[i].second),
                                MachinePointerInfo(StoresToEmit[i].second));
     OutChains.push_back(Val);
   }
 
   if (!OutChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);
 
   DAG.setRoot(Chain);
 }
 
 void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
                                              const Twine &Message) {
   LLVMContext &Ctx = *DAG.getContext();
   Ctx.emitError(CS.getInstruction(), Message);
 
   // Make sure we leave the DAG in a valid state
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 1> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
 
   if (ValueVTs.empty())
     return;
 
   SmallVector<SDValue, 1> Ops;
   for (unsigned i = 0, e = ValueVTs.size(); i != e; ++i)
     Ops.push_back(DAG.getUNDEF(ValueVTs[i]));
 
   setValue(CS.getInstruction(), DAG.getMergeValues(Ops, getCurSDLoc()));
 }
 
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
                           MVT::Other, getRoot(),
                           getValue(I.getArgOperand(0)),
                           DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
   SDValue V = DAG.getVAArg(TLI.getValueType(DAG.getDataLayout(), I.getType()),
                            getCurSDLoc(), getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
                            DL.getABITypeAlignment(I.getType()));
   setValue(&I, V);
   DAG.setRoot(V.getValue(1));
 }
 
 void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
                           MVT::Other, getRoot(),
                           getValue(I.getArgOperand(0)),
                           DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
                           MVT::Other, getRoot(),
                           getValue(I.getArgOperand(0)),
                           getValue(I.getArgOperand(1)),
                           DAG.getSrcValue(I.getArgOperand(0)),
                           DAG.getSrcValue(I.getArgOperand(1))));
 }
 
 SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
                                                     const Instruction &I,
                                                     SDValue Op) {
   const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
   if (!Range)
     return Op;
 
   ConstantRange CR = getConstantRangeFromMetadata(*Range);
   if (CR.isFullSet() || CR.isEmptySet() || CR.isWrappedSet())
     return Op;
 
   APInt Lo = CR.getUnsignedMin();
   if (!Lo.isMinValue())
     return Op;
 
   APInt Hi = CR.getUnsignedMax();
   unsigned Bits = Hi.getActiveBits();
 
   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
 
   SDLoc SL = getCurSDLoc();
 
   SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
                              DAG.getValueType(SmallVT));
   unsigned NumVals = Op.getNode()->getNumValues();
   if (NumVals == 1)
     return ZExt;
 
   SmallVector<SDValue, 4> Ops;
 
   Ops.push_back(ZExt);
   for (unsigned I = 1; I != NumVals; ++I)
     Ops.push_back(Op.getValue(I));
 
   return DAG.getMergeValues(Ops, SL);
 }
 
 /// Populate a CallLowerinInfo (into \p CLI) based on the properties of
 /// the call being lowered.
 ///
 /// This is a helper for lowering intrinsics that follow a target calling
 /// convention or require stack pointer adjustment. Only a subset of the
 /// intrinsic's operands need to participate in the calling convention.
 void SelectionDAGBuilder::populateCallLoweringInfo(
     TargetLowering::CallLoweringInfo &CLI, ImmutableCallSite CS,
     unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
     bool IsPatchPoint) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
   // Populate the argument list.
   // Attributes for args start at offset 1, after the return attribute.
   for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
        ArgI != ArgE; ++ArgI) {
     const Value *V = CS->getOperand(ArgI);
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
 
     TargetLowering::ArgListEntry Entry;
     Entry.Node = getValue(V);
     Entry.Ty = V->getType();
     Entry.setAttributes(&CS, ArgI);
     Args.push_back(Entry);
   }
 
   CLI.setDebugLoc(getCurSDLoc())
       .setChain(getRoot())
       .setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args))
       .setDiscardResult(CS->use_empty())
       .setIsPatchPoint(IsPatchPoint);
 }
 
 /// Add a stack map intrinsic call's live variable operands to a stackmap
 /// or patchpoint target node's operand list.
 ///
 /// Constants are converted to TargetConstants purely as an optimization to
 /// avoid constant materialization and register allocation.
 ///
 /// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not
 /// generate addess computation nodes, and so ExpandISelPseudo can convert the
 /// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids
 /// address materialization and register allocation, but may also be required
 /// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an
 /// alloca in the entry block, then the runtime may assume that the alloca's
 /// StackMap location can be read immediately after compilation and that the
 /// location is valid at any point during execution (this is similar to the
 /// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
 /// only available in a register, then the runtime would need to trap when
 /// execution reaches the StackMap in order to read the alloca's location.
 static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
                                 const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
                                 SelectionDAGBuilder &Builder) {
   for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
     SDValue OpVal = Builder.getValue(CS.getArgument(i));
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
       Ops.push_back(
         Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
       Ops.push_back(
         Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
     } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
       const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
       Ops.push_back(Builder.DAG.getTargetFrameIndex(
           FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout())));
     } else
       Ops.push_back(OpVal);
   }
 }
 
 /// Lower llvm.experimental.stackmap directly to its target opcode.
 void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
   //                                  [live variables...])
 
   assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
 
   SDValue Chain, InFlag, Callee, NullPtr;
   SmallVector<SDValue, 32> Ops;
 
   SDLoc DL = getCurSDLoc();
   Callee = getValue(CI.getCalledValue());
   NullPtr = DAG.getIntPtrConstant(0, DL, true);
 
   // The stackmap intrinsic only records the live variables (the arguemnts
   // passed to it) and emits NOPS (if requested). Unlike the patchpoint
   // intrinsic, this won't be lowered to a function call. This means we don't
   // have to worry about calling conventions and target specific lowering code.
   // Instead we perform the call lowering right here.
   //
   // chain, flag = CALLSEQ_START(chain, 0, 0)
   // chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
   // chain, flag = CALLSEQ_END(chain, 0, 0, flag)
   //
   Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
   InFlag = Chain.getValue(1);
 
   // Add the <id> and <numBytes> constants.
   SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(IDVal)->getZExtValue(), DL, MVT::i64));
   SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), DL,
                   MVT::i32));
 
   // Push live variables for the stack map.
   addStackMapLiveVars(&CI, 2, DL, Ops, *this);
 
   // We are not pushing any register mask info here on the operands list,
   // because the stackmap doesn't clobber anything.
 
   // Push the chain and the glue flag.
   Ops.push_back(Chain);
   Ops.push_back(InFlag);
 
   // Create the STACKMAP node.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops);
   Chain = SDValue(SM, 0);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL);
 
   // Stackmaps don't generate values, so nothing goes into the NodeMap.
 
   // Set the root to the target-lowered call chain.
   DAG.setRoot(Chain);
 
   // Inform the Frame Information that we have a stackmap in this function.
   FuncInfo.MF->getFrameInfo().setHasStackMap();
 }
 
 /// Lower llvm.experimental.patchpoint directly to its target opcode.
 void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
                                           const BasicBlock *EHPadBB) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
   //                                                 i32 <numBytes>,
   //                                                 i8* <target>,
   //                                                 i32 <numArgs>,
   //                                                 [Args...],
   //                                                 [live variables...])
 
   CallingConv::ID CC = CS.getCallingConv();
   bool IsAnyRegCC = CC == CallingConv::AnyReg;
   bool HasDef = !CS->getType()->isVoidTy();
   SDLoc dl = getCurSDLoc();
   SDValue Callee = getValue(CS->getOperand(PatchPointOpers::TargetPos));
 
   // Handle immediate and symbolic callees.
   if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
     Callee = DAG.getIntPtrConstant(ConstCallee->getZExtValue(), dl,
                                    /*isTarget=*/true);
   else if (auto* SymbolicCallee = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee =  DAG.getTargetGlobalAddress(SymbolicCallee->getGlobal(),
                                          SDLoc(SymbolicCallee),
                                          SymbolicCallee->getValueType(0));
 
   // Get the real number of arguments participating in the call <numArgs>
   SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
   unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
 
   // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
   // Intrinsics include all meta-operands up to but not including CC.
   unsigned NumMetaOpers = PatchPointOpers::CCPos;
   assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
          "Not enough arguments provided to the patchpoint intrinsic");
 
   // For AnyRegCC the arguments are lowered later on manually.
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   Type *ReturnTy =
     IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   populateCallLoweringInfo(CLI, CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy,
                            true);
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   SDNode *CallEnd = Result.second.getNode();
   if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
     CallEnd = CallEnd->getOperand(0).getNode();
 
   /// Get a call instruction from the call sequence chain.
   /// Tail calls are not allowed.
   assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
          "Expected a callseq node.");
   SDNode *Call = CallEnd->getOperand(0).getNode();
   bool HasGlue = Call->getGluedNode();
 
   // Replace the target specific call node with the patchable intrinsic.
   SmallVector<SDValue, 8> Ops;
 
   // Add the <id> and <numBytes> constants.
   SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
   SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
                   MVT::i32));
 
   // Add the callee.
   Ops.push_back(Callee);
 
   // Adjust <numArgs> to account for any arguments that have been passed on the
   // stack instead.
   // Call Node: Chain, Target, {Args}, RegMask, [Glue]
   unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
   NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
   Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, dl, MVT::i32));
 
   // Add the calling convention
   Ops.push_back(DAG.getTargetConstant((unsigned)CC, dl, MVT::i32));
 
   // Add the arguments we omitted previously. The register allocator should
   // place these in any free register.
   if (IsAnyRegCC)
     for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
       Ops.push_back(getValue(CS.getArgument(i)));
 
   // Push the arguments from the call instruction up to the register mask.
   SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
   Ops.append(Call->op_begin() + 2, e);
 
   // Push live variables for the stack map.
   addStackMapLiveVars(CS, NumMetaOpers + NumArgs, dl, Ops, *this);
 
   // Push the register mask info.
   if (HasGlue)
     Ops.push_back(*(Call->op_end()-2));
   else
     Ops.push_back(*(Call->op_end()-1));
 
   // Push the chain (this is originally the first operand of the call, but
   // becomes now the last or second to last operand).
   Ops.push_back(*(Call->op_begin()));
 
   // Push the glue flag (last operand).
   if (HasGlue)
     Ops.push_back(*(Call->op_end()-1));
 
   SDVTList NodeTys;
   if (IsAnyRegCC && HasDef) {
     // Create the return types based on the intrinsic definition
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SmallVector<EVT, 3> ValueVTs;
     ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
     assert(ValueVTs.size() == 1 && "Expected only one return value type.");
 
     // There is always a chain and a glue type at the end
     ValueVTs.push_back(MVT::Other);
     ValueVTs.push_back(MVT::Glue);
     NodeTys = DAG.getVTList(ValueVTs);
   } else
     NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
   // Replace the target specific call node with a PATCHPOINT node.
   MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
                                          dl, NodeTys, Ops);
 
   // Update the NodeMap.
   if (HasDef) {
     if (IsAnyRegCC)
       setValue(CS.getInstruction(), SDValue(MN, 0));
     else
       setValue(CS.getInstruction(), Result.first);
   }
 
   // Fixup the consumers of the intrinsic. The chain and glue may be used in the
   // call sequence. Furthermore the location of the chain and glue can change
   // when the AnyReg calling convention is used and the intrinsic returns a
   // value.
   if (IsAnyRegCC && HasDef) {
     SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
     SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
     DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
   } else
     DAG.ReplaceAllUsesWith(Call, MN);
   DAG.DeleteNode(Call);
 
   // Inform the Frame Information that we have a patchpoint in this function.
   FuncInfo.MF->getFrameInfo().setHasPatchPoint();
 }
 
 void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
                                             unsigned Intrinsic) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Op1 = getValue(I.getArgOperand(0));
   SDValue Op2;
   if (I.getNumArgOperands() > 1)
     Op2 = getValue(I.getArgOperand(1));
   SDLoc dl = getCurSDLoc();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   SDValue Res;
   FastMathFlags FMF;
   if (isa<FPMathOperator>(I))
     FMF = I.getFastMathFlags();
 
   switch (Intrinsic) {
   case Intrinsic::experimental_vector_reduce_fadd:
     if (FMF.isFast())
       Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
     break;
   case Intrinsic::experimental_vector_reduce_fmul:
     if (FMF.isFast())
       Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
     break;
   case Intrinsic::experimental_vector_reduce_add:
     Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_mul:
     Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_and:
     Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_or:
     Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_xor:
     Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_smax:
     Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_smin:
     Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_umax:
     Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_umin:
     Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_fmax:
     Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1);
     break;
   case Intrinsic::experimental_vector_reduce_fmin:
     Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1);
     break;
   default:
     llvm_unreachable("Unhandled vector reduce intrinsic");
   }
   setValue(&I, Res);
 }
 
 /// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
 static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
   if (CLI.RetZExt)
     Attrs.push_back(Attribute::ZExt);
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
   return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
                             Attrs);
 }
 
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
 /// implementation, which just calls LowerCall.
 /// FIXME: When all targets are
 /// migrated to using LowerCall, this hook should be integrated into SDISel.
 std::pair<SDValue, SDValue>
 TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle the incoming return values from the call.
   CLI.Ins.clear();
   Type *OrigRetTy = CLI.RetTy;
   SmallVector<EVT, 4> RetTys;
   SmallVector<uint64_t, 4> Offsets;
   auto &DL = CLI.DAG.getDataLayout();
   ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
 
   if (CLI.IsPostTypeLegalization) {
     // If we are lowering a libcall after legalization, split the return type.
     SmallVector<EVT, 4> OldRetTys = std::move(RetTys);
     SmallVector<uint64_t, 4> OldOffsets = std::move(Offsets);
     for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
       EVT RetVT = OldRetTys[i];
       uint64_t Offset = OldOffsets[i];
       MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
       unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
       unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
       RetTys.append(NumRegs, RegisterVT);
       for (unsigned j = 0; j != NumRegs; ++j)
         Offsets.push_back(Offset + j * RegisterVTByteSZ);
     }
   }
 
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(CLI.CallConv, CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);
 
   bool CanLowerReturn =
       this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
                            CLI.IsVarArg, Outs, CLI.RetTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
   if (!CanLowerReturn) {
     // FIXME: equivalent assert?
     // assert(!CS.hasInAllocaArgument() &&
     //        "sret demotion is incompatible with inalloca");
     uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
     unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
     MachineFunction &MF = CLI.DAG.getMachineFunction();
     DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
     Type *StackSlotPtrType = PointerType::get(CLI.RetTy,
                                               DL.getAllocaAddrSpace());
 
     DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
     ArgListEntry Entry;
     Entry.Node = DemoteStackSlot;
     Entry.Ty = StackSlotPtrType;
     Entry.IsSExt = false;
     Entry.IsZExt = false;
     Entry.IsInReg = false;
     Entry.IsSRet = true;
     Entry.IsNest = false;
     Entry.IsByVal = false;
     Entry.IsReturned = false;
     Entry.IsSwiftSelf = false;
     Entry.IsSwiftError = false;
     Entry.Alignment = Align;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.NumFixedArgs += 1;
     CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
 
     // sret demotion isn't compatible with tail-calls, since the sret argument
     // points into the callers stack frame.
     CLI.IsTailCall = false;
   } else {
     for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
       EVT VT = RetTys[I];
       MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
                                                      CLI.CallConv, VT);
       unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
                                                        CLI.CallConv, VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
         ISD::InputArg MyFlags;
         MyFlags.VT = RegisterVT;
         MyFlags.ArgVT = VT;
         MyFlags.Used = CLI.IsReturnValueUsed;
         if (CLI.RetSExt)
           MyFlags.Flags.setSExt();
         if (CLI.RetZExt)
           MyFlags.Flags.setZExt();
         if (CLI.IsInReg)
           MyFlags.Flags.setInReg();
         CLI.Ins.push_back(MyFlags);
       }
     }
   }
 
   // We push in swifterror return as the last element of CLI.Ins.
   ArgListTy &Args = CLI.getArgs();
   if (supportSwiftError()) {
     for (unsigned i = 0, e = Args.size(); i != e; ++i) {
       if (Args[i].IsSwiftError) {
         ISD::InputArg MyFlags;
         MyFlags.VT = getPointerTy(DL);
         MyFlags.ArgVT = EVT(getPointerTy(DL));
         MyFlags.Flags.setSwiftError();
         CLI.Ins.push_back(MyFlags);
       }
     }
   }
 
   // Handle all of the outgoing arguments.
   CLI.Outs.clear();
   CLI.OutVals.clear();
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
     // FIXME: Split arguments if CLI.IsPostTypeLegalization
     Type *FinalType = Args[i].Ty;
     if (Args[i].IsByVal)
       FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         FinalType, CLI.CallConv, CLI.IsVarArg);
     for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
          ++Value) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
 
       // Certain targets (such as MIPS), may have a different ABI alignment
       // for a type depending on the context. Give the target a chance to
       // specify the alignment it wants.
       unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);
 
       if (Args[i].IsZExt)
         Flags.setZExt();
       if (Args[i].IsSExt)
         Flags.setSExt();
       if (Args[i].IsInReg) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (CLI.CallConv == CallingConv::X86_VectorCall &&
             isa<StructType>(FinalType)) {
           // The first value of a structure is marked
           if (0 == Value)
             Flags.setHvaStart();
           Flags.setHva();
         }
         // Set InReg Flag
         Flags.setInReg();
       }
       if (Args[i].IsSRet)
         Flags.setSRet();
       if (Args[i].IsSwiftSelf)
         Flags.setSwiftSelf();
       if (Args[i].IsSwiftError)
         Flags.setSwiftError();
       if (Args[i].IsByVal)
         Flags.setByVal();
       if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
         // inalloca.  This way we can know how many bytes we should've allocated
         // and how many bytes a callee cleanup function will pop.  If we port
         // inalloca to more targets, we'll have to add custom inalloca handling
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
       if (Args[i].IsByVal || Args[i].IsInAlloca) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should come from FE.  BE will guess if this
         // info is not there but there are cases it cannot get right.
         unsigned FrameAlign;
         if (Args[i].Alignment)
           FrameAlign = Args[i].Alignment;
         else
           FrameAlign = getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
       if (Args[i].IsNest)
         Flags.setNest();
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
                                                  CLI.CallConv, VT);
       unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
                                                         CLI.CallConv, VT);
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
       if (Args[i].IsSExt)
         ExtendKind = ISD::SIGN_EXTEND;
       else if (Args[i].IsZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
       // Conservatively only handle 'returned' on non-vectors that can be lowered,
       // for now.
       if (Args[i].IsReturned && !Op.getValueType().isVector() &&
           CanLowerReturn) {
         assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
                "unexpected use of 'returned'");
         // Before passing 'returned' to the target lowering code, ensure that
         // either the register MVT and the actual EVT are the same size or that
         // the return value and argument are extended in the same way; in these
         // cases it's safe to pass the argument register value unchanged as the
         // return register value (although it's at the target's option whether
         // to do so)
         // TODO: allow code generation to take advantage of partially preserved
         // registers rather than clobbering the entire register when the
         // parameter extension method is not compatible with the return
         // extension method
         if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
             (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
              CLI.RetZExt == Args[i].IsZExt))
           Flags.setReturned();
       }
 
       getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
                      CLI.CS.getInstruction(), CLI.CallConv, ExtendKind);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
         ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
                                i < CLI.NumFixedArgs,
                                i, j*Parts[j].getValueType().getStoreSize());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0) {
           MyFlags.Flags.setOrigAlign(1);
           if (j == NumParts - 1)
             MyFlags.Flags.setSplitEnd();
         }
 
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
 
       if (NeedsRegBlock && Value == NumValues - 1)
         CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
     }
   }
 
   SmallVector<SDValue, 4> InVals;
   CLI.Chain = LowerCall(CLI, InVals);
 
   // Update CLI.InVals to use outside of this function.
   CLI.InVals = InVals;
 
   // Verify that the target's LowerCall behaved as expected.
   assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
          "LowerCall didn't return a valid chain!");
   assert((!CLI.IsTailCall || InVals.empty()) &&
          "LowerCall emitted a return value for a tail call!");
   assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
          "LowerCall didn't emit the correct number of values!");
 
   // For a tail call, the return value is merely live-out and there aren't
   // any nodes in the DAG representing it. Return a special value to
   // indicate that a tail call has been emitted and no more Instructions
   // should be processed in the current block.
   if (CLI.IsTailCall) {
     CLI.DAG.setRoot(CLI.Chain);
     return std::make_pair(SDValue(), SDValue());
   }
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
     assert(InVals[i].getNode() && "LowerCall emitted a null value!");
     assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
            "LowerCall emitted a value with the wrong type!");
   }
 #endif
 
   SmallVector<SDValue, 4> ReturnValues;
   if (!CanLowerReturn) {
     // The instruction result is the result of loading from the
     // hidden sret parameter.
     SmallVector<EVT, 1> PVTs;
     Type *PtrRetTy = OrigRetTy->getPointerTo(DL.getAllocaAddrSpace());
 
     ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
 
     unsigned NumValues = RetTys.size();
     ReturnValues.resize(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
     // An aggregate return value cannot wrap around the address space, so
     // offsets to its parts don't wrap either.
     SDNodeFlags Flags;
     Flags.setNoUnsignedWrap(true);
 
     for (unsigned i = 0; i < NumValues; ++i) {
       SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
                                     CLI.DAG.getConstant(Offsets[i], CLI.DL,
                                                         PtrVT), Flags);
       SDValue L = CLI.DAG.getLoad(
           RetTys[i], CLI.DL, CLI.Chain, Add,
           MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
                                             DemoteStackIdx, Offsets[i]),
           /* Alignment = */ 1);
       ReturnValues[i] = L;
       Chains[i] = L.getValue(1);
     }
 
     CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
   } else {
     // Collect the legal value parts into potentially illegal values
     // that correspond to the original function's return values.
     Optional<ISD::NodeType> AssertOp;
     if (CLI.RetSExt)
       AssertOp = ISD::AssertSext;
     else if (CLI.RetZExt)
       AssertOp = ISD::AssertZext;
     unsigned CurReg = 0;
     for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
       EVT VT = RetTys[I];
       MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
                                                      CLI.CallConv, VT);
       unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
                                                        CLI.CallConv, VT);
 
       ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
                                               NumRegs, RegisterVT, VT, nullptr,
                                               CLI.CallConv, AssertOp));
       CurReg += NumRegs;
     }
 
     // For a function returning void, there is no return value. We can't create
     // such a node, so we just return a null return value in that case. In
     // that case, nothing will actually look at the value.
     if (ReturnValues.empty())
       return std::make_pair(SDValue(), CLI.Chain);
   }
 
   SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
                                 CLI.DAG.getVTList(RetTys), ReturnValues);
   return std::make_pair(Res, CLI.Chain);
 }
 
 void TargetLowering::LowerOperationWrapper(SDNode *N,
                                            SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
   if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
     Results.push_back(Res);
 }
 
 SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("LowerOperation not implemented for this target!");
 }
 
 void
 SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   SDValue Op = getNonRegisterValue(V);
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
          "Copy from a reg to the same reg!");
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If this is an InlineAsm we have to match the registers required, not the
   // notional registers required by the type.
 
   RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
                    getABIRegCopyCC(V));
   SDValue Chain = DAG.getEntryNode();
 
   ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
                               FuncInfo.PreferredExtendType.end())
                                  ? ISD::ANY_EXTEND
                                  : FuncInfo.PreferredExtendType[V];
   RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
   PendingExports.push_back(Chain);
 }
 
 #include "llvm/CodeGen/SelectionDAGISel.h"
 
 /// isOnlyUsedInEntryBlock - If the specified argument is only used in the
 /// entry block, return true.  This includes arguments used by switches, since
 /// the switch may expand into multiple basic blocks.
 static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
   // With FastISel active, we may be splitting blocks, so force creation
   // of virtual registers for all non-dead arguments.
   if (FastISel)
     return A->use_empty();
 
   const BasicBlock &Entry = A->getParent()->front();
   for (const User *U : A->users())
     if (cast<Instruction>(U)->getParent() != &Entry || isa<SwitchInst>(U))
       return false;  // Use not in entry block.
 
   return true;
 }
 
 using ArgCopyElisionMapTy =
     DenseMap<const Argument *,
              std::pair<const AllocaInst *, const StoreInst *>>;
 
 /// Scan the entry block of the function in FuncInfo for arguments that look
 /// like copies into a local alloca. Record any copied arguments in
 /// ArgCopyElisionCandidates.
 static void
 findArgumentCopyElisionCandidates(const DataLayout &DL,
                                   FunctionLoweringInfo *FuncInfo,
                                   ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
   // Record the state of every static alloca used in the entry block. Argument
   // allocas are all used in the entry block, so we need approximately as many
   // entries as we have arguments.
   enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
   SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
   unsigned NumArgs = FuncInfo->Fn->arg_size();
   StaticAllocas.reserve(NumArgs * 2);
 
   auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
     if (!V)
       return nullptr;
     V = V->stripPointerCasts();
     const auto *AI = dyn_cast<AllocaInst>(V);
     if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
       return nullptr;
     auto Iter = StaticAllocas.insert({AI, Unknown});
     return &Iter.first->second;
   };
 
   // Look for stores of arguments to static allocas. Look through bitcasts and
   // GEPs to handle type coercions, as long as the alloca is fully initialized
   // by the store. Any non-store use of an alloca escapes it and any subsequent
   // unanalyzed store might write it.
   // FIXME: Handle structs initialized with multiple stores.
   for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
     // Look for stores, and handle non-store uses conservatively.
     const auto *SI = dyn_cast<StoreInst>(&I);
     if (!SI) {
       // We will look through cast uses, so ignore them completely.
       if (I.isCast())
         continue;
       // Ignore debug info intrinsics, they don't escape or store to allocas.
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       // This is an unknown instruction. Assume it escapes or writes to all
       // static alloca operands.
       for (const Use &U : I.operands()) {
         if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
           *Info = StaticAllocaInfo::Clobbered;
       }
       continue;
     }
 
     // If the stored value is a static alloca, mark it as escaped.
     if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
       *Info = StaticAllocaInfo::Clobbered;
 
     // Check if the destination is a static alloca.
     const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
     StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
     if (!Info)
       continue;
     const AllocaInst *AI = cast<AllocaInst>(Dst);
 
     // Skip allocas that have been initialized or clobbered.
     if (*Info != StaticAllocaInfo::Unknown)
       continue;
 
     // Check if the stored value is an argument, and that this store fully
     // initializes the alloca. Don't elide copies from the same argument twice.
     const Value *Val = SI->getValueOperand()->stripPointerCasts();
     const auto *Arg = dyn_cast<Argument>(Val);
     if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
         Arg->getType()->isEmptyTy() ||
         DL.getTypeStoreSize(Arg->getType()) !=
             DL.getTypeAllocSize(AI->getAllocatedType()) ||
         ArgCopyElisionCandidates.count(Arg)) {
       *Info = StaticAllocaInfo::Clobbered;
       continue;
     }
 
     LLVM_DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI
                       << '\n');
 
     // Mark this alloca and store for argument copy elision.
     *Info = StaticAllocaInfo::Elidable;
     ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
 
     // Stop scanning if we've seen all arguments. This will happen early in -O0
     // builds, which is useful, because -O0 builds have large entry blocks and
     // many allocas.
     if (ArgCopyElisionCandidates.size() == NumArgs)
       break;
   }
 }
 
 /// Try to elide argument copies from memory into a local alloca. Succeeds if
 /// ArgVal is a load from a suitable fixed stack object.
 static void tryToElideArgumentCopy(
     FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
     DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
     SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
     ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
     SDValue ArgVal, bool &ArgHasUses) {
   // Check if this is a load from a fixed stack object.
   auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
   if (!LNode)
     return;
   auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
   if (!FINode)
     return;
 
   // Check that the fixed stack object is the right size and alignment.
   // Look at the alignment that the user wrote on the alloca instead of looking
   // at the stack object.
   auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
   assert(ArgCopyIter != ArgCopyElisionCandidates.end());
   const AllocaInst *AI = ArgCopyIter->second.first;
   int FixedIndex = FINode->getIndex();
   int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
   int OldIndex = AllocaIndex;
   MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
   if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
     LLVM_DEBUG(
         dbgs() << "  argument copy elision failed due to bad fixed stack "
                   "object size\n");
     return;
   }
   unsigned RequiredAlignment = AI->getAlignment();
   if (!RequiredAlignment) {
     RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
         AI->getAllocatedType());
   }
   if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
     LLVM_DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
                          "greater than stack argument alignment ("
                       << RequiredAlignment << " vs "
                       << MFI.getObjectAlignment(FixedIndex) << ")\n");
     return;
   }
 
   // Perform the elision. Delete the old stack object and replace its only use
   // in the variable info map. Mark the stack object as mutable.
   LLVM_DEBUG({
     dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
            << "  Replacing frame index " << OldIndex << " with " << FixedIndex
            << '\n';
   });
   MFI.RemoveStackObject(OldIndex);
   MFI.setIsImmutableObjectIndex(FixedIndex, false);
   AllocaIndex = FixedIndex;
   ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
   Chains.push_back(ArgVal.getValue(1));
 
   // Avoid emitting code for the store implementing the copy.
   const StoreInst *SI = ArgCopyIter->second.second;
   ElidedArgCopyInstrs.insert(SI);
 
   // Check for uses of the argument again so that we can avoid exporting ArgVal
   // if it is't used by anything other than the store.
   for (const Value *U : Arg.users()) {
     if (U != SI) {
       ArgHasUses = true;
       break;
     }
   }
 }
 
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
   const DataLayout &DL = DAG.getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
     ComputeValueVTs(*TLI, DAG.getDataLayout(),
                     F.getReturnType()->getPointerTo(
                         DAG.getDataLayout().getAllocaAddrSpace()),
                     ValueVTs);
 
     // NOTE: Assuming that a pointer will never break down to more than one VT
     // or one register.
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
     ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
                          ISD::InputArg::NoArgIndex, 0);
     Ins.push_back(RetArg);
   }
 
   // Look for stores of arguments to static allocas. Mark such arguments with a
   // flag to ask the target to give us the memory location of that argument if
   // available.
   ArgCopyElisionMapTy ArgCopyElisionCandidates;
   findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
 
   // Set up the incoming argument description vector.
   for (const Argument &Arg : F.args()) {
     unsigned ArgNo = Arg.getArgNo();
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
     bool isArgValueUsed = !Arg.use_empty();
     unsigned PartBase = 0;
     Type *FinalType = Arg.getType();
     if (Arg.hasAttribute(Attribute::ByVal))
       FinalType = cast<PointerType>(FinalType)->getElementType();
     bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
         FinalType, F.getCallingConv(), F.isVarArg());
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
       ISD::ArgFlagsTy Flags;
 
       // Certain targets (such as MIPS), may have a different ABI alignment
       // for a type depending on the context. Give the target a chance to
       // specify the alignment it wants.
       unsigned OriginalAlignment =
           TLI->getABIAlignmentForCallingConv(ArgTy, DL);
 
       if (Arg.hasAttribute(Attribute::ZExt))
         Flags.setZExt();
       if (Arg.hasAttribute(Attribute::SExt))
         Flags.setSExt();
       if (Arg.hasAttribute(Attribute::InReg)) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (F.getCallingConv() == CallingConv::X86_VectorCall &&
             isa<StructType>(Arg.getType())) {
           // The first value of a structure is marked
           if (0 == Value)
             Flags.setHvaStart();
           Flags.setHva();
         }
         // Set InReg Flag
         Flags.setInReg();
       }
       if (Arg.hasAttribute(Attribute::StructRet))
         Flags.setSRet();
       if (Arg.hasAttribute(Attribute::SwiftSelf))
         Flags.setSwiftSelf();
       if (Arg.hasAttribute(Attribute::SwiftError))
         Flags.setSwiftError();
       if (Arg.hasAttribute(Attribute::ByVal))
         Flags.setByVal();
       if (Arg.hasAttribute(Attribute::InAlloca)) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
         // inalloca.  This way we can know how many bytes we should've allocated
         // and how many bytes a callee cleanup function will pop.  If we port
         // inalloca to more targets, we'll have to add custom inalloca handling
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
       if (F.getCallingConv() == CallingConv::X86_INTR) {
         // IA Interrupt passes frame (1st parameter) by value in the stack.
         if (ArgNo == 0)
           Flags.setByVal();
       }
       if (Flags.isByVal() || Flags.isInAlloca()) {
         PointerType *Ty = cast<PointerType>(Arg.getType());
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
         // this info is not there but there are cases it cannot get right.
         unsigned FrameAlign;
         if (Arg.getParamAlignment())
           FrameAlign = Arg.getParamAlignment();
         else
           FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
       if (Arg.hasAttribute(Attribute::Nest))
         Flags.setNest();
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
       if (ArgCopyElisionCandidates.count(&Arg))
         Flags.setCopyElisionCandidate();
 
       MVT RegisterVT = TLI->getRegisterTypeForCallingConv(
           *CurDAG->getContext(), F.getCallingConv(), VT);
       unsigned NumRegs = TLI->getNumRegistersForCallingConv(
           *CurDAG->getContext(), F.getCallingConv(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
         ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
                               ArgNo, PartBase+i*RegisterVT.getStoreSize());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
         else if (i > 0) {
           MyFlags.Flags.setOrigAlign(1);
           if (i == NumRegs - 1)
             MyFlags.Flags.setSplitEnd();
         }
         Ins.push_back(MyFlags);
       }
       if (NeedsRegBlock && Value == NumValues - 1)
         Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
       PartBase += VT.getStoreSize();
     }
   }
 
   // Call the target to set up the argument values.
   SmallVector<SDValue, 8> InVals;
   SDValue NewRoot = TLI->LowerFormalArguments(
       DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);
 
   // Verify that the target's LowerFormalArguments behaved as expected.
   assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
          "LowerFormalArguments didn't return a valid chain!");
   assert(InVals.size() == Ins.size() &&
          "LowerFormalArguments didn't emit the correct number of values!");
   LLVM_DEBUG({
     for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
       assert(InVals[i].getNode() &&
              "LowerFormalArguments emitted a null value!");
       assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
              "LowerFormalArguments emitted a value with the wrong type!");
     }
   });
 
   // Update the DAG with the new chain value resulting from argument lowering.
   DAG.setRoot(NewRoot);
 
   // Set up the argument values.
   unsigned i = 0;
   if (!FuncInfo->CanLowerReturn) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
     SmallVector<EVT, 1> ValueVTs;
     ComputeValueVTs(*TLI, DAG.getDataLayout(),
                     F.getReturnType()->getPointerTo(
                         DAG.getDataLayout().getAllocaAddrSpace()),
                     ValueVTs);
     MVT VT = ValueVTs[0].getSimpleVT();
     MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
     Optional<ISD::NodeType> AssertOp = None;
     SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT,
                                         nullptr, F.getCallingConv(), AssertOp);
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
     unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
     FuncInfo->DemoteRegister = SRetReg;
     NewRoot =
         SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
     DAG.setRoot(NewRoot);
 
     // i indexes lowered arguments.  Bump it past the hidden sret argument.
     ++i;
   }
 
   SmallVector<SDValue, 4> Chains;
   DenseMap<int, int> ArgCopyElisionFrameIndexMap;
   for (const Argument &Arg : F.args()) {
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
     if (NumValues == 0)
       continue;
 
     bool ArgHasUses = !Arg.use_empty();
 
     // Elide the copying store if the target loaded this argument from a
     // suitable fixed stack object.
     if (Ins[i].Flags.isCopyElisionCandidate()) {
       tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
                              ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
                              InVals[i], ArgHasUses);
     }
 
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
     bool isSwiftErrorArg =
         TLI->supportSwiftError() &&
         Arg.hasAttribute(Attribute::SwiftError);
     if (!ArgHasUses && !isSwiftErrorArg) {
       SDB->setUnusedArgValue(&Arg, InVals[i]);
 
       // Also remember any frame index for use in FastISel.
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
         FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     for (unsigned Val = 0; Val != NumValues; ++Val) {
       EVT VT = ValueVTs[Val];
       MVT PartVT = TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(),
                                                       F.getCallingConv(), VT);
       unsigned NumParts = TLI->getNumRegistersForCallingConv(
           *CurDAG->getContext(), F.getCallingConv(), VT);
 
       // Even an apparant 'unused' swifterror argument needs to be returned. So
       // we do generate a copy for it that can be used on return from the
       // function.
       if (ArgHasUses || isSwiftErrorArg) {
         Optional<ISD::NodeType> AssertOp;
         if (Arg.hasAttribute(Attribute::SExt))
           AssertOp = ISD::AssertSext;
         else if (Arg.hasAttribute(Attribute::ZExt))
           AssertOp = ISD::AssertZext;
 
         ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
                                              PartVT, VT, nullptr,
                                              F.getCallingConv(), AssertOp));
       }
 
       i += NumParts;
     }
 
     // We don't need to do anything else for unused arguments.
     if (ArgValues.empty())
       continue;
 
     // Note down frame index.
     if (FrameIndexSDNode *FI =
         dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
       FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
                                      SDB->getCurSDLoc());
 
     SDB->setValue(&Arg, Res);
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
       // We want to associate the argument with the frame index, among
       // involved operands, that correspond to the lowest address. The
       // getCopyFromParts function, called earlier, is swapping the order of
       // the operands to BUILD_PAIR depending on endianness. The result of
       // that swapping is that the least significant bits of the argument will
       // be in the first operand of the BUILD_PAIR node, and the most
       // significant bits will be in the second operand.
       unsigned LowAddressOp = DAG.getDataLayout().isBigEndian() ? 1 : 0;
       if (LoadSDNode *LNode =
           dyn_cast<LoadSDNode>(Res.getOperand(LowAddressOp).getNode()))
         if (FrameIndexSDNode *FI =
             dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
           FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     // Update the SwiftErrorVRegDefMap.
     if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg))
         FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB,
                                            FuncInfo->SwiftErrorArg, Reg);
     }
 
     // If this argument is live outside of the entry block, insert a copy from
     // wherever we got it to the vreg that other BB's will reference it as.
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
       // If we can, though, try to skip creating an unnecessary vreg.
       // FIXME: This isn't very clean... it would be nice to make this more
       // general.  It's also subtly incompatible with the hacks FastISel
       // uses with vregs.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         FuncInfo->ValueMap[&Arg] = Reg;
         continue;
       }
     }
     if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
       FuncInfo->InitializeRegForValue(&Arg);
       SDB->CopyToExportRegsIfNeeded(&Arg);
     }
   }
 
   if (!Chains.empty()) {
     Chains.push_back(NewRoot);
     NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   }
 
   DAG.setRoot(NewRoot);
 
   assert(i == InVals.size() && "Argument register count mismatch!");
 
   // If any argument copy elisions occurred and we have debug info, update the
   // stale frame indices used in the dbg.declare variable info table.
   MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
   if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
     for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
       auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
       if (I != ArgCopyElisionFrameIndexMap.end())
         VI.Slot = I->second;
     }
   }
 
   // Finally, if the target has anything special to do, allow it to do so.
   EmitFunctionEntryCode();
 }
 
 /// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
 /// ensure constants are generated when needed.  Remember the virtual registers
 /// that need to be added to the Machine PHI nodes as input.  We cannot just
 /// directly add them, because expansion might result in multiple MBB's for one
 /// BB.  As such, the start of the BB might correspond to a different MBB than
 /// the end.
 void
 SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
   const TerminatorInst *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
 
   // Check PHI nodes in successors that expect a value to be available from this
   // block.
   for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
     const BasicBlock *SuccBB = TI->getSuccessor(succ);
     if (!isa<PHINode>(SuccBB->begin())) continue;
     MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
     if (!SuccsHandled.insert(SuccMBB).second)
       continue;
 
     MachineBasicBlock::iterator MBBI = SuccMBB->begin();
 
     // At this point we know that there is a 1-1 correspondence between LLVM PHI
     // nodes and Machine PHI nodes, but the incoming operands have not been
     // emitted yet.
     for (const PHINode &PN : SuccBB->phis()) {
       // Ignore dead phi's.
       if (PN.use_empty())
         continue;
 
       // Skip empty types
       if (PN.getType()->isEmptyTy())
         continue;
 
       unsigned Reg;
       const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB);
 
       if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
         unsigned &RegOut = ConstantsOut[C];
         if (RegOut == 0) {
           RegOut = FuncInfo.CreateRegs(C->getType());
           CopyValueToVirtualRegister(C, RegOut);
         }
         Reg = RegOut;
       } else {
         DenseMap<const Value *, unsigned>::iterator I =
           FuncInfo.ValueMap.find(PHIOp);
         if (I != FuncInfo.ValueMap.end())
           Reg = I->second;
         else {
           assert(isa<AllocaInst>(PHIOp) &&
                  FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
                  "Didn't codegen value into a register!??");
           Reg = FuncInfo.CreateRegs(PHIOp->getType());
           CopyValueToVirtualRegister(PHIOp, Reg);
         }
       }
 
       // Remember that this register needs to added to the machine PHI node as
       // the input for this MBB.
       SmallVector<EVT, 4> ValueVTs;
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       ComputeValueVTs(TLI, DAG.getDataLayout(), PN.getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
         unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
         for (unsigned i = 0, e = NumRegisters; i != e; ++i)
           FuncInfo.PHINodesToUpdate.push_back(
               std::make_pair(&*MBBI++, Reg + i));
         Reg += NumRegisters;
       }
     }
   }
 
   ConstantsOut.clear();
 }
 
 /// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
 /// is 0.
 MachineBasicBlock *
 SelectionDAGBuilder::StackProtectorDescriptor::
 AddSuccessorMBB(const BasicBlock *BB,
                 MachineBasicBlock *ParentMBB,
                 bool IsLikely,
                 MachineBasicBlock *SuccMBB) {
   // If SuccBB has not been created yet, create it.
   if (!SuccMBB) {
     MachineFunction *MF = ParentMBB->getParent();
     MachineFunction::iterator BBI(ParentMBB);
     SuccMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(++BBI, SuccMBB);
   }
   // Add it as a successor of ParentMBB.
   ParentMBB->addSuccessor(
       SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
   return SuccMBB;
 }
 
 MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) {
   MachineFunction::iterator I(MBB);
   if (++I == FuncInfo.MF->end())
     return nullptr;
   return &*I;
 }
 
 /// During lowering new call nodes can be created (such as memset, etc.).
 /// Those will become new roots of the current DAG, but complications arise
 /// when they are tail calls. In such cases, the call lowering will update
 /// the root, but the builder still needs to know that a tail call has been
 /// lowered in order to avoid generating an additional return.
 void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
   // If the node is null, we do have a tail call.
   if (MaybeTC.getNode() != nullptr)
     DAG.setRoot(MaybeTC);
   else
     HasTailCall = true;
 }
 
 uint64_t
 SelectionDAGBuilder::getJumpTableRange(const CaseClusterVector &Clusters,
                                        unsigned First, unsigned Last) const {
   assert(Last >= First);
   const APInt &LowCase = Clusters[First].Low->getValue();
   const APInt &HighCase = Clusters[Last].High->getValue();
   assert(LowCase.getBitWidth() == HighCase.getBitWidth());
 
   // FIXME: A range of consecutive cases has 100% density, but only requires one
   // comparison to lower. We should discriminate against such consecutive ranges
   // in jump tables.
 
   return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1;
 }
 
 uint64_t SelectionDAGBuilder::getJumpTableNumCases(
     const SmallVectorImpl<unsigned> &TotalCases, unsigned First,
     unsigned Last) const {
   assert(Last >= First);
   assert(TotalCases[Last] >= TotalCases[First]);
   uint64_t NumCases =
       TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);
   return NumCases;
 }
 
 bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,
                                          unsigned First, unsigned Last,
                                          const SwitchInst *SI,
                                          MachineBasicBlock *DefaultMBB,
                                          CaseCluster &JTCluster) {
   assert(First <= Last);
 
   auto Prob = BranchProbability::getZero();
   unsigned NumCmps = 0;
   std::vector<MachineBasicBlock*> Table;
   DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
 
   // Initialize probabilities in JTProbs.
   for (unsigned I = First; I <= Last; ++I)
     JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
 
   for (unsigned I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
     Prob += Clusters[I].Prob;
     const APInt &Low = Clusters[I].Low->getValue();
     const APInt &High = Clusters[I].High->getValue();
     NumCmps += (Low == High) ? 1 : 2;
     if (I != First) {
       // Fill the gap between this and the previous cluster.
       const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
       assert(PreviousHigh.slt(Low));
       uint64_t Gap = (Low - PreviousHigh).getLimitedValue() - 1;
       for (uint64_t J = 0; J < Gap; J++)
         Table.push_back(DefaultMBB);
     }
     uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;
     for (uint64_t J = 0; J < ClusterSize; ++J)
       Table.push_back(Clusters[I].MBB);
     JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
   }
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned NumDests = JTProbs.size();
   if (TLI.isSuitableForBitTests(
           NumDests, NumCmps, Clusters[First].Low->getValue(),
           Clusters[Last].High->getValue(), DAG.getDataLayout())) {
     // Clusters[First..Last] should be lowered as bit tests instead.
     return false;
   }
 
   // Create the MBB that will load from and jump through the table.
   // Note: We create it here, but it's not inserted into the function yet.
   MachineFunction *CurMF = FuncInfo.MF;
   MachineBasicBlock *JumpTableMBB =
       CurMF->CreateMachineBasicBlock(SI->getParent());
 
   // Add successors. Note: use table order for determinism.
   SmallPtrSet<MachineBasicBlock *, 8> Done;
   for (MachineBasicBlock *Succ : Table) {
     if (Done.count(Succ))
       continue;
     addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);
     Done.insert(Succ);
   }
   JumpTableMBB->normalizeSuccProbs();
 
   unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())
                      ->createJumpTableIndex(Table);
 
   // Set up the jump table info.
   JumpTable JT(-1U, JTI, JumpTableMBB, nullptr);
   JumpTableHeader JTH(Clusters[First].Low->getValue(),
                       Clusters[Last].High->getValue(), SI->getCondition(),
                       nullptr, false);
   JTCases.emplace_back(std::move(JTH), std::move(JT));
 
   JTCluster = CaseCluster::jumpTable(Clusters[First].Low, Clusters[Last].High,
                                      JTCases.size() - 1, Prob);
   return true;
 }
 
 void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
                                          const SwitchInst *SI,
                                          MachineBasicBlock *DefaultMBB) {
 #ifndef NDEBUG
   // Clusters must be non-empty, sorted, and only contain Range clusters.
   assert(!Clusters.empty());
   for (CaseCluster &C : Clusters)
     assert(C.Kind == CC_Range);
   for (unsigned i = 1, e = Clusters.size(); i < e; ++i)
     assert(Clusters[i - 1].High->getValue().slt(Clusters[i].Low->getValue()));
 #endif
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.areJTsAllowed(SI->getParent()->getParent()))
     return;
 
   const int64_t N = Clusters.size();
   const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries();
   const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;
 
   if (N < 2 || N < MinJumpTableEntries)
     return;
 
   // TotalCases[i]: Total nbr of cases in Clusters[0..i].
   SmallVector<unsigned, 8> TotalCases(N);
   for (unsigned i = 0; i < N; ++i) {
     const APInt &Hi = Clusters[i].High->getValue();
     const APInt &Lo = Clusters[i].Low->getValue();
     TotalCases[i] = (Hi - Lo).getLimitedValue() + 1;
     if (i != 0)
       TotalCases[i] += TotalCases[i - 1];
   }
 
   // Cheap case: the whole range may be suitable for jump table.
   uint64_t Range = getJumpTableRange(Clusters,0, N - 1);
   uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1);
   assert(NumCases < UINT64_MAX / 100);
   assert(Range >= NumCases);
   if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
     CaseCluster JTCluster;
     if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
       Clusters[0] = JTCluster;
       Clusters.resize(1);
       return;
     }
   }
 
   // The algorithm below is not suitable for -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
 
   // Split Clusters into minimum number of dense partitions. The algorithm uses
   // the same idea as Kannan & Proebsting "Correction to 'Producing Good Code
   // for the Case Statement'" (1994), but builds the MinPartitions array in
   // reverse order to make it easier to reconstruct the partitions in ascending
   // order. In the choice between two optimal partitionings, it picks the one
   // which yields more jump tables.
 
   // MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
   SmallVector<unsigned, 8> MinPartitions(N);
   // LastElement[i] is the last element of the partition starting at i.
   SmallVector<unsigned, 8> LastElement(N);
   // PartitionsScore[i] is used to break ties when choosing between two
   // partitionings resulting in the same number of partitions.
   SmallVector<unsigned, 8> PartitionsScore(N);
   // For PartitionsScore, a small number of comparisons is considered as good as
   // a jump table and a single comparison is considered better than a jump
   // table.
   enum PartitionScores : unsigned {
     NoTable = 0,
     Table = 1,
     FewCases = 1,
     SingleCase = 2
   };
 
   // Base case: There is only one way to partition Clusters[N-1].
   MinPartitions[N - 1] = 1;
   LastElement[N - 1] = N - 1;
   PartitionsScore[N - 1] = PartitionScores::SingleCase;
 
   // Note: loop indexes are signed to avoid underflow.
   for (int64_t i = N - 2; i >= 0; i--) {
     // Find optimal partitioning of Clusters[i..N-1].
     // Baseline: Put Clusters[i] into a partition on its own.
     MinPartitions[i] = MinPartitions[i + 1] + 1;
     LastElement[i] = i;
     PartitionsScore[i] = PartitionsScore[i + 1] + PartitionScores::SingleCase;
 
     // Search for a solution that results in fewer partitions.
     for (int64_t j = N - 1; j > i; j--) {
       // Try building a partition from Clusters[i..j].
       uint64_t Range = getJumpTableRange(Clusters, i, j);
       uint64_t NumCases = getJumpTableNumCases(TotalCases, i, j);
       assert(NumCases < UINT64_MAX / 100);
       assert(Range >= NumCases);
       if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
         unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
         unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];
         int64_t NumEntries = j - i + 1;
 
         if (NumEntries == 1)
           Score += PartitionScores::SingleCase;
         else if (NumEntries <= SmallNumberOfEntries)
           Score += PartitionScores::FewCases;
         else if (NumEntries >= MinJumpTableEntries)
           Score += PartitionScores::Table;
 
         // If this leads to fewer partitions, or to the same number of
         // partitions with better score, it is a better partitioning.
         if (NumPartitions < MinPartitions[i] ||
             (NumPartitions == MinPartitions[i] && Score > PartitionsScore[i])) {
           MinPartitions[i] = NumPartitions;
           LastElement[i] = j;
           PartitionsScore[i] = Score;
         }
       }
     }
   }
 
   // Iterate over the partitions, replacing some with jump tables in-place.
   unsigned DstIndex = 0;
   for (unsigned First = 0, Last; First < N; First = Last + 1) {
     Last = LastElement[First];
     assert(Last >= First);
     assert(DstIndex <= First);
     unsigned NumClusters = Last - First + 1;
 
     CaseCluster JTCluster;
     if (NumClusters >= MinJumpTableEntries &&
         buildJumpTable(Clusters, First, Last, SI, DefaultMBB, JTCluster)) {
       Clusters[DstIndex++] = JTCluster;
     } else {
       for (unsigned I = First; I <= Last; ++I)
         std::memmove(&Clusters[DstIndex++], &Clusters[I], sizeof(Clusters[I]));
     }
   }
   Clusters.resize(DstIndex);
 }
 
 bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
                                         unsigned First, unsigned Last,
                                         const SwitchInst *SI,
                                         CaseCluster &BTCluster) {
   assert(First <= Last);
   if (First == Last)
     return false;
 
   BitVector Dests(FuncInfo.MF->getNumBlockIDs());
   unsigned NumCmps = 0;
   for (int64_t I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
     Dests.set(Clusters[I].MBB->getNumber());
     NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
   }
   unsigned NumDests = Dests.count();
 
   APInt Low = Clusters[First].Low->getValue();
   APInt High = Clusters[Last].High->getValue();
   assert(Low.slt(High));
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
   if (!TLI.isSuitableForBitTests(NumDests, NumCmps, Low, High, DL))
     return false;
 
   APInt LowBound;
   APInt CmpRange;
 
   const int BitWidth = TLI.getPointerTy(DL).getSizeInBits();
   assert(TLI.rangeFitsInWord(Low, High, DL) &&
          "Case range must fit in bit mask!");
 
   // Check if the clusters cover a contiguous range such that no value in the
   // range will jump to the default statement.
   bool ContiguousRange = true;
   for (int64_t I = First + 1; I <= Last; ++I) {
     if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {
       ContiguousRange = false;
       break;
     }
   }
 
   if (Low.isStrictlyPositive() && High.slt(BitWidth)) {
     // Optimize the case where all the case values fit in a word without having
     // to subtract minValue. In this case, we can optimize away the subtraction.
     LowBound = APInt::getNullValue(Low.getBitWidth());
     CmpRange = High;
     ContiguousRange = false;
   } else {
     LowBound = Low;
     CmpRange = High - Low;
   }
 
   CaseBitsVector CBV;
   auto TotalProb = BranchProbability::getZero();
   for (unsigned i = First; i <= Last; ++i) {
     // Find the CaseBits for this destination.
     unsigned j;
     for (j = 0; j < CBV.size(); ++j)
       if (CBV[j].BB == Clusters[i].MBB)
         break;
     if (j == CBV.size())
       CBV.push_back(
           CaseBits(0, Clusters[i].MBB, 0, BranchProbability::getZero()));
     CaseBits *CB = &CBV[j];
 
     // Update Mask, Bits and ExtraProb.
     uint64_t Lo = (Clusters[i].Low->getValue() - LowBound).getZExtValue();
     uint64_t Hi = (Clusters[i].High->getValue() - LowBound).getZExtValue();
     assert(Hi >= Lo && Hi < 64 && "Invalid bit case!");
     CB->Mask |= (-1ULL >> (63 - (Hi - Lo))) << Lo;
     CB->Bits += Hi - Lo + 1;
     CB->ExtraProb += Clusters[i].Prob;
     TotalProb += Clusters[i].Prob;
   }
 
   BitTestInfo BTI;
   llvm::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) {
     // Sort by probability first, number of bits second, bit mask third.
     if (a.ExtraProb != b.ExtraProb)
       return a.ExtraProb > b.ExtraProb;
     if (a.Bits != b.Bits)
       return a.Bits > b.Bits;
     return a.Mask < b.Mask;
   });
 
   for (auto &CB : CBV) {
     MachineBasicBlock *BitTestBB =
         FuncInfo.MF->CreateMachineBasicBlock(SI->getParent());
     BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb));
   }
   BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange),
                             SI->getCondition(), -1U, MVT::Other, false,
                             ContiguousRange, nullptr, nullptr, std::move(BTI),
                             TotalProb);
 
   BTCluster = CaseCluster::bitTests(Clusters[First].Low, Clusters[Last].High,
                                     BitTestCases.size() - 1, TotalProb);
   return true;
 }
 
 void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters,
                                               const SwitchInst *SI) {
 // Partition Clusters into as few subsets as possible, where each subset has a
 // range that fits in a machine word and has <= 3 unique destinations.
 
 #ifndef NDEBUG
   // Clusters must be sorted and contain Range or JumpTable clusters.
   assert(!Clusters.empty());
   assert(Clusters[0].Kind == CC_Range || Clusters[0].Kind == CC_JumpTable);
   for (const CaseCluster &C : Clusters)
     assert(C.Kind == CC_Range || C.Kind == CC_JumpTable);
   for (unsigned i = 1; i < Clusters.size(); ++i)
     assert(Clusters[i-1].High->getValue().slt(Clusters[i].Low->getValue()));
 #endif
 
   // The algorithm below is not suitable for -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
 
   // If target does not have legal shift left, do not emit bit tests at all.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
 
   EVT PTy = TLI.getPointerTy(DL);
   if (!TLI.isOperationLegal(ISD::SHL, PTy))
     return;
 
   int BitWidth = PTy.getSizeInBits();
   const int64_t N = Clusters.size();
 
   // MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
   SmallVector<unsigned, 8> MinPartitions(N);
   // LastElement[i] is the last element of the partition starting at i.
   SmallVector<unsigned, 8> LastElement(N);
 
   // FIXME: This might not be the best algorithm for finding bit test clusters.
 
   // Base case: There is only one way to partition Clusters[N-1].
   MinPartitions[N - 1] = 1;
   LastElement[N - 1] = N - 1;
 
   // Note: loop indexes are signed to avoid underflow.
   for (int64_t i = N - 2; i >= 0; --i) {
     // Find optimal partitioning of Clusters[i..N-1].
     // Baseline: Put Clusters[i] into a partition on its own.
     MinPartitions[i] = MinPartitions[i + 1] + 1;
     LastElement[i] = i;
 
     // Search for a solution that results in fewer partitions.
     // Note: the search is limited by BitWidth, reducing time complexity.
     for (int64_t j = std::min(N - 1, i + BitWidth - 1); j > i; --j) {
       // Try building a partition from Clusters[i..j].
 
       // Check the range.
       if (!TLI.rangeFitsInWord(Clusters[i].Low->getValue(),
                                Clusters[j].High->getValue(), DL))
         continue;
 
       // Check nbr of destinations and cluster types.
       // FIXME: This works, but doesn't seem very efficient.
       bool RangesOnly = true;
       BitVector Dests(FuncInfo.MF->getNumBlockIDs());
       for (int64_t k = i; k <= j; k++) {
         if (Clusters[k].Kind != CC_Range) {
           RangesOnly = false;
           break;
         }
         Dests.set(Clusters[k].MBB->getNumber());
       }
       if (!RangesOnly || Dests.count() > 3)
         break;
 
       // Check if it's a better partition.
       unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
       if (NumPartitions < MinPartitions[i]) {
         // Found a better partition.
         MinPartitions[i] = NumPartitions;
         LastElement[i] = j;
       }
     }
   }
 
   // Iterate over the partitions, replacing with bit-test clusters in-place.
   unsigned DstIndex = 0;
   for (unsigned First = 0, Last; First < N; First = Last + 1) {
     Last = LastElement[First];
     assert(First <= Last);
     assert(DstIndex <= First);
 
     CaseCluster BitTestCluster;
     if (buildBitTests(Clusters, First, Last, SI, BitTestCluster)) {
       Clusters[DstIndex++] = BitTestCluster;
     } else {
       size_t NumClusters = Last - First + 1;
       std::memmove(&Clusters[DstIndex], &Clusters[First],
                    sizeof(Clusters[0]) * NumClusters);
       DstIndex += NumClusters;
     }
   }
   Clusters.resize(DstIndex);
 }
 
 void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
                                         MachineBasicBlock *SwitchMBB,
                                         MachineBasicBlock *DefaultMBB) {
   MachineFunction *CurMF = FuncInfo.MF;
   MachineBasicBlock *NextMBB = nullptr;
   MachineFunction::iterator BBI(W.MBB);
   if (++BBI != FuncInfo.MF->end())
     NextMBB = &*BBI;
 
   unsigned Size = W.LastCluster - W.FirstCluster + 1;
 
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
 
   if (Size == 2 && W.MBB == SwitchMBB) {
     // If any two of the cases has the same destination, and if one value
     // is the same as the other, but has one bit unset that the other has set,
     // use bit manipulation to do two compares at once.  For example:
     // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
     // TODO: This could be extended to merge any 2 cases in switches with 3
     // cases.
     // TODO: Handle cases where W.CaseBB != SwitchBB.
     CaseCluster &Small = *W.FirstCluster;
     CaseCluster &Big = *W.LastCluster;
 
     if (Small.Low == Small.High && Big.Low == Big.High &&
         Small.MBB == Big.MBB) {
       const APInt &SmallValue = Small.Low->getValue();
       const APInt &BigValue = Big.Low->getValue();
 
       // Check that there is only one bit different.
       APInt CommonBit = BigValue ^ SmallValue;
       if (CommonBit.isPowerOf2()) {
         SDValue CondLHS = getValue(Cond);
         EVT VT = CondLHS.getValueType();
         SDLoc DL = getCurSDLoc();
 
         SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
                                  DAG.getConstant(CommonBit, DL, VT));
         SDValue Cond = DAG.getSetCC(
             DL, MVT::i1, Or, DAG.getConstant(BigValue | SmallValue, DL, VT),
             ISD::SETEQ);
 
         // Update successor info.
         // Both Small and Big will jump to Small.BB, so we sum up the
         // probabilities.
         addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
         if (BPI)
           addSuccessorWithProb(
               SwitchMBB, DefaultMBB,
               // The default destination is the first successor in IR.
               BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
         else
           addSuccessorWithProb(SwitchMBB, DefaultMBB);
 
         // Insert the true branch.
         SDValue BrCond =
             DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond,
                         DAG.getBasicBlock(Small.MBB));
         // Insert the false branch.
         BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
                              DAG.getBasicBlock(DefaultMBB));
 
         DAG.setRoot(BrCond);
         return;
       }
     }
   }
 
   if (TM.getOptLevel() != CodeGenOpt::None) {
     // Here, we order cases by probability so the most likely case will be
     // checked first. However, two clusters can have the same probability in
     // which case their relative ordering is non-deterministic. So we use Low
     // as a tie-breaker as clusters are guaranteed to never overlap.
     llvm::sort(W.FirstCluster, W.LastCluster + 1,
                [](const CaseCluster &a, const CaseCluster &b) {
       return a.Prob != b.Prob ?
              a.Prob > b.Prob :
              a.Low->getValue().slt(b.Low->getValue());
     });
 
     // Rearrange the case blocks so that the last one falls through if possible
     // without changing the order of probabilities.
     for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
       --I;
       if (I->Prob > W.LastCluster->Prob)
         break;
       if (I->Kind == CC_Range && I->MBB == NextMBB) {
         std::swap(*I, *W.LastCluster);
         break;
       }
     }
   }
 
   // Compute total probability.
   BranchProbability DefaultProb = W.DefaultProb;
   BranchProbability UnhandledProbs = DefaultProb;
   for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
     UnhandledProbs += I->Prob;
 
   MachineBasicBlock *CurMBB = W.MBB;
   for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
     MachineBasicBlock *Fallthrough;
     if (I == W.LastCluster) {
       // For the last cluster, fall through to the default destination.
       Fallthrough = DefaultMBB;
     } else {
       Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
       CurMF->insert(BBI, Fallthrough);
       // Put Cond in a virtual register to make it available from the new blocks.
       ExportFromCurrentBlock(Cond);
     }
     UnhandledProbs -= I->Prob;
 
     switch (I->Kind) {
       case CC_JumpTable: {
         // FIXME: Optimize away range check based on pivot comparisons.
         JumpTableHeader *JTH = &JTCases[I->JTCasesIndex].first;
         JumpTable *JT = &JTCases[I->JTCasesIndex].second;
 
         // The jump block hasn't been inserted yet; insert it here.
         MachineBasicBlock *JumpMBB = JT->MBB;
         CurMF->insert(BBI, JumpMBB);
 
         auto JumpProb = I->Prob;
         auto FallthroughProb = UnhandledProbs;
 
         // If the default statement is a target of the jump table, we evenly
         // distribute the default probability to successors of CurMBB. Also
         // update the probability on the edge from JumpMBB to Fallthrough.
         for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
                                               SE = JumpMBB->succ_end();
              SI != SE; ++SI) {
           if (*SI == DefaultMBB) {
             JumpProb += DefaultProb / 2;
             FallthroughProb -= DefaultProb / 2;
             JumpMBB->setSuccProbability(SI, DefaultProb / 2);
             JumpMBB->normalizeSuccProbs();
             break;
           }
         }
 
         addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
         addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
         CurMBB->normalizeSuccProbs();
 
         // The jump table header will be inserted in our current block, do the
         // range check, and fall through to our fallthrough block.
         JTH->HeaderBB = CurMBB;
         JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.
 
         // If we're in the right place, emit the jump table header right now.
         if (CurMBB == SwitchMBB) {
           visitJumpTableHeader(*JT, *JTH, SwitchMBB);
           JTH->Emitted = true;
         }
         break;
       }
       case CC_BitTests: {
         // FIXME: Optimize away range check based on pivot comparisons.
         BitTestBlock *BTB = &BitTestCases[I->BTCasesIndex];
 
         // The bit test blocks haven't been inserted yet; insert them here.
         for (BitTestCase &BTC : BTB->Cases)
           CurMF->insert(BBI, BTC.ThisBB);
 
         // Fill in fields of the BitTestBlock.
         BTB->Parent = CurMBB;
         BTB->Default = Fallthrough;
 
         BTB->DefaultProb = UnhandledProbs;
         // If the cases in bit test don't form a contiguous range, we evenly
         // distribute the probability on the edge to Fallthrough to two
         // successors of CurMBB.
         if (!BTB->ContiguousRange) {
           BTB->Prob += DefaultProb / 2;
           BTB->DefaultProb -= DefaultProb / 2;
         }
 
         // If we're in the right place, emit the bit test header right now.
         if (CurMBB == SwitchMBB) {
           visitBitTestHeader(*BTB, SwitchMBB);
           BTB->Emitted = true;
         }
         break;
       }
       case CC_Range: {
         const Value *RHS, *LHS, *MHS;
         ISD::CondCode CC;
         if (I->Low == I->High) {
           // Check Cond == I->Low.
           CC = ISD::SETEQ;
           LHS = Cond;
           RHS=I->Low;
           MHS = nullptr;
         } else {
           // Check I->Low <= Cond <= I->High.
           CC = ISD::SETLE;
           LHS = I->Low;
           MHS = Cond;
           RHS = I->High;
         }
 
         // The false probability is the sum of all unhandled cases.
         CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB,
                      getCurSDLoc(), I->Prob, UnhandledProbs);
 
         if (CurMBB == SwitchMBB)
           visitSwitchCase(CB, SwitchMBB);
         else
           SwitchCases.push_back(CB);
 
         break;
       }
     }
     CurMBB = Fallthrough;
   }
 }
 
 unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
                                               CaseClusterIt First,
                                               CaseClusterIt Last) {
   return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
     if (X.Prob != CC.Prob)
       return X.Prob > CC.Prob;
 
     // Ties are broken by comparing the case value.
     return X.Low->getValue().slt(CC.Low->getValue());
   });
 }
 
 void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
                                         const SwitchWorkListItem &W,
                                         Value *Cond,
                                         MachineBasicBlock *SwitchMBB) {
   assert(W.FirstCluster->Low->getValue().slt(W.LastCluster->Low->getValue()) &&
          "Clusters not sorted?");
 
   assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");
 
   // Balance the tree based on branch probabilities to create a near-optimal (in
   // terms of search time given key frequency) binary search tree. See e.g. Kurt
   // Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
   CaseClusterIt LastLeft = W.FirstCluster;
   CaseClusterIt FirstRight = W.LastCluster;
   auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
   auto RightProb = FirstRight->Prob + W.DefaultProb / 2;
 
   // Move LastLeft and FirstRight towards each other from opposite directions to
   // find a partitioning of the clusters which balances the probability on both
   // sides. If LeftProb and RightProb are equal, alternate which side is
   // taken to ensure 0-probability nodes are distributed evenly.
   unsigned I = 0;
   while (LastLeft + 1 < FirstRight) {
     if (LeftProb < RightProb || (LeftProb == RightProb && (I & 1)))
       LeftProb += (++LastLeft)->Prob;
     else
       RightProb += (--FirstRight)->Prob;
     I++;
   }
 
   while (true) {
     // Our binary search tree differs from a typical BST in that ours can have up
     // to three values in each leaf. The pivot selection above doesn't take that
     // into account, which means the tree might require more nodes and be less
     // efficient. We compensate for this here.
 
     unsigned NumLeft = LastLeft - W.FirstCluster + 1;
     unsigned NumRight = W.LastCluster - FirstRight + 1;
 
     if (std::min(NumLeft, NumRight) < 3 && std::max(NumLeft, NumRight) > 3) {
       // If one side has less than 3 clusters, and the other has more than 3,
       // consider taking a cluster from the other side.
 
       if (NumLeft < NumRight) {
         // Consider moving the first cluster on the right to the left side.
         CaseCluster &CC = *FirstRight;
         unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
         unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
         if (LeftSideRank <= RightSideRank) {
           // Moving the cluster to the left does not demote it.
           ++LastLeft;
           ++FirstRight;
           continue;
         }
       } else {
         assert(NumRight < NumLeft);
         // Consider moving the last element on the left to the right side.
         CaseCluster &CC = *LastLeft;
         unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
         unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
         if (RightSideRank <= LeftSideRank) {
           // Moving the cluster to the right does not demot it.
           --LastLeft;
           --FirstRight;
           continue;
         }
       }
     }
     break;
   }
 
   assert(LastLeft + 1 == FirstRight);
   assert(LastLeft >= W.FirstCluster);
   assert(FirstRight <= W.LastCluster);
 
   // Use the first element on the right as pivot since we will make less-than
   // comparisons against it.
   CaseClusterIt PivotCluster = FirstRight;
   assert(PivotCluster > W.FirstCluster);
   assert(PivotCluster <= W.LastCluster);
 
   CaseClusterIt FirstLeft = W.FirstCluster;
   CaseClusterIt LastRight = W.LastCluster;
 
   const ConstantInt *Pivot = PivotCluster->Low;
 
   // New blocks will be inserted immediately after the current one.
   MachineFunction::iterator BBI(W.MBB);
   ++BBI;
 
   // We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
   // we can branch to its destination directly if it's squeezed exactly in
   // between the known lower bound and Pivot - 1.
   MachineBasicBlock *LeftMBB;
   if (FirstLeft == LastLeft && FirstLeft->Kind == CC_Range &&
       FirstLeft->Low == W.GE &&
       (FirstLeft->High->getValue() + 1LL) == Pivot->getValue()) {
     LeftMBB = FirstLeft->MBB;
   } else {
     LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
     FuncInfo.MF->insert(BBI, LeftMBB);
     WorkList.push_back(
         {LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
     // Put Cond in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(Cond);
   }
 
   // Similarly, we will branch to the RHS if Value >= Pivot. If RHS is a
   // single cluster, RHS.Low == Pivot, and we can branch to its destination
   // directly if RHS.High equals the current upper bound.
   MachineBasicBlock *RightMBB;
   if (FirstRight == LastRight && FirstRight->Kind == CC_Range &&
       W.LT && (FirstRight->High->getValue() + 1ULL) == W.LT->getValue()) {
     RightMBB = FirstRight->MBB;
   } else {
     RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
     FuncInfo.MF->insert(BBI, RightMBB);
     WorkList.push_back(
         {RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
     // Put Cond in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(Cond);
   }
 
   // Create the CaseBlock record that will be used to lower the branch.
   CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
                getCurSDLoc(), LeftProb, RightProb);
 
   if (W.MBB == SwitchMBB)
     visitSwitchCase(CB, SwitchMBB);
   else
     SwitchCases.push_back(CB);
 }
 
 // Scale CaseProb after peeling a case with the probablity of PeeledCaseProb
 // from the swith statement.
 static BranchProbability scaleCaseProbality(BranchProbability CaseProb,
                                             BranchProbability PeeledCaseProb) {
   if (PeeledCaseProb == BranchProbability::getOne())
     return BranchProbability::getZero();
   BranchProbability SwitchProb = PeeledCaseProb.getCompl();
 
   uint32_t Numerator = CaseProb.getNumerator();
   uint32_t Denominator = SwitchProb.scale(CaseProb.getDenominator());
   return BranchProbability(Numerator, std::max(Numerator, Denominator));
 }
 
 // Try to peel the top probability case if it exceeds the threshold.
 // Return current MachineBasicBlock for the switch statement if the peeling
 // does not occur.
 // If the peeling is performed, return the newly created MachineBasicBlock
 // for the peeled switch statement. Also update Clusters to remove the peeled
 // case. PeeledCaseProb is the BranchProbability for the peeled case.
 MachineBasicBlock *SelectionDAGBuilder::peelDominantCaseCluster(
     const SwitchInst &SI, CaseClusterVector &Clusters,
     BranchProbability &PeeledCaseProb) {
   MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
   // Don't perform if there is only one cluster or optimizing for size.
   if (SwitchPeelThreshold > 100 || !FuncInfo.BPI || Clusters.size() < 2 ||
       TM.getOptLevel() == CodeGenOpt::None ||
       SwitchMBB->getParent()->getFunction().optForMinSize())
     return SwitchMBB;
 
   BranchProbability TopCaseProb = BranchProbability(SwitchPeelThreshold, 100);
   unsigned PeeledCaseIndex = 0;
   bool SwitchPeeled = false;
   for (unsigned Index = 0; Index < Clusters.size(); ++Index) {
     CaseCluster &CC = Clusters[Index];
     if (CC.Prob < TopCaseProb)
       continue;
     TopCaseProb = CC.Prob;
     PeeledCaseIndex = Index;
     SwitchPeeled = true;
   }
   if (!SwitchPeeled)
     return SwitchMBB;
 
   LLVM_DEBUG(dbgs() << "Peeled one top case in switch stmt, prob: "
                     << TopCaseProb << "\n");
 
   // Record the MBB for the peeled switch statement.
   MachineFunction::iterator BBI(SwitchMBB);
   ++BBI;
   MachineBasicBlock *PeeledSwitchMBB =
       FuncInfo.MF->CreateMachineBasicBlock(SwitchMBB->getBasicBlock());
   FuncInfo.MF->insert(BBI, PeeledSwitchMBB);
 
   ExportFromCurrentBlock(SI.getCondition());
   auto PeeledCaseIt = Clusters.begin() + PeeledCaseIndex;
   SwitchWorkListItem W = {SwitchMBB, PeeledCaseIt, PeeledCaseIt,
                           nullptr,   nullptr,      TopCaseProb.getCompl()};
   lowerWorkItem(W, SI.getCondition(), SwitchMBB, PeeledSwitchMBB);
 
   Clusters.erase(PeeledCaseIt);
   for (CaseCluster &CC : Clusters) {
     LLVM_DEBUG(
         dbgs() << "Scale the probablity for one cluster, before scaling: "
                << CC.Prob << "\n");
     CC.Prob = scaleCaseProbality(CC.Prob, TopCaseProb);
     LLVM_DEBUG(dbgs() << "After scaling: " << CC.Prob << "\n");
   }
   PeeledCaseProb = TopCaseProb;
   return PeeledSwitchMBB;
 }
 
 void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   // Extract cases from the switch.
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   CaseClusterVector Clusters;
   Clusters.reserve(SI.getNumCases());
   for (auto I : SI.cases()) {
     MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
     const ConstantInt *CaseVal = I.getCaseValue();
     BranchProbability Prob =
         BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
             : BranchProbability(1, SI.getNumCases() + 1);
     Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
   }
 
   MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];
 
   // Cluster adjacent cases with the same destination. We do this at all
   // optimization levels because it's cheap to do and will make codegen faster
   // if there are many clusters.
   sortAndRangeify(Clusters);
 
   if (TM.getOptLevel() != CodeGenOpt::None) {
     // Replace an unreachable default with the most popular destination.
     // FIXME: Exploit unreachable default more aggressively.
     bool UnreachableDefault =
         isa<UnreachableInst>(SI.getDefaultDest()->getFirstNonPHIOrDbg());
     if (UnreachableDefault && !Clusters.empty()) {
       DenseMap<const BasicBlock *, unsigned> Popularity;
       unsigned MaxPop = 0;
       const BasicBlock *MaxBB = nullptr;
       for (auto I : SI.cases()) {
         const BasicBlock *BB = I.getCaseSuccessor();
         if (++Popularity[BB] > MaxPop) {
           MaxPop = Popularity[BB];
           MaxBB = BB;
         }
       }
       // Set new default.
       assert(MaxPop > 0 && MaxBB);
       DefaultMBB = FuncInfo.MBBMap[MaxBB];
 
       // Remove cases that were pointing to the destination that is now the
       // default.
       CaseClusterVector New;
       New.reserve(Clusters.size());
       for (CaseCluster &CC : Clusters) {
         if (CC.MBB != DefaultMBB)
           New.push_back(CC);
       }
       Clusters = std::move(New);
     }
   }
 
   // The branch probablity of the peeled case.
   BranchProbability PeeledCaseProb = BranchProbability::getZero();
   MachineBasicBlock *PeeledSwitchMBB =
       peelDominantCaseCluster(SI, Clusters, PeeledCaseProb);
 
   // If there is only the default destination, jump there directly.
   MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
   if (Clusters.empty()) {
     assert(PeeledSwitchMBB == SwitchMBB);
     SwitchMBB->addSuccessor(DefaultMBB);
     if (DefaultMBB != NextBlock(SwitchMBB)) {
       DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
                               getControlRoot(), DAG.getBasicBlock(DefaultMBB)));
     }
     return;
   }
 
   findJumpTables(Clusters, &SI, DefaultMBB);
   findBitTestClusters(Clusters, &SI);
 
   LLVM_DEBUG({
     dbgs() << "Case clusters: ";
     for (const CaseCluster &C : Clusters) {
       if (C.Kind == CC_JumpTable)
         dbgs() << "JT:";
       if (C.Kind == CC_BitTests)
         dbgs() << "BT:";
 
       C.Low->getValue().print(dbgs(), true);
       if (C.Low != C.High) {
         dbgs() << '-';
         C.High->getValue().print(dbgs(), true);
       }
       dbgs() << ' ';
     }
     dbgs() << '\n';
   });
 
   assert(!Clusters.empty());
   SwitchWorkList WorkList;
   CaseClusterIt First = Clusters.begin();
   CaseClusterIt Last = Clusters.end() - 1;
   auto DefaultProb = getEdgeProbability(PeeledSwitchMBB, DefaultMBB);
   // Scale the branchprobability for DefaultMBB if the peel occurs and
   // DefaultMBB is not replaced.
   if (PeeledCaseProb != BranchProbability::getZero() &&
       DefaultMBB == FuncInfo.MBBMap[SI.getDefaultDest()])
     DefaultProb = scaleCaseProbality(DefaultProb, PeeledCaseProb);
   WorkList.push_back(
       {PeeledSwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
 
   while (!WorkList.empty()) {
     SwitchWorkListItem W = WorkList.back();
     WorkList.pop_back();
     unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;
 
     if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None &&
         !DefaultMBB->getParent()->getFunction().optForMinSize()) {
       // For optimized builds, lower large range as a balanced binary tree.
       splitWorkItem(WorkList, W, SI.getCondition(), SwitchMBB);
       continue;
     }
 
     lowerWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB);
   }
 }
Index: vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCISelLowering.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/Target/PowerPC/PPCISelLowering.cpp	(revision 338000)
@@ -1,14190 +1,14197 @@
 //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the PPCISelLowering class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "PPCISelLowering.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
 #include "PPCCCState.h"
 #include "PPCCallingConv.h"
 #include "PPCFrameLowering.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCRegisterInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <list>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-lowering"
 
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
 
 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 
 static cl::opt<bool> DisableSCO("disable-ppc-sco",
 cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
 
 static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
 cl::desc("enable quad precision float support on ppc"), cl::Hidden);
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 
 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
 
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
 PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
                                      const PPCSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
   // Use _setjmp/_longjmp instead of setjmp/longjmp.
   setUseUnderscoreSetJmp(true);
   setUseUnderscoreLongJmp(true);
 
   // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
   // arguments are at least 4/8 bytes aligned.
   bool isPPC64 = Subtarget.isPPC64();
   setMinStackArgumentAlignment(isPPC64 ? 8:4);
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
   if (!useSoftFloat()) {
     if (hasSPE()) {
       addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass);
       addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
     } else {
       addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
       addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
     }
   }
 
   // Match BITREVERSE to customized fast code sequence in the td file.
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
 
   // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
 
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
   }
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // PowerPC has pre-inc load and store's.
   setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
   if (!Subtarget.hasSPE()) {
     setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
     setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
     setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
     setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
   }
 
   // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
     setOperationAction(ISD::ADDC, VT, Legal);
     setOperationAction(ISD::ADDE, VT, Legal);
     setOperationAction(ISD::SUBC, VT, Legal);
     setOperationAction(ISD::SUBE, VT, Legal);
   }
 
   if (Subtarget.useCRBits()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
     if (isPPC64 || Subtarget.hasFPCVT()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
       AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
                          isPPC64 ? MVT::i64 : MVT::i32);
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
       AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
                         isPPC64 ? MVT::i64 : MVT::i32);
     } else {
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
     }
 
     // PowerPC does not support direct load/store of condition registers.
     setOperationAction(ISD::LOAD, MVT::i1, Custom);
     setOperationAction(ISD::STORE, MVT::i1, Custom);
 
     // FIXME: Remove this once the ANDI glue bug is fixed:
     if (ANDIGlueBug)
       setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
 
     for (MVT VT : MVT::integer_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
       setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
       setTruncStoreAction(VT, MVT::i1, Expand);
     }
 
     addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
   }
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
   // PPC (the libcall is not available).
   setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
 
   // We do not currently implement these libm ops for PowerPC.
   setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
   setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
   setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
   setOperationAction(ISD::FRINT,  MVT::ppcf128, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
   setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
 
   // PowerPC has no SREM/UREM instructions unless we are on P9
   // On P9 we may use a hardware instruction to compute the remainder.
   // The instructions are not legalized directly because in the cases where the
   // result of both the remainder and the division is required it is more
   // efficient to compute the remainder from the result of the division rather
   // than use the remainder instruction.
   if (Subtarget.isISA3_0()) {
     setOperationAction(ISD::SREM, MVT::i32, Custom);
     setOperationAction(ISD::UREM, MVT::i32, Custom);
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
   } else {
     setOperationAction(ISD::SREM, MVT::i32, Expand);
     setOperationAction(ISD::UREM, MVT::i32, Expand);
     setOperationAction(ISD::SREM, MVT::i64, Expand);
     setOperationAction(ISD::UREM, MVT::i64, Expand);
   }
 
   if (Subtarget.hasP9Vector()) {
     setOperationAction(ISD::ABS, MVT::v4i32, Legal);
     setOperationAction(ISD::ABS, MVT::v8i16, Legal);
     setOperationAction(ISD::ABS, MVT::v16i8, Legal);
   }
 
   // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 
   // We don't support sin/cos/sqrt/fmod/pow
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
   if (Subtarget.hasSPE()) {
     setOperationAction(ISD::FMA  , MVT::f64, Expand);
     setOperationAction(ISD::FMA  , MVT::f32, Expand);
   } else {
     setOperationAction(ISD::FMA  , MVT::f64, Legal);
     setOperationAction(ISD::FMA  , MVT::f32, Legal);
   }
 
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
   if (!Subtarget.hasFSQRT() &&
       !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
         Subtarget.hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 
   if (!Subtarget.hasFSQRT() &&
       !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
         Subtarget.hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
   if (Subtarget.hasFCPSGN()) {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
   } else {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
   }
 
   if (Subtarget.hasFPRND()) {
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FROUND, MVT::f64, Legal);
 
     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
     setOperationAction(ISD::FROUND, MVT::f32, Legal);
   }
 
   // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
   // to speed up scalar BSWAP64.
   // CTPOP or CTTZ were introduced in P8/P9 respectively
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
   if (Subtarget.isISA3_0()) {
     setOperationAction(ISD::BSWAP, MVT::i64  , Custom);
     setOperationAction(ISD::CTTZ , MVT::i32  , Legal);
     setOperationAction(ISD::CTTZ , MVT::i64  , Legal);
   } else {
     setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
   }
 
   if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
     setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
     setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
   } else {
     setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
     setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
   }
 
   // PowerPC does not have ROTR
   setOperationAction(ISD::ROTR, MVT::i32   , Expand);
   setOperationAction(ISD::ROTR, MVT::i64   , Expand);
 
   if (!Subtarget.useCRBits()) {
     // PowerPC does not have Select
     setOperationAction(ISD::SELECT, MVT::i32, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::f32, Expand);
     setOperationAction(ISD::SELECT, MVT::f64, Expand);
   }
 
   // PowerPC wants to turn select_cc of FP into fsel when possible.
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
   // PowerPC wants to optimize integer setcc a bit
   if (!Subtarget.useCRBits())
     setOperationAction(ISD::SETCC, MVT::i32, Custom);
 
   // PowerPC does not have BRCOND which requires SetCC
   if (!Subtarget.useCRBits())
     setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
   setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
 
   if (Subtarget.hasSPE()) {
     // SPE has built-in conversions
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
   } else {
     // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 
     // PowerPC does not have [U|S]INT_TO_FP
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
   }
 
   if (Subtarget.hasDirectMove() && isPPC64) {
     setOperationAction(ISD::BITCAST, MVT::f32, Legal);
     setOperationAction(ISD::BITCAST, MVT::i32, Legal);
     setOperationAction(ISD::BITCAST, MVT::i64, Legal);
     setOperationAction(ISD::BITCAST, MVT::f64, Legal);
   } else {
     setOperationAction(ISD::BITCAST, MVT::f32, Expand);
     setOperationAction(ISD::BITCAST, MVT::i32, Expand);
     setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::BITCAST, MVT::f64, Expand);
   }
 
   // We cannot sextinreg(i1).  Expand to shifts.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
   // support continuation, user-level threading, and etc.. As a result, no
   // other SjLj exception interfaces are implemented and please don't build
   // your own exception handling based on them.
   // LLVM/Clang supports zero-cost DWARF exception handling.
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 
   // We want to legalize GlobalAddress and ConstantPool nodes into the
   // appropriate instructions to materialize the address.
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
   setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
   setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
   setOperationAction(ISD::BlockAddress,  MVT::i64, Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
   setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
 
   // TRAP is legal.
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // TRAMPOLINE is custom lowered.
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
   if (Subtarget.isSVR4ABI()) {
     if (isPPC64) {
       // VAARG always uses double-word chunks, so promote anything smaller.
       setOperationAction(ISD::VAARG, MVT::i1, Promote);
       AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
       setOperationAction(ISD::VAARG, MVT::i8, Promote);
       AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
       setOperationAction(ISD::VAARG, MVT::i16, Promote);
       AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
       setOperationAction(ISD::VAARG, MVT::i32, Promote);
       AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
       setOperationAction(ISD::VAARG, MVT::Other, Expand);
     } else {
       // VAARG is custom lowered with the 32-bit SVR4 ABI.
       setOperationAction(ISD::VAARG, MVT::Other, Custom);
       setOperationAction(ISD::VAARG, MVT::i64, Custom);
     }
   } else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
   if (Subtarget.isSVR4ABI() && !isPPC64)
     // VACOPY is custom lowered with the 32-bit SVR4 ABI.
     setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
   else
     setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
 
   // Use the default implementation.
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
   setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
   setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   // To handle counter-based loop conditions.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   // Comparisons that require checking two conditions.
   if (Subtarget.hasSPE()) {
     setCondCodeAction(ISD::SETO, MVT::f32, Expand);
     setCondCodeAction(ISD::SETO, MVT::f64, Expand);
     setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
     setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
   }
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
   setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
   if (Subtarget.has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
     setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
     // This is just the low 32 bits of a (signed) fp->i64 conversion.
     // We cannot do this with Promote because i64 is not a legal type.
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 
     if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
     if (Subtarget.hasSPE())
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
     else
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
   }
 
   // With the instructions enabled under FPCVT, we can do everything.
   if (Subtarget.hasFPCVT()) {
     if (Subtarget.has64BitSupport()) {
       setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
       setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     }
 
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   }
 
   if (Subtarget.use64BitRegs()) {
     // 64-bit PowerPC implementations can support i64 types directly
     addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
     // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
     setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
     // 64-bit PowerPC wants to expand i128 shifts itself.
     setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
     setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
     setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
   } else {
     // 32-bit PowerPC wants to expand i64 shifts itself.
     setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
     setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   }
 
   if (Subtarget.hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
     for (MVT VT : MVT::vector_valuetypes()) {
       // add/sub are legal for all supported vector VT's.
       setOperationAction(ISD::ADD, VT, Legal);
       setOperationAction(ISD::SUB, VT, Legal);
 
       // Vector instructions introduced in P8
       if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
         setOperationAction(ISD::CTPOP, VT, Legal);
         setOperationAction(ISD::CTLZ, VT, Legal);
       }
       else {
         setOperationAction(ISD::CTPOP, VT, Expand);
         setOperationAction(ISD::CTLZ, VT, Expand);
       }
 
       // Vector instructions introduced in P9
       if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
         setOperationAction(ISD::CTTZ, VT, Legal);
       else
         setOperationAction(ISD::CTTZ, VT, Expand);
 
       // We promote all shuffles to v16i8.
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
       AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
 
       // We promote all non-typed operations to v4i32.
       setOperationAction(ISD::AND   , VT, Promote);
       AddPromotedToType (ISD::AND   , VT, MVT::v4i32);
       setOperationAction(ISD::OR    , VT, Promote);
       AddPromotedToType (ISD::OR    , VT, MVT::v4i32);
       setOperationAction(ISD::XOR   , VT, Promote);
       AddPromotedToType (ISD::XOR   , VT, MVT::v4i32);
       setOperationAction(ISD::LOAD  , VT, Promote);
       AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
       setOperationAction(ISD::SELECT, VT, Promote);
       AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
       setOperationAction(ISD::SELECT_CC, VT, Promote);
       AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
       setOperationAction(ISD::STORE, VT, Promote);
       AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
 
       // No other operations are legal.
       setOperationAction(ISD::MUL , VT, Expand);
       setOperationAction(ISD::SDIV, VT, Expand);
       setOperationAction(ISD::SREM, VT, Expand);
       setOperationAction(ISD::UDIV, VT, Expand);
       setOperationAction(ISD::UREM, VT, Expand);
       setOperationAction(ISD::FDIV, VT, Expand);
       setOperationAction(ISD::FREM, VT, Expand);
       setOperationAction(ISD::FNEG, VT, Expand);
       setOperationAction(ISD::FSQRT, VT, Expand);
       setOperationAction(ISD::FLOG, VT, Expand);
       setOperationAction(ISD::FLOG10, VT, Expand);
       setOperationAction(ISD::FLOG2, VT, Expand);
       setOperationAction(ISD::FEXP, VT, Expand);
       setOperationAction(ISD::FEXP2, VT, Expand);
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FABS, VT, Expand);
       setOperationAction(ISD::FFLOOR, VT, Expand);
       setOperationAction(ISD::FCEIL,  VT, Expand);
       setOperationAction(ISD::FTRUNC, VT, Expand);
       setOperationAction(ISD::FRINT,  VT, Expand);
       setOperationAction(ISD::FNEARBYINT, VT, Expand);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
       setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
       setOperationAction(ISD::MULHU, VT, Expand);
       setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
       setOperationAction(ISD::UDIVREM, VT, Expand);
       setOperationAction(ISD::SDIVREM, VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
       setOperationAction(ISD::BSWAP, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
       setOperationAction(ISD::ROTL, VT, Expand);
       setOperationAction(ISD::ROTR, VT, Expand);
 
       for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
 
     // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
     // with merges, splats, etc.
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
 
     setOperationAction(ISD::AND   , MVT::v4i32, Legal);
     setOperationAction(ISD::OR    , MVT::v4i32, Legal);
     setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
     setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
     setOperationAction(ISD::SELECT, MVT::v4i32,
                        Subtarget.useCRBits() ? Legal : Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
     setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
     setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
 
     addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
 
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
 
     if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
       setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     }
 
     if (Subtarget.hasP8Altivec())
       setOperationAction(ISD::MUL, MVT::v4i32, Legal);
     else
       setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
 
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
 
     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
 
     // Altivec does not contain unordered floating-point compare instructions
     setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
 
     if (Subtarget.hasVSX()) {
       setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
       if (Subtarget.hasP8Vector()) {
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
       }
       if (Subtarget.hasDirectMove() && isPPC64) {
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
         setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
       }
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
       setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
       setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
       setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
       setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
 
       setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
 
       setOperationAction(ISD::MUL, MVT::v2f64, Legal);
       setOperationAction(ISD::FMA, MVT::v2f64, Legal);
 
       setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
       setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
 
       setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
       setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
       setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
       setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
       setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
 
       // Share the Altivec comparison restrictions.
       setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
       setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
       setCondCodeAction(ISD::SETO,   MVT::v2f64, Expand);
       setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
 
       setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
       setOperationAction(ISD::STORE, MVT::v2f64, Legal);
 
       setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
 
       if (Subtarget.hasP8Vector())
         addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
 
       addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
 
       addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
       addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
       addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
 
       if (Subtarget.hasP8Altivec()) {
         setOperationAction(ISD::SHL, MVT::v2i64, Legal);
         setOperationAction(ISD::SRA, MVT::v2i64, Legal);
         setOperationAction(ISD::SRL, MVT::v2i64, Legal);
 
         // 128 bit shifts can be accomplished via 3 instructions for SHL and
         // SRL, but not for SRA because of the instructions available:
         // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
         // doing
         setOperationAction(ISD::SHL, MVT::v1i128, Expand);
         setOperationAction(ISD::SRL, MVT::v1i128, Expand);
         setOperationAction(ISD::SRA, MVT::v1i128, Expand);
 
         setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
       }
       else {
         setOperationAction(ISD::SHL, MVT::v2i64, Expand);
         setOperationAction(ISD::SRA, MVT::v2i64, Expand);
         setOperationAction(ISD::SRL, MVT::v2i64, Expand);
 
         setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
 
         // VSX v2i64 only supports non-arithmetic operations.
         setOperationAction(ISD::ADD, MVT::v2i64, Expand);
         setOperationAction(ISD::SUB, MVT::v2i64, Expand);
       }
 
       setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
       AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
       setOperationAction(ISD::STORE, MVT::v2i64, Promote);
       AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
 
       setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
 
       setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
       setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
 
       // Vector operation legalization checks the result type of
       // SIGN_EXTEND_INREG, overall legalization checks the inner type.
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
 
       setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
       setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
       setOperationAction(ISD::FABS, MVT::v4f32, Legal);
       setOperationAction(ISD::FABS, MVT::v2f64, Legal);
 
       if (Subtarget.hasDirectMove())
         setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 
       addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
     }
 
     if (Subtarget.hasP8Altivec()) {
       addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
       addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
     }
 
     if (Subtarget.hasP9Vector()) {
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 
       // 128 bit shifts can be accomplished via 3 instructions for SHL and
       // SRL, but not for SRA because of the instructions available:
       // VS{RL} and VS{RL}O.
       setOperationAction(ISD::SHL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRA, MVT::v1i128, Expand);
 
       if (EnableQuadPrecision) {
         addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
         setOperationAction(ISD::FADD, MVT::f128, Legal);
         setOperationAction(ISD::FSUB, MVT::f128, Legal);
         setOperationAction(ISD::FDIV, MVT::f128, Legal);
         setOperationAction(ISD::FMUL, MVT::f128, Legal);
         setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
         // No extending loads to f128 on PPC.
         for (MVT FPT : MVT::fp_valuetypes())
           setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
         setOperationAction(ISD::FMA, MVT::f128, Legal);
         setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
         setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
         setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
         setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
         setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
         setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
 
         setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
         setOperationAction(ISD::FRINT, MVT::f128, Legal);
         setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
         setOperationAction(ISD::FCEIL, MVT::f128, Legal);
         setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
         setOperationAction(ISD::FROUND, MVT::f128, Legal);
 
         setOperationAction(ISD::SELECT, MVT::f128, Expand);
         setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
         setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
         setTruncStoreAction(MVT::f128, MVT::f64, Expand);
         setTruncStoreAction(MVT::f128, MVT::f32, Expand);
         setOperationAction(ISD::BITCAST, MVT::i128, Custom);
         // No implementation for these ops for PowerPC.
         setOperationAction(ISD::FSIN , MVT::f128, Expand);
         setOperationAction(ISD::FCOS , MVT::f128, Expand);
         setOperationAction(ISD::FPOW, MVT::f128, Expand);
         setOperationAction(ISD::FPOWI, MVT::f128, Expand);
         setOperationAction(ISD::FREM, MVT::f128, Expand);
       }
 
     }
 
     if (Subtarget.hasP9Altivec()) {
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
     }
   }
 
   if (Subtarget.hasQPX()) {
     setOperationAction(ISD::FADD, MVT::v4f64, Legal);
     setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
     setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
     setOperationAction(ISD::FREM, MVT::v4f64, Expand);
 
     setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
     setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
 
     setOperationAction(ISD::LOAD  , MVT::v4f64, Custom);
     setOperationAction(ISD::STORE , MVT::v4f64, Custom);
 
     setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
 
     if (!Subtarget.useCRBits())
       setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
     setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
     setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
     setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
     setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
 
     setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
     setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
 
     setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
     setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
 
     setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
     setOperationAction(ISD::FABS , MVT::v4f64, Legal);
     setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
     setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
     setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
     setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
     setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
 
     setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
 
     setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
     setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
 
     addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
 
     setOperationAction(ISD::FADD, MVT::v4f32, Legal);
     setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
     setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FREM, MVT::v4f32, Expand);
 
     setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
     setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
 
     setOperationAction(ISD::LOAD  , MVT::v4f32, Custom);
     setOperationAction(ISD::STORE , MVT::v4f32, Custom);
 
     if (!Subtarget.useCRBits())
       setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
     setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
     setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
     setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
     setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
 
     setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
     setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
 
     setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
     setOperationAction(ISD::FABS , MVT::v4f32, Legal);
     setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
 
     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
 
     setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
     setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
 
     addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
 
     setOperationAction(ISD::AND , MVT::v4i1, Legal);
     setOperationAction(ISD::OR , MVT::v4i1, Legal);
     setOperationAction(ISD::XOR , MVT::v4i1, Legal);
 
     if (!Subtarget.useCRBits())
       setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
     setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
 
     setOperationAction(ISD::LOAD  , MVT::v4i1, Custom);
     setOperationAction(ISD::STORE , MVT::v4i1, Custom);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
     setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
     setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
     setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
 
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
 
     addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
 
     setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::v4f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
     setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
 
     setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
     setOperationAction(ISD::FCEIL,  MVT::v4f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
 
     setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
 
     // These need to set FE_INEXACT, and so cannot be vectorized here.
     setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
 
     if (TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
 
       setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     } else {
       setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
       setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
 
       setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     }
   }
 
   if (Subtarget.has64BitSupport())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
 
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
 
   if (!isPPC64) {
     setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
     setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
   }
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
   if (Subtarget.hasAltivec()) {
     // Altivec instructions set fields to all zeros or all ones.
     setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   }
 
   if (!isPPC64) {
     // These libcalls are not available in 32-bit.
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
   setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget.hasFPCVT())
     setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
   if (Subtarget.useCRBits())
     setTargetDAGCombine(ISD::BRCOND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
 
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
   if (Subtarget.useCRBits()) {
     setTargetDAGCombine(ISD::TRUNCATE);
     setTargetDAGCombine(ISD::SETCC);
     setTargetDAGCombine(ISD::SELECT_CC);
   }
 
   // Use reciprocal estimates.
   if (TM.Options.UnsafeFPMath) {
     setTargetDAGCombine(ISD::FDIV);
     setTargetDAGCombine(ISD::FSQRT);
   }
 
   // Darwin long double math library functions have $LDBL128 appended.
   if (Subtarget.isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
     setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
     setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
     setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
     setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
     setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
     setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
     setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
     setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
     setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
   }
 
   if (EnableQuadPrecision) {
     setLibcallName(RTLIB::LOG_F128, "logf128");
     setLibcallName(RTLIB::LOG2_F128, "log2f128");
     setLibcallName(RTLIB::LOG10_F128, "log10f128");
     setLibcallName(RTLIB::EXP_F128, "expf128");
     setLibcallName(RTLIB::EXP2_F128, "exp2f128");
     setLibcallName(RTLIB::SIN_F128, "sinf128");
     setLibcallName(RTLIB::COS_F128, "cosf128");
     setLibcallName(RTLIB::POW_F128, "powf128");
     setLibcallName(RTLIB::FMIN_F128, "fminf128");
     setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
     setLibcallName(RTLIB::POWI_F128, "__powikf2");
     setLibcallName(RTLIB::REM_F128, "fmodf128");
   }
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
   if (Subtarget.useCRBits()) {
     setHasMultipleConditionRegisters();
     setJumpIsExpensive();
   }
 
   setMinFunctionAlignment(2);
   if (Subtarget.isDarwin())
     setPrefFunctionAlignment(4);
 
   switch (Subtarget.getDarwinDirective()) {
   default: break;
   case PPC::DIR_970:
   case PPC::DIR_A2:
   case PPC::DIR_E500:
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
   case PPC::DIR_PWR4:
   case PPC::DIR_PWR5:
   case PPC::DIR_PWR5X:
   case PPC::DIR_PWR6:
   case PPC::DIR_PWR6X:
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
     setPrefFunctionAlignment(4);
     setPrefLoopAlignment(4);
     break;
   }
 
   if (Subtarget.enableMachineScheduler())
     setSchedulingPreference(Sched::Source);
   else
     setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties(STI.getRegisterInfo());
 
   // The Freescale cores do better with aggressive inlining of memcpy and
   // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
   if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
       Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
     MaxStoresPerMemset = 32;
     MaxStoresPerMemsetOptSize = 16;
     MaxStoresPerMemcpy = 32;
     MaxStoresPerMemcpyOptSize = 8;
     MaxStoresPerMemmove = 32;
     MaxStoresPerMemmoveOptSize = 8;
   } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) {
     // The A2 also benefits from (very) aggressive inlining of memcpy and
     // friends. The overhead of a the function call, even when warm, can be
     // over one hundred cycles.
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
     MaxLoadsPerMemcmp = 128;
   } else {
     MaxLoadsPerMemcmp = 8;
     MaxLoadsPerMemcmpOptSize = 4;
   }
 }
 
 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
                              unsigned MaxMaxAlign) {
   if (MaxAlign == MaxMaxAlign)
     return;
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
       MaxAlign = 32;
     else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
       MaxAlign = 16;
   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     unsigned EltAlign = 0;
     getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (auto *EltTy : STy->elements()) {
       unsigned EltAlign = 0;
       getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
       if (MaxAlign == MaxMaxAlign)
         break;
     }
   }
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
   // Darwin passes everything on 4 byte boundary.
   if (Subtarget.isDarwin())
     return 4;
 
   // 16byte and wider vectors are passed on 16byte boundary.
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
   unsigned Align = Subtarget.isPPC64() ? 8 : 4;
   if (Subtarget.hasAltivec() || Subtarget.hasQPX())
     getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
   return Align;
 }
 
 unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                           CallingConv:: ID CC,
                                                           EVT VT) const {
   if (Subtarget.hasSPE() && VT == MVT::f64)
     return 2;
   return PPCTargetLowering::getNumRegisters(Context, VT);
 }
 
 MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                      CallingConv:: ID CC,
                                                      EVT VT) const {
   if (Subtarget.hasSPE() && VT == MVT::f64)
     return MVT::i32;
   return PPCTargetLowering::getRegisterType(Context, VT);
 }
 
 bool PPCTargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
 bool PPCTargetLowering::hasSPE() const {
   return Subtarget.hasSPE();
 }
 
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
   case PPCISD::FSEL:            return "PPCISD::FSEL";
   case PPCISD::FCFID:           return "PPCISD::FCFID";
   case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
   case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
   case PPCISD::FCFIDUS:         return "PPCISD::FCFIDUS";
   case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
   case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
   case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
   case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
   case PPCISD::FP_TO_UINT_IN_VSR:
                                 return "PPCISD::FP_TO_UINT_IN_VSR,";
   case PPCISD::FP_TO_SINT_IN_VSR:
                                 return "PPCISD::FP_TO_SINT_IN_VSR";
   case PPCISD::FRE:             return "PPCISD::FRE";
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
   case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
   case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
   case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
   case PPCISD::XXREVERSE:       return "PPCISD::XXREVERSE";
   case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
   case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
   case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
   case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
   case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
   case PPCISD::SRA:             return "PPCISD::SRA";
   case PPCISD::SHL:             return "PPCISD::SHL";
   case PPCISD::SRA_ADDZE:       return "PPCISD::SRA_ADDZE";
   case PPCISD::CALL:            return "PPCISD::CALL";
   case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL:           return "PPCISD::BCTRL";
   case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
   case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
   case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
   case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
   case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
   case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
   case PPCISD::MFVSR:           return "PPCISD::MFVSR";
   case PPCISD::MTVSRA:          return "PPCISD::MTVSRA";
   case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
   case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
   case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
   case PPCISD::ANDIo_1_EQ_BIT:  return "PPCISD::ANDIo_1_EQ_BIT";
   case PPCISD::ANDIo_1_GT_BIT:  return "PPCISD::ANDIo_1_GT_BIT";
   case PPCISD::VCMP:            return "PPCISD::VCMP";
   case PPCISD::VCMPo:           return "PPCISD::VCMPo";
   case PPCISD::LBRX:            return "PPCISD::LBRX";
   case PPCISD::STBRX:           return "PPCISD::STBRX";
   case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
   case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
   case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
   case PPCISD::STXSIX:          return "PPCISD::STXSIX";
   case PPCISD::VEXTS:           return "PPCISD::VEXTS";
   case PPCISD::SExtVElems:      return "PPCISD::SExtVElems";
   case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
   case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
   case PPCISD::ST_VSR_SCAL_INT:
                                 return "PPCISD::ST_VSR_SCAL_INT";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
   case PPCISD::BDNZ:            return "PPCISD::BDNZ";
   case PPCISD::BDZ:             return "PPCISD::BDZ";
   case PPCISD::MFFS:            return "PPCISD::MFFS";
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
   case PPCISD::CR6SET:          return "PPCISD::CR6SET";
   case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
   case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
   case PPCISD::PPC32_PICGOT:    return "PPCISD::PPC32_PICGOT";
   case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
   case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
   case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
   case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
   case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
   case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
   case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
   case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
   case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
   case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
   case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
   case PPCISD::SC:              return "PPCISD::SC";
   case PPCISD::CLRBHRB:         return "PPCISD::CLRBHRB";
   case PPCISD::MFBHRBE:         return "PPCISD::MFBHRBE";
   case PPCISD::RFEBB:           return "PPCISD::RFEBB";
   case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
   case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
   case PPCISD::QVFPERM:         return "PPCISD::QVFPERM";
   case PPCISD::QVGPCI:          return "PPCISD::QVGPCI";
   case PPCISD::QVALIGNI:        return "PPCISD::QVALIGNI";
   case PPCISD::QVESPLATI:       return "PPCISD::QVESPLATI";
   case PPCISD::QBFLT:           return "PPCISD::QBFLT";
   case PPCISD::QVLFSb:          return "PPCISD::QVLFSb";
   case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
   }
   return nullptr;
 }
 
 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
                                           EVT VT) const {
   if (!VT.isVector())
     return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
 
   if (Subtarget.hasQPX())
     return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
 
   return VT.changeVectorElementTypeToInteger();
 }
 
 bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
   return true;
 }
 
 //===----------------------------------------------------------------------===//
 // Node matching predicates, for use by the tblgen matching code.
 //===----------------------------------------------------------------------===//
 
 /// isFloatingPointZero - Return true if this is 0.0 or -0.0.
 static bool isFloatingPointZero(SDValue Op) {
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
     return CFP->getValueAPF().isZero();
   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
     // Maybe this has already been legalized into the constant pool?
     if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
       if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
         return CFP->getValueAPF().isZero();
   }
   return false;
 }
 
 /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
 /// true if Op is undef or if it matches the specified value.
 static bool isConstantOrUndef(int Op, int Val) {
   return Op < 0 || Op == Val;
 }
 
 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUHUM instruction.
 /// The ShuffleKind distinguishes between big-endian operations with
 /// two different inputs (0), either-endian operations with two identical
 /// inputs (1), and little-endian operations with two different inputs (2).
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
   bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
     for (unsigned i = 0; i != 16; ++i)
       if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
         return false;
   } else if (ShuffleKind == 2) {
     if (!IsLE)
       return false;
     for (unsigned i = 0; i != 16; ++i)
       if (!isConstantOrUndef(N->getMaskElt(i), i*2))
         return false;
   } else if (ShuffleKind == 1) {
     unsigned j = IsLE ? 0 : 1;
     for (unsigned i = 0; i != 8; ++i)
       if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
           !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
         return false;
   }
   return true;
 }
 
 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUWUM instruction.
 /// The ShuffleKind distinguishes between big-endian operations with
 /// two different inputs (0), either-endian operations with two identical
 /// inputs (1), and little-endian operations with two different inputs (2).
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
   bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
     for (unsigned i = 0; i != 16; i += 2)
       if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
           !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
         return false;
   } else if (ShuffleKind == 2) {
     if (!IsLE)
       return false;
     for (unsigned i = 0; i != 16; i += 2)
       if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
           !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1))
         return false;
   } else if (ShuffleKind == 1) {
     unsigned j = IsLE ? 0 : 2;
     for (unsigned i = 0; i != 8; i += 2)
       if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
           !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
           !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j)   ||
           !isConstantOrUndef(N->getMaskElt(i+9),  i*2+j+1))
         return false;
   }
   return true;
 }
 
 /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
 /// current subtarget.
 ///
 /// The ShuffleKind distinguishes between big-endian operations with
 /// two different inputs (0), either-endian operations with two identical
 /// inputs (1), and little-endian operations with two different inputs (2).
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
   const PPCSubtarget& Subtarget =
     static_cast<const PPCSubtarget&>(DAG.getSubtarget());
   if (!Subtarget.hasP8Vector())
     return false;
 
   bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
     for (unsigned i = 0; i != 16; i += 4)
       if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+4) ||
           !isConstantOrUndef(N->getMaskElt(i+1),  i*2+5) ||
           !isConstantOrUndef(N->getMaskElt(i+2),  i*2+6) ||
           !isConstantOrUndef(N->getMaskElt(i+3),  i*2+7))
         return false;
   } else if (ShuffleKind == 2) {
     if (!IsLE)
       return false;
     for (unsigned i = 0; i != 16; i += 4)
       if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
           !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1) ||
           !isConstantOrUndef(N->getMaskElt(i+2),  i*2+2) ||
           !isConstantOrUndef(N->getMaskElt(i+3),  i*2+3))
         return false;
   } else if (ShuffleKind == 1) {
     unsigned j = IsLE ? 0 : 4;
     for (unsigned i = 0; i != 8; i += 4)
       if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
           !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
           !isConstantOrUndef(N->getMaskElt(i+2),  i*2+j+2) ||
           !isConstantOrUndef(N->getMaskElt(i+3),  i*2+j+3) ||
           !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j)   ||
           !isConstantOrUndef(N->getMaskElt(i+9),  i*2+j+1) ||
           !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
           !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
         return false;
   }
   return true;
 }
 
 /// isVMerge - Common function, used to match vmrg* shuffles.
 ///
 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
                      unsigned LHSStart, unsigned RHSStart) {
   if (N->getValueType(0) != MVT::v16i8)
     return false;
   assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
          "Unsupported merge size!");
 
   for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
     for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
       if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
                              LHSStart+j+i*UnitSize) ||
           !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
                              RHSStart+j+i*UnitSize))
         return false;
     }
   return true;
 }
 
 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
 /// The ShuffleKind distinguishes between big-endian merges with two
 /// different inputs (0), either-endian merges with two identical inputs (1),
 /// and little-endian merges with two different inputs (2).  For the latter,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
   if (DAG.getDataLayout().isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 0, 0);
     else if (ShuffleKind == 2) // swapped
       return isVMerge(N, UnitSize, 0, 16);
     else
       return false;
   } else {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 8, 8);
     else if (ShuffleKind == 0) // normal
       return isVMerge(N, UnitSize, 8, 24);
     else
       return false;
   }
 }
 
 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
 /// The ShuffleKind distinguishes between big-endian merges with two
 /// different inputs (0), either-endian merges with two identical inputs (1),
 /// and little-endian merges with two different inputs (2).  For the latter,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
   if (DAG.getDataLayout().isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 8, 8);
     else if (ShuffleKind == 2) // swapped
       return isVMerge(N, UnitSize, 8, 24);
     else
       return false;
   } else {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 0, 0);
     else if (ShuffleKind == 0) // normal
       return isVMerge(N, UnitSize, 0, 16);
     else
       return false;
   }
 }
 
 /**
  * Common function used to match vmrgew and vmrgow shuffles
  *
  * The indexOffset determines whether to look for even or odd words in
  * the shuffle mask. This is based on the of the endianness of the target
  * machine.
  *   - Little Endian:
  *     - Use offset of 0 to check for odd elements
  *     - Use offset of 4 to check for even elements
  *   - Big Endian:
  *     - Use offset of 0 to check for even elements
  *     - Use offset of 4 to check for odd elements
  * A detailed description of the vector element ordering for little endian and
  * big endian can be found at
  * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
  * Targeting your applications - what little endian and big endian IBM XL C/C++
  * compiler differences mean to you
  *
  * The mask to the shuffle vector instruction specifies the indices of the
  * elements from the two input vectors to place in the result. The elements are
  * numbered in array-access order, starting with the first vector. These vectors
  * are always of type v16i8, thus each vector will contain 16 elements of size
  * 8. More info on the shuffle vector can be found in the
  * http://llvm.org/docs/LangRef.html#shufflevector-instruction
  * Language Reference.
  *
  * The RHSStartValue indicates whether the same input vectors are used (unary)
  * or two different input vectors are used, based on the following:
  *   - If the instruction uses the same vector for both inputs, the range of the
  *     indices will be 0 to 15. In this case, the RHSStart value passed should
  *     be 0.
  *   - If the instruction has two different vectors then the range of the
  *     indices will be 0 to 31. In this case, the RHSStart value passed should
  *     be 16 (indices 0-15 specify elements in the first vector while indices 16
  *     to 31 specify elements in the second vector).
  *
  * \param[in] N The shuffle vector SD Node to analyze
  * \param[in] IndexOffset Specifies whether to look for even or odd elements
  * \param[in] RHSStartValue Specifies the starting index for the righthand input
  * vector to the shuffle_vector instruction
  * \return true iff this shuffle vector represents an even or odd word merge
  */
 static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
                      unsigned RHSStartValue) {
   if (N->getValueType(0) != MVT::v16i8)
     return false;
 
   for (unsigned i = 0; i < 2; ++i)
     for (unsigned j = 0; j < 4; ++j)
       if (!isConstantOrUndef(N->getMaskElt(i*4+j),
                              i*RHSStartValue+j+IndexOffset) ||
           !isConstantOrUndef(N->getMaskElt(i*4+j+8),
                              i*RHSStartValue+j+IndexOffset+8))
         return false;
   return true;
 }
 
 /**
  * Determine if the specified shuffle mask is suitable for the vmrgew or
  * vmrgow instructions.
  *
  * \param[in] N The shuffle vector SD Node to analyze
  * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
  * \param[in] ShuffleKind Identify the type of merge:
  *   - 0 = big-endian merge with two different inputs;
  *   - 1 = either-endian merge with two identical inputs;
  *   - 2 = little-endian merge with two different inputs (inputs are swapped for
  *     little-endian merges).
  * \param[in] DAG The current SelectionDAG
  * \return true iff this shuffle mask
  */
 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
                               unsigned ShuffleKind, SelectionDAG &DAG) {
   if (DAG.getDataLayout().isLittleEndian()) {
     unsigned indexOffset = CheckEven ? 4 : 0;
     if (ShuffleKind == 1) // Unary
       return isVMerge(N, indexOffset, 0);
     else if (ShuffleKind == 2) // swapped
       return isVMerge(N, indexOffset, 16);
     else
       return false;
   }
   else {
     unsigned indexOffset = CheckEven ? 0 : 4;
     if (ShuffleKind == 1) // Unary
       return isVMerge(N, indexOffset, 0);
     else if (ShuffleKind == 0) // Normal
       return isVMerge(N, indexOffset, 16);
     else
       return false;
   }
   return false;
 }
 
 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
 /// amount, otherwise return -1.
 /// The ShuffleKind distinguishes between big-endian operations with two
 /// different inputs (0), either-endian operations with two identical inputs
 /// (1), and little-endian operations with two different inputs (2).  For the
 /// latter, the input operands are swapped (see PPCInstrAltivec.td).
 int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
                              SelectionDAG &DAG) {
   if (N->getValueType(0) != MVT::v16i8)
     return -1;
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
 
   // Find the first non-undef value in the shuffle mask.
   unsigned i;
   for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
     /*search*/;
 
   if (i == 16) return -1;  // all undef.
 
   // Otherwise, check to see if the rest of the elements are consecutively
   // numbered from this value.
   unsigned ShiftAmt = SVOp->getMaskElt(i);
   if (ShiftAmt < i) return -1;
 
   ShiftAmt -= i;
   bool isLE = DAG.getDataLayout().isLittleEndian();
 
   if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
     // Check the rest of the elements to see if they are consecutive.
     for (++i; i != 16; ++i)
       if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
         return -1;
   } else if (ShuffleKind == 1) {
     // Check the rest of the elements to see if they are consecutive.
     for (++i; i != 16; ++i)
       if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
         return -1;
   } else
     return -1;
 
   if (isLE)
     ShiftAmt = 16 - ShiftAmt;
 
   return ShiftAmt;
 }
 
 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a splat of a single element that is suitable for input to
 /// VSPLTB/VSPLTH/VSPLTW.
 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
   assert(N->getValueType(0) == MVT::v16i8 &&
          (EltSize == 1 || EltSize == 2 || EltSize == 4));
 
   // The consecutive indices need to specify an element, not part of two
   // different elements.  So abandon ship early if this isn't the case.
   if (N->getMaskElt(0) % EltSize != 0)
     return false;
 
   // This is a splat operation if each element of the permute is the same, and
   // if the value doesn't reference the second vector.
   unsigned ElementBase = N->getMaskElt(0);
 
   // FIXME: Handle UNDEF elements too!
   if (ElementBase >= 16)
     return false;
 
   // Check that the indices are consecutive, in the case of a multi-byte element
   // splatted with a v16i8 mask.
   for (unsigned i = 1; i != EltSize; ++i)
     if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
       return false;
 
   for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
     if (N->getMaskElt(i) < 0) continue;
     for (unsigned j = 0; j != EltSize; ++j)
       if (N->getMaskElt(i+j) != N->getMaskElt(j))
         return false;
   }
   return true;
 }
 
 /// Check that the mask is shuffling N byte elements. Within each N byte
 /// element of the mask, the indices could be either in increasing or
 /// decreasing order as long as they are consecutive.
 /// \param[in] N the shuffle vector SD Node to analyze
 /// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
 /// Word/DoubleWord/QuadWord).
 /// \param[in] StepLen the delta indices number among the N byte element, if
 /// the mask is in increasing/decreasing order then it is 1/-1.
 /// \return true iff the mask is shuffling N byte elements.
 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
                                    int StepLen) {
   assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
          "Unexpected element width.");
   assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
 
   unsigned NumOfElem = 16 / Width;
   unsigned MaskVal[16]; //  Width is never greater than 16
   for (unsigned i = 0; i < NumOfElem; ++i) {
     MaskVal[0] = N->getMaskElt(i * Width);
     if ((StepLen == 1) && (MaskVal[0] % Width)) {
       return false;
     } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
       return false;
     }
 
     for (unsigned int j = 1; j < Width; ++j) {
       MaskVal[j] = N->getMaskElt(i * Width + j);
       if (MaskVal[j] != MaskVal[j-1] + StepLen) {
         return false;
       }
     }
   }
 
   return true;
 }
 
 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                           unsigned &InsertAtByte, bool &Swap, bool IsLE) {
   if (!isNByteElemShuffleMask(N, 4, 1))
     return false;
 
   // Now we look at mask elements 0,4,8,12
   unsigned M0 = N->getMaskElt(0) / 4;
   unsigned M1 = N->getMaskElt(4) / 4;
   unsigned M2 = N->getMaskElt(8) / 4;
   unsigned M3 = N->getMaskElt(12) / 4;
   unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
   unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
 
   // Below, let H and L be arbitrary elements of the shuffle mask
   // where H is in the range [4,7] and L is in the range [0,3].
   // H, 1, 2, 3 or L, 5, 6, 7
   if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
       (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
     ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
     InsertAtByte = IsLE ? 12 : 0;
     Swap = M0 < 4;
     return true;
   }
   // 0, H, 2, 3 or 4, L, 6, 7
   if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
       (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
     ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
     InsertAtByte = IsLE ? 8 : 4;
     Swap = M1 < 4;
     return true;
   }
   // 0, 1, H, 3 or 4, 5, L, 7
   if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
       (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
     ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
     InsertAtByte = IsLE ? 4 : 8;
     Swap = M2 < 4;
     return true;
   }
   // 0, 1, 2, H or 4, 5, 6, L
   if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
       (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
     ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
     InsertAtByte = IsLE ? 0 : 12;
     Swap = M3 < 4;
     return true;
   }
 
   // If both vector operands for the shuffle are the same vector, the mask will
   // contain only elements from the first one and the second one will be undef.
   if (N->getOperand(1).isUndef()) {
     ShiftElts = 0;
     Swap = true;
     unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
     if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
       InsertAtByte = IsLE ? 12 : 0;
       return true;
     }
     if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
       InsertAtByte = IsLE ? 8 : 4;
       return true;
     }
     if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
       InsertAtByte = IsLE ? 4 : 8;
       return true;
     }
     if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
       InsertAtByte = IsLE ? 0 : 12;
       return true;
     }
   }
 
   return false;
 }
 
 bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                                bool &Swap, bool IsLE) {
   assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
   // Ensure each byte index of the word is consecutive.
   if (!isNByteElemShuffleMask(N, 4, 1))
     return false;
 
   // Now we look at mask elements 0,4,8,12, which are the beginning of words.
   unsigned M0 = N->getMaskElt(0) / 4;
   unsigned M1 = N->getMaskElt(4) / 4;
   unsigned M2 = N->getMaskElt(8) / 4;
   unsigned M3 = N->getMaskElt(12) / 4;
 
   // If both vector operands for the shuffle are the same vector, the mask will
   // contain only elements from the first one and the second one will be undef.
   if (N->getOperand(1).isUndef()) {
     assert(M0 < 4 && "Indexing into an undef vector?");
     if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
       return false;
 
     ShiftElts = IsLE ? (4 - M0) % 4 : M0;
     Swap = false;
     return true;
   }
 
   // Ensure each word index of the ShuffleVector Mask is consecutive.
   if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
     return false;
 
   if (IsLE) {
     if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
       // Input vectors don't need to be swapped if the leading element
       // of the result is one of the 3 left elements of the second vector
       // (or if there is no shift to be done at all).
       Swap = false;
       ShiftElts = (8 - M0) % 8;
     } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
       // Input vectors need to be swapped if the leading element
       // of the result is one of the 3 left elements of the first vector
       // (or if we're shifting by 4 - thereby simply swapping the vectors).
       Swap = true;
       ShiftElts = (4 - M0) % 4;
     }
 
     return true;
   } else {                                          // BE
     if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
       // Input vectors don't need to be swapped if the leading element
       // of the result is one of the 4 elements of the first vector.
       Swap = false;
       ShiftElts = M0;
     } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
       // Input vectors need to be swapped if the leading element
       // of the result is one of the 4 elements of the right vector.
       Swap = true;
       ShiftElts = M0 - 4;
     }
 
     return true;
   }
 }
 
 bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
   assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
 
   if (!isNByteElemShuffleMask(N, Width, -1))
     return false;
 
   for (int i = 0; i < 16; i += Width)
     if (N->getMaskElt(i) != i + Width - 1)
       return false;
 
   return true;
 }
 
 bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
   return isXXBRShuffleMaskHelper(N, 2);
 }
 
 bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
   return isXXBRShuffleMaskHelper(N, 4);
 }
 
 bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
   return isXXBRShuffleMaskHelper(N, 8);
 }
 
 bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
   return isXXBRShuffleMaskHelper(N, 16);
 }
 
 /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
 /// if the inputs to the instruction should be swapped and set \p DM to the
 /// value for the immediate.
 /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
 /// AND element 0 of the result comes from the first input (LE) or second input
 /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
 /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
 /// mask.
 bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
                                bool &Swap, bool IsLE) {
   assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
 
   // Ensure each byte index of the double word is consecutive.
   if (!isNByteElemShuffleMask(N, 8, 1))
     return false;
 
   unsigned M0 = N->getMaskElt(0) / 8;
   unsigned M1 = N->getMaskElt(8) / 8;
   assert(((M0 | M1) < 4) && "A mask element out of bounds?");
 
   // If both vector operands for the shuffle are the same vector, the mask will
   // contain only elements from the first one and the second one will be undef.
   if (N->getOperand(1).isUndef()) {
     if ((M0 | M1) < 2) {
       DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
       Swap = false;
       return true;
     } else
       return false;
   }
 
   if (IsLE) {
     if (M0 > 1 && M1 < 2) {
       Swap = false;
     } else if (M0 < 2 && M1 > 1) {
       M0 = (M0 + 2) % 4;
       M1 = (M1 + 2) % 4;
       Swap = true;
     } else
       return false;
 
     // Note: if control flow comes here that means Swap is already set above
     DM = (((~M1) & 1) << 1) + ((~M0) & 1);
     return true;
   } else { // BE
     if (M0 < 2 && M1 > 1) {
       Swap = false;
     } else if (M0 > 1 && M1 < 2) {
       M0 = (M0 + 2) % 4;
       M1 = (M1 + 2) % 4;
       Swap = true;
     } else
       return false;
 
     // Note: if control flow comes here that means Swap is already set above
     DM = (M0 << 1) + (M1 & 1);
     return true;
   }
 }
 
 
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
                                 SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
   if (DAG.getDataLayout().isLittleEndian())
     return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
   else
     return SVOp->getMaskElt(0) / EltSize;
 }
 
 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
 /// by using a vspltis[bhw] instruction of the specified element size, return
 /// the constant being splatted.  The ByteSize field indicates the number of
 /// bytes of each element [124] -> [bhw].
 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   SDValue OpVal(nullptr, 0);
 
   // If ByteSize of the splat is bigger than the element size of the
   // build_vector, then we have a case where we are checking for a splat where
   // multiple elements of the buildvector are folded together into a single
   // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
   unsigned EltSize = 16/N->getNumOperands();
   if (EltSize < ByteSize) {
     unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
     SDValue UniquedVals[4];
     assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
 
     // See if all of the elements in the buildvector agree across.
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
       if (N->getOperand(i).isUndef()) continue;
       // If the element isn't a constant, bail fully out.
       if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
 
       if (!UniquedVals[i&(Multiple-1)].getNode())
         UniquedVals[i&(Multiple-1)] = N->getOperand(i);
       else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
         return SDValue();  // no match.
     }
 
     // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
     // either constant or undef values that are identical for each chunk.  See
     // if these chunks can form into a larger vspltis*.
 
     // Check to see if all of the leading entries are either 0 or -1.  If
     // neither, then this won't fit into the immediate field.
     bool LeadingZero = true;
     bool LeadingOnes = true;
     for (unsigned i = 0; i != Multiple-1; ++i) {
       if (!UniquedVals[i].getNode()) continue;  // Must have been undefs.
 
       LeadingZero &= isNullConstant(UniquedVals[i]);
       LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
     }
     // Finally, check the least significant entry.
     if (LeadingZero) {
       if (!UniquedVals[Multiple-1].getNode())
         return DAG.getTargetConstant(0, SDLoc(N), MVT::i32);  // 0,0,0,undef
       int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
       if (Val < 16)                                   // 0,0,0,4 -> vspltisw(4)
         return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
     }
     if (LeadingOnes) {
       if (!UniquedVals[Multiple-1].getNode())
         return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
       int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
       if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
         return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
     }
 
     return SDValue();
   }
 
   // Check to see if this buildvec has a single non-undef value in its elements.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     if (N->getOperand(i).isUndef()) continue;
     if (!OpVal.getNode())
       OpVal = N->getOperand(i);
     else if (OpVal != N->getOperand(i))
       return SDValue();
   }
 
   if (!OpVal.getNode()) return SDValue();  // All UNDEF: use implicit def.
 
   unsigned ValSizeInBytes = EltSize;
   uint64_t Value = 0;
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
     Value = CN->getZExtValue();
   } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
     assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
     Value = FloatToBits(CN->getValueAPF().convertToFloat());
   }
 
   // If the splat value is larger than the element value, then we can never do
   // this splat.  The only case that we could fit the replicated bits into our
   // immediate field for would be zero, and we prefer to use vxor for it.
   if (ValSizeInBytes < ByteSize) return SDValue();
 
   // If the element value is larger than the splat value, check if it consists
   // of a repeated bit pattern of size ByteSize.
   if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
     return SDValue();
 
   // Properly sign extend the value.
   int MaskVal = SignExtend32(Value, ByteSize * 8);
 
   // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
   if (MaskVal == 0) return SDValue();
 
   // Finally, if this value fits in a 5 bit sext field, return it
   if (SignExtend32<5>(MaskVal) == MaskVal)
     return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
   return SDValue();
 }
 
 /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
 /// amount, otherwise return -1.
 int PPC::isQVALIGNIShuffleMask(SDNode *N) {
   EVT VT = N->getValueType(0);
   if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
     return -1;
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
 
   // Find the first non-undef value in the shuffle mask.
   unsigned i;
   for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
     /*search*/;
 
   if (i == 4) return -1;  // all undef.
 
   // Otherwise, check to see if the rest of the elements are consecutively
   // numbered from this value.
   unsigned ShiftAmt = SVOp->getMaskElt(i);
   if (ShiftAmt < i) return -1;
   ShiftAmt -= i;
 
   // Check the rest of the elements to see if they are consecutive.
   for (++i; i != 4; ++i)
     if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
       return -1;
 
   return ShiftAmt;
 }
 
 //===----------------------------------------------------------------------===//
 //  Addressing Mode Selection
 //===----------------------------------------------------------------------===//
 
 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
 /// or 64-bit immediate, and if the value can be accurately represented as a
 /// sign extension from a 16-bit value.  If so, this returns true and the
 /// immediate.
 bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
   if (!isa<ConstantSDNode>(N))
     return false;
 
   Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
   if (N->getValueType(0) == MVT::i32)
     return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
   else
     return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
 }
 bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
   return isIntS16Immediate(Op.getNode(), Imm);
 }
 
 /// SelectAddressRegReg - Given the specified addressed, check to see if it
 /// can be represented as an indexed [r+r] operation.  Returns false if it
 /// can be more efficiently represented with [r+imm].
 bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
                                             SDValue &Index,
                                             SelectionDAG &DAG) const {
   int16_t imm = 0;
   if (N.getOpcode() == ISD::ADD) {
     if (isIntS16Immediate(N.getOperand(1), imm))
       return false;    // r+i
     if (N.getOperand(1).getOpcode() == PPCISD::Lo)
       return false;    // r+i
 
     Base = N.getOperand(0);
     Index = N.getOperand(1);
     return true;
   } else if (N.getOpcode() == ISD::OR) {
     if (isIntS16Immediate(N.getOperand(1), imm))
       return false;    // r+i can fold it if we can.
 
     // If this is an or of disjoint bitfields, we can codegen this as an add
     // (for better address arithmetic) if the LHS and RHS of the OR are provably
     // disjoint.
     KnownBits LHSKnown, RHSKnown;
     DAG.computeKnownBits(N.getOperand(0), LHSKnown);
 
     if (LHSKnown.Zero.getBoolValue()) {
       DAG.computeKnownBits(N.getOperand(1), RHSKnown);
       // If all of the bits are known zero on the LHS or RHS, the add won't
       // carry.
       if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
         Base = N.getOperand(0);
         Index = N.getOperand(1);
         return true;
       }
     }
   }
 
   return false;
 }
 
 // If we happen to be doing an i64 load or store into a stack slot that has
 // less than a 4-byte alignment, then the frame-index elimination may need to
 // use an indexed load or store instruction (because the offset may not be a
 // multiple of 4). The extra register needed to hold the offset comes from the
 // register scavenger, and it is possible that the scavenger will need to use
 // an emergency spill slot. As a result, we need to make sure that a spill slot
 // is allocated when doing an i64 load/store into a less-than-4-byte-aligned
 // stack slot.
 static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
   // FIXME: This does not handle the LWA case.
   if (VT != MVT::i64)
     return;
 
   // NOTE: We'll exclude negative FIs here, which come from argument
   // lowering, because there are no known test cases triggering this problem
   // using packed structures (or similar). We can remove this exclusion if
   // we find such a test case. The reason why this is so test-case driven is
   // because this entire 'fixup' is only to prevent crashes (from the
   // register scavenger) on not-really-valid inputs. For example, if we have:
   //   %a = alloca i1
   //   %b = bitcast i1* %a to i64*
   //   store i64* a, i64 b
   // then the store should really be marked as 'align 1', but is not. If it
   // were marked as 'align 1' then the indexed form would have been
   // instruction-selected initially, and the problem this 'fixup' is preventing
   // won't happen regardless.
   if (FrameIdx < 0)
     return;
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   unsigned Align = MFI.getObjectAlignment(FrameIdx);
   if (Align >= 4)
     return;
 
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setHasNonRISpills();
 }
 
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
 /// represented as reg+reg.  If \p Alignment is non-zero, only accept
 /// displacements that are multiples of that value.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
                                             SelectionDAG &DAG,
                                             unsigned Alignment) const {
   // FIXME dl should come from parent load or store, not from address
   SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
   if (SelectAddressRegReg(N, Disp, Base, DAG))
     return false;
 
   if (N.getOpcode() == ISD::ADD) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
         (!Alignment || (imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
         fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
       } else {
         Base = N.getOperand(0);
       }
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
       assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
              && "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
              Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
              Disp.getOpcode() == ISD::TargetConstantPool ||
              Disp.getOpcode() == ISD::TargetJumpTable);
       Base = N.getOperand(0);
       return true;  // [&g+r]
     }
   } else if (N.getOpcode() == ISD::OR) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
         (!Alignment || (imm % Alignment) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
       KnownBits LHSKnown;
       DAG.computeKnownBits(N.getOperand(0), LHSKnown);
 
       if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
         // carry.
         if (FrameIndexSDNode *FI =
               dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
           Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
           fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
         } else {
           Base = N.getOperand(0);
         }
         Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
         return true;
       }
     }
   } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
     // Loading from a constant address.
 
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
     int16_t Imm;
     if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
       Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
       return true;
     }
 
     // Handle 32-bit sext immediates with LIS + addr mode.
     if ((CN->getValueType(0) == MVT::i32 ||
          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
         (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
       Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
 
       Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
                                    MVT::i32);
       unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
       Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
       return true;
     }
   }
 
   Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
   if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
     Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
     fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
   } else
     Base = N;
   return true;      // [r+0]
 }
 
 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
 /// represented as an indexed [r+r] operation.
 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
                                                 SDValue &Index,
                                                 SelectionDAG &DAG) const {
   // Check to see if we can easily represent this as an [r+r] address.  This
   // will fail if it thinks that the address is more profitably represented as
   // reg+imm, e.g. where imm = 0.
   if (SelectAddressRegReg(N, Base, Index, DAG))
     return true;
 
   // If the address is the result of an add, we will utilize the fact that the
   // address calculation includes an implicit add.  However, we can reduce
   // register pressure if we do not materialize a constant just for use as the
   // index register.  We only get rid of the add if it is not an add of a
   // value and a 16-bit signed constant and both have a single use.
   int16_t imm = 0;
   if (N.getOpcode() == ISD::ADD &&
       (!isIntS16Immediate(N.getOperand(1), imm) ||
        !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
     Base = N.getOperand(0);
     Index = N.getOperand(1);
     return true;
   }
 
   // Otherwise, do it the hard way, using R0 as the base register.
   Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                          N.getValueType());
   Index = N;
   return true;
 }
 
 /// getPreIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if the node's address
 /// can be legally represented as pre-indexed load / store address.
 bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                   SDValue &Offset,
                                                   ISD::MemIndexedMode &AM,
                                                   SelectionDAG &DAG) const {
   if (DisablePPCPreinc) return false;
 
   bool isLoad = true;
   SDValue Ptr;
   EVT VT;
   unsigned Alignment;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
     Alignment = LD->getAlignment();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT  = ST->getMemoryVT();
     Alignment = ST->getAlignment();
     isLoad = false;
   } else
     return false;
 
   // PowerPC doesn't have preinc load/store instructions for vectors (except
   // for QPX, which does have preinc r+r forms).
   if (VT.isVector()) {
     if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
       return false;
     } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
       AM = ISD::PRE_INC;
       return true;
     }
   }
 
   if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
     // Common code will reject creating a pre-inc form if the base pointer
     // is a frame index, or if N is a store and the base pointer is either
     // the same as or a predecessor of the value being stored.  Check for
     // those situations here, and try with swapped Base/Offset instead.
     bool Swap = false;
 
     if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
       Swap = true;
     else if (!isLoad) {
       SDValue Val = cast<StoreSDNode>(N)->getValue();
       if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
         Swap = true;
     }
 
     if (Swap)
       std::swap(Base, Offset);
 
     AM = ISD::PRE_INC;
     return true;
   }
 
   // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
     if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
     if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
       return false;
   }
 
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
     // sext i32 to i64 when addr mode is r+i.
     if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
         LD->getExtensionType() == ISD::SEXTLOAD &&
         isa<ConstantSDNode>(Offset))
       return false;
   }
 
   AM = ISD::PRE_INC;
   return true;
 }
 
 //===----------------------------------------------------------------------===//
 //  LowerOperation implementation
 //===----------------------------------------------------------------------===//
 
 /// Return true if we should reference labels using a PICBase, set the HiOpFlags
 /// and LoOpFlags to the target MO flags.
 static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
                                unsigned &HiOpFlags, unsigned &LoOpFlags,
                                const GlobalValue *GV = nullptr) {
   HiOpFlags = PPCII::MO_HA;
   LoOpFlags = PPCII::MO_LO;
 
   // Don't use the pic base if not in PIC relocation model.
   if (IsPIC) {
     HiOpFlags |= PPCII::MO_PIC_FLAG;
     LoOpFlags |= PPCII::MO_PIC_FLAG;
   }
 
   // If this is a reference to a global value that requires a non-lazy-ptr, make
   // sure that instruction lowering adds it.
   if (GV && Subtarget.hasLazyResolverStub(GV)) {
     HiOpFlags |= PPCII::MO_NLP_FLAG;
     LoOpFlags |= PPCII::MO_NLP_FLAG;
 
     if (GV->hasHiddenVisibility()) {
       HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
       LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
     }
   }
 }
 
 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
                              SelectionDAG &DAG) {
   SDLoc DL(HiPart);
   EVT PtrVT = HiPart.getValueType();
   SDValue Zero = DAG.getConstant(0, DL, PtrVT);
 
   SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
   SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
 
   // With PIC, the first instruction is actually "GR+hi(&G)".
   if (isPIC)
     Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
                      DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
 
   // Generate non-pic code that has direct accesses to the constant pool.
   // The address of the global is just (hi(&g)+lo(&g)).
   return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
 }
 
 static void setUsesTOCBasePtr(MachineFunction &MF) {
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setUsesTOCBasePtr();
 }
 
 static void setUsesTOCBasePtr(SelectionDAG &DAG) {
   setUsesTOCBasePtr(DAG.getMachineFunction());
 }
 
 static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
                            SDValue GA) {
   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
   SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
                 DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
 
   SDValue Ops[] = { GA, Reg };
   return DAG.getMemIntrinsicNode(
       PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
       MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
       MachineMemOperand::MOLoad);
 }
 
 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   const Constant *C = CP->getConstVal();
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
     return getTOCEntry(DAG, SDLoc(CP), true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
   bool IsPIC = isPositionIndependent();
   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
 
   if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
                                            PPCII::MO_PIC_FLAG);
     return getTOCEntry(DAG, SDLoc(CP), false, GA);
   }
 
   SDValue CPIHi =
     DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
   SDValue CPILo =
     DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
   return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
 }
 
 // For 64-bit PowerPC, prefer the more compact relative encodings.
 // This trades 32 bits per jump table entry for one or two instructions
 // on the jump site.
 unsigned PPCTargetLowering::getJumpTableEncoding() const {
   if (isJumpTableRelative())
     return MachineJumpTableInfo::EK_LabelDifference32;
 
   return TargetLowering::getJumpTableEncoding();
 }
 
 bool PPCTargetLowering::isJumpTableRelative() const {
   if (Subtarget.isPPC64())
     return true;
   return TargetLowering::isJumpTableRelative();
 }
 
 SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
   if (!Subtarget.isPPC64())
     return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
 
   switch (getTargetMachine().getCodeModel()) {
   case CodeModel::Small:
   case CodeModel::Medium:
     return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
   default:
     return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
                        getPointerTy(DAG.getDataLayout()));
   }
 }
 
 const MCExpr *
 PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
                                                 unsigned JTI,
                                                 MCContext &Ctx) const {
   if (!Subtarget.isPPC64())
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
 
   switch (getTargetMachine().getCodeModel()) {
   case CodeModel::Small:
   case CodeModel::Medium:
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
   default:
     return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
   }
 }
 
 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
     return getTOCEntry(DAG, SDLoc(JT), true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
   bool IsPIC = isPositionIndependent();
   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
 
   if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                         PPCII::MO_PIC_FLAG);
     return getTOCEntry(DAG, SDLoc(GA), false, GA);
   }
 
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
   SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
   return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
 }
 
 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
   const BlockAddress *BA = BASDN->getBlockAddress();
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual BlockAddress is stored in the TOC.
   if (Subtarget.isSVR4ABI() && isPositionIndependent()) {
     if (Subtarget.isPPC64())
       setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
     return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
   bool IsPIC = isPositionIndependent();
   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
   SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
   SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
   return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
 }
 
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   // FIXME: TLS addresses currently use medium model code sequences,
   // which is the most useful form.  Eventually support for small and
   // large models could be added if users need it, at the cost of
   // additional complexity.
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool is64bit = Subtarget.isPPC64();
   const Module *M = DAG.getMachineFunction().getFunction().getParent();
   PICLevel::Level picLevel = M->getPICLevel();
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
 
   if (Model == TLSModel::LocalExec) {
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                PPCII::MO_TPREL_HA);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                PPCII::MO_TPREL_LO);
     SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
                              : DAG.getRegister(PPC::R2, MVT::i32);
 
     SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
     return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
   }
 
   if (Model == TLSModel::InitialExec) {
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                 PPCII::MO_TLS);
     SDValue GOTPtr;
     if (is64bit) {
       setUsesTOCBasePtr(DAG);
       SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
       GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
                            PtrVT, GOTReg, TGA);
     } else
       GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
     SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
                                    PtrVT, TGA, GOTPtr);
     return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
   }
 
   if (Model == TLSModel::GeneralDynamic) {
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue GOTPtr;
     if (is64bit) {
       setUsesTOCBasePtr(DAG);
       SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
       GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
                                    GOTReg, TGA);
     } else {
       if (picLevel == PICLevel::SmallPIC)
         GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
       else
         GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
     }
     return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
                        GOTPtr, TGA, TGA);
   }
 
   if (Model == TLSModel::LocalDynamic) {
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue GOTPtr;
     if (is64bit) {
       setUsesTOCBasePtr(DAG);
       SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
       GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
                            GOTReg, TGA);
     } else {
       if (picLevel == PICLevel::SmallPIC)
         GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
       else
         GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
     }
     SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
                                   PtrVT, GOTPtr, TGA, TGA);
     SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
                                       PtrVT, TLSAddr, TGA);
     return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
   }
 
   llvm_unreachable("Unknown TLS model!");
 }
 
 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
   SDLoc DL(GSDN);
   const GlobalValue *GV = GSDN->getGlobal();
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
     return getTOCEntry(DAG, DL, true, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
   bool IsPIC = isPositionIndependent();
   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
 
   if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
                                             GSDN->getOffset(),
                                             PPCII::MO_PIC_FLAG);
     return getTOCEntry(DAG, DL, false, GA);
   }
 
   SDValue GAHi =
     DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
   SDValue GALo =
     DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
 
   SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
 
   // If the global reference is actually to a non-lazy-pointer, we have to do an
   // extra load to get the address of the global.
   if (MOHiFlag & PPCII::MO_NLP_FLAG)
     Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
   return Ptr;
 }
 
 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDLoc dl(Op);
 
   if (Op.getValueType() == MVT::v2i64) {
     // When the operands themselves are v2i64 values, we need to do something
     // special because VSX has no underlying comparison operations for these.
     if (Op.getOperand(0).getValueType() == MVT::v2i64) {
       // Equality can be handled by casting to the legal type for Altivec
       // comparisons, everything else needs to be expanded.
       if (CC == ISD::SETEQ || CC == ISD::SETNE) {
         return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                  DAG.getSetCC(dl, MVT::v4i32,
                    DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
                    DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
                    CC));
       }
 
       return SDValue();
     }
 
     // We handle most of these in the usual way.
     return Op;
   }
 
   // If we're comparing for equality to zero, expose the fact that this is
   // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
   // fold the new nodes.
   if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
     return V;
 
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
     // Leave comparisons against 0 and -1 alone for now, since they're usually
     // optimized.  FIXME: revisit this when we can custom lower all setcc
     // optimizations.
     if (C->isAllOnesValue() || C->isNullValue())
       return SDValue();
   }
 
   // If we have an integer seteq/setne, turn it into a compare against zero
   // by xor'ing the rhs with the lhs, which is faster than setting a
   // condition register, reading it back out, and masking the correct bit.  The
   // normal approach here uses sub to do this instead of xor.  Using xor exposes
   // the result to other bit-twiddling opportunities.
   EVT LHSVT = Op.getOperand(0).getValueType();
   if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     EVT VT = Op.getValueType();
     SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
                                 Op.getOperand(1));
     return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
   }
   return SDValue();
 }
 
 SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc dl(Node);
 
   assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
 
   // gpr_index
   SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
                                     VAListPtr, MachinePointerInfo(SV), MVT::i8);
   InChain = GprIndex.getValue(1);
 
   if (VT == MVT::i64) {
     // Check if GprIndex is even
     SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
                                  DAG.getConstant(1, dl, MVT::i32));
     SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
                                 DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
     SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
                                           DAG.getConstant(1, dl, MVT::i32));
     // Align GprIndex to be even if it isn't
     GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
                            GprIndex);
   }
 
   // fpr index is 1 byte after gpr
   SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
                                DAG.getConstant(1, dl, MVT::i32));
 
   // fpr
   SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
                                     FprPtr, MachinePointerInfo(SV), MVT::i8);
   InChain = FprIndex.getValue(1);
 
   SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
                                        DAG.getConstant(8, dl, MVT::i32));
 
   SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
                                         DAG.getConstant(4, dl, MVT::i32));
 
   // areas
   SDValue OverflowArea =
       DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
   InChain = OverflowArea.getValue(1);
 
   SDValue RegSaveArea =
       DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
   InChain = RegSaveArea.getValue(1);
 
   // select overflow_area if index > 8
   SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
                             DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
 
   // adjustment constant gpr_index * 4/8
   SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
                                     VT.isInteger() ? GprIndex : FprIndex,
                                     DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
                                                     MVT::i32));
 
   // OurReg = RegSaveArea + RegConstant
   SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
                                RegConstant);
 
   // Floating types are 32 bytes into RegSaveArea
   if (VT.isFloatingPoint())
     OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
                          DAG.getConstant(32, dl, MVT::i32));
 
   // increase {f,g}pr_index by 1 (or 2 if VT is i64)
   SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
                                    VT.isInteger() ? GprIndex : FprIndex,
                                    DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
                                                    MVT::i32));
 
   InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
                               VT.isInteger() ? VAListPtr : FprPtr,
                               MachinePointerInfo(SV), MVT::i8);
 
   // determine if we should load from reg_save_area or overflow_area
   SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
 
   // increase overflow_area by 4/8 if gpr/fpr > 8
   SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
                                           DAG.getConstant(VT.isInteger() ? 4 : 8,
                                           dl, MVT::i32));
 
   OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
                              OverflowAreaPlusN);
 
   InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
                               MachinePointerInfo(), MVT::i32);
 
   return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
   assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
 
   // We have to copy the entire va_list struct:
   // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
   return DAG.getMemcpy(Op.getOperand(0), Op,
                        Op.getOperand(1), Op.getOperand(2),
                        DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
                        false, MachinePointerInfo(), MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
                                                   SelectionDAG &DAG) const {
   return Op.getOperand(0);
 }
 
 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   SDLoc dl(Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
   Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
   Entry.Ty = IntPtrTy;
   Entry.Node = Trmp; Args.push_back(Entry);
 
   // TrampSize == (isPPC64 ? 48 : 40);
   Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
                                isPPC64 ? MVT::i64 : MVT::i32);
   Args.push_back(Entry);
 
   Entry.Node = FPtr; Args.push_back(Entry);
   Entry.Node = Nest; Args.push_back(Entry);
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
       CallingConv::C, Type::getVoidTy(*DAG.getContext()),
       DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   SDLoc dl(Op);
 
   if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
     return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
                         MachinePointerInfo(SV));
   }
 
   // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
   // We suppose the given va_list is already allocated.
   //
   // typedef struct {
   //  char gpr;     /* index into the array of 8 GPRs
   //                 * stored in the register save area
   //                 * gpr=0 corresponds to r3,
   //                 * gpr=1 to r4, etc.
   //                 */
   //  char fpr;     /* index into the array of 8 FPRs
   //                 * stored in the register save area
   //                 * fpr=0 corresponds to f1,
   //                 * fpr=1 to f2, etc.
   //                 */
   //  char *overflow_arg_area;
   //                /* location on stack that holds
   //                 * the next overflow argument
   //                 */
   //  char *reg_save_area;
   //               /* where r3:r10 and f1:f8 (if saved)
   //                * are stored
   //                */
   // } va_list[1];
 
   SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
   SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
   SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
                                             PtrVT);
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                  PtrVT);
 
   uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
   SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
 
   uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
   SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
 
   uint64_t FPROffset = 1;
   SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Store first byte : number of int regs
   SDValue firstStore =
       DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
                         MachinePointerInfo(SV), MVT::i8);
   uint64_t nextOffset = FPROffset;
   SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
                                   ConstFPROffset);
 
   // Store second byte : number of float regs
   SDValue secondStore =
       DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
                         MachinePointerInfo(SV, nextOffset), MVT::i8);
   nextOffset += StackOffset;
   nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
 
   // Store second word : arguments given on stack
   SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
                                     MachinePointerInfo(SV, nextOffset));
   nextOffset += FrameOffset;
   nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
 
   // Store third word : arguments given in registers
   return DAG.getStore(thirdStore, dl, FR, nextPtr,
                       MachinePointerInfo(SV, nextOffset));
 }
 
 #include "PPCGenCallingConv.inc"
 
 // Function whose sole purpose is to kill compiler warnings
 // stemming from unused functions included from PPCGenCallingConv.inc.
 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
   return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
 }
 
 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                       CCValAssign::LocInfo &LocInfo,
                                       ISD::ArgFlagsTy &ArgFlags,
                                       CCState &State) {
   return true;
 }
 
 bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
                                              MVT &LocVT,
                                              CCValAssign::LocInfo &LocInfo,
                                              ISD::ArgFlagsTy &ArgFlags,
                                              CCState &State) {
   static const MCPhysReg ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
   const unsigned NumArgRegs = array_lengthof(ArgRegs);
 
   unsigned RegNum = State.getFirstUnallocated(ArgRegs);
 
   // Skip one register if the first unallocated register has an even register
   // number and there are still argument registers available which have not been
   // allocated yet. RegNum is actually an index into ArgRegs, which means we
   // need to skip a register if RegNum is odd.
   if (RegNum != NumArgRegs && RegNum % 2 == 1) {
     State.AllocateReg(ArgRegs[RegNum]);
   }
 
   // Always return false here, as this function only makes sure that the first
   // unallocated register has an odd register number and does not actually
   // allocate a register for the current argument.
   return false;
 }
 
 bool
 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
                                                   MVT &LocVT,
                                                   CCValAssign::LocInfo &LocInfo,
                                                   ISD::ArgFlagsTy &ArgFlags,
                                                   CCState &State) {
   static const MCPhysReg ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
   const unsigned NumArgRegs = array_lengthof(ArgRegs);
 
   unsigned RegNum = State.getFirstUnallocated(ArgRegs);
   int RegsLeft = NumArgRegs - RegNum;
 
   // Skip if there is not enough registers left for long double type (4 gpr regs
   // in soft float mode) and put long double argument on the stack.
   if (RegNum != NumArgRegs && RegsLeft < 4) {
     for (int i = 0; i < RegsLeft; i++) {
       State.AllocateReg(ArgRegs[RegNum + i]);
     }
   }
 
   return false;
 }
 
 bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                                MVT &LocVT,
                                                CCValAssign::LocInfo &LocInfo,
                                                ISD::ArgFlagsTy &ArgFlags,
                                                CCState &State) {
   static const MCPhysReg ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
   };
 
   const unsigned NumArgRegs = array_lengthof(ArgRegs);
 
   unsigned RegNum = State.getFirstUnallocated(ArgRegs);
 
   // If there is only one Floating-point register left we need to put both f64
   // values of a split ppc_fp128 value on the stack.
   if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
     State.AllocateReg(ArgRegs[RegNum]);
   }
 
   // Always return false here, as this function only makes sure that the two f64
   // values a ppc_fp128 value is split into are both passed in registers or both
   // passed on the stack and does not actually allocate a register for the
   // current argument.
   return false;
 }
 
 /// FPR - The set of FP registers that should be allocated for arguments,
 /// on Darwin.
 static const MCPhysReg FPR[] = {PPC::F1,  PPC::F2,  PPC::F3, PPC::F4, PPC::F5,
                                 PPC::F6,  PPC::F7,  PPC::F8, PPC::F9, PPC::F10,
                                 PPC::F11, PPC::F12, PPC::F13};
 
 /// QFPR - The set of QPX registers that should be allocated for arguments.
 static const MCPhysReg QFPR[] = {
     PPC::QF1, PPC::QF2, PPC::QF3,  PPC::QF4,  PPC::QF5,  PPC::QF6, PPC::QF7,
     PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
 
 /// CalculateStackSlotSize - Calculates the size reserved for this argument on
 /// the stack.
 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
                                        unsigned PtrByteSize) {
   unsigned ArgSize = ArgVT.getStoreSize();
   if (Flags.isByVal())
     ArgSize = Flags.getByValSize();
 
   // Round up to multiples of the pointer size, except for array members,
   // which are always packed.
   if (!Flags.isInConsecutiveRegs())
     ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
 
   return ArgSize;
 }
 
 /// CalculateStackSlotAlignment - Calculates the alignment of this argument
 /// on the stack.
 static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
                                             ISD::ArgFlagsTy Flags,
                                             unsigned PtrByteSize) {
   unsigned Align = PtrByteSize;
 
   // Altivec parameters are padded to a 16 byte boundary.
   if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
       ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
       ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
       ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
     Align = 16;
   // QPX vector types stored in double-precision are padded to a 32 byte
   // boundary.
   else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
     Align = 32;
 
   // ByVal parameters are aligned as requested.
   if (Flags.isByVal()) {
     unsigned BVAlign = Flags.getByValAlign();
     if (BVAlign > PtrByteSize) {
       if (BVAlign % PtrByteSize != 0)
           llvm_unreachable(
             "ByVal alignment is not a multiple of the pointer size");
 
       Align = BVAlign;
     }
   }
 
   // Array members are always packed to their original alignment.
   if (Flags.isInConsecutiveRegs()) {
     // If the array member was split into multiple registers, the first
     // needs to be aligned to the size of the full type.  (Except for
     // ppcf128, which is only aligned as its f64 components.)
     if (Flags.isSplit() && OrigVT != MVT::ppcf128)
       Align = OrigVT.getStoreSize();
     else
       Align = ArgVT.getStoreSize();
   }
 
   return Align;
 }
 
 /// CalculateStackSlotUsed - Return whether this argument will use its
 /// stack slot (instead of being passed in registers).  ArgOffset,
 /// AvailableFPRs, and AvailableVRs must hold the current argument
 /// position, and will be updated to account for this argument.
 static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
                                    ISD::ArgFlagsTy Flags,
                                    unsigned PtrByteSize,
                                    unsigned LinkageSize,
                                    unsigned ParamAreaSize,
                                    unsigned &ArgOffset,
                                    unsigned &AvailableFPRs,
                                    unsigned &AvailableVRs, bool HasQPX) {
   bool UseMemory = false;
 
   // Respect alignment of argument on the stack.
   unsigned Align =
     CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
   ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
   // If there's no space left in the argument save area, we must
   // use memory (this check also catches zero-sized arguments).
   if (ArgOffset >= LinkageSize + ParamAreaSize)
     UseMemory = true;
 
   // Allocate argument on the stack.
   ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
   if (Flags.isInConsecutiveRegsLast())
     ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
   // If we overran the argument save area, we must use memory
   // (this check catches arguments passed partially in memory)
   if (ArgOffset > LinkageSize + ParamAreaSize)
     UseMemory = true;
 
   // However, if the argument is actually passed in an FPR or a VR,
   // we don't use memory after all.
   if (!Flags.isByVal()) {
     if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
         // QPX registers overlap with the scalar FP registers.
         (HasQPX && (ArgVT == MVT::v4f32 ||
                     ArgVT == MVT::v4f64 ||
                     ArgVT == MVT::v4i1)))
       if (AvailableFPRs > 0) {
         --AvailableFPRs;
         return false;
       }
     if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
         ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
         ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
         ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
       if (AvailableVRs > 0) {
         --AvailableVRs;
         return false;
       }
   }
 
   return UseMemory;
 }
 
 /// EnsureStackAlignment - Round stack frame size up from NumBytes to
 /// ensure minimum alignment required for target.
 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
                                      unsigned NumBytes) {
   unsigned TargetAlign = Lowering->getStackAlignment();
   unsigned AlignMask = TargetAlign - 1;
   NumBytes = (NumBytes + AlignMask) & ~AlignMask;
   return NumBytes;
 }
 
 SDValue PPCTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   if (Subtarget.isSVR4ABI()) {
     if (Subtarget.isPPC64())
       return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
                                          dl, DAG, InVals);
     else
       return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
                                          dl, DAG, InVals);
   } else {
     return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
                                        dl, DAG, InVals);
   }
 }
 
 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   // 32-bit SVR4 ABI Stack Frame Layout:
   //              +-----------------------------------+
   //        +-->  |            Back chain             |
   //        |     +-----------------------------------+
   //        |     | Floating-point register save area |
   //        |     +-----------------------------------+
   //        |     |    General register save area     |
   //        |     +-----------------------------------+
   //        |     |          CR save word             |
   //        |     +-----------------------------------+
   //        |     |         VRSAVE save word          |
   //        |     +-----------------------------------+
   //        |     |         Alignment padding         |
   //        |     +-----------------------------------+
   //        |     |     Vector register save area     |
   //        |     +-----------------------------------+
   //        |     |       Local variable space        |
   //        |     +-----------------------------------+
   //        |     |        Parameter list area        |
   //        |     +-----------------------------------+
   //        |     |           LR save word            |
   //        |     +-----------------------------------+
   // SP-->  +---  |            Back chain             |
   //              +-----------------------------------+
   //
   // Specifications:
   //   System V Application Binary Interface PowerPC Processor Supplement
   //   AltiVec Technology Programming Interface Manual
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
   EVT PtrVT = getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = 4;
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   CCInfo.AllocateStack(LinkageSize, PtrByteSize);
   if (useSoftFloat() || hasSPE())
     CCInfo.PreAnalyzeFormalArguments(Ins);
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
   CCInfo.clearWasPPCF128();
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
       const TargetRegisterClass *RC;
       EVT ValVT = VA.getValVT();
 
       switch (ValVT.getSimpleVT().SimpleTy) {
         default:
           llvm_unreachable("ValVT not supported by formal arguments Lowering");
         case MVT::i1:
         case MVT::i32:
           RC = &PPC::GPRCRegClass;
           break;
         case MVT::f32:
           if (Subtarget.hasP8Vector())
             RC = &PPC::VSSRCRegClass;
           else if (Subtarget.hasSPE())
             RC = &PPC::SPE4RCRegClass;
           else
             RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
           if (Subtarget.hasVSX())
             RC = &PPC::VSFRCRegClass;
           else if (Subtarget.hasSPE())
             RC = &PPC::SPERCRegClass;
           else
             RC = &PPC::F8RCRegClass;
           break;
         case MVT::v16i8:
         case MVT::v8i16:
         case MVT::v4i32:
           RC = &PPC::VRRCRegClass;
           break;
         case MVT::v4f32:
           RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
           break;
         case MVT::v2f64:
         case MVT::v2i64:
           RC = &PPC::VRRCRegClass;
           break;
         case MVT::v4f64:
           RC = &PPC::QFRCRegClass;
           break;
         case MVT::v4i1:
           RC = &PPC::QBRCRegClass;
           break;
       }
 
       // Transform the arguments stored in physical registers into virtual ones.
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
                                             ValVT == MVT::i1 ? MVT::i32 : ValVT);
 
       if (ValVT == MVT::i1)
         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
 
       InVals.push_back(ArgValue);
     } else {
       // Argument stored in memory.
       assert(VA.isMemLoc());
 
       unsigned ArgSize = VA.getLocVT().getStoreSize();
       int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(),
                                      isImmutable);
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       InVals.push_back(
           DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
     }
   }
 
   // Assign locations to all of the incoming aggregate by value arguments.
   // Aggregates passed by value are stored in the local variable space of the
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                       ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
 
   CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
 
   // Area that is at least reserved in the caller of this function.
   unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
   MinReservedArea = std::max(MinReservedArea, LinkageSize);
 
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized function's reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
   MinReservedArea =
       EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
   FuncInfo->setMinReservedArea(MinReservedArea);
 
   SmallVector<SDValue, 8> MemOps;
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
     static const MCPhysReg GPArgRegs[] = {
       PPC::R3, PPC::R4, PPC::R5, PPC::R6,
       PPC::R7, PPC::R8, PPC::R9, PPC::R10,
     };
     const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
 
     static const MCPhysReg FPArgRegs[] = {
       PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
       PPC::F8
     };
     unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
 
     if (useSoftFloat() || hasSPE())
        NumFPArgRegs = 0;
 
     FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
     FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
 
     // Make room for NumGPArgRegs and NumFPArgRegs.
     int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
                 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
 
     FuncInfo->setVarArgsStackOffset(
       MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
                             CCInfo.getNextStackOffset(), true));
 
     FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // The fixed integer arguments of a variadic function are stored to the
     // VarArgsFrameIndex on the stack so that they may be loaded by
     // dereferencing the result of va_next.
     for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
       // Get an existing live-in vreg, or add a new one.
       unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
       if (!VReg)
         VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
 
     // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
     // is set.
     // The double arguments are stored to the VarArgsFrameIndex
     // on the stack.
     for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
       // Get an existing live-in vreg, or add a new one.
       unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
       if (!VReg)
         VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by eight for the next argument to store
       SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
                                          PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
   }
 
   if (!MemOps.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
 
 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
 // value to MVT::i64 and then truncate to the correct register size.
 SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
                                              EVT ObjectVT, SelectionDAG &DAG,
                                              SDValue ArgVal,
                                              const SDLoc &dl) const {
   if (Flags.isSExt())
     ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
   else if (Flags.isZExt())
     ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
 
   return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
 }
 
 SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
   assert(!(CallConv == CallingConv::Fast && isVarArg) &&
          "fastcc not supported on varargs functions");
 
   EVT PtrVT = getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = 8;
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
   static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
 
   const unsigned Num_GPR_Regs = array_lengthof(GPR);
   const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
   const unsigned Num_VR_Regs  = array_lengthof(VR);
   const unsigned Num_QFPR_Regs = Num_FPR_Regs;
 
   // Do a first pass over the arguments to determine whether the ABI
   // guarantees that our caller has allocated the parameter save area
   // on its stack frame.  In the ELFv1 ABI, this is always the case;
   // in the ELFv2 ABI, it is true if this is a vararg function or if
   // any parameter is located in a stack slot.
 
   bool HasParameterArea = !isELFv2ABI || isVarArg;
   unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
   unsigned NumBytes = LinkageSize;
   unsigned AvailableFPRs = Num_FPR_Regs;
   unsigned AvailableVRs = Num_VR_Regs;
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     if (Ins[i].Flags.isNest())
       continue;
 
     if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
                                PtrByteSize, LinkageSize, ParamAreaSize,
                                NumBytes, AvailableFPRs, AvailableVRs,
                                Subtarget.hasQPX()))
       HasParameterArea = true;
   }
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
   // entry to a function on PPC, the arguments start after the linkage area,
   // although the first ones are often in registers.
 
   unsigned ArgOffset = LinkageSize;
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
   unsigned &QFPR_idx = FPR_idx;
   SmallVector<SDValue, 8> MemOps;
   Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
     EVT OrigVT = Ins[ArgNo].ArgVT;
     unsigned ObjSize = ObjectVT.getStoreSize();
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
     if (Ins[ArgNo].isOrigArg()) {
       std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
       CurArgIdx = Ins[ArgNo].getOrigArgIndex();
     }
     // We re-align the argument offset for each argument, except when using the
     // fast calling convention, when we need to make sure we do that only when
     // we'll actually use a stack slot.
     unsigned CurArgOffset, Align;
     auto ComputeArgOffset = [&]() {
       /* Respect alignment of argument on the stack.  */
       Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
       ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
       CurArgOffset = ArgOffset;
     };
 
     if (CallConv != CallingConv::Fast) {
       ComputeArgOffset();
 
       /* Compute GPR index associated with argument offset.  */
       GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
       GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
     }
 
     // FIXME the codegen can be much improved in some cases.
     // We do not have to keep everything in memory.
     if (Flags.isByVal()) {
       assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
 
       if (CallConv == CallingConv::Fast)
         ComputeArgOffset();
 
       // ObjSize is the true size, ArgSize rounded up to multiple of registers.
       ObjSize = Flags.getByValSize();
       ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
       // Empty aggregate parameters do not take up registers.  Examples:
       //   struct { } a;
       //   union  { } b;
       //   int c[0];
       // etc.  However, we have to provide a place-holder in InVals, so
       // pretend we have an 8-byte item at the current address for that
       // purpose.
       if (!ObjSize) {
         int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
         SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
         InVals.push_back(FIN);
         continue;
       }
 
       // Create a stack object covering all stack doublewords occupied
       // by the argument.  If the argument is (fully or partially) on
       // the stack, or if the argument is fully in registers but the
       // caller has allocated the parameter save anyway, we can refer
       // directly to the caller's stack frame.  Otherwise, create a
       // local copy in our own frame.
       int FI;
       if (HasParameterArea ||
           ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
         FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
       else
         FI = MFI.CreateStackObject(ArgSize, Align, false);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
 
       // Handle aggregates smaller than 8 bytes.
       if (ObjSize < PtrByteSize) {
         // The value of the object is its address, which differs from the
         // address of the enclosing doubleword on big-endian systems.
         SDValue Arg = FIN;
         if (!isLittleEndian) {
           SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
           Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
         }
         InVals.push_back(Arg);
 
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
           FuncInfo->addLiveInAttr(VReg, Flags);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store;
 
           if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
             EVT ObjType = (ObjSize == 1 ? MVT::i8 :
                            (ObjSize == 2 ? MVT::i16 : MVT::i32));
             Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
                                       MachinePointerInfo(&*FuncArg), ObjType);
           } else {
             // For sizes that don't fit a truncating store (3, 5, 6, 7),
             // store the whole register as-is to the parameter save area
             // slot.
             Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                  MachinePointerInfo(&*FuncArg));
           }
 
           MemOps.push_back(Store);
         }
         // Whether we copied from a register or not, advance the offset
         // into the parameter save area by a full doubleword.
         ArgOffset += PtrByteSize;
         continue;
       }
 
       // The value of the object is its address, which is the address of
       // its first stack doubleword.
       InVals.push_back(FIN);
 
       // Store whatever pieces of the object are in registers to memory.
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
         if (GPR_idx == Num_GPR_Regs)
           break;
 
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         FuncInfo->addLiveInAttr(VReg, Flags);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
         SDValue Addr = FIN;
         if (j) {
           SDValue Off = DAG.getConstant(j, dl, PtrVT);
           Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
         }
         SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
                                      MachinePointerInfo(&*FuncArg, j));
         MemOps.push_back(Store);
         ++GPR_idx;
       }
       ArgOffset += ArgSize;
       continue;
     }
 
     switch (ObjectVT.getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unhandled argument type!");
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
       if (Flags.isNest()) {
         // The 'nest' parameter, if any, is passed in R11.
         unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
 
         break;
       }
 
       // These can be scalar arguments or elements of an integer array type
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != Num_GPR_Regs) {
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
         FuncInfo->addLiveInAttr(VReg, Flags);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
       } else {
         if (CallConv == CallingConv::Fast)
           ComputeArgOffset();
 
         needsLoad = true;
         ArgSize = PtrByteSize;
       }
       if (CallConv != CallingConv::Fast || needsLoad)
         ArgOffset += 8;
       break;
 
     case MVT::f32:
     case MVT::f64:
       // These can be scalar arguments or elements of a float array type
       // passed directly.  The latter are used to implement ELFv2 homogenous
       // float aggregates.
       if (FPR_idx != Num_FPR_Regs) {
         unsigned VReg;
 
         if (ObjectVT == MVT::f32)
           VReg = MF.addLiveIn(FPR[FPR_idx],
                               Subtarget.hasP8Vector()
                                   ? &PPC::VSSRCRegClass
                                   : &PPC::F4RCRegClass);
         else
           VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
                                                 ? &PPC::VSFRCRegClass
                                                 : &PPC::F8RCRegClass);
 
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
       } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
         // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
         // once we support fp <-> gpr moves.
 
         // This can only ever happen in the presence of f32 array types,
         // since otherwise we never run out of FPRs before running out
         // of GPRs.
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
         FuncInfo->addLiveInAttr(VReg, Flags);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::f32) {
           if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
             ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
                                  DAG.getConstant(32, dl, MVT::i32));
           ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
         }
 
         ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
       } else {
         if (CallConv == CallingConv::Fast)
           ComputeArgOffset();
 
         needsLoad = true;
       }
 
       // When passing an array of floats, the array occupies consecutive
       // space in the argument area; only round up to the next doubleword
       // at the end of the array.  Otherwise, each float takes 8 bytes.
       if (CallConv != CallingConv::Fast || needsLoad) {
         ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
         ArgOffset += ArgSize;
         if (Flags.isInConsecutiveRegsLast())
           ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
       }
       break;
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
     case MVT::v1i128:
     case MVT::f128:
       if (!Subtarget.hasQPX()) {
         // These can be scalar arguments or elements of a vector array type
         // passed directly.  The latter are used to implement ELFv2 homogenous
         // vector aggregates.
         if (VR_idx != Num_VR_Regs) {
           unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
           ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
           ++VR_idx;
         } else {
           if (CallConv == CallingConv::Fast)
             ComputeArgOffset();
           needsLoad = true;
         }
         if (CallConv != CallingConv::Fast || needsLoad)
           ArgOffset += 16;
         break;
       } // not QPX
 
       assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
              "Invalid QPX parameter type");
       /* fall through */
 
     case MVT::v4f64:
     case MVT::v4i1:
       // QPX vectors are treated like their scalar floating-point subregisters
       // (except that they're larger).
       unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
       if (QFPR_idx != Num_QFPR_Regs) {
         const TargetRegisterClass *RC;
         switch (ObjectVT.getSimpleVT().SimpleTy) {
         case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
         case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
         default:         RC = &PPC::QBRCRegClass; break;
         }
 
         unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++QFPR_idx;
       } else {
         if (CallConv == CallingConv::Fast)
           ComputeArgOffset();
         needsLoad = true;
       }
       if (CallConv != CallingConv::Fast || needsLoad)
         ArgOffset += Sz;
       break;
     }
 
     // We need to load the argument to a virtual register if we determined
     // above that we ran out of physical registers of the appropriate type.
     if (needsLoad) {
       if (ObjSize < ArgSize && !isLittleEndian)
         CurArgOffset += ArgSize - ObjSize;
       int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
     }
 
     InVals.push_back(ArgVal);
   }
 
   // Area that is at least reserved in the caller of this function.
   unsigned MinReservedArea;
   if (HasParameterArea)
     MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
   else
     MinReservedArea = LinkageSize;
 
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
   MinReservedArea =
       EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
   FuncInfo->setMinReservedArea(MinReservedArea);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
     int Depth = ArgOffset;
 
     FuncInfo->setVarArgsFrameIndex(
       MFI.CreateFixedObject(PtrByteSize, Depth, true));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // If this function is vararg, store any remaining integer argument regs
     // to their spots on the stack so that they may be loaded by dereferencing
     // the result of va_next.
     for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
          GPR_idx < Num_GPR_Regs; ++GPR_idx) {
       unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
   }
 
   if (!MemOps.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
 
 SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
   EVT PtrVT = getPointerTy(MF.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned ArgOffset = LinkageSize;
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
   static const MCPhysReg GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
   static const MCPhysReg GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
   static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
 
   const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
   const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
   const unsigned Num_VR_Regs  = array_lengthof( VR);
 
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
   const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   // In 32-bit non-varargs functions, the stack space for vectors is after the
   // stack space for non-vectors.  We do not use this space unless we have
   // too many vectors to fit in registers, something that only occurs in
   // constructed examples:), but we have to walk the arglist to figure
   // that out...for the pathological case, compute VecArgOffset as the
   // start of the vector parameter area.  Computing VecArgOffset is the
   // entire point of the following loop.
   unsigned VecArgOffset = ArgOffset;
   if (!isVarArg && !isPPC64) {
     for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
          ++ArgNo) {
       EVT ObjectVT = Ins[ArgNo].VT;
       ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
 
       if (Flags.isByVal()) {
         // ObjSize is the true size, ArgSize rounded up to multiple of regs.
         unsigned ObjSize = Flags.getByValSize();
         unsigned ArgSize =
                 ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
         VecArgOffset += ArgSize;
         continue;
       }
 
       switch(ObjectVT.getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unhandled argument type!");
       case MVT::i1:
       case MVT::i32:
       case MVT::f32:
         VecArgOffset += 4;
         break;
       case MVT::i64:  // PPC64
       case MVT::f64:
         // FIXME: We are guaranteed to be !isPPC64 at this point.
         // Does MVT::i64 apply?
         VecArgOffset += 8;
         break;
       case MVT::v4f32:
       case MVT::v4i32:
       case MVT::v8i16:
       case MVT::v16i8:
         // Nothing to do, we're only looking at Nonvector args here.
         break;
       }
     }
   }
   // We've found where the vector parameter area in memory is.  Skip the
   // first 12 parameters; these don't use that memory.
   VecArgOffset = ((VecArgOffset+15)/16)*16;
   VecArgOffset += 12*16;
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
   // entry to a function on PPC, the arguments start after the linkage area,
   // although the first ones are often in registers.
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
   Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
     if (Ins[ArgNo].isOrigArg()) {
       std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
       CurArgIdx = Ins[ArgNo].getOrigArgIndex();
     }
     unsigned CurArgOffset = ArgOffset;
 
     // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
     if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
         ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
       if (isVarArg || isPPC64) {
         MinReservedArea = ((MinReservedArea+15)/16)*16;
         MinReservedArea += CalculateStackSlotSize(ObjectVT,
                                                   Flags,
                                                   PtrByteSize);
       } else  nAltivecParamsAtEnd++;
     } else
       // Calculate min reserved area.
       MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
                                                 Flags,
                                                 PtrByteSize);
 
     // FIXME the codegen can be much improved in some cases.
     // We do not have to keep everything in memory.
     if (Flags.isByVal()) {
       assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
 
       // ObjSize is the true size, ArgSize rounded up to multiple of registers.
       ObjSize = Flags.getByValSize();
       ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
       // Objects of size 1 and 2 are right justified, everything else is
       // left justified.  This means the memory address is adjusted forwards.
       if (ObjSize==1 || ObjSize==2) {
         CurArgOffset = CurArgOffset + (4 - ObjSize);
       }
       // The value of the object is its address.
       int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       InVals.push_back(FIN);
       if (ObjSize==1 || ObjSize==2) {
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg;
           if (isPPC64)
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
           else
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
           SDValue Store =
               DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
                                 MachinePointerInfo(&*FuncArg), ObjType);
           MemOps.push_back(Store);
           ++GPR_idx;
         }
 
         ArgOffset += PtrByteSize;
 
         continue;
       }
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
         // Store whatever pieces of the object are in registers
         // to memory.  ArgOffset will be the address of the beginning
         // of the object.
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg;
           if (isPPC64)
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
           else
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                        MachinePointerInfo(&*FuncArg, j));
           MemOps.push_back(Store);
           ++GPR_idx;
           ArgOffset += PtrByteSize;
         } else {
           ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
           break;
         }
       }
       continue;
     }
 
     switch (ObjectVT.getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unhandled argument type!");
     case MVT::i1:
     case MVT::i32:
       if (!isPPC64) {
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
 
           if (ObjectVT == MVT::i1)
             ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
 
           ++GPR_idx;
         } else {
           needsLoad = true;
           ArgSize = PtrByteSize;
         }
         // All int arguments reserve stack space in the Darwin ABI.
         ArgOffset += PtrByteSize;
         break;
       }
       LLVM_FALLTHROUGH;
     case MVT::i64:  // PPC64
       if (GPR_idx != Num_GPR_Regs) {
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
 
         ++GPR_idx;
       } else {
         needsLoad = true;
         ArgSize = PtrByteSize;
       }
       // All int arguments reserve stack space in the Darwin ABI.
       ArgOffset += 8;
       break;
 
     case MVT::f32:
     case MVT::f64:
       // Every 4 bytes of argument space consumes one of the GPRs available for
       // argument passing.
       if (GPR_idx != Num_GPR_Regs) {
         ++GPR_idx;
         if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
           ++GPR_idx;
       }
       if (FPR_idx != Num_FPR_Regs) {
         unsigned VReg;
 
         if (ObjectVT == MVT::f32)
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
         else
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
 
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
       } else {
         needsLoad = true;
       }
 
       // All FP arguments reserve stack space in the Darwin ABI.
       ArgOffset += isPPC64 ? 8 : ObjSize;
       break;
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
       // Note that vector arguments in registers don't reserve stack space,
       // except in varargs functions.
       if (VR_idx != Num_VR_Regs) {
         unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         if (isVarArg) {
           while ((ArgOffset % 16) != 0) {
             ArgOffset += PtrByteSize;
             if (GPR_idx != Num_GPR_Regs)
               GPR_idx++;
           }
           ArgOffset += 16;
           GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
         }
         ++VR_idx;
       } else {
         if (!isVarArg && !isPPC64) {
           // Vectors go after all the nonvectors.
           CurArgOffset = VecArgOffset;
           VecArgOffset += 16;
         } else {
           // Vectors are aligned.
           ArgOffset = ((ArgOffset+15)/16)*16;
           CurArgOffset = ArgOffset;
           ArgOffset += 16;
         }
         needsLoad = true;
       }
       break;
     }
 
     // We need to load the argument to a virtual register if we determined above
     // that we ran out of physical registers of the appropriate type.
     if (needsLoad) {
       int FI = MFI.CreateFixedObject(ObjSize,
                                      CurArgOffset + (ArgSize - ObjSize),
                                      isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
     }
 
     InVals.push_back(ArgVal);
   }
 
   // Allow for Altivec parameters at the end, if needed.
   if (nAltivecParamsAtEnd) {
     MinReservedArea = ((MinReservedArea+15)/16)*16;
     MinReservedArea += 16*nAltivecParamsAtEnd;
   }
 
   // Area that is at least reserved in the caller of this function.
   MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
 
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
   MinReservedArea =
       EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
   FuncInfo->setMinReservedArea(MinReservedArea);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
     int Depth = ArgOffset;
 
     FuncInfo->setVarArgsFrameIndex(
       MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
                             Depth, true));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // If this function is vararg, store any remaining integer argument regs
     // to their spots on the stack so that they may be loaded by dereferencing
     // the result of va_next.
     for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
       unsigned VReg;
 
       if (isPPC64)
         VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
       else
         VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
 
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
       MemOps.push_back(Store);
       // Increment the address by four for the next argument to store
       SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
   }
 
   if (!MemOps.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
 
 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
 /// adjusted to accommodate the arguments for the tailcall.
 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
                                    unsigned ParamSize) {
 
   if (!isTailCall) return 0;
 
   PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
   unsigned CallerMinReservedArea = FI->getMinReservedArea();
   int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
   // Remember only if the new adjustment is bigger.
   if (SPDiff < FI->getTailCallSPDelta())
     FI->setTailCallSPDelta(SPDiff);
 
   return SPDiff;
 }
 
 static bool isFunctionGlobalAddress(SDValue Callee);
 
 static bool
 callsShareTOCBase(const Function *Caller, SDValue Callee,
                     const TargetMachine &TM) {
   // If !G, Callee can be an external symbol.
   GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   if (!G)
     return false;
 
   // The medium and large code models are expected to provide a sufficiently
   // large TOC to provide all data addressing needs of a module with a
   // single TOC. Since each module will be addressed with a single TOC then we
   // only need to check that caller and callee don't cross dso boundaries.
   if (CodeModel::Medium == TM.getCodeModel() ||
       CodeModel::Large == TM.getCodeModel())
     return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());
 
   // Otherwise we need to ensure callee and caller are in the same section,
   // since the linker may allocate multiple TOCs, and we don't know which
   // sections will belong to the same TOC base.
 
   const GlobalValue *GV = G->getGlobal();
   if (!GV->isStrongDefinitionForLinker())
     return false;
 
   // Any explicitly-specified sections and section prefixes must also match.
   // Also, if we're using -ffunction-sections, then each function is always in
   // a different section (the same is true for COMDAT functions).
   if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() ||
       GV->getSection() != Caller->getSection())
     return false;
   if (const auto *F = dyn_cast<Function>(GV)) {
     if (F->getSectionPrefix() != Caller->getSectionPrefix())
       return false;
   }
 
   // If the callee might be interposed, then we can't assume the ultimate call
   // target will be in the same section. Even in cases where we can assume that
   // interposition won't happen, in any case where the linker might insert a
   // stub to allow for interposition, we must generate code as though
   // interposition might occur. To understand why this matters, consider a
   // situation where: a -> b -> c where the arrows indicate calls. b and c are
   // in the same section, but a is in a different module (i.e. has a different
   // TOC base pointer). If the linker allows for interposition between b and c,
   // then it will generate a stub for the call edge between b and c which will
   // save the TOC pointer into the designated stack slot allocated by b. If we
   // return true here, and therefore allow a tail call between b and c, that
   // stack slot won't exist and the b -> c stub will end up saving b'c TOC base
   // pointer into the stack slot allocated by a (where the a -> b stub saved
   // a's TOC base pointer). If we're not considering a tail call, but rather,
   // whether a nop is needed after the call instruction in b, because the linker
   // will insert a stub, it might complain about a missing nop if we omit it
   // (although many don't complain in this case).
   if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
     return false;
 
   return true;
 }
 
 static bool
 needStackSlotPassParameters(const PPCSubtarget &Subtarget,
                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
   assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
 
   const unsigned PtrByteSize = 8;
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
   static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
 
   const unsigned NumGPRs = array_lengthof(GPR);
   const unsigned NumFPRs = 13;
   const unsigned NumVRs = array_lengthof(VR);
   const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
 
   unsigned NumBytes = LinkageSize;
   unsigned AvailableFPRs = NumFPRs;
   unsigned AvailableVRs = NumVRs;
 
   for (const ISD::OutputArg& Param : Outs) {
     if (Param.Flags.isNest()) continue;
 
     if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
                                PtrByteSize, LinkageSize, ParamAreaSize,
                                NumBytes, AvailableFPRs, AvailableVRs,
                                Subtarget.hasQPX()))
       return true;
   }
   return false;
 }
 
 static bool
 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
   if (CS.arg_size() != CallerFn->arg_size())
     return false;
 
   ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
   ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
   Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
 
   for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
     const Value* CalleeArg = *CalleeArgIter;
     const Value* CallerArg = &(*CallerArgIter);
     if (CalleeArg == CallerArg)
       continue;
 
     // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
     //        tail call @callee([4 x i64] undef, [4 x i64] %b)
     //      }
     // 1st argument of callee is undef and has the same type as caller.
     if (CalleeArg->getType() == CallerArg->getType() &&
         isa<UndefValue>(CalleeArg))
       continue;
 
     return false;
   }
 
   return true;
 }
 
 // Returns true if TCO is possible between the callers and callees
 // calling conventions.
 static bool
 areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
                                     CallingConv::ID CalleeCC) {
   // Tail calls are possible with fastcc and ccc.
   auto isTailCallableCC  = [] (CallingConv::ID CC){
       return  CC == CallingConv::C || CC == CallingConv::Fast;
   };
   if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
     return false;
 
   // We can safely tail call both fastcc and ccc callees from a c calling
   // convention caller. If the caller is fastcc, we may have less stack space
   // than a non-fastcc caller with the same signature so disable tail-calls in
   // that case.
   return CallerCC == CallingConv::C || CallerCC == CalleeCC;
 }
 
 bool
 PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
                                     SDValue Callee,
                                     CallingConv::ID CalleeCC,
                                     ImmutableCallSite CS,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     SelectionDAG& DAG) const {
   bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
 
   if (DisableSCO && !TailCallOpt) return false;
 
   // Variadic argument functions are not supported.
   if (isVarArg) return false;
 
   auto &Caller = DAG.getMachineFunction().getFunction();
   // Check that the calling conventions are compatible for tco.
   if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
     return false;
 
   // Caller contains any byval parameter is not supported.
   if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
     return false;
 
   // Callee contains any byval parameter is not supported, too.
   // Note: This is a quick work around, because in some cases, e.g.
   // caller's stack size > callee's stack size, we are still able to apply
   // sibling call optimization. For example, gcc is able to do SCO for caller1
   // in the following example, but not for caller2.
   //   struct test {
   //     long int a;
   //     char ary[56];
   //   } gTest;
   //   __attribute__((noinline)) int callee(struct test v, struct test *b) {
   //     b->a = v.a;
   //     return 0;
   //   }
   //   void caller1(struct test a, struct test c, struct test *b) {
   //     callee(gTest, b); }
   //   void caller2(struct test *b) { callee(gTest, b); }
   if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
     return false;
 
   // If callee and caller use different calling conventions, we cannot pass
   // parameters on stack since offsets for the parameter area may be different.
   if (Caller.getCallingConv() != CalleeCC &&
       needStackSlotPassParameters(Subtarget, Outs))
     return false;
 
   // No TCO/SCO on indirect call because Caller have to restore its TOC
   if (!isFunctionGlobalAddress(Callee) &&
       !isa<ExternalSymbolSDNode>(Callee))
     return false;
 
   // If the caller and callee potentially have different TOC bases then we
   // cannot tail call since we need to restore the TOC pointer after the call.
   // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
   if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
     return false;
 
   // TCO allows altering callee ABI, so we don't have to check further.
   if (CalleeCC == CallingConv::Fast && TailCallOpt)
     return true;
 
   if (DisableSCO) return false;
 
   // If callee use the same argument list that caller is using, then we can
   // apply SCO on this case. If it is not, then we need to check if callee needs
   // stack for passing arguments.
   if (!hasSameArgumentList(&Caller, CS) &&
       needStackSlotPassParameters(Subtarget, Outs)) {
     return false;
   }
 
   return true;
 }
 
 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
 /// for tail call optimization. Targets which want to do tail call
 /// optimization should implement this function.
 bool
 PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                      CallingConv::ID CalleeCC,
                                                      bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
   if (!getTargetMachine().Options.GuaranteedTailCallOpt)
     return false;
 
   // Variable argument functions are not supported.
   if (isVarArg)
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
   CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
   if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
     // Functions containing by val parameters are not supported.
     for (unsigned i = 0; i != Ins.size(); i++) {
        ISD::ArgFlagsTy Flags = Ins[i].Flags;
        if (Flags.isByVal()) return false;
     }
 
     // Non-PIC/GOT tail calls are supported.
     if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
       return true;
 
     // At the moment we can only do local tail calls (in same module, hidden
     // or protected) if we are generating PIC.
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
       return G->getGlobal()->hasHiddenVisibility()
           || G->getGlobal()->hasProtectedVisibility();
   }
 
   return false;
 }
 
 /// isCallCompatibleAddress - Return the immediate to use if the specified
 /// 32-bit value is representable in the immediate field of a BxA instruction.
 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
   if (!C) return nullptr;
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
       SignExtend32<26>(Addr) != Addr)
     return nullptr;  // Top 6 bits have to be sext of immediate.
 
   return DAG
       .getConstant(
           (int)C->getZExtValue() >> 2, SDLoc(Op),
           DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
       .getNode();
 }
 
 namespace {
 
 struct TailCallArgumentInfo {
   SDValue Arg;
   SDValue FrameIdxOp;
   int FrameIdx = 0;
 
   TailCallArgumentInfo() = default;
 };
 
 } // end anonymous namespace
 
 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
 static void StoreTailCallArgumentsToStackSlot(
     SelectionDAG &DAG, SDValue Chain,
     const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
     SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
   for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
     SDValue Arg = TailCallArgs[i].Arg;
     SDValue FIN = TailCallArgs[i].FrameIdxOp;
     int FI = TailCallArgs[i].FrameIdx;
     // Store relative to framepointer.
     MemOpChains.push_back(DAG.getStore(
         Chain, dl, Arg, FIN,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
   }
 }
 
 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
 /// the appropriate stack slot for the tail call optimized function call.
 static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
                                              SDValue OldRetAddr, SDValue OldFP,
                                              int SPDiff, const SDLoc &dl) {
   if (SPDiff) {
     // Calculate the new stack slot for the return address.
     MachineFunction &MF = DAG.getMachineFunction();
     const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
     const PPCFrameLowering *FL = Subtarget.getFrameLowering();
     bool isPPC64 = Subtarget.isPPC64();
     int SlotSize = isPPC64 ? 8 : 4;
     int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
     int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
                                                          NewRetAddrLoc, true);
     EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
     SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
     Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
                          MachinePointerInfo::getFixedStack(MF, NewRetAddr));
 
     // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
     // slot as the FP is never overwritten.
     if (Subtarget.isDarwinABI()) {
       int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
       int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
                                                          true);
       SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
       Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
                            MachinePointerInfo::getFixedStack(
                                DAG.getMachineFunction(), NewFPIdx));
     }
   }
   return Chain;
 }
 
 /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
 /// the position of the argument.
 static void
 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
                          SDValue Arg, int SPDiff, unsigned ArgOffset,
                      SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
   int Offset = ArgOffset + SPDiff;
   uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
   int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
   EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
   SDValue FIN = DAG.getFrameIndex(FI, VT);
   TailCallArgumentInfo Info;
   Info.Arg = Arg;
   Info.FrameIdxOp = FIN;
   Info.FrameIdx = FI;
   TailCallArguments.push_back(Info);
 }
 
 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
 /// stack slot. Returns the chain as result and the loaded frame pointers in
 /// LROpOut/FPOpout. Used when tail calling.
 SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
     SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
     SDValue &FPOpOut, const SDLoc &dl) const {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
     EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
     LROpOut = getReturnAddrFrameIndex(DAG);
     LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
     Chain = SDValue(LROpOut.getNode(), 1);
 
     // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
     // slot as the FP is never overwritten.
     if (Subtarget.isDarwinABI()) {
       FPOpOut = getFramePointerFrameIndex(DAG);
       FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
       Chain = SDValue(FPOpOut.getNode(), 1);
     }
   }
   return Chain;
 }
 
 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
 /// by "Src" to address "Dst" of size "Size".  Alignment information is
 /// specified by the specific parameter attribute. The copy will be passed as
 /// a byval function parameter.
 /// Sometimes what we are copying is the end of a larger object, the part that
 /// does not fit in registers.
 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        false, false, false, MachinePointerInfo(),
                        MachinePointerInfo());
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
 /// tail calls.
 static void LowerMemOpCallTo(
     SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
     SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
     bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
     SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   if (!isTailCall) {
     if (isVector) {
       SDValue StackPtr;
       if (isPPC64)
         StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
       else
         StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
       PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
                            DAG.getConstant(ArgOffset, dl, PtrVT));
     }
     MemOpChains.push_back(
         DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
     // Calculate and remember argument location.
   } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
                                   TailCallArguments);
 }
 
 static void
 PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
                 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
                 SDValue FPOp,
                 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
   // Emit a sequence of copyto/copyfrom virtual registers for arguments that
   // might overwrite each other in case of tail call optimization.
   SmallVector<SDValue, 8> MemOpChains2;
   // Do not flag preceding copytoreg stuff together with the following stuff.
   InFlag = SDValue();
   StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
                                     MemOpChains2, dl);
   if (!MemOpChains2.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
   // Store the return address to the appropriate stack slot.
   Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
 
   // Emit callseq_end just before tailcall node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 }
 
 // Is this global address that of a function that can be called by name? (as
 // opposed to something that must hold a descriptor for an indirect call).
 static bool isFunctionGlobalAddress(SDValue Callee) {
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
         Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
       return false;
 
     return G->getGlobal()->getValueType()->isFunctionTy();
   }
 
   return false;
 }
 
 static unsigned
 PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
             SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall,
             bool isPatchPoint, bool hasNest,
             SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
             SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
             ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   NodeTys.push_back(MVT::Other);   // Returns a chain
   NodeTys.push_back(MVT::Glue);    // Returns a flag for retval copy to use.
 
   unsigned CallOpc = PPCISD::CALL;
 
   bool needIndirectCall = true;
   if (!isSVR4ABI || !isPPC64)
     if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
       // If this is an absolute destination address, use the munged value.
       Callee = SDValue(Dest, 0);
       needIndirectCall = false;
     }
 
   // PC-relative references to external symbols should go through $stub, unless
   // we're building with the leopard linker or later, which automatically
   // synthesizes these stubs.
   const TargetMachine &TM = DAG.getTarget();
   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
     GV = G->getGlobal();
   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
   bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;
 
   if (isFunctionGlobalAddress(Callee)) {
     GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
     // A call to a TLS address is actually an indirect call to a
     // thread-specific pointer.
     unsigned OpFlags = 0;
     if (UsePlt)
       OpFlags = PPCII::MO_PLT;
 
     // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
     // every direct call is) turn it into a TargetGlobalAddress /
     // TargetExternalSymbol node so that legalize doesn't hack it.
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
                                         Callee.getValueType(), 0, OpFlags);
     needIndirectCall = false;
   }
 
   if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
 
     if (UsePlt)
       OpFlags = PPCII::MO_PLT;
 
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
                                          OpFlags);
     needIndirectCall = false;
   }
 
   if (isPatchPoint) {
     // We'll form an invalid direct call when lowering a patchpoint; the full
     // sequence for an indirect call is complicated, and many of the
     // instructions introduced might have side effects (and, thus, can't be
     // removed later). The call itself will be removed as soon as the
     // argument/return lowering is complete, so the fact that it has the wrong
     // kind of operands should not really matter.
     needIndirectCall = false;
   }
 
   if (needIndirectCall) {
     // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
     // to do the call, we can't use PPCISD::CALL.
     SDValue MTCTROps[] = {Chain, Callee, InFlag};
 
     if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
       // Function pointers in the 64-bit SVR4 ABI do not point to the function
       // entry point, but to the function descriptor (the function entry point
       // address is part of the function descriptor though).
       // The function descriptor is a three doubleword structure with the
       // following fields: function entry point, TOC base address and
       // environment pointer.
       // Thus for a call through a function pointer, the following actions need
       // to be performed:
       //   1. Save the TOC of the caller in the TOC save area of its stack
       //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
       //   2. Load the address of the function entry point from the function
       //      descriptor.
       //   3. Load the TOC of the callee from the function descriptor into r2.
       //   4. Load the environment pointer from the function descriptor into
       //      r11.
       //   5. Branch to the function entry point address.
       //   6. On return of the callee, the TOC of the caller needs to be
       //      restored (this is done in FinishCall()).
       //
       // The loads are scheduled at the beginning of the call sequence, and the
       // register copies are flagged together to ensure that no other
       // operations can be scheduled in between. E.g. without flagging the
       // copies together, a TOC access in the caller could be scheduled between
       // the assignment of the callee TOC and the branch to the callee, which
       // results in the TOC access going through the TOC of the callee instead
       // of going through the TOC of the caller, which leads to incorrect code.
 
       // Load the address of the function entry point from the function
       // descriptor.
       SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1);
       if (LDChain.getValueType() == MVT::Glue)
         LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2);
 
       auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
                           ? (MachineMemOperand::MODereferenceable |
                              MachineMemOperand::MOInvariant)
                           : MachineMemOperand::MONone;
 
       MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
       SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
                                         /* Alignment = */ 8, MMOFlags);
 
       // Load environment pointer into r11.
       SDValue PtrOff = DAG.getIntPtrConstant(16, dl);
       SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
       SDValue LoadEnvPtr =
           DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16),
                       /* Alignment = */ 8, MMOFlags);
 
       SDValue TOCOff = DAG.getIntPtrConstant(8, dl);
       SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
       SDValue TOCPtr =
           DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8),
                       /* Alignment = */ 8, MMOFlags);
 
       setUsesTOCBasePtr(DAG);
       SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr,
                                         InFlag);
       Chain = TOCVal.getValue(0);
       InFlag = TOCVal.getValue(1);
 
       // If the function call has an explicit 'nest' parameter, it takes the
       // place of the environment pointer.
       if (!hasNest) {
         SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
                                           InFlag);
 
         Chain = EnvVal.getValue(0);
         InFlag = EnvVal.getValue(1);
       }
 
       MTCTROps[0] = Chain;
       MTCTROps[1] = LoadFuncPtr;
       MTCTROps[2] = InFlag;
     }
 
     Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
                         makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
     InFlag = Chain.getValue(1);
 
     NodeTys.clear();
     NodeTys.push_back(MVT::Other);
     NodeTys.push_back(MVT::Glue);
     Ops.push_back(Chain);
     CallOpc = PPCISD::BCTRL;
     Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
     if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
       Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
   }
 
   // If this is a direct call, pass the chain and the callee.
   if (Callee.getNode()) {
     Ops.push_back(Chain);
     Ops.push_back(Callee);
   }
   // If this is a tail call add stack pointer delta.
   if (isTailCall)
     Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
   // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
   // into the call.
   if (isSVR4ABI && isPPC64 && !isPatchPoint) {
     setUsesTOCBasePtr(DAG);
     Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
   }
 
   return CallOpc;
 }
 
 SDValue PPCTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
 
   CCRetInfo.AnalyzeCallResult(
       Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
                ? RetCC_PPC_Cold
                : RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     SDValue Val = DAG.getCopyFromReg(Chain, dl,
                                      VA.getLocReg(), VA.getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::AExt:
       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
       break;
     case CCValAssign::ZExt:
       Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
                         DAG.getValueType(VA.getValVT()));
       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
       break;
     case CCValAssign::SExt:
       Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
                         DAG.getValueType(VA.getValVT()));
       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
       break;
     }
 
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
 SDValue PPCTargetLowering::FinishCall(
     CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
     bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
     SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
     unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
     SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
                                  SPDiff, isTailCall, isPatchPoint, hasNest,
                                  RegsToPass, Ops, NodeTys, CS, Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
   if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
     Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
 
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCFrameLowering::eliminateCallFramePseudoInstr.
   int BytesCalleePops =
     (CallConv == CallingConv::Fast &&
      getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
 
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask =
       TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   // Emit tail call.
   if (isTailCall) {
     assert(((Callee.getOpcode() == ISD::Register &&
              cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
             Callee.getOpcode() == ISD::TargetExternalSymbol ||
             Callee.getOpcode() == ISD::TargetGlobalAddress ||
             isa<ConstantSDNode>(Callee)) &&
     "Expecting an global address, external symbol, absolute value or register");
 
     DAG.getMachineFunction().getFrameInfo().setHasTailCall();
     return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
   }
 
   // Add a NOP immediately after the branch instruction when using the 64-bit
   // SVR4 ABI. At link time, if caller and callee are in a different module and
   // thus have a different TOC, the call will be replaced with a call to a stub
   // function which saves the current TOC, loads the TOC of the callee and
   // branches to the callee. The NOP will be replaced with a load instruction
   // which restores the TOC of the caller from the TOC save slot of the current
   // stack frame. If caller and callee belong to the same module (and have the
   // same TOC), the NOP will remain unchanged.
 
   MachineFunction &MF = DAG.getMachineFunction();
   if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
       !isPatchPoint) {
     if (CallOpc == PPCISD::BCTRL) {
       // This is a call through a function pointer.
       // Restore the caller TOC from the save area into R2.
       // See PrepareCall() for more information about calls through function
       // pointers in the 64-bit SVR4 ABI.
       // We are using a target-specific load with r2 hard coded, because the
       // result of a target-independent load would never go directly into r2,
       // since r2 is a reserved register (which prevents the register allocator
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       CallOpc = PPCISD::BCTRL_LOAD_TOC;
 
       EVT PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
       unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
       SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
       SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
 
       // The address needs to go after the chain input but before the flag (or
       // any other variadic arguments).
       Ops.insert(std::next(Ops.begin()), AddTOC);
     } else if (CallOpc == PPCISD::CALL &&
       !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) {
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
     }
   }
 
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
                              DAG.getIntPtrConstant(BytesCalleePops, dl, true),
                              InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
                          Ins, dl, DAG, InVals);
 }
 
 SDValue
 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
   SDLoc &dl                             = CLI.DL;
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
   bool isPatchPoint                     = CLI.IsPatchPoint;
   ImmutableCallSite CS                  = CLI.CS;
 
   if (isTailCall) {
     if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
       isTailCall = false;
     else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
       isTailCall =
         IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
                                                  isVarArg, Outs, Ins, DAG);
     else
       isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                      Ins, DAG);
     if (isTailCall) {
       ++NumTailCalls;
       if (!getTargetMachine().Options.GuaranteedTailCallOpt)
         ++NumSiblingCalls;
 
       assert(isa<GlobalAddressSDNode>(Callee) &&
              "Callee should be an llvm::Function object.");
       LLVM_DEBUG(
           const GlobalValue *GV =
               cast<GlobalAddressSDNode>(Callee)->getGlobal();
           const unsigned Width =
               80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
           dbgs() << "TCO caller: "
                  << left_justify(DAG.getMachineFunction().getName(), Width)
                  << ", callee linkage: " << GV->getVisibility() << ", "
                  << GV->getLinkage() << "\n");
     }
   }
 
   if (!isTailCall && CS && CS.isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
   // When long calls (i.e. indirect calls) are always used, calls are always
   // made via function pointer. If we have a function name, first translate it
   // into a pointer.
   if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
       !isTailCall)
     Callee = LowerGlobalAddress(Callee, DAG);
 
   if (Subtarget.isSVR4ABI()) {
     if (Subtarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
                               isTailCall, isPatchPoint, Outs, OutVals, Ins,
                               dl, DAG, InVals, CS);
     else
       return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
                               isTailCall, isPatchPoint, Outs, OutVals, Ins,
                               dl, DAG, InVals, CS);
   }
 
   return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
                           isTailCall, isPatchPoint, Outs, OutVals, Ins,
                           dl, DAG, InVals, CS);
 }
 
 SDValue PPCTargetLowering::LowerCall_32SVR4(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, bool isPatchPoint,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite CS) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
   assert((CallConv == CallingConv::C ||
           CallConv == CallingConv::Cold ||
           CallConv == CallingConv::Fast) && "Unknown calling convention!");
 
   unsigned PtrByteSize = 4;
 
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Mark this function as potentially containing a function that contains a
   // tail call. As a consequence the frame pointer will be used for dynamicalloc
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, parameter list area and the part of the local variable space which
   // contains copies of aggregates which are passed by value.
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
                        PtrByteSize);
   if (useSoftFloat())
     CCInfo.PreAnalyzeCallOperands(Outs);
 
   if (isVarArg) {
     // Handle fixed and variable vector arguments differently.
     // Fixed vector arguments go into registers as long as registers are
     // available. Variable vector arguments always go into memory.
     unsigned NumArgs = Outs.size();
 
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ArgVT = Outs[i].VT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       bool Result;
 
       if (Outs[i].IsFixed) {
         Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
                                CCInfo);
       } else {
         Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
                                       ArgFlags, CCInfo);
       }
 
       if (Result) {
 #ifndef NDEBUG
         errs() << "Call operand #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << "\n";
 #endif
         llvm_unreachable(nullptr);
       }
     }
   } else {
     // All arguments are treated the same.
     CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
   }
   CCInfo.clearWasPPCF128();
 
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
 
   CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
 
   // Size of the linkage area, parameter list area and the part of the local
   // space variable where copies of aggregates which are passed by value are
   // stored.
   unsigned NumBytes = CCByValInfo.getNextStackOffset();
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
   int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be moved somewhere else
   // later.
   SDValue LROp, FPOp;
   Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
 
   // Set up a copy of the stack pointer for use loading and storing any
   // arguments that may not fit in the registers available for argument
   // passing.
   SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
   SmallVector<SDValue, 8> MemOpChains;
 
   bool seenFloatArg = false;
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, j = 0, e = ArgLocs.size();
        i != e;
        ++i) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
     if (Flags.isByVal()) {
       // Argument is an aggregate which is passed by value, thus we need to
       // create a copy of it in the local variable space of the current stack
       // frame (which is the stack frame of the caller) and pass the address of
       // this copy to the callee.
       assert((j < ByValArgLocs.size()) && "Index out of bounds!");
       CCValAssign &ByValVA = ByValArgLocs[j++];
       assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
 
       // Memory reserved in the local variable space of the callers stack frame.
       unsigned LocMemOffset = ByValVA.getLocMemOffset();
 
       SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
       PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
                            StackPtr, PtrOff);
 
       // Create a copy of the argument in the local area of the current
       // stack frame.
       SDValue MemcpyCall =
         CreateCopyOfByValArgument(Arg, PtrOff,
                                   CallSeqStart.getNode()->getOperand(0),
                                   Flags, DAG, dl);
 
       // This must go outside the CALLSEQ_START..END.
       SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
                                                      SDLoc(MemcpyCall));
       DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                              NewCallSeqStart.getNode());
       Chain = CallSeqStart = NewCallSeqStart;
 
       // Pass the address of the aggregate copy on the stack either in a
       // physical register or in the parameter list area of the current stack
       // frame to the callee.
       Arg = PtrOff;
     }
 
     if (VA.isRegLoc()) {
       if (Arg.getValueType() == MVT::i1)
         Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);
 
       seenFloatArg |= VA.getLocVT().isFloatingPoint();
       // Put argument in a physical register.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
       // Put argument in the parameter list area of the current stack frame.
       assert(VA.isMemLoc());
       unsigned LocMemOffset = VA.getLocMemOffset();
 
       if (!isTailCall) {
         SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
         PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
                              StackPtr, PtrOff);
 
         MemOpChains.push_back(
             DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
       } else {
         // Calculate and remember argument location.
         CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
                                  TailCallArguments);
       }
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   // Set CR bit 6 to true if this is a vararg call with floating args passed in
   // registers.
   if (isVarArg) {
     SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, InFlag };
 
     Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
                         dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
 
     InFlag = Chain.getValue(1);
   }
 
   if (isTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
   return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
                     /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
 }
 
 // Copy an argument into memory, being careful to do this outside the
 // call sequence for the call to which the argument belongs.
 SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
     SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
     SelectionDAG &DAG, const SDLoc &dl) const {
   SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
   // The MEMCPY must go outside the CALLSEQ_START..END.
   int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
   SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
                                                  SDLoc(MemcpyCall));
   DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                          NewCallSeqStart.getNode());
   return NewCallSeqStart;
 }
 
 SDValue PPCTargetLowering::LowerCall_64SVR4(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, bool isPatchPoint,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite CS) const {
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
   bool hasNest = false;
   bool IsSibCall = false;
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   unsigned PtrByteSize = 8;
 
   MachineFunction &MF = DAG.getMachineFunction();
 
   if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
     IsSibCall = true;
 
   // Mark this function as potentially containing a function that contains a
   // tail call. As a consequence the frame pointer will be used for dynamicalloc
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   assert(!(CallConv == CallingConv::Fast && isVarArg) &&
          "fastcc not supported on varargs functions");
 
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  On ELFv1, the linkage area is 48 bytes
   // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
   // area is 32 bytes reserved space for [SP][CR][LR][TOC].
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned NumBytes = LinkageSize;
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
   unsigned &QFPR_idx = FPR_idx;
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
   static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
 
   const unsigned NumGPRs = array_lengthof(GPR);
   const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
   const unsigned NumVRs  = array_lengthof(VR);
   const unsigned NumQFPRs = NumFPRs;
 
   // On ELFv2, we can avoid allocating the parameter area if all the arguments
   // can be passed to the callee in registers.
   // For the fast calling convention, there is another check below.
   // Note: We should keep consistent with LowerFormalArguments_64SVR4()
   bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
   if (!HasParameterArea) {
     unsigned ParamAreaSize = NumGPRs * PtrByteSize;
     unsigned AvailableFPRs = NumFPRs;
     unsigned AvailableVRs = NumVRs;
     unsigned NumBytesTmp = NumBytes;
     for (unsigned i = 0; i != NumOps; ++i) {
       if (Outs[i].Flags.isNest()) continue;
       if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
                                 PtrByteSize, LinkageSize, ParamAreaSize,
                                 NumBytesTmp, AvailableFPRs, AvailableVRs,
                                 Subtarget.hasQPX()))
         HasParameterArea = true;
     }
   }
 
   // When using the fast calling convention, we don't provide backing for
   // arguments that will be in registers.
   unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
 
   // Avoid allocating parameter area for fastcc functions if all the arguments
   // can be passed in the registers.
   if (CallConv == CallingConv::Fast)
     HasParameterArea = false;
 
   // Add up all the space actually used.
   for (unsigned i = 0; i != NumOps; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     EVT ArgVT = Outs[i].VT;
     EVT OrigVT = Outs[i].ArgVT;
 
     if (Flags.isNest())
       continue;
 
     if (CallConv == CallingConv::Fast) {
       if (Flags.isByVal()) {
         NumGPRsUsed += (Flags.getByValSize()+7)/8;
         if (NumGPRsUsed > NumGPRs)
           HasParameterArea = true;
       } else {
         switch (ArgVT.getSimpleVT().SimpleTy) {
         default: llvm_unreachable("Unexpected ValueType for argument!");
         case MVT::i1:
         case MVT::i32:
         case MVT::i64:
           if (++NumGPRsUsed <= NumGPRs)
             continue;
           break;
         case MVT::v4i32:
         case MVT::v8i16:
         case MVT::v16i8:
         case MVT::v2f64:
         case MVT::v2i64:
         case MVT::v1i128:
         case MVT::f128:
           if (++NumVRsUsed <= NumVRs)
             continue;
           break;
         case MVT::v4f32:
           // When using QPX, this is handled like a FP register, otherwise, it
           // is an Altivec register.
           if (Subtarget.hasQPX()) {
             if (++NumFPRsUsed <= NumFPRs)
               continue;
           } else {
             if (++NumVRsUsed <= NumVRs)
               continue;
           }
           break;
         case MVT::f32:
         case MVT::f64:
         case MVT::v4f64: // QPX
         case MVT::v4i1:  // QPX
           if (++NumFPRsUsed <= NumFPRs)
             continue;
           break;
         }
         HasParameterArea = true;
       }
     }
 
     /* Respect alignment of argument on the stack.  */
     unsigned Align =
       CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
     NumBytes = ((NumBytes + Align - 1) / Align) * Align;
 
     NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
     if (Flags.isInConsecutiveRegsLast())
       NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
   }
 
   unsigned NumBytesActuallyUsed = NumBytes;
 
   // In the old ELFv1 ABI,
   // the prolog code of the callee may store up to 8 GPR argument registers to
   // the stack, allowing va_start to index over them in memory if its varargs.
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
   // In the ELFv2 ABI, we allocate the parameter area iff a callee
   // really requires memory operands, e.g. a vararg function.
   if (HasParameterArea)
     NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
   else
     NumBytes = LinkageSize;
 
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
     NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
   int SPDiff = 0;
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
   if (!IsSibCall)
     SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
 
   // To protect arguments on the stack from being clobbered in a tail call,
   // force all the loads to happen before doing any other lowering.
   if (isTailCall)
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
   // later.
   SDValue LROp, FPOp;
   Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
 
   // Set up a copy of the stack pointer for use loading and storing any
   // arguments that may not fit in the registers available for argument
   // passing.
   SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
 
   // Figure out which arguments are going to go in registers, and which in
   // memory.  Also, if this is a vararg function, floating point operations
   // must be stored to our stack, and loaded into integer regs as well, if
   // any integer regs are available for argument passing.
   unsigned ArgOffset = LinkageSize;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
 
   SmallVector<SDValue, 8> MemOpChains;
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     EVT ArgVT = Outs[i].VT;
     EVT OrigVT = Outs[i].ArgVT;
 
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
     SDValue PtrOff;
 
     // We re-align the argument offset for each argument, except when using the
     // fast calling convention, when we need to make sure we do that only when
     // we'll actually use a stack slot.
     auto ComputePtrOff = [&]() {
       /* Respect alignment of argument on the stack.  */
       unsigned Align =
         CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
       ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
 
       PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
 
       PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     };
 
     if (CallConv != CallingConv::Fast) {
       ComputePtrOff();
 
       /* Compute GPR index associated with argument offset.  */
       GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
       GPR_idx = std::min(GPR_idx, NumGPRs);
     }
 
     // Promote integers to 64-bit values.
     if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
       // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
       unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
       Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
     }
 
     // FIXME memcpy is used way more than necessary.  Correctness first.
     // Note: "by value" is code for passing a structure by value, not
     // basic types.
     if (Flags.isByVal()) {
       // Note: Size includes alignment padding, so
       //   struct x { short a; char b; }
       // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
       // These are the proper values we need for right-justifying the
       // aggregate in a parameter register.
       unsigned Size = Flags.getByValSize();
 
       // An empty aggregate parameter takes up no storage and no
       // registers.
       if (Size == 0)
         continue;
 
       if (CallConv == CallingConv::Fast)
         ComputePtrOff();
 
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (Size==1 || Size==2 || Size==4) {
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
                                         MachinePointerInfo(), VT);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
           ArgOffset += PtrByteSize;
           continue;
         }
       }
 
       if (GPR_idx == NumGPRs && Size < 8) {
         SDValue AddPtr = PtrOff;
         if (!isLittleEndian) {
           SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
                                           PtrOff.getValueType());
           AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
         }
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
         ArgOffset += PtrByteSize;
         continue;
       }
       // Copy entire object into memory.  There are cases where gcc-generated
       // code assumes it is there, even if it could be put entirely into
       // registers.  (This is not what the doc says.)
 
       // FIXME: The above statement is likely due to a misunderstanding of the
       // documents.  All arguments must be copied into the parameter area BY
       // THE CALLEE in the event that the callee takes the address of any
       // formal argument.  That has not yet been implemented.  However, it is
       // reasonable to use the stack area as a staging area for the register
       // load.
 
       // Skip this for small aggregates, as we will use the same slot for a
       // right-justified copy, below.
       if (Size >= 8)
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
 
       // When a register is available, pass a small aggregate right-justified.
       if (Size < 8 && GPR_idx != NumGPRs) {
         // The easiest way to get this right-justified in a register
         // is to copy the structure into the rightmost portion of a
         // local variable slot, then load the whole slot into the
         // register.
         // FIXME: The memcpy seems to produce pretty awful code for
         // small aggregates, particularly for packed ones.
         // FIXME: It would be preferable to use the slot in the
         // parameter save area instead of a new local variable.
         SDValue AddPtr = PtrOff;
         if (!isLittleEndian) {
           SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
           AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
         }
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
 
         // Load the slot into the register.
         SDValue Load =
             DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Load.getValue(1));
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
         // Done with this argument.
         ArgOffset += PtrByteSize;
         continue;
       }
 
       // For aggregates larger than PtrByteSize, copy the pieces of the
       // object that fit into registers from the parameter save area.
       for (unsigned j=0; j<Size; j+=PtrByteSize) {
         SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
         if (GPR_idx != NumGPRs) {
           SDValue Load =
               DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           ArgOffset += PtrByteSize;
         } else {
           ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
           break;
         }
       }
       continue;
     }
 
     switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
       if (Flags.isNest()) {
         // The 'nest' parameter, if any, is passed in R11.
         RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
         hasNest = true;
         break;
       }
 
       // These can be scalar arguments or elements of an integer array type
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != NumGPRs) {
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
       } else {
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
         if (CallConv == CallingConv::Fast)
           ArgOffset += PtrByteSize;
       }
       if (CallConv != CallingConv::Fast)
         ArgOffset += PtrByteSize;
       break;
     case MVT::f32:
     case MVT::f64: {
       // These can be scalar arguments or elements of a float array type
       // passed directly.  The latter are used to implement ELFv2 homogenous
       // float aggregates.
 
       // Named arguments go into FPRs first, and once they overflow, the
       // remaining arguments go into GPRs and then the parameter save area.
       // Unnamed arguments for vararg functions always go to GPRs and
       // then the parameter save area.  For now, put all arguments to vararg
       // routines always in both locations (FPR *and* GPR or stack slot).
       bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
       bool NeededLoad = false;
 
       // First load the argument into the next available FPR.
       if (FPR_idx != NumFPRs)
         RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
 
       // Next, load the argument into GPR or stack slot if needed.
       if (!NeedGPROrStack)
         ;
       else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
         // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
         // once we support fp <-> gpr moves.
 
         // In the non-vararg case, this can only ever happen in the
         // presence of f32 array types, since otherwise we never run
         // out of FPRs before running out of GPRs.
         SDValue ArgVal;
 
         // Double values are always passed in a single GPR.
         if (Arg.getValueType() != MVT::f32) {
           ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
 
         // Non-array float values are extended and passed in a GPR.
         } else if (!Flags.isInConsecutiveRegs()) {
           ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
           ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
 
         // If we have an array of floats, we collect every odd element
         // together with its predecessor into one GPR.
         } else if (ArgOffset % PtrByteSize != 0) {
           SDValue Lo, Hi;
           Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
           Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
           if (!isLittleEndian)
             std::swap(Lo, Hi);
           ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
 
         // The final element, if even, goes into the first half of a GPR.
         } else if (Flags.isInConsecutiveRegsLast()) {
           ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
           ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
           if (!isLittleEndian)
             ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
                                  DAG.getConstant(32, dl, MVT::i32));
 
         // Non-final even elements are skipped; they will be handled
         // together the with subsequent argument on the next go-around.
         } else
           ArgVal = SDValue();
 
         if (ArgVal.getNode())
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
       } else {
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
         // Single-precision floating-point values are mapped to the
         // second (rightmost) word of the stack doubleword.
         if (Arg.getValueType() == MVT::f32 &&
             !isLittleEndian && !Flags.isInConsecutiveRegs()) {
           SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
 
         NeededLoad = true;
       }
       // When passing an array of floats, the array occupies consecutive
       // space in the argument area; only round up to the next doubleword
       // at the end of the array.  Otherwise, each float takes 8 bytes.
       if (CallConv != CallingConv::Fast || NeededLoad) {
         ArgOffset += (Arg.getValueType() == MVT::f32 &&
                       Flags.isInConsecutiveRegs()) ? 4 : 8;
         if (Flags.isInConsecutiveRegsLast())
           ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
       }
       break;
     }
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
     case MVT::v1i128:
     case MVT::f128:
       if (!Subtarget.hasQPX()) {
       // These can be scalar arguments or elements of a vector array type
       // passed directly.  The latter are used to implement ELFv2 homogenous
       // vector aggregates.
 
       // For a varargs call, named arguments go into VRs or on the stack as
       // usual; unnamed arguments always go to the stack or the corresponding
       // GPRs when within range.  For now, we always put the value in both
       // locations (or even all three).
       if (isVarArg) {
         assert(HasParameterArea &&
                "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
             DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
         if (VR_idx != NumVRs) {
           SDValue Load =
               DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
         }
         ArgOffset += 16;
         for (unsigned i=0; i<16; i+=PtrByteSize) {
           if (GPR_idx == NumGPRs)
             break;
           SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, dl, PtrVT));
           SDValue Load =
               DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
         }
         break;
       }
 
       // Non-varargs Altivec params go into VRs or on the stack.
       if (VR_idx != NumVRs) {
         RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
       } else {
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
         if (CallConv == CallingConv::Fast)
           ArgOffset += 16;
       }
 
       if (CallConv != CallingConv::Fast)
         ArgOffset += 16;
       break;
       } // not QPX
 
       assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
              "Invalid QPX parameter type");
 
       /* fall through */
     case MVT::v4f64:
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
       if (isVarArg) {
         assert(HasParameterArea &&
                "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
             DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
         if (QFPR_idx != NumQFPRs) {
           SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
                                      PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
         }
         ArgOffset += (IsF32 ? 16 : 32);
         for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
           if (GPR_idx == NumGPRs)
             break;
           SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, dl, PtrVT));
           SDValue Load =
               DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
         }
         break;
       }
 
       // Non-varargs QPX params go into registers or on the stack.
       if (QFPR_idx != NumQFPRs) {
         RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
       } else {
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
         if (CallConv == CallingConv::Fast)
           ArgOffset += (IsF32 ? 16 : 32);
       }
 
       if (CallConv != CallingConv::Fast)
         ArgOffset += (IsF32 ? 16 : 32);
       break;
       }
     }
   }
 
   assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
          "mismatch in size of parameter area");
   (void)NumBytesActuallyUsed;
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Check if this is an indirect call (MTCTR/BCTRL).
   // See PrepareCall() for more information about calls through function
   // pointers in the 64-bit SVR4 ABI.
   if (!isTailCall && !isPatchPoint &&
       !isFunctionGlobalAddress(Callee) &&
       !isa<ExternalSymbolSDNode>(Callee)) {
     // Load r2 into a virtual register and store it to the TOC save area.
     setUsesTOCBasePtr(DAG);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
     // TOC save area offset.
     unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
     SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     Chain = DAG.getStore(
         Val.getValue(1), dl, Val, AddPtr,
         MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
     // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
     // This does not mean the MTCTR instruction must use R12; it's easier
     // to model this as an extra parameter, so do that.
     if (isELFv2ABI && !isPatchPoint)
       RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   if (isTailCall && !IsSibCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
   return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
                     DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
                     SPDiff, NumBytes, Ins, InVals, CS);
 }
 
 SDValue PPCTargetLowering::LowerCall_Darwin(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, bool isPatchPoint,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite CS) const {
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Mark this function as potentially containing a function that contains a
   // tail call. As a consequence the frame pointer will be used for dynamicalloc
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with 24/48 bytes, which is
   // prereserved space for [SP][CR][LR][3 x unused].
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   unsigned NumBytes = LinkageSize;
 
   // Add up all the space actually used.
   // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
   // they all go in registers, but we must reserve stack space for them for
   // possible use by the caller.  In varargs or 64-bit calls, parameters are
   // assigned stack space in order, with padding so Altivec parameters are
   // 16-byte aligned.
   unsigned nAltivecParamsAtEnd = 0;
   for (unsigned i = 0; i != NumOps; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     EVT ArgVT = Outs[i].VT;
     // Varargs Altivec parameters are padded to a 16 byte boundary.
     if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
         ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
         ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
       if (!isVarArg && !isPPC64) {
         // Non-varargs Altivec parameters go after all the non-Altivec
         // parameters; handle those later so we know how much padding we need.
         nAltivecParamsAtEnd++;
         continue;
       }
       // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
       NumBytes = ((NumBytes+15)/16)*16;
     }
     NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
   }
 
   // Allow for Altivec parameters at the end, if needed.
   if (nAltivecParamsAtEnd) {
     NumBytes = ((NumBytes+15)/16)*16;
     NumBytes += 16*nAltivecParamsAtEnd;
   }
 
   // The prolog code of the callee may store up to 8 GPR argument registers to
   // the stack, allowing va_start to index over them in memory if its varargs.
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
   NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
 
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
       CallConv == CallingConv::Fast)
     NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
   int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
 
   // To protect arguments on the stack from being clobbered in a tail call,
   // force all the loads to happen before doing any other lowering.
   if (isTailCall)
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
   // later.
   SDValue LROp, FPOp;
   Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
 
   // Set up a copy of the stack pointer for use loading and storing any
   // arguments that may not fit in the registers available for argument
   // passing.
   SDValue StackPtr;
   if (isPPC64)
     StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
   else
     StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
 
   // Figure out which arguments are going to go in registers, and which in
   // memory.  Also, if this is a vararg function, floating point operations
   // must be stored to our stack, and loaded into integer regs as well, if
   // any integer regs are available for argument passing.
   unsigned ArgOffset = LinkageSize;
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
   static const MCPhysReg GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
   static const MCPhysReg GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
   static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
   const unsigned NumGPRs = array_lengthof(GPR_32);
   const unsigned NumFPRs = 13;
   const unsigned NumVRs  = array_lengthof(VR);
 
   const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
 
   SmallVector<SDValue, 8> MemOpChains;
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
     SDValue PtrOff;
 
     PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
 
     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
 
     // On PPC64, promote integers to 64-bit values.
     if (isPPC64 && Arg.getValueType() == MVT::i32) {
       // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
       unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
       Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
     }
 
     // FIXME memcpy is used way more than necessary.  Correctness first.
     // Note: "by value" is code for passing a structure by value, not
     // basic types.
     if (Flags.isByVal()) {
       unsigned Size = Flags.getByValSize();
       // Very small objects are passed right-justified.  Everything else is
       // passed left-justified.
       if (Size==1 || Size==2) {
         EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
                                         MachinePointerInfo(), VT);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
           ArgOffset += PtrByteSize;
         } else {
           SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
                                           PtrOff.getValueType());
           SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
           Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                             CallSeqStart,
                                                             Flags, DAG, dl);
           ArgOffset += PtrByteSize;
         }
         continue;
       }
       // Copy entire object into memory.  There are cases where gcc-generated
       // code assumes it is there, even if it could be put entirely into
       // registers.  (This is not what the doc says.)
       Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
                                                         CallSeqStart,
                                                         Flags, DAG, dl);
 
       // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
       // copy the pieces of the object that fit into registers from the
       // parameter save area.
       for (unsigned j=0; j<Size; j+=PtrByteSize) {
         SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
         if (GPR_idx != NumGPRs) {
           SDValue Load =
               DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           ArgOffset += PtrByteSize;
         } else {
           ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
           break;
         }
       }
       continue;
     }
 
     switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
       if (GPR_idx != NumGPRs) {
         if (Arg.getValueType() == MVT::i1)
           Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
 
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
       } else {
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
       }
       ArgOffset += PtrByteSize;
       break;
     case MVT::f32:
     case MVT::f64:
       if (FPR_idx != NumFPRs) {
         RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
 
         if (isVarArg) {
           SDValue Store =
               DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Store);
 
           // Float varargs are always shadowed in available integer registers
           if (GPR_idx != NumGPRs) {
             SDValue Load =
                 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
             MemOpChains.push_back(Load.getValue(1));
             RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           }
           if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
             SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
             PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
             SDValue Load =
                 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
             MemOpChains.push_back(Load.getValue(1));
             RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
           }
         } else {
           // If we have any FPRs remaining, we may also have GPRs remaining.
           // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
           // GPRs.
           if (GPR_idx != NumGPRs)
             ++GPR_idx;
           if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
               !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
             ++GPR_idx;
         }
       } else
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
       if (isPPC64)
         ArgOffset += 8;
       else
         ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
       break;
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
       if (isVarArg) {
         // These go aligned on the stack, or in the corresponding R registers
         // when within range.  The Darwin PPC ABI doc claims they also go in
         // V registers; in fact gcc does this only for arguments that are
         // prototyped, not for those that match the ...  We do it for all
         // arguments, seems to work.
         while (ArgOffset % 16 !=0) {
           ArgOffset += PtrByteSize;
           if (GPR_idx != NumGPRs)
             GPR_idx++;
         }
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
                              DAG.getConstant(ArgOffset, dl, PtrVT));
         SDValue Store =
             DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
         MemOpChains.push_back(Store);
         if (VR_idx != NumVRs) {
           SDValue Load =
               DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
         }
         ArgOffset += 16;
         for (unsigned i=0; i<16; i+=PtrByteSize) {
           if (GPR_idx == NumGPRs)
             break;
           SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, dl, PtrVT));
           SDValue Load =
               DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
         }
         break;
       }
 
       // Non-varargs Altivec params generally go in registers, but have
       // stack space allocated at the end.
       if (VR_idx != NumVRs) {
         // Doesn't have GPR space allocated.
         RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
       } else if (nAltivecParamsAtEnd==0) {
         // We are emitting Altivec params in order.
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
         ArgOffset += 16;
       }
       break;
     }
   }
   // If all Altivec parameters fit in registers, as they usually do,
   // they get stack space following the non-Altivec parameters.  We
   // don't track this here because nobody below needs it.
   // If there are more Altivec parameters than fit in registers emit
   // the stores here.
   if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
     unsigned j = 0;
     // Offset is aligned; skip 1st 12 params which go in V registers.
     ArgOffset = ((ArgOffset+15)/16)*16;
     ArgOffset += 12*16;
     for (unsigned i = 0; i != NumOps; ++i) {
       SDValue Arg = OutVals[i];
       EVT ArgType = Outs[i].VT;
       if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
           ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
         if (++j > NumVRs) {
           SDValue PtrOff;
           // We are emitting Altivec params in order.
           LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                            isPPC64, isTailCall, true, MemOpChains,
                            TailCallArguments, dl);
           ArgOffset += 16;
         }
       }
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // On Darwin, R12 must contain the address of an indirect callee.  This does
   // not mean the MTCTR instruction must use R12; it's easier to model this as
   // an extra parameter, so do that.
   if (!isTailCall &&
       !isFunctionGlobalAddress(Callee) &&
       !isa<ExternalSymbolSDNode>(Callee) &&
       !isBLACompatibleAddress(Callee, DAG))
     RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
                                                    PPC::R12), Callee));
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   if (isTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
   return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
                     /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
 }
 
 bool
 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                   MachineFunction &MF, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(
       Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
                 ? RetCC_PPC_Cold
                 : RetCC_PPC);
 }
 
 SDValue
 PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &dl, SelectionDAG &DAG) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs,
                        (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
                            ? RetCC_PPC_Cold
                            : RetCC_PPC);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     SDValue Arg = OutVals[i];
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     }
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
   const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *I =
     TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   if (I) {
     for (; *I; ++I) {
 
       if (PPC::G8RCRegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
       else if (PPC::F8RCRegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
       else if (PPC::CRRCRegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::i1));
       else if (PPC::VRRCRegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::Other));
       else
         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
     }
   }
 
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
   return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 SDValue
 PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   // Get the correct type for integers.
   EVT IntVT = Op.getValueType();
 
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue FPSIdx = getFramePointerFrameIndex(DAG);
   // Build a DYNAREAOFFSET node.
   SDValue Ops[2] = {Chain, FPSIdx};
   SDVTList VTs = DAG.getVTList(IntVT);
   return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
 }
 
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
                                              SelectionDAG &DAG) const {
   // When we pop the dynamic allocation we need to restore the SP link.
   SDLoc dl(Op);
 
   // Get the correct type for pointers.
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Construct the stack pointer operand.
   bool isPPC64 = Subtarget.isPPC64();
   unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
   SDValue StackPtr = DAG.getRegister(SP, PtrVT);
 
   // Get the operands for the STACKRESTORE.
   SDValue Chain = Op.getOperand(0);
   SDValue SaveSP = Op.getOperand(1);
 
   // Load the old link SP.
   SDValue LoadLinkSP =
       DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
 
   // Restore the stack pointer.
   Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
 
   // Store the old link SP.
   return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   int RASI = FI->getReturnAddrSaveIndex();
 
   // If the frame pointer save index hasn't been defined yet.
   if (!RASI) {
     // Find out what the fix offset of the frame pointer save area.
     int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
     // Allocate the frame index for frame pointer save area.
     RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
     // Save the result.
     FI->setReturnAddrSaveIndex(RASI);
   }
   return DAG.getFrameIndex(RASI, PtrVT);
 }
 
 SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   int FPSI = FI->getFramePointerSaveIndex();
 
   // If the frame pointer save index hasn't been defined yet.
   if (!FPSI) {
     // Find out what the fix offset of the frame pointer save area.
     int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
     // Allocate the frame index for frame pointer save area.
     FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
     // Save the result.
     FI->setFramePointerSaveIndex(FPSI);
   }
   return DAG.getFrameIndex(FPSI, PtrVT);
 }
 
 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                    SelectionDAG &DAG) const {
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
   SDLoc dl(Op);
 
   // Get the correct type for pointers.
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   // Negate the size.
   SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
                                 DAG.getConstant(0, dl, PtrVT), Size);
   // Construct a node for the frame pointer save index.
   SDValue FPSIdx = getFramePointerFrameIndex(DAG);
   // Build a DYNALLOC node.
   SDValue Ops[3] = { Chain, NegSize, FPSIdx };
   SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
   return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
 }
 
 SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
                                                      SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   bool isPPC64 = Subtarget.isPPC64();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
   return DAG.getFrameIndex(FI, PtrVT);
 }
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType().isVector())
     return LowerVectorLoad(Op, DAG);
 
   assert(Op.getValueType() == MVT::i1 &&
          "Custom lowering only for i1 loads");
 
   // First, load 8 bits into 32 bits, then truncate to 1 bit.
 
   SDLoc dl(Op);
   LoadSDNode *LD = cast<LoadSDNode>(Op);
 
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
   MachineMemOperand *MMO = LD->getMemOperand();
 
   SDValue NewLD =
       DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
                      BasePtr, MVT::i8, MMO);
   SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
 
   SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
   return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getOperand(1).getValueType().isVector())
     return LowerVectorStore(Op, DAG);
 
   assert(Op.getOperand(1).getValueType() == MVT::i1 &&
          "Custom lowering only for i1 stores");
 
   // First, zero extend to 32 bits, then use a truncating store to 8 bits.
 
   SDLoc dl(Op);
   StoreSDNode *ST = cast<StoreSDNode>(Op);
 
   SDValue Chain = ST->getChain();
   SDValue BasePtr = ST->getBasePtr();
   SDValue Value = ST->getValue();
   MachineMemOperand *MMO = ST->getMemOperand();
 
   Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
                       Value);
   return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
 }
 
 // FIXME: Remove this once the ANDI glue bug is fixed:
 SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::i1 &&
          "Custom lowering only for i1 results");
 
   SDLoc DL(Op);
   return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
                      Op.getOperand(0));
 }
 
 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
 /// possible.
 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // Not FP? Not a fsel.
   if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
       !Op.getOperand(2).getValueType().isFloatingPoint())
     return Op;
 
   // We might be able to do better than this under some circumstances, but in
   // general, fsel-based lowering of select is a finite-math-only optimization.
   // For more information, see section F.3 of the 2.06 ISA specification.
   if (!DAG.getTarget().Options.NoInfsFPMath ||
       !DAG.getTarget().Options.NoNaNsFPMath)
     return Op;
   // TODO: Propagate flags from the select rather than global settings.
   SDNodeFlags Flags;
   Flags.setNoInfs(true);
   Flags.setNoNaNs(true);
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
 
   EVT ResVT = Op.getValueType();
   EVT CmpVT = Op.getOperand(0).getValueType();
   SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
   SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
   SDLoc dl(Op);
 
   // If the RHS of the comparison is a 0.0, we don't need to do the
   // subtraction at all.
   SDValue Sel1;
   if (isFloatingPointZero(RHS))
     switch (CC) {
     default: break;       // SETUO etc aren't handled by fsel.
     case ISD::SETNE:
       std::swap(TV, FV);
       LLVM_FALLTHROUGH;
     case ISD::SETEQ:
       if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
         LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
       Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
       if (Sel1.getValueType() == MVT::f32)   // Comparison is always 64-bits
         Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
       return DAG.getNode(PPCISD::FSEL, dl, ResVT,
                          DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
     case ISD::SETULT:
     case ISD::SETLT:
       std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
       LLVM_FALLTHROUGH;
     case ISD::SETOGE:
     case ISD::SETGE:
       if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
         LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
       return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
     case ISD::SETUGT:
     case ISD::SETGT:
       std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
       LLVM_FALLTHROUGH;
     case ISD::SETOLE:
     case ISD::SETLE:
       if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
         LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
       return DAG.getNode(PPCISD::FSEL, dl, ResVT,
                          DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
     }
 
   SDValue Cmp;
   switch (CC) {
   default: break;       // SETUO etc aren't handled by fsel.
   case ISD::SETNE:
     std::swap(TV, FV);
     LLVM_FALLTHROUGH;
   case ISD::SETEQ:
     Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
     if (Sel1.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT,
                        DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
   case ISD::SETULT:
   case ISD::SETLT:
     Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
   case ISD::SETOGE:
   case ISD::SETGE:
     Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
   case ISD::SETUGT:
   case ISD::SETGT:
     Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
   case ISD::SETOLE:
   case ISD::SETLE:
     Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
   }
   return Op;
 }
 
 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
                                                SelectionDAG &DAG,
                                                const SDLoc &dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
   if (Src.getValueType() == MVT::f32)
     Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
 
   SDValue Tmp;
   switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(
         Op.getOpcode() == ISD::FP_TO_SINT
             ? PPCISD::FCTIWZ
             : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
         dl, MVT::f64, Src);
     break;
   case MVT::i64:
     assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
            "i64 FP_TO_UINT is supported only with FPCVT");
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
                                                         PPCISD::FCTIDUZ,
                       dl, MVT::f64, Src);
     break;
   }
 
   // Convert the FP value to an int value through memory.
   bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
     (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
   SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
   int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
   MachinePointerInfo MPI =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
   // Emit a store to the stack slot.
   SDValue Chain;
   if (i32Stack) {
     MachineFunction &MF = DAG.getMachineFunction();
     MachineMemOperand *MMO =
       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
     SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
     Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
               DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
   } else
     Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);
 
   // Result is a load from the stack slot.  If loading 4 bytes, make sure to
   // add in a bias on big endian.
   if (Op.getValueType() == MVT::i32 && !i32Stack) {
     FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
                         DAG.getConstant(4, dl, FIPtr.getValueType()));
     MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
   }
 
   RLI.Chain = Chain;
   RLI.Ptr = FIPtr;
   RLI.MPI = MPI;
 }
 
 /// Custom lowers floating point to integer conversions to use
 /// the direct move instructions available in ISA 2.07 to avoid the
 /// need for load/store combinations.
 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
                                                     SelectionDAG &DAG,
                                                     const SDLoc &dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
 
   if (Src.getValueType() == MVT::f32)
     Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
 
   SDValue Tmp;
   switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(
         Op.getOpcode() == ISD::FP_TO_SINT
             ? PPCISD::FCTIWZ
             : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
         dl, MVT::f64, Src);
     Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
     break;
   case MVT::i64:
     assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
            "i64 FP_TO_UINT is supported only with FPCVT");
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
                                                         PPCISD::FCTIDUZ,
                       dl, MVT::f64, Src);
     Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
     break;
   }
   return Tmp;
 }
 
 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                                           const SDLoc &dl) const {
 
   // FP to INT conversions are legal for f128.
   if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
     return Op;
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
   // PPC (the libcall is not available).
   if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
     if (Op.getValueType() == MVT::i32) {
       if (Op.getOpcode() == ISD::FP_TO_SINT) {
         SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
                                  MVT::f64, Op.getOperand(0),
                                  DAG.getIntPtrConstant(0, dl));
         SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
                                  MVT::f64, Op.getOperand(0),
                                  DAG.getIntPtrConstant(1, dl));
 
         // Add the two halves of the long double in round-to-zero mode.
         SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
 
         // Now use a smaller FP_TO_SINT.
         return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
       }
       if (Op.getOpcode() == ISD::FP_TO_UINT) {
         const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
         APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
         SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
         //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
         // FIXME: generated code sucks.
         // TODO: Are there fast-math-flags to propagate to this FSUB?
         SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
                                    Op.getOperand(0), Tmp);
         True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
         True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
                            DAG.getConstant(0x80000000, dl, MVT::i32));
         SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
                                     Op.getOperand(0));
         return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
                                ISD::SETGE);
       }
     }
 
     return SDValue();
   }
 
   if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
     return LowerFP_TO_INTDirectMove(Op, DAG, dl);
 
   ReuseLoadInfo RLI;
   LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
 
   return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
                      RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
 }
 
 // We're trying to insert a regular store, S, and then a load, L. If the
 // incoming value, O, is a load, we might just be able to have our load use the
 // address used by O. However, we don't know if anything else will store to
 // that address before we can load from it. To prevent this situation, we need
 // to insert our load, L, into the chain as a peer of O. To do this, we give L
 // the same chain operand as O, we create a token factor from the chain results
 // of O and L, and we replace all uses of O's chain result with that token
 // factor (see spliceIntoChain below for this last part).
 bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
                                             ReuseLoadInfo &RLI,
                                             SelectionDAG &DAG,
                                             ISD::LoadExtType ET) const {
   SDLoc dl(Op);
   if (ET == ISD::NON_EXTLOAD &&
       (Op.getOpcode() == ISD::FP_TO_UINT ||
        Op.getOpcode() == ISD::FP_TO_SINT) &&
       isOperationLegalOrCustom(Op.getOpcode(),
                                Op.getOperand(0).getValueType())) {
 
     LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
     return true;
   }
 
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
   if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
       LD->isNonTemporal())
     return false;
   if (LD->getMemoryVT() != MemVT)
     return false;
 
   RLI.Ptr = LD->getBasePtr();
   if (LD->isIndexed() && !LD->getOffset().isUndef()) {
     assert(LD->getAddressingMode() == ISD::PRE_INC &&
            "Non-pre-inc AM on PPC?");
     RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
                           LD->getOffset());
   }
 
   RLI.Chain = LD->getChain();
   RLI.MPI = LD->getPointerInfo();
   RLI.IsDereferenceable = LD->isDereferenceable();
   RLI.IsInvariant = LD->isInvariant();
   RLI.Alignment = LD->getAlignment();
   RLI.AAInfo = LD->getAAInfo();
   RLI.Ranges = LD->getRanges();
 
   RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
   return true;
 }
 
 // Given the head of the old chain, ResChain, insert a token factor containing
 // it and NewResChain, and make users of ResChain now be users of that token
 // factor.
 // TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
 void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
                                         SDValue NewResChain,
                                         SelectionDAG &DAG) const {
   if (!ResChain)
     return;
 
   SDLoc dl(NewResChain);
 
   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                            NewResChain, DAG.getUNDEF(MVT::Other));
   assert(TF.getNode() != NewResChain.getNode() &&
          "A new TF really is required here");
 
   DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
   DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
 }
 
 /// Analyze profitability of direct move
 /// prefer float load to int load plus direct move
 /// when there is no integer use of int load
 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
   SDNode *Origin = Op.getOperand(0).getNode();
   if (Origin->getOpcode() != ISD::LOAD)
     return true;
 
   // If there is no LXSIBZX/LXSIHZX, like Power8,
   // prefer direct move if the memory size is 1 or 2 bytes.
   MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
   if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
     return true;
 
   for (SDNode::use_iterator UI = Origin->use_begin(),
                             UE = Origin->use_end();
        UI != UE; ++UI) {
 
     // Only look at the users of the loaded value.
     if (UI.getUse().get().getResNo() != 0)
       continue;
 
     if (UI->getOpcode() != ISD::SINT_TO_FP &&
         UI->getOpcode() != ISD::UINT_TO_FP)
       return true;
   }
 
   return false;
 }
 
 /// Custom lowers integer to floating point conversions to use
 /// the direct move instructions available in ISA 2.07 to avoid the
 /// need for load/store combinations.
 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
                                                     SelectionDAG &DAG,
                                                     const SDLoc &dl) const {
   assert((Op.getValueType() == MVT::f32 ||
           Op.getValueType() == MVT::f64) &&
          "Invalid floating point type as target of conversion");
   assert(Subtarget.hasFPCVT() &&
          "Int to FP conversions with direct moves require FPCVT");
   SDValue FP;
   SDValue Src = Op.getOperand(0);
   bool SinglePrec = Op.getValueType() == MVT::f32;
   bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
   bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
   unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
                              (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);
 
   if (WordInt) {
     FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
                      dl, MVT::f64, Src);
     FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
   }
   else {
     FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
     FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
   }
 
   return FP;
 }
 
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   // Conversions to f128 are legal.
   if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
     return Op;
 
   if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
     if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
       return SDValue();
 
     SDValue Value = Op.getOperand(0);
     // The values are now known to be -1 (false) or 1 (true). To convert this
     // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
     // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
     Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
 
     SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
 
     Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
     if (Op.getValueType() != MVT::v4f64)
       Value = DAG.getNode(ISD::FP_ROUND, dl,
                           Op.getValueType(), Value,
                           DAG.getIntPtrConstant(1, dl));
     return Value;
   }
 
   // Don't handle ppc_fp128 here; let it be lowered to a libcall.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
 
   if (Op.getOperand(0).getValueType() == MVT::i1)
     return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
                        DAG.getConstantFP(1.0, dl, Op.getValueType()),
                        DAG.getConstantFP(0.0, dl, Op.getValueType()));
 
   // If we have direct moves, we can do all the conversion, skip the store/load
   // however, without FPCVT we can't do most conversions.
   if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
       Subtarget.isPPC64() && Subtarget.hasFPCVT())
     return LowerINT_TO_FPDirectMove(Op, DAG, dl);
 
   assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
          "UINT_TO_FP is supported only with FPCVT");
 
   // If we have FCFIDS, then use it when converting to single-precision.
   // Otherwise, convert to double-precision and then round.
   unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
                        ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
                                                             : PPCISD::FCFIDS)
                        : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
                                                             : PPCISD::FCFID);
   MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
                   ? MVT::f32
                   : MVT::f64;
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
     SDValue SINT = Op.getOperand(0);
     // When converting to single-precision, we actually need to convert
     // to double-precision first and then round to single-precision.
     // To avoid double-rounding effects during that operation, we have
     // to prepare the input operand.  Bits that might be truncated when
     // converting to double-precision are replaced by a bit that won't
     // be lost at this stage, but is below the single-precision rounding
     // position.
     //
     // However, if -enable-unsafe-fp-math is in effect, accept double
     // rounding to avoid the extra overhead.
     if (Op.getValueType() == MVT::f32 &&
         !Subtarget.hasFPCVT() &&
         !DAG.getTarget().Options.UnsafeFPMath) {
 
       // Twiddle input to make sure the low 11 bits are zero.  (If this
       // is the case, we are guaranteed the value will fit into the 53 bit
       // mantissa of an IEEE double-precision value without rounding.)
       // If any of those low 11 bits were not zero originally, make sure
       // bit 12 (value 2048) is set instead, so that the final rounding
       // to single-precision gets the correct result.
       SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
                                   SINT, DAG.getConstant(2047, dl, MVT::i64));
       Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
                           Round, DAG.getConstant(2047, dl, MVT::i64));
       Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
       Round = DAG.getNode(ISD::AND, dl, MVT::i64,
                           Round, DAG.getConstant(-2048, dl, MVT::i64));
 
       // However, we cannot use that value unconditionally: if the magnitude
       // of the input value is small, the bit-twiddling we did above might
       // end up visibly changing the output.  Fortunately, in that case, we
       // don't need to twiddle bits since the original input will convert
       // exactly to double-precision floating-point already.  Therefore,
       // construct a conditional to use the original value if the top 11
       // bits are all sign-bit copies, and use the rounded value computed
       // above otherwise.
       SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
                                  SINT, DAG.getConstant(53, dl, MVT::i32));
       Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
                          Cond, DAG.getConstant(1, dl, MVT::i64));
       Cond = DAG.getSetCC(dl, MVT::i32,
                           Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
 
       SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
     }
 
     ReuseLoadInfo RLI;
     SDValue Bits;
 
     MachineFunction &MF = DAG.getMachineFunction();
     if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
       Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
                          RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
       spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
     } else if (Subtarget.hasLFIWAX() &&
                canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
       MachineMemOperand *MMO =
         MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
                                 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
       SDValue Ops[] = { RLI.Chain, RLI.Ptr };
       Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
                                      DAG.getVTList(MVT::f64, MVT::Other),
                                      Ops, MVT::i32, MMO);
       spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
     } else if (Subtarget.hasFPCVT() &&
                canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
       MachineMemOperand *MMO =
         MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
                                 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
       SDValue Ops[] = { RLI.Chain, RLI.Ptr };
       Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
                                      DAG.getVTList(MVT::f64, MVT::Other),
                                      Ops, MVT::i32, MMO);
       spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
     } else if (((Subtarget.hasLFIWAX() &&
                  SINT.getOpcode() == ISD::SIGN_EXTEND) ||
                 (Subtarget.hasFPCVT() &&
                  SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
                SINT.getOperand(0).getValueType() == MVT::i32) {
       MachineFrameInfo &MFI = MF.getFrameInfo();
       EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
       int FrameIdx = MFI.CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
       SDValue Store =
           DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(), FrameIdx));
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
 
       RLI.Ptr = FIdx;
       RLI.Chain = Store;
       RLI.MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
       RLI.Alignment = 4;
 
       MachineMemOperand *MMO =
         MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
                                 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
       SDValue Ops[] = { RLI.Chain, RLI.Ptr };
       Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
                                      PPCISD::LFIWZX : PPCISD::LFIWAX,
                                      dl, DAG.getVTList(MVT::f64, MVT::Other),
                                      Ops, MVT::i32, MMO);
     } else
       Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
 
     SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
 
     if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
       FP = DAG.getNode(ISD::FP_ROUND, dl,
                        MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
     return FP;
   }
 
   assert(Op.getOperand(0).getValueType() == MVT::i32 &&
          "Unhandled INT_TO_FP type in custom expander!");
   // Since we only generate this in 64-bit mode, we can take advantage of
   // 64-bit registers.  In particular, sign extend the input value into the
   // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
   // then lfd it and fcfid it.
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   SDValue Ld;
   if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
     ReuseLoadInfo RLI;
     bool ReusingLoad;
     if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
                                             DAG))) {
       int FrameIdx = MFI.CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
       SDValue Store =
           DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(), FrameIdx));
 
       assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
              "Expected an i32 store");
 
       RLI.Ptr = FIdx;
       RLI.Chain = Store;
       RLI.MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
       RLI.Alignment = 4;
     }
 
     MachineMemOperand *MMO =
       MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
                               RLI.Alignment, RLI.AAInfo, RLI.Ranges);
     SDValue Ops[] = { RLI.Chain, RLI.Ptr };
     Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
                                    PPCISD::LFIWZX : PPCISD::LFIWAX,
                                  dl, DAG.getVTList(MVT::f64, MVT::Other),
                                  Ops, MVT::i32, MMO);
     if (ReusingLoad)
       spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
   } else {
     assert(Subtarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
 
     int FrameIdx = MFI.CreateStackObject(8, 8, false);
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
     SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
                                 Op.getOperand(0));
 
     // STD the extended value into the stack slot.
     SDValue Store = DAG.getStore(
         DAG.getEntryNode(), dl, Ext64, FIdx,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
 
     // Load the value as a double.
     Ld = DAG.getLoad(
         MVT::f64, dl, Store, FIdx,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
   }
 
   // FCFID it and return it.
   SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
   if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
     FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
                      DAG.getIntPtrConstant(0, dl));
   return FP;
 }
 
 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
   /*
    The rounding mode is in bits 30:31 of FPSR, and has the following
    settings:
      00 Round to nearest
      01 Round to 0
      10 Round to +inf
      11 Round to -inf
 
   FLT_ROUNDS, on the other hand, expects the following:
     -1 Undefined
      0 Round to 0
      1 Round to nearest
      2 Round to +inf
      3 Round to -inf
 
   To perform the conversion, we do:
     ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Save FP Control Word to register
   EVT NodeTys[] = {
     MVT::f64,    // return register
     MVT::Glue    // unused in this context
   };
   SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
 
   // Save FP register to stack slot
   int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
                                MachinePointerInfo());
 
   // Load FP Control Word from low 32 bits of stack slot.
   SDValue Four = DAG.getConstant(4, dl, PtrVT);
   SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
   SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
 
   // Transform as necessary
   SDValue CWD1 =
     DAG.getNode(ISD::AND, dl, MVT::i32,
                 CWD, DAG.getConstant(3, dl, MVT::i32));
   SDValue CWD2 =
     DAG.getNode(ISD::SRL, dl, MVT::i32,
                 DAG.getNode(ISD::AND, dl, MVT::i32,
                             DAG.getNode(ISD::XOR, dl, MVT::i32,
                                         CWD, DAG.getConstant(3, dl, MVT::i32)),
                             DAG.getConstant(3, dl, MVT::i32)),
                 DAG.getConstant(1, dl, MVT::i32));
 
   SDValue RetVal =
     DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
 
   return DAG.getNode((VT.getSizeInBits() < 16 ?
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
 }
 
 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
   SDLoc dl(Op);
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
          "Unexpected SHL!");
 
   // Expand into a bunch of logical ops.  Note that these ops
   // depend on the PPC behavior for oversized shift amounts.
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
   EVT AmtVT = Amt.getValueType();
 
   SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
                              DAG.getConstant(BitWidth, dl, AmtVT), Amt);
   SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
   SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
   SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
   SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
                              DAG.getConstant(-BitWidth, dl, AmtVT));
   SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
   SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
   SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
   SDValue OutOps[] = { OutLo, OutHi };
   return DAG.getMergeValues(OutOps, dl);
 }
 
 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
          "Unexpected SRL!");
 
   // Expand into a bunch of logical ops.  Note that these ops
   // depend on the PPC behavior for oversized shift amounts.
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
   EVT AmtVT = Amt.getValueType();
 
   SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
                              DAG.getConstant(BitWidth, dl, AmtVT), Amt);
   SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
   SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
   SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
   SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
                              DAG.getConstant(-BitWidth, dl, AmtVT));
   SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
   SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
   SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
   SDValue OutOps[] = { OutLo, OutHi };
   return DAG.getMergeValues(OutOps, dl);
 }
 
 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
          "Unexpected SRA!");
 
   // Expand into a bunch of logical ops, followed by a select_cc.
   SDValue Lo = Op.getOperand(0);
   SDValue Hi = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
   EVT AmtVT = Amt.getValueType();
 
   SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
                              DAG.getConstant(BitWidth, dl, AmtVT), Amt);
   SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
   SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
   SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
   SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
                              DAG.getConstant(-BitWidth, dl, AmtVT));
   SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
   SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
   SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
                                   Tmp4, Tmp6, ISD::SETLE);
   SDValue OutOps[] = { OutLo, OutHi };
   return DAG.getMergeValues(OutOps, dl);
 }
 
 //===----------------------------------------------------------------------===//
 // Vector related lowering.
 //
 
 /// BuildSplatI - Build a canonical splati of Val with an element size of
 /// SplatSize.  Cast the result to VT.
 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
                            SelectionDAG &DAG, const SDLoc &dl) {
   assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
 
   static const MVT VTys[] = { // canonical VT to use for each size.
     MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
   };
 
   EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
 
   // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
   if (Val == -1)
     SplatSize = 1;
 
   EVT CanonicalVT = VTys[SplatSize-1];
 
   // Build a canonical splat for this value.
   return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
 }
 
 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
                                 const SDLoc &dl, EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, dl, MVT::i32), Op);
 }
 
 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
                                 SelectionDAG &DAG, const SDLoc &dl,
                                 EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = LHS.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
 }
 
 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
                                 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
                                 EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op0.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
 }
 
 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
 /// amount.  The result has the specified value type.
 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
                            SelectionDAG &DAG, const SDLoc &dl) {
   // Force LHS/RHS to be the right type.
   LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
   RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
 
   int Ops[16];
   for (unsigned i = 0; i != 16; ++i)
     Ops[i] = i + Amt;
   SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
   return DAG.getNode(ISD::BITCAST, dl, VT, T);
 }
 
 /// Do we have an efficient pattern in a .td file for this node?
 ///
 /// \param V - pointer to the BuildVectorSDNode being matched
 /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
 ///
 /// There are some patterns where it is beneficial to keep a BUILD_VECTOR
 /// node as a BUILD_VECTOR node rather than expanding it. The patterns where
 /// the opposite is true (expansion is beneficial) are:
 /// - The node builds a vector out of integers that are not 32 or 64-bits
 /// - The node builds a vector out of constants
 /// - The node is a "load-and-splat"
 /// In all other cases, we will choose to keep the BUILD_VECTOR.
 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
                                             bool HasDirectMove,
                                             bool HasP8Vector) {
   EVT VecVT = V->getValueType(0);
   bool RightType = VecVT == MVT::v2f64 ||
     (HasP8Vector && VecVT == MVT::v4f32) ||
     (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
   if (!RightType)
     return false;
 
   bool IsSplat = true;
   bool IsLoad = false;
   SDValue Op0 = V->getOperand(0);
 
   // This function is called in a block that confirms the node is not a constant
   // splat. So a constant BUILD_VECTOR here means the vector is built out of
   // different constants.
   if (V->isConstant())
     return false;
   for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
     if (V->getOperand(i).isUndef())
       return false;
     // We want to expand nodes that represent load-and-splat even if the
     // loaded value is a floating point truncation or conversion to int.
     if (V->getOperand(i).getOpcode() == ISD::LOAD ||
         (V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
          V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
         (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
          V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
         (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
          V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
       IsLoad = true;
     // If the operands are different or the input is not a load and has more
     // uses than just this BV node, then it isn't a splat.
     if (V->getOperand(i) != Op0 ||
         (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
       IsSplat = false;
   }
   return !(IsSplat && IsLoad);
 }
 
 // Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
 SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
   SDValue Op0 = Op->getOperand(0);
 
   if (!EnableQuadPrecision ||
       (Op.getValueType() != MVT::f128 ) ||
       (Op0.getOpcode() != ISD::BUILD_PAIR) ||
       (Op0.getOperand(0).getValueType() !=  MVT::i64) ||
       (Op0.getOperand(1).getValueType() != MVT::i64))
     return SDValue();
 
   return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
                      Op0.getOperand(1));
 }
 
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
 // this case more efficiently than a constant pool load, lower it to the
 // sequence of ops that should be used.
 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc dl(Op);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
   if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
     // We first build an i32 vector, load it into a QPX register,
     // then convert it to a floating-point vector and compare it
     // to a zero vector to get the boolean result.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     int FrameIdx = MFI.CreateStackObject(16, 16, false);
     MachinePointerInfo PtrInfo =
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
     assert(BVN->getNumOperands() == 4 &&
       "BUILD_VECTOR for v4i1 does not have 4 operands");
 
     bool IsConst = true;
     for (unsigned i = 0; i < 4; ++i) {
       if (BVN->getOperand(i).isUndef()) continue;
       if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
         IsConst = false;
         break;
       }
     }
 
     if (IsConst) {
       Constant *One =
         ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
       Constant *NegOne =
         ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
 
       Constant *CV[4];
       for (unsigned i = 0; i < 4; ++i) {
         if (BVN->getOperand(i).isUndef())
           CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
         else if (isNullConstant(BVN->getOperand(i)))
           CV[i] = NegOne;
         else
           CV[i] = One;
       }
 
       Constant *CP = ConstantVector::get(CV);
       SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
                                           16 /* alignment */);
 
       SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
       SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
       return DAG.getMemIntrinsicNode(
           PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     }
 
     SmallVector<SDValue, 4> Stores;
     for (unsigned i = 0; i < 4; ++i) {
       if (BVN->getOperand(i).isUndef()) continue;
 
       unsigned Offset = 4*i;
       SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
       Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
 
       unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
       if (StoreSize > 4) {
         Stores.push_back(
             DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
                               PtrInfo.getWithOffset(Offset), MVT::i32));
       } else {
         SDValue StoreValue = BVN->getOperand(i);
         if (StoreSize < 4)
           StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
 
         Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
                                       PtrInfo.getWithOffset(Offset)));
       }
     }
 
     SDValue StoreChain;
     if (!Stores.empty())
       StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
     else
       StoreChain = DAG.getEntryNode();
 
     // Now load from v4i32 into the QPX register; this will extend it to
     // v4i64 but not yet convert it to a floating point. Nevertheless, this
     // is typed as v4f64 because the QPX register integer states are not
     // explicitly represented.
 
     SDValue Ops[] = {StoreChain,
                      DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
                      FIdx};
     SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
 
     SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
       dl, VTs, Ops, MVT::v4i32, PtrInfo);
     LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
       DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
       LoadedVect);
 
     SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
 
     return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
   }
 
   // All other QPX vectors are handled by generic code.
   if (Subtarget.hasQPX())
     return SDValue();
 
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                              HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
       SplatBitSize > 32) {
     // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
     // lowered to VSX instructions under certain conditions.
     // Without VSX, there is no pattern more efficient than expanding the node.
     if (Subtarget.hasVSX() &&
         haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
                                         Subtarget.hasP8Vector()))
       return Op;
     return SDValue();
   }
 
   unsigned SplatBits = APSplatBits.getZExtValue();
   unsigned SplatUndef = APSplatUndef.getZExtValue();
   unsigned SplatSize = SplatBitSize / 8;
 
   // First, handle single instruction cases.
 
   // All zeros?
   if (SplatBits == 0) {
     // Canonicalize all zero vectors to be v4i32.
     if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
       SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
       Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
     }
     return Op;
   }
 
   // We have XXSPLTIB for constant splats one byte wide
   if (Subtarget.hasP9Vector() && SplatSize == 1) {
     // This is a splat of 1-byte elements with some elements potentially undef.
     // Rather than trying to match undef in the SDAG patterns, ensure that all
     // elements are the same constant.
     if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) {
       SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits,
                                                        dl, MVT::i32));
       SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops);
       if (Op.getValueType() != MVT::v16i8)
         return DAG.getBitcast(Op.getValueType(), NewBV);
       return NewBV;
     }
 
     // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll
     // detect that constant splats like v8i16: 0xABAB are really just splats
     // of a 1-byte constant. In this case, we need to convert the node to a
     // splat of v16i8 and a bitcast.
     if (Op.getValueType() != MVT::v16i8)
       return DAG.getBitcast(Op.getValueType(),
                             DAG.getConstant(SplatBits, dl, MVT::v16i8));
 
     return Op;
   }
 
   // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
   int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
                     (32-SplatBitSize));
   if (SextVal >= -16 && SextVal <= 15)
     return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
 
   // Two instruction sequences.
 
   // If this value is in the range [-32,30] and is even, use:
   //     VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
   // If this value is in the range [17,31] and is odd, use:
   //     VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
   // If this value is in the range [-31,-17] and is odd, use:
   //     VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
   // Note the last two are three-instruction sequences.
   if (SextVal >= -32 && SextVal <= 31) {
     // To avoid having these optimizations undone by constant folding,
     // we convert to a pseudo that will be expanded later into one of
     // the above forms.
     SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
     EVT VT = (SplatSize == 1 ? MVT::v16i8 :
               (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
     SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
     SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
     if (VT == Op.getValueType())
       return RetVal;
     else
       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
   }
 
   // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
   // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
   // for fneg/fabs.
   if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
     // Make -1 and vspltisw -1:
     SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
 
     // Make the VSLW intrinsic, computing 0x8000_0000.
     SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
                                    OnesV, DAG, dl);
 
     // xor by OnesV to invert it.
     Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
     return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
   }
 
   // Check to see if this is a wide variety of vsplti*, binop self cases.
   static const signed char SplatCsts[] = {
     -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
     -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
   };
 
   for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
     // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
     // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
     int i = SplatCsts[idx];
 
     // Figure out what shift amount will be used by altivec if shifted by i in
     // this splat size.
     unsigned TypeShiftAmt = i & (SplatBitSize-1);
 
     // vsplti + shl self.
     if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
         Intrinsic::ppc_altivec_vslw
       };
       Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
     }
 
     // vsplti + srl self.
     if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
         Intrinsic::ppc_altivec_vsrw
       };
       Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
     }
 
     // vsplti + sra self.
     if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
         Intrinsic::ppc_altivec_vsraw
       };
       Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
     }
 
     // vsplti + rol self.
     if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
                          ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
         Intrinsic::ppc_altivec_vrlw
       };
       Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
     }
 
     // t = vsplti c, result = vsldoi t, t, 1
     if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
       return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
     if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
       return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
     if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
       return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
   }
 
   return SDValue();
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
 
   enum {
     OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
     OP_VMRGHW,
     OP_VMRGLW,
     OP_VSPLTISW0,
     OP_VSPLTISW1,
     OP_VSPLTISW2,
     OP_VSPLTISW3,
     OP_VSLDOI4,
     OP_VSLDOI8,
     OP_VSLDOI12
   };
 
   if (OpNum == OP_COPY) {
     if (LHSID == (1*9+2)*9+3) return LHS;
     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
     return RHS;
   }
 
   SDValue OpLHS, OpRHS;
   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
 
   int ShufIdxs[16];
   switch (OpNum) {
   default: llvm_unreachable("Unknown i32 permute!");
   case OP_VMRGHW:
     ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
     ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
     ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
     ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
     break;
   case OP_VMRGLW:
     ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
     ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
     ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
     ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
     break;
   case OP_VSPLTISW0:
     for (unsigned i = 0; i != 16; ++i)
       ShufIdxs[i] = (i&3)+0;
     break;
   case OP_VSPLTISW1:
     for (unsigned i = 0; i != 16; ++i)
       ShufIdxs[i] = (i&3)+4;
     break;
   case OP_VSPLTISW2:
     for (unsigned i = 0; i != 16; ++i)
       ShufIdxs[i] = (i&3)+8;
     break;
   case OP_VSPLTISW3:
     for (unsigned i = 0; i != 16; ++i)
       ShufIdxs[i] = (i&3)+12;
     break;
   case OP_VSLDOI4:
     return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
   case OP_VSLDOI8:
     return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
   case OP_VSLDOI12:
     return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
   }
   EVT VT = OpLHS.getValueType();
   OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
   OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
   SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
   return DAG.getNode(ISD::BITCAST, dl, VT, T);
 }
 
 /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
 /// by the VINSERTB instruction introduced in ISA 3.0, else just return default
 /// SDValue.
 SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
                                            SelectionDAG &DAG) const {
   const unsigned BytesInVector = 16;
   bool IsLE = Subtarget.isLittleEndian();
   SDLoc dl(N);
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
   unsigned ShiftElts = 0, InsertAtByte = 0;
   bool Swap = false;
 
   // Shifts required to get the byte we want at element 7.
   unsigned LittleEndianShifts[] = {8, 7,  6,  5,  4,  3,  2,  1,
                                    0, 15, 14, 13, 12, 11, 10, 9};
   unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
                                 1, 2,  3,  4,  5,  6,  7,  8};
 
   ArrayRef<int> Mask = N->getMask();
   int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
   // For each mask element, find out if we're just inserting something
   // from V2 into V1 or vice versa.
   // Possible permutations inserting an element from V2 into V1:
   //   X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   //   0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   //   ...
   //   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
   // Inserting from V1 into V2 will be similar, except mask range will be
   // [16,31].
 
   bool FoundCandidate = false;
   // If both vector operands for the shuffle are the same vector, the mask
   // will contain only elements from the first one and the second one will be
   // undef.
   unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
   // Go through the mask of half-words to find an element that's being moved
   // from one vector to the other.
   for (unsigned i = 0; i < BytesInVector; ++i) {
     unsigned CurrentElement = Mask[i];
     // If 2nd operand is undefined, we should only look for element 7 in the
     // Mask.
     if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
       continue;
 
     bool OtherElementsInOrder = true;
     // Examine the other elements in the Mask to see if they're in original
     // order.
     for (unsigned j = 0; j < BytesInVector; ++j) {
       if (j == i)
         continue;
       // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
       // from V2 [16,31] and vice versa.  Unless the 2nd operand is undefined,
       // in which we always assume we're always picking from the 1st operand.
       int MaskOffset =
           (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
       if (Mask[j] != OriginalOrder[j] + MaskOffset) {
         OtherElementsInOrder = false;
         break;
       }
     }
     // If other elements are in original order, we record the number of shifts
     // we need to get the element we want into element 7. Also record which byte
     // in the vector we should insert into.
     if (OtherElementsInOrder) {
       // If 2nd operand is undefined, we assume no shifts and no swapping.
       if (V2.isUndef()) {
         ShiftElts = 0;
         Swap = false;
       } else {
         // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
         ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
                          : BigEndianShifts[CurrentElement & 0xF];
         Swap = CurrentElement < BytesInVector;
       }
       InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
       FoundCandidate = true;
       break;
     }
   }
 
   if (!FoundCandidate)
     return SDValue();
 
   // Candidate found, construct the proper SDAG sequence with VINSERTB,
   // optionally with VECSHL if shift is required.
   if (Swap)
     std::swap(V1, V2);
   if (V2.isUndef())
     V2 = V1;
   if (ShiftElts) {
     SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
                               DAG.getConstant(ShiftElts, dl, MVT::i32));
     return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
                        DAG.getConstant(InsertAtByte, dl, MVT::i32));
   }
   return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
                      DAG.getConstant(InsertAtByte, dl, MVT::i32));
 }
 
 /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
 /// by the VINSERTH instruction introduced in ISA 3.0, else just return default
 /// SDValue.
 SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
                                            SelectionDAG &DAG) const {
   const unsigned NumHalfWords = 8;
   const unsigned BytesInVector = NumHalfWords * 2;
   // Check that the shuffle is on half-words.
   if (!isNByteElemShuffleMask(N, 2, 1))
     return SDValue();
 
   bool IsLE = Subtarget.isLittleEndian();
   SDLoc dl(N);
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
   unsigned ShiftElts = 0, InsertAtByte = 0;
   bool Swap = false;
 
   // Shifts required to get the half-word we want at element 3.
   unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
   unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
 
   uint32_t Mask = 0;
   uint32_t OriginalOrderLow = 0x1234567;
   uint32_t OriginalOrderHigh = 0x89ABCDEF;
   // Now we look at mask elements 0,2,4,6,8,10,12,14.  Pack the mask into a
   // 32-bit space, only need 4-bit nibbles per element.
   for (unsigned i = 0; i < NumHalfWords; ++i) {
     unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
     Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
   }
 
   // For each mask element, find out if we're just inserting something
   // from V2 into V1 or vice versa.  Possible permutations inserting an element
   // from V2 into V1:
   //   X, 1, 2, 3, 4, 5, 6, 7
   //   0, X, 2, 3, 4, 5, 6, 7
   //   0, 1, X, 3, 4, 5, 6, 7
   //   0, 1, 2, X, 4, 5, 6, 7
   //   0, 1, 2, 3, X, 5, 6, 7
   //   0, 1, 2, 3, 4, X, 6, 7
   //   0, 1, 2, 3, 4, 5, X, 7
   //   0, 1, 2, 3, 4, 5, 6, X
   // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
 
   bool FoundCandidate = false;
   // Go through the mask of half-words to find an element that's being moved
   // from one vector to the other.
   for (unsigned i = 0; i < NumHalfWords; ++i) {
     unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
     uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
     uint32_t MaskOtherElts = ~(0xF << MaskShift);
     uint32_t TargetOrder = 0x0;
 
     // If both vector operands for the shuffle are the same vector, the mask
     // will contain only elements from the first one and the second one will be
     // undef.
     if (V2.isUndef()) {
       ShiftElts = 0;
       unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
       TargetOrder = OriginalOrderLow;
       Swap = false;
       // Skip if not the correct element or mask of other elements don't equal
       // to our expected order.
       if (MaskOneElt == VINSERTHSrcElem &&
           (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
         InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
         FoundCandidate = true;
         break;
       }
     } else { // If both operands are defined.
       // Target order is [8,15] if the current mask is between [0,7].
       TargetOrder =
           (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
       // Skip if mask of other elements don't equal our expected order.
       if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
         // We only need the last 3 bits for the number of shifts.
         ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
                          : BigEndianShifts[MaskOneElt & 0x7];
         InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
         Swap = MaskOneElt < NumHalfWords;
         FoundCandidate = true;
         break;
       }
     }
   }
 
   if (!FoundCandidate)
     return SDValue();
 
   // Candidate found, construct the proper SDAG sequence with VINSERTH,
   // optionally with VECSHL if shift is required.
   if (Swap)
     std::swap(V1, V2);
   if (V2.isUndef())
     V2 = V1;
   SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   if (ShiftElts) {
     // Double ShiftElts because we're left shifting on v16i8 type.
     SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
                               DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
     SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
     SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
                               DAG.getConstant(InsertAtByte, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
   SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
   SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
                             DAG.getConstant(InsertAtByte, dl, MVT::i32));
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
 }
 
 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
 /// is a shuffle we can handle in a single instruction, return it.  Otherwise,
 /// return the code it can be lowered into.  Worst case, it can always be
 /// lowered into a vperm.
 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   EVT VT = Op.getValueType();
   bool isLittleEndian = Subtarget.isLittleEndian();
 
   unsigned ShiftElts, InsertAtByte;
   bool Swap = false;
   if (Subtarget.hasP9Vector() &&
       PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
                            isLittleEndian)) {
     if (Swap)
       std::swap(V1, V2);
     SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
     SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
     if (ShiftElts) {
       SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
                                 DAG.getConstant(ShiftElts, dl, MVT::i32));
       SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
                                 DAG.getConstant(InsertAtByte, dl, MVT::i32));
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
     }
     SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
                               DAG.getConstant(InsertAtByte, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
   if (Subtarget.hasP9Altivec()) {
     SDValue NewISDNode;
     if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
       return NewISDNode;
 
     if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
       return NewISDNode;
   }
 
   if (Subtarget.hasVSX() &&
       PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
     if (Swap)
       std::swap(V1, V2);
     SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
     SDValue Conv2 =
         DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
 
     SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
                               DAG.getConstant(ShiftElts, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
   }
 
   if (Subtarget.hasVSX() &&
     PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
     if (Swap)
       std::swap(V1, V2);
     SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
     SDValue Conv2 =
         DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
 
     SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
                               DAG.getConstant(ShiftElts, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
   }
 
   if (Subtarget.hasP9Vector()) {
      if (PPC::isXXBRHShuffleMask(SVOp)) {
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
       SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
     } else if (PPC::isXXBRWShuffleMask(SVOp)) {
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
       SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
     } else if (PPC::isXXBRDShuffleMask(SVOp)) {
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
       SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
     } else if (PPC::isXXBRQShuffleMask(SVOp)) {
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
       SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
     }
   }
 
   if (Subtarget.hasVSX()) {
     if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
       int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
 
       // If the source for the shuffle is a scalar_to_vector that came from a
       // 32-bit load, it will have used LXVWSX so we don't need to splat again.
       if (Subtarget.hasP9Vector() &&
           ((isLittleEndian && SplatIdx == 3) ||
            (!isLittleEndian && SplatIdx == 0))) {
         SDValue Src = V1.getOperand(0);
         if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
             Src.getOperand(0).getOpcode() == ISD::LOAD &&
             Src.getOperand(0).hasOneUse())
           return V1;
       }
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
       SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
                                   DAG.getConstant(SplatIdx, dl, MVT::i32));
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
     }
 
     // Left shifts of 8 bytes are actually swaps. Convert accordingly.
     if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
       SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
     }
   }
 
   if (Subtarget.hasQPX()) {
     if (VT.getVectorNumElements() != 4)
       return SDValue();
 
     if (V2.isUndef()) V2 = V1;
 
     int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
     if (AlignIdx != -1) {
       return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
                          DAG.getConstant(AlignIdx, dl, MVT::i32));
     } else if (SVOp->isSplat()) {
       int SplatIdx = SVOp->getSplatIndex();
       if (SplatIdx >= 4) {
         std::swap(V1, V2);
         SplatIdx -= 4;
       }
 
       return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
                          DAG.getConstant(SplatIdx, dl, MVT::i32));
     }
 
     // Lower this into a qvgpci/qvfperm pair.
 
     // Compute the qvgpci literal
     unsigned idx = 0;
     for (unsigned i = 0; i < 4; ++i) {
       int m = SVOp->getMaskElt(i);
       unsigned mm = m >= 0 ? (unsigned) m : i;
       idx |= mm << (3-i)*3;
     }
 
     SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
                              DAG.getConstant(idx, dl, MVT::i32));
     return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
   }
 
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
   // selected by the instruction selector.
   if (V2.isUndef()) {
     if (PPC::isSplatShuffleMask(SVOp, 1) ||
         PPC::isSplatShuffleMask(SVOp, 2) ||
         PPC::isSplatShuffleMask(SVOp, 4) ||
         PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
         PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
         PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
         PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
         PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
         PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
         PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
         PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
         PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
         (Subtarget.hasP8Altivec() && (
          PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
          PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
          PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
       return Op;
     }
   }
 
   // Altivec has a variety of "shuffle immediates" that take two vector inputs
   // and produce a fixed permutation.  If any of these match, do not lower to
   // VPERM.
   unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
   if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
       PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
       PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
       PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
       PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
       PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
       PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
       PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
       PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
       (Subtarget.hasP8Altivec() && (
        PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
        PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
        PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
     return Op;
 
   // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
   // perfect shuffle table to emit an optimal matching sequence.
   ArrayRef<int> PermMask = SVOp->getMask();
 
   unsigned PFIndexes[4];
   bool isFourElementShuffle = true;
   for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
     unsigned EltNo = 8;   // Start out undef.
     for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
       if (PermMask[i*4+j] < 0)
         continue;   // Undef, ignore it.
 
       unsigned ByteSource = PermMask[i*4+j];
       if ((ByteSource & 3) != j) {
         isFourElementShuffle = false;
         break;
       }
 
       if (EltNo == 8) {
         EltNo = ByteSource/4;
       } else if (EltNo != ByteSource/4) {
         isFourElementShuffle = false;
         break;
       }
     }
     PFIndexes[i] = EltNo;
   }
 
   // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
   // perfect shuffle vector to determine if it is cost effective to do this as
   // discrete instructions, or whether we should use a vperm.
   // For now, we skip this for little endian until such time as we have a
   // little-endian perfect shuffle table.
   if (isFourElementShuffle && !isLittleEndian) {
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex =
       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
 
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost  = (PFEntry >> 30);
 
     // Determining when to avoid vperm is tricky.  Many things affect the cost
     // of vperm, particularly how many times the perm mask needs to be computed.
     // For example, if the perm mask can be hoisted out of a loop or is already
     // used (perhaps because there are multiple permutes with the same shuffle
     // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
     // the loop requires an extra register.
     //
     // As a compromise, we only emit discrete instructions if the shuffle can be
     // generated in 3 or fewer operations.  When we have loop information
     // available, if this block is within a loop, we should avoid using vperm
     // for 3-operation perms and use a constant pool load instead.
     if (Cost < 3)
       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
   // vector that will get spilled to the constant pool.
   if (V2.isUndef()) V2 = V1;
 
   // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
   // that it is in input element units, not in bytes.  Convert now.
 
   // For little endian, the order of the input vectors is reversed, and
   // the permutation mask is complemented with respect to 31.  This is
   // necessary to produce proper semantics with the big-endian-biased vperm
   // instruction.
   EVT EltVT = V1.getValueType().getVectorElementType();
   unsigned BytesPerElement = EltVT.getSizeInBits()/8;
 
   SmallVector<SDValue, 16> ResultMask;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
     unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
 
     for (unsigned j = 0; j != BytesPerElement; ++j)
       if (isLittleEndian)
         ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
                                              dl, MVT::i32));
       else
         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
                                              MVT::i32));
   }
 
   SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
   if (isLittleEndian)
     return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
                        V2, V1, VPermMask);
   else
     return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
                        V1, V2, VPermMask);
 }
 
 /// getVectorCompareInfo - Given an intrinsic, return false if it is not a
 /// vector comparison.  If it is, return true and fill in Opc/isDot with
 /// information about the intrinsic.
 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
                                  bool &isDot, const PPCSubtarget &Subtarget) {
   unsigned IntrinsicID =
       cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
   CompareOpc = -1;
   isDot = false;
   switch (IntrinsicID) {
   default:
     return false;
   // Comparison predicates.
   case Intrinsic::ppc_altivec_vcmpbfp_p:
     CompareOpc = 966;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpeqfp_p:
     CompareOpc = 198;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpequb_p:
     CompareOpc = 6;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpequh_p:
     CompareOpc = 70;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpequw_p:
     CompareOpc = 134;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpequd_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 199;
       isDot = true;
     } else
       return false;
     break;
   case Intrinsic::ppc_altivec_vcmpneb_p:
   case Intrinsic::ppc_altivec_vcmpneh_p:
   case Intrinsic::ppc_altivec_vcmpnew_p:
   case Intrinsic::ppc_altivec_vcmpnezb_p:
   case Intrinsic::ppc_altivec_vcmpnezh_p:
   case Intrinsic::ppc_altivec_vcmpnezw_p:
     if (Subtarget.hasP9Altivec()) {
       switch (IntrinsicID) {
       default:
         llvm_unreachable("Unknown comparison intrinsic.");
       case Intrinsic::ppc_altivec_vcmpneb_p:
         CompareOpc = 7;
         break;
       case Intrinsic::ppc_altivec_vcmpneh_p:
         CompareOpc = 71;
         break;
       case Intrinsic::ppc_altivec_vcmpnew_p:
         CompareOpc = 135;
         break;
       case Intrinsic::ppc_altivec_vcmpnezb_p:
         CompareOpc = 263;
         break;
       case Intrinsic::ppc_altivec_vcmpnezh_p:
         CompareOpc = 327;
         break;
       case Intrinsic::ppc_altivec_vcmpnezw_p:
         CompareOpc = 391;
         break;
       }
       isDot = true;
     } else
       return false;
     break;
   case Intrinsic::ppc_altivec_vcmpgefp_p:
     CompareOpc = 454;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtfp_p:
     CompareOpc = 710;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsb_p:
     CompareOpc = 774;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsh_p:
     CompareOpc = 838;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsw_p:
     CompareOpc = 902;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsd_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 967;
       isDot = true;
     } else
       return false;
     break;
   case Intrinsic::ppc_altivec_vcmpgtub_p:
     CompareOpc = 518;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtuh_p:
     CompareOpc = 582;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtuw_p:
     CompareOpc = 646;
     isDot = true;
     break;
   case Intrinsic::ppc_altivec_vcmpgtud_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 711;
       isDot = true;
     } else
       return false;
     break;
 
   // VSX predicate comparisons use the same infrastructure
   case Intrinsic::ppc_vsx_xvcmpeqdp_p:
   case Intrinsic::ppc_vsx_xvcmpgedp_p:
   case Intrinsic::ppc_vsx_xvcmpgtdp_p:
   case Intrinsic::ppc_vsx_xvcmpeqsp_p:
   case Intrinsic::ppc_vsx_xvcmpgesp_p:
   case Intrinsic::ppc_vsx_xvcmpgtsp_p:
     if (Subtarget.hasVSX()) {
       switch (IntrinsicID) {
       case Intrinsic::ppc_vsx_xvcmpeqdp_p:
         CompareOpc = 99;
         break;
       case Intrinsic::ppc_vsx_xvcmpgedp_p:
         CompareOpc = 115;
         break;
       case Intrinsic::ppc_vsx_xvcmpgtdp_p:
         CompareOpc = 107;
         break;
       case Intrinsic::ppc_vsx_xvcmpeqsp_p:
         CompareOpc = 67;
         break;
       case Intrinsic::ppc_vsx_xvcmpgesp_p:
         CompareOpc = 83;
         break;
       case Intrinsic::ppc_vsx_xvcmpgtsp_p:
         CompareOpc = 75;
         break;
       }
       isDot = true;
     } else
       return false;
     break;
 
   // Normal Comparisons.
   case Intrinsic::ppc_altivec_vcmpbfp:
     CompareOpc = 966;
     break;
   case Intrinsic::ppc_altivec_vcmpeqfp:
     CompareOpc = 198;
     break;
   case Intrinsic::ppc_altivec_vcmpequb:
     CompareOpc = 6;
     break;
   case Intrinsic::ppc_altivec_vcmpequh:
     CompareOpc = 70;
     break;
   case Intrinsic::ppc_altivec_vcmpequw:
     CompareOpc = 134;
     break;
   case Intrinsic::ppc_altivec_vcmpequd:
     if (Subtarget.hasP8Altivec())
       CompareOpc = 199;
     else
       return false;
     break;
   case Intrinsic::ppc_altivec_vcmpneb:
   case Intrinsic::ppc_altivec_vcmpneh:
   case Intrinsic::ppc_altivec_vcmpnew:
   case Intrinsic::ppc_altivec_vcmpnezb:
   case Intrinsic::ppc_altivec_vcmpnezh:
   case Intrinsic::ppc_altivec_vcmpnezw:
     if (Subtarget.hasP9Altivec())
       switch (IntrinsicID) {
       default:
         llvm_unreachable("Unknown comparison intrinsic.");
       case Intrinsic::ppc_altivec_vcmpneb:
         CompareOpc = 7;
         break;
       case Intrinsic::ppc_altivec_vcmpneh:
         CompareOpc = 71;
         break;
       case Intrinsic::ppc_altivec_vcmpnew:
         CompareOpc = 135;
         break;
       case Intrinsic::ppc_altivec_vcmpnezb:
         CompareOpc = 263;
         break;
       case Intrinsic::ppc_altivec_vcmpnezh:
         CompareOpc = 327;
         break;
       case Intrinsic::ppc_altivec_vcmpnezw:
         CompareOpc = 391;
         break;
       }
     else
       return false;
     break;
   case Intrinsic::ppc_altivec_vcmpgefp:
     CompareOpc = 454;
     break;
   case Intrinsic::ppc_altivec_vcmpgtfp:
     CompareOpc = 710;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsb:
     CompareOpc = 774;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsh:
     CompareOpc = 838;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsw:
     CompareOpc = 902;
     break;
   case Intrinsic::ppc_altivec_vcmpgtsd:
     if (Subtarget.hasP8Altivec())
       CompareOpc = 967;
     else
       return false;
     break;
   case Intrinsic::ppc_altivec_vcmpgtub:
     CompareOpc = 518;
     break;
   case Intrinsic::ppc_altivec_vcmpgtuh:
     CompareOpc = 582;
     break;
   case Intrinsic::ppc_altivec_vcmpgtuw:
     CompareOpc = 646;
     break;
   case Intrinsic::ppc_altivec_vcmpgtud:
     if (Subtarget.hasP8Altivec())
       CompareOpc = 711;
     else
       return false;
     break;
   }
   return true;
 }
 
 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
   unsigned IntrinsicID =
     cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   SDLoc dl(Op);
 
   if (IntrinsicID == Intrinsic::thread_pointer) {
     // Reads the thread pointer register, used for __builtin_thread_pointer.
     if (Subtarget.isPPC64())
       return DAG.getRegister(PPC::X13, MVT::i64);
     return DAG.getRegister(PPC::R2, MVT::i32);
   }
 
   // We are looking for absolute values here.
   // The idea is to try to fit one of two patterns:
   //  max (a, (0-a))  OR  max ((0-a), a)
   if (Subtarget.hasP9Vector() &&
       (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
        IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
        IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
     SDValue V1 = Op.getOperand(1);
     SDValue V2 = Op.getOperand(2);
     if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
         (V1.getSimpleValueType() == MVT::v4i32 ||
          V1.getSimpleValueType() == MVT::v8i16 ||
          V1.getSimpleValueType() == MVT::v16i8)) {
       if ( V1.getOpcode() == ISD::SUB &&
            ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
            V1.getOperand(1) == V2 ) {
         // Generate the abs instruction with the operands
         return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
       }
 
       if ( V2.getOpcode() == ISD::SUB &&
            ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
            V2.getOperand(1) == V1 ) {
         // Generate the abs instruction with the operands
         return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
       }
     }
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
   // opcode number of the comparison.
   int CompareOpc;
   bool isDot;
   if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
     return SDValue();    // Don't custom lower most intrinsics.
 
   // If this is a non-dot comparison, make the VCMP node and we are done.
   if (!isDot) {
     SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
                               Op.getOperand(1), Op.getOperand(2),
                               DAG.getConstant(CompareOpc, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
   }
 
   // Create the PPCISD altivec 'dot' comparison node.
   SDValue Ops[] = {
     Op.getOperand(2),  // LHS
     Op.getOperand(3),  // RHS
     DAG.getConstant(CompareOpc, dl, MVT::i32)
   };
   EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
   SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
 
   // Now that we have the comparison, emit a copy from the CR to a GPR.
   // This is flagged to the above dot comparison.
   SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
                                 DAG.getRegister(PPC::CR6, MVT::i32),
                                 CompNode.getValue(1));
 
   // Unpack the result based on how the target uses it.
   unsigned BitNo;   // Bit # of CR6.
   bool InvertBit;   // Invert result?
   switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
   default:  // Can't happen, don't crash on invalid number though.
   case 0:   // Return the value of the EQ bit of CR6.
     BitNo = 0; InvertBit = false;
     break;
   case 1:   // Return the inverted value of the EQ bit of CR6.
     BitNo = 0; InvertBit = true;
     break;
   case 2:   // Return the value of the LT bit of CR6.
     BitNo = 2; InvertBit = false;
     break;
   case 3:   // Return the inverted value of the LT bit of CR6.
     BitNo = 2; InvertBit = true;
     break;
   }
 
   // Shift the bit into the low position.
   Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
                       DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
   // Isolate the bit.
   Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
                       DAG.getConstant(1, dl, MVT::i32));
 
   // If we are supposed to, toggle the bit.
   if (InvertBit)
     Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
                         DAG.getConstant(1, dl, MVT::i32));
   return Flags;
 }
 
 SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                                SelectionDAG &DAG) const {
   // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
   // the beginning of the argument list.
   int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
   SDLoc DL(Op);
   switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
   case Intrinsic::ppc_cfence: {
     assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
     assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
     return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
                                       DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
                                                   Op.getOperand(ArgStart + 1)),
                                       Op.getOperand(0)),
                    0);
   }
   default:
     break;
   }
   return SDValue();
 }
 
 SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
   // Check for a DIV with the same operands as this REM.
   for (auto UI : Op.getOperand(1)->uses()) {
     if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||
         (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
       if (UI->getOperand(0) == Op.getOperand(0) &&
           UI->getOperand(1) == Op.getOperand(1))
         return SDValue();
   }
   return Op;
 }
 
 // Lower scalar BSWAP64 to xxbrd.
 SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   // MTVSRDD
   Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
                    Op.getOperand(0));
   // XXBRD
   Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
   // MFVSRD
   int VectorIndex = 0;
   if (Subtarget.isLittleEndian())
     VectorIndex = 1;
   Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
                    DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
   return Op;
 }
 
 // ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
 // compared to a value that is atomically loaded (atomic loads zero-extend).
 SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
          "Expecting an atomic compare-and-swap here.");
   SDLoc dl(Op);
   auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
   EVT MemVT = AtomicNode->getMemoryVT();
   if (MemVT.getSizeInBits() >= 32)
     return Op;
 
   SDValue CmpOp = Op.getOperand(2);
   // If this is already correctly zero-extended, leave it alone.
   auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
   if (DAG.MaskedValueIsZero(CmpOp, HighBits))
     return Op;
 
   // Clear the high bits of the compare operand.
   unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
   SDValue NewCmpOp =
     DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
                 DAG.getConstant(MaskVal, dl, MVT::i32));
 
   // Replace the existing compare operand with the properly zero-extended one.
   SmallVector<SDValue, 4> Ops;
   for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
     Ops.push_back(AtomicNode->getOperand(i));
   Ops[2] = NewCmpOp;
   MachineMemOperand *MMO = AtomicNode->getMemOperand();
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
   auto NodeTy =
     (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
   return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
 }
 
 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
   // instructions), but for smaller types, we need to first extend up to v2i32
   // before doing going farther.
   if (Op.getValueType() == MVT::v2i64) {
     EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     if (ExtVT != MVT::v2i32) {
       Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
       Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
                        DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
                                         ExtVT.getVectorElementType(), 4)));
       Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
       Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
                        DAG.getValueType(MVT::v2i32));
     }
 
     return Op;
   }
 
   return SDValue();
 }
 
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(Op);
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = MFI.CreateStackObject(16, 16, false);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   // Store the input value into Value#0 of the stack slot.
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
                                MachinePointerInfo());
   // Load it out.
   return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
          "Should only be called for ISD::INSERT_VECTOR_ELT");
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   // We have legal lowering for constant indices but not for variable ones.
   if (!C)
     return SDValue();
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
   if (VT == MVT::v8i16 || VT == MVT::v16i8) {
     SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
     unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
     unsigned InsertAtElement = C->getZExtValue();
     unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
     if (Subtarget.isLittleEndian()) {
       InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
     }
     return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
                        DAG.getConstant(InsertAtByte, dl, MVT::i32));
   }
   return Op;
 }
 
 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDNode *N = Op.getNode();
 
   assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
          "Unknown extract_vector_elt type");
 
   SDValue Value = N->getOperand(0);
 
   // The first part of this is like the store lowering except that we don't
   // need to track the chain.
 
   // The values are now known to be -1 (false) or 1 (true). To convert this
   // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
   // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
   Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
 
   // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
   // understand how to form the extending load.
   SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
 
   Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
   // Now convert to an integer and store.
   Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
     DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
     Value);
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = MFI.CreateStackObject(16, 16, false);
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SDValue StoreChain = DAG.getEntryNode();
   SDValue Ops[] = {StoreChain,
                    DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
                    Value, FIdx};
   SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
 
   StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
     dl, VTs, Ops, MVT::v4i32, PtrInfo);
 
   // Extract the value requested.
   unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
   Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
 
   SDValue IntVal =
       DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
 
   if (!Subtarget.useCRBits())
     return IntVal;
 
   return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
 }
 
 /// Lowering for QPX v4i1 loads
 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
   LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
   SDValue LoadChain = LN->getChain();
   SDValue BasePtr = LN->getBasePtr();
 
   if (Op.getValueType() == MVT::v4f64 ||
       Op.getValueType() == MVT::v4f32) {
     EVT MemVT = LN->getMemoryVT();
     unsigned Alignment = LN->getAlignment();
 
     // If this load is properly aligned, then it is legal.
     if (Alignment >= MemVT.getStoreSize())
       return Op;
 
     EVT ScalarVT = Op.getValueType().getScalarType(),
         ScalarMemVT = MemVT.getScalarType();
     unsigned Stride = ScalarMemVT.getStoreSize();
 
     SDValue Vals[4], LoadChains[4];
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
       SDValue Load;
       if (ScalarVT != ScalarMemVT)
         Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
                               BasePtr,
                               LN->getPointerInfo().getWithOffset(Idx * Stride),
                               ScalarMemVT, MinAlign(Alignment, Idx * Stride),
                               LN->getMemOperand()->getFlags(), LN->getAAInfo());
       else
         Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
                            LN->getPointerInfo().getWithOffset(Idx * Stride),
                            MinAlign(Alignment, Idx * Stride),
                            LN->getMemOperand()->getFlags(), LN->getAAInfo());
 
       if (Idx == 0 && LN->isIndexed()) {
         assert(LN->getAddressingMode() == ISD::PRE_INC &&
                "Unknown addressing mode on vector load");
         Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
                                   LN->getAddressingMode());
       }
 
       Vals[Idx] = Load;
       LoadChains[Idx] = Load.getValue(1);
 
       BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                             DAG.getConstant(Stride, dl,
                                             BasePtr.getValueType()));
     }
 
     SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
     SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
 
     if (LN->isIndexed()) {
       SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
       return DAG.getMergeValues(RetOps, dl);
     }
 
     SDValue RetOps[] = { Value, TF };
     return DAG.getMergeValues(RetOps, dl);
   }
 
   assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
   assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
 
   // To lower v4i1 from a byte array, we load the byte elements of the
   // vector and then reuse the BUILD_VECTOR logic.
 
   SDValue VectElmts[4], VectElmtChains[4];
   for (unsigned i = 0; i < 4; ++i) {
     SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
 
     VectElmts[i] = DAG.getExtLoad(
         ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
         LN->getPointerInfo().getWithOffset(i), MVT::i8,
         /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
     VectElmtChains[i] = VectElmts[i].getValue(1);
   }
 
   LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
   SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
 
   SDValue RVals[] = { Value, LoadChain };
   return DAG.getMergeValues(RVals, dl);
 }
 
 /// Lowering for QPX v4i1 stores
 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
   StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
   SDValue StoreChain = SN->getChain();
   SDValue BasePtr = SN->getBasePtr();
   SDValue Value = SN->getValue();
 
   if (Value.getValueType() == MVT::v4f64 ||
       Value.getValueType() == MVT::v4f32) {
     EVT MemVT = SN->getMemoryVT();
     unsigned Alignment = SN->getAlignment();
 
     // If this store is properly aligned, then it is legal.
     if (Alignment >= MemVT.getStoreSize())
       return Op;
 
     EVT ScalarVT = Value.getValueType().getScalarType(),
         ScalarMemVT = MemVT.getScalarType();
     unsigned Stride = ScalarMemVT.getStoreSize();
 
     SDValue Stores[4];
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
       SDValue Ex = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
           DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
       SDValue Store;
       if (ScalarVT != ScalarMemVT)
         Store =
             DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
                               SN->getPointerInfo().getWithOffset(Idx * Stride),
                               ScalarMemVT, MinAlign(Alignment, Idx * Stride),
                               SN->getMemOperand()->getFlags(), SN->getAAInfo());
       else
         Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
                              SN->getPointerInfo().getWithOffset(Idx * Stride),
                              MinAlign(Alignment, Idx * Stride),
                              SN->getMemOperand()->getFlags(), SN->getAAInfo());
 
       if (Idx == 0 && SN->isIndexed()) {
         assert(SN->getAddressingMode() == ISD::PRE_INC &&
                "Unknown addressing mode on vector store");
         Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
                                     SN->getAddressingMode());
       }
 
       BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                             DAG.getConstant(Stride, dl,
                                             BasePtr.getValueType()));
       Stores[Idx] = Store;
     }
 
     SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
 
     if (SN->isIndexed()) {
       SDValue RetOps[] = { TF, Stores[0].getValue(1) };
       return DAG.getMergeValues(RetOps, dl);
     }
 
     return TF;
   }
 
   assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
   assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
 
   // The values are now known to be -1 (false) or 1 (true). To convert this
   // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
   // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
   Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
 
   // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
   // understand how to form the extending load.
   SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
 
   Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
 
   // Now convert to an integer and store.
   Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
     DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
     Value);
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = MFI.CreateStackObject(16, 16, false);
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SDValue Ops[] = {StoreChain,
                    DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
                    Value, FIdx};
   SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
 
   StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
     dl, VTs, Ops, MVT::v4i32, PtrInfo);
 
   // Move data into the byte array.
   SDValue Loads[4], LoadChains[4];
   for (unsigned i = 0; i < 4; ++i) {
     unsigned Offset = 4*i;
     SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
 
     Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
                            PtrInfo.getWithOffset(Offset));
     LoadChains[i] = Loads[i].getValue(1);
   }
 
   StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
 
   SDValue Stores[4];
   for (unsigned i = 0; i < 4; ++i) {
     SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
     Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
 
     Stores[i] = DAG.getTruncStore(
         StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
         MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
         SN->getAAInfo());
   }
 
   StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
 
   return StoreChain;
 }
 
 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   if (Op.getValueType() == MVT::v4i32) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
 
     SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
     SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
 
     SDValue RHSSwap =   // = vrlw RHS, 16
       BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
 
     // Shrinkify inputs to v8i16.
     LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
     RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
     RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
 
     // Low parts multiplied together, generating 32-bit results (we ignore the
     // top parts).
     SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
                                         LHS, RHS, DAG, dl, MVT::v4i32);
 
     SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
                                       LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
     // Shift the high parts up 16 bits.
     HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
                               Neg16, DAG, dl);
     return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
   } else if (Op.getValueType() == MVT::v8i16) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
 
     SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
 
     return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
                             LHS, RHS, Zero, DAG, dl);
   } else if (Op.getValueType() == MVT::v16i8) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
     bool isLittleEndian = Subtarget.isLittleEndian();
 
     // Multiply the even 8-bit parts, producing 16-bit sums.
     SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
                                            LHS, RHS, DAG, dl, MVT::v8i16);
     EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
 
     // Multiply the odd 8-bit parts, producing 16-bit sums.
     SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
                                           LHS, RHS, DAG, dl, MVT::v8i16);
     OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
 
     // Merge the results together.  Because vmuleub and vmuloub are
     // instructions with a big-endian bias, we must reverse the
     // element numbering and reverse the meaning of "odd" and "even"
     // when generating little endian code.
     int Ops[16];
     for (unsigned i = 0; i != 8; ++i) {
       if (isLittleEndian) {
         Ops[i*2  ] = 2*i;
         Ops[i*2+1] = 2*i+16;
       } else {
         Ops[i*2  ] = 2*i+1;
         Ops[i*2+1] = 2*i+1+16;
       }
     }
     if (isLittleEndian)
       return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
     else
       return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
   } else {
     llvm_unreachable("Unknown mul to lower!");
   }
 }
 
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Wasn't expecting to be able to lower this!");
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
 
   // Variable argument lowering.
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
 
   case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::GET_DYNAMIC_AREA_OFFSET:
     return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
 
   // Exception handling lowering.
   case ISD::EH_DWARF_CFA:       return LowerEH_DWARF_CFA(Op, DAG);
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
 
   case ISD::LOAD:               return LowerLOAD(Op, DAG);
   case ISD::STORE:              return LowerSTORE(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
 
   // Lower 64-bit shifts.
   case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
   case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
   case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
 
   // Vector-related lowering.
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
 
   // For counter-based loop handling.
   case ISD::INTRINSIC_W_CHAIN:  return SDValue();
 
   case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
 
   // Frame & Return address.
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
 
   case ISD::INTRINSIC_VOID:
     return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::SREM:
   case ISD::UREM:
     return LowerREM(Op, DAG);
   case ISD::BSWAP:
     return LowerBSWAP(Op, DAG);
   case ISD::ATOMIC_CMP_SWAP:
     return LowerATOMIC_CMP_SWAP(Op, DAG);
   }
 }
 
 void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
   case ISD::READCYCLECOUNTER: {
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
     SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
 
     Results.push_back(RTB);
     Results.push_back(RTB.getValue(1));
     Results.push_back(RTB.getValue(2));
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
         Intrinsic::ppc_is_decremented_ctr_nonzero)
       break;
 
     assert(N->getValueType(0) == MVT::i1 &&
            "Unexpected result type for CTR decrement intrinsic");
     EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                  N->getValueType(0));
     SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
     SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
                                  N->getOperand(1));
 
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
     Results.push_back(NewInt.getValue(1));
     break;
   }
   case ISD::VAARG: {
     if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
       return;
 
     EVT VT = N->getValueType(0);
 
     if (VT == MVT::i64) {
       SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
 
       Results.push_back(NewNode);
       Results.push_back(NewNode.getValue(1));
     }
     return;
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     // LowerFP_TO_INT() can only handle f32 and f64.
     if (N->getOperand(0).getValueType() == MVT::ppcf128)
       return;
     Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
   }
 }
 
 //===----------------------------------------------------------------------===//
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
 static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *Func = Intrinsic::getDeclaration(M, Id);
   return Builder.CreateCall(Func, {});
 }
 
 // The mappings for emitLeading/TrailingFence is taken from
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                                  Instruction *Inst,
                                                  AtomicOrdering Ord) const {
   if (Ord == AtomicOrdering::SequentiallyConsistent)
     return callIntrinsic(Builder, Intrinsic::ppc_sync);
   if (isReleaseOrStronger(Ord))
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   return nullptr;
 }
 
 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                                   Instruction *Inst,
                                                   AtomicOrdering Ord) const {
   if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
     // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
     // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
     // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
     if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
       return Builder.CreateCall(
           Intrinsic::getDeclaration(
               Builder.GetInsertBlock()->getParent()->getParent(),
               Intrinsic::ppc_cfence, {Inst->getType()}),
           {Inst});
     // FIXME: Can use isync for rmw operation.
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   }
   return nullptr;
 }
 
 MachineBasicBlock *
 PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
                                     unsigned AtomicSize,
                                     unsigned BinOpcode,
                                     unsigned CmpOpcode,
                                     unsigned CmpPred) const {
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   auto LoadMnemonic = PPC::LDARX;
   auto StoreMnemonic = PPC::STDCX;
   switch (AtomicSize) {
   default:
     llvm_unreachable("Unexpected size of atomic entity");
   case 1:
     LoadMnemonic = PPC::LBARX;
     StoreMnemonic = PPC::STBCX;
     assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
     break;
   case 2:
     LoadMnemonic = PPC::LHARX;
     StoreMnemonic = PPC::STHCX;
     assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
     break;
   case 4:
     LoadMnemonic = PPC::LWARX;
     StoreMnemonic = PPC::STWCX;
     break;
   case 8:
     LoadMnemonic = PPC::LDARX;
     StoreMnemonic = PPC::STDCX;
     break;
   }
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
   MachineFunction::iterator It = ++BB->getIterator();
 
   unsigned dest = MI.getOperand(0).getReg();
   unsigned ptrA = MI.getOperand(1).getReg();
   unsigned ptrB = MI.getOperand(2).getReg();
   unsigned incr = MI.getOperand(3).getReg();
   DebugLoc dl = MI.getDebugLoc();
 
   MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *loop2MBB =
     CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
   MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(It, loopMBB);
   if (CmpOpcode)
     F->insert(It, loop2MBB);
   F->insert(It, exitMBB);
   exitMBB->splice(exitMBB->begin(), BB,
                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
   unsigned TmpReg = (!BinOpcode) ? incr :
     RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
                                            : &PPC::GPRCRegClass);
 
   //  thisMBB:
   //   ...
   //   fallthrough --> loopMBB
   BB->addSuccessor(loopMBB);
 
   //  loopMBB:
   //   l[wd]arx dest, ptr
   //   add r0, dest, incr
   //   st[wd]cx. r0, ptr
   //   bne- loopMBB
   //   fallthrough --> exitMBB
 
   // For max/min...
   //  loopMBB:
   //   l[wd]arx dest, ptr
   //   cmpl?[wd] incr, dest
   //   bgt exitMBB
   //  loop2MBB:
   //   st[wd]cx. dest, ptr
   //   bne- loopMBB
   //   fallthrough --> exitMBB
 
   BB = loopMBB;
   BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
     .addReg(ptrA).addReg(ptrB);
   if (BinOpcode)
     BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
   if (CmpOpcode) {
     // Signed comparisons of byte or halfword values must be sign-extended.
     if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
       unsigned ExtReg =  RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
       BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
               ExtReg).addReg(dest);
       BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
         .addReg(incr).addReg(ExtReg);
     } else
       BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
         .addReg(incr).addReg(dest);
 
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
     BB->addSuccessor(loop2MBB);
     BB->addSuccessor(exitMBB);
     BB = loop2MBB;
   }
   BuildMI(BB, dl, TII->get(StoreMnemonic))
     .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
   BuildMI(BB, dl, TII->get(PPC::BCC))
     .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
 
   //  exitMBB:
   //   ...
   BB = exitMBB;
   return BB;
 }
 
 MachineBasicBlock *
 PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI,
                                             MachineBasicBlock *BB,
                                             bool is8bit, // operation
                                             unsigned BinOpcode,
                                             unsigned CmpOpcode,
                                             unsigned CmpPred) const {
   // If we support part-word atomic mnemonics, just use them
   if (Subtarget.hasPartwordAtomics())
     return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode,
                             CmpOpcode, CmpPred);
 
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // In 64 bit mode we have to use 64 bits for addresses, even though the
   // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
   // registers without caring whether they're 32 or 64, but here we're
   // doing actual arithmetic on the addresses.
   bool is64bit = Subtarget.isPPC64();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
   MachineFunction::iterator It = ++BB->getIterator();
 
   unsigned dest = MI.getOperand(0).getReg();
   unsigned ptrA = MI.getOperand(1).getReg();
   unsigned ptrB = MI.getOperand(2).getReg();
   unsigned incr = MI.getOperand(3).getReg();
   DebugLoc dl = MI.getDebugLoc();
 
   MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *loop2MBB =
     CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
   MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(It, loopMBB);
   if (CmpOpcode)
     F->insert(It, loop2MBB);
   F->insert(It, exitMBB);
   exitMBB->splice(exitMBB->begin(), BB,
                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
   const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
                                           : &PPC::GPRCRegClass;
   unsigned PtrReg = RegInfo.createVirtualRegister(RC);
   unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
   unsigned ShiftReg =
     isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
   unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
   unsigned MaskReg = RegInfo.createVirtualRegister(RC);
   unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
   unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
   unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
   unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
   unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
   unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
   unsigned Ptr1Reg;
   unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
 
   //  thisMBB:
   //   ...
   //   fallthrough --> loopMBB
   BB->addSuccessor(loopMBB);
 
   // The 4-byte load must be aligned, while a char or short may be
   // anywhere in the word.  Hence all this nasty bookkeeping code.
   //   add ptr1, ptrA, ptrB [copy if ptrA==0]
   //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
   //   xori shift, shift1, 24 [16]
   //   rlwinm ptr, ptr1, 0, 0, 29
   //   slw incr2, incr, shift
   //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
   //   slw mask, mask2, shift
   //  loopMBB:
   //   lwarx tmpDest, ptr
   //   add tmp, tmpDest, incr2
   //   andc tmp2, tmpDest, mask
   //   and tmp3, tmp, mask
   //   or tmp4, tmp3, tmp2
   //   stwcx. tmp4, ptr
   //   bne- loopMBB
   //   fallthrough --> exitMBB
   //   srw dest, tmpDest, shift
   if (ptrA != ZeroReg) {
     Ptr1Reg = RegInfo.createVirtualRegister(RC);
     BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
       .addReg(ptrA).addReg(ptrB);
   } else {
     Ptr1Reg = ptrB;
   }
   BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
       .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
   if (!isLittleEndian)
     BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
         .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
   if (is64bit)
     BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
       .addReg(Ptr1Reg).addImm(0).addImm(61);
   else
     BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
       .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
   BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
       .addReg(incr).addReg(ShiftReg);
   if (is8bit)
     BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
   else {
     BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
     BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
   }
   BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
       .addReg(Mask2Reg).addReg(ShiftReg);
 
   BB = loopMBB;
   BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
     .addReg(ZeroReg).addReg(PtrReg);
   if (BinOpcode)
     BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
       .addReg(Incr2Reg).addReg(TmpDestReg);
   BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
     .addReg(TmpDestReg).addReg(MaskReg);
   BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
     .addReg(TmpReg).addReg(MaskReg);
   if (CmpOpcode) {
     // For unsigned comparisons, we can directly compare the shifted values.
     // For signed comparisons we shift and sign extend.
     unsigned SReg = RegInfo.createVirtualRegister(RC);
     BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg)
       .addReg(TmpDestReg).addReg(MaskReg);
     unsigned ValueReg = SReg;
     unsigned CmpReg = Incr2Reg;
     if (CmpOpcode == PPC::CMPW) {
       ValueReg = RegInfo.createVirtualRegister(RC);
       BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
         .addReg(SReg).addReg(ShiftReg);
       unsigned ValueSReg = RegInfo.createVirtualRegister(RC);
       BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
         .addReg(ValueReg);
       ValueReg = ValueSReg;
       CmpReg = incr;
     }
     BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
       .addReg(CmpReg).addReg(ValueReg);
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
     BB->addSuccessor(loop2MBB);
     BB->addSuccessor(exitMBB);
     BB = loop2MBB;
   }
   BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
     .addReg(Tmp3Reg).addReg(Tmp2Reg);
   BuildMI(BB, dl, TII->get(PPC::STWCX))
     .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
   BuildMI(BB, dl, TII->get(PPC::BCC))
     .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
 
   //  exitMBB:
   //   ...
   BB = exitMBB;
   BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
     .addReg(ShiftReg);
   return BB;
 }
 
 llvm::MachineBasicBlock *
 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
   // For v = setjmp(buf), we generate
   //
   // thisMBB:
   //  SjLjSetup mainMBB
   //  bl mainMBB
   //  v_restore = 1
   //  b sinkMBB
   //
   // mainMBB:
   //  buf[LabelOffset] = LR
   //  v_main = 0
   //
   // sinkMBB:
   //  v = phi(main, restore)
   //
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
   MF->insert(I, sinkMBB);
 
   MachineInstrBuilder MIB;
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // Note that the structure of the jmp_buf used here is not compatible
   // with that used by libc, and is not designed to be. Specifically, it
   // stores only those 'reserved' registers that LLVM does not otherwise
   // understand how to spill. Also, by convention, by the time this
   // intrinsic is called, Clang has already stored the frame address in the
   // first slot of the buffer and stack address in the third. Following the
   // X86 target code, we'll store the jump address in the second slot. We also
   // need to save the TOC pointer (R2) to handle jumps between shared
   // libraries, and that will be stored in the fourth slot. The thread
   // identifier (R13) is not affected.
 
   // thisMBB:
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   const int64_t TOCOffset   = 3 * PVT.getStoreSize();
   const int64_t BPOffset    = 4 * PVT.getStoreSize();
 
   // Prepare IP either in reg.
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
   unsigned BufReg = MI.getOperand(1).getReg();
 
   if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
     setUsesTOCBasePtr(*MBB->getParent());
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
             .addReg(PPC::X2)
             .addImm(TOCOffset)
             .addReg(BufReg);
     MIB.setMemRefs(MMOBegin, MMOEnd);
   }
 
   // Naked functions never have a base pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned BaseReg;
   if (MF->getFunction().hasFnAttribute(Attribute::Naked))
     BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
   else
     BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
 
   MIB = BuildMI(*thisMBB, MI, DL,
                 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
             .addReg(BaseReg)
             .addImm(BPOffset)
             .addReg(BufReg);
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
   MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
 
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
           .addMBB(mainMBB);
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
 
   thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
   thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
 
   // mainMBB:
   //  mainDstReg = 0
   MIB =
       BuildMI(mainMBB, DL,
               TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
 
   // Store IP
   if (Subtarget.isPPC64()) {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
             .addReg(LabelReg)
             .addImm(LabelOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
             .addReg(LabelReg)
             .addImm(LabelOffset)
             .addReg(BufReg);
   }
 
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
   mainMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
           TII->get(PPC::PHI), DstReg)
     .addReg(mainDstReg).addMBB(mainMBB)
     .addReg(restoreDstReg).addMBB(thisMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
 }
 
 MachineBasicBlock *
 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
   const TargetRegisterClass *RC =
     (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
   unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
   unsigned BP =
       (PVT == MVT::i64)
           ? PPC::X30
           : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
                                                               : PPC::R30);
 
   MachineInstrBuilder MIB;
 
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   const int64_t SPOffset    = 2 * PVT.getStoreSize();
   const int64_t TOCOffset   = 3 * PVT.getStoreSize();
   const int64_t BPOffset    = 4 * PVT.getStoreSize();
 
   unsigned BufReg = MI.getOperand(0).getReg();
 
   // Reload FP (the jumped-to function may not have had a
   // frame pointer, and if so, then its r31 will be restored
   // as necessary).
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
             .addImm(0)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
             .addImm(0)
             .addReg(BufReg);
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload IP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
             .addImm(LabelOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
             .addImm(LabelOffset)
             .addReg(BufReg);
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload SP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
             .addImm(SPOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
             .addImm(SPOffset)
             .addReg(BufReg);
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload BP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
             .addImm(BPOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
             .addImm(BPOffset)
             .addReg(BufReg);
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload TOC
   if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
     setUsesTOCBasePtr(*MBB->getParent());
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
             .addImm(TOCOffset)
             .addReg(BufReg);
 
     MIB.setMemRefs(MMOBegin, MMOEnd);
   }
 
   // Jump
   BuildMI(*MBB, MI, DL,
           TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
   BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
 
   MI.eraseFromParent();
   return MBB;
 }
 
 MachineBasicBlock *
 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   if (MI.getOpcode() == TargetOpcode::STACKMAP ||
       MI.getOpcode() == TargetOpcode::PATCHPOINT) {
     if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
         MI.getOpcode() == TargetOpcode::PATCHPOINT) {
       // Call lowering should have added an r2 operand to indicate a dependence
       // on the TOC base pointer value. It can't however, because there is no
       // way to mark the dependence as implicit there, and so the stackmap code
       // will confuse it with a regular operand. Instead, add the dependence
       // here.
       setUsesTOCBasePtr(*BB->getParent());
       MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
     }
 
     return emitPatchPoint(MI, BB);
   }
 
   if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
       MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
     return emitEHSjLjSetJmp(MI, BB);
   } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
              MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
     return emitEHSjLjLongJmp(MI, BB);
   }
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // To "insert" these instructions we actually have to insert their
   // control-flow patterns.
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = ++BB->getIterator();
 
   MachineFunction *F = BB->getParent();
 
   if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
        MI.getOpcode() == PPC::SELECT_CC_I8 ||
        MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) {
     SmallVector<MachineOperand, 2> Cond;
     if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
         MI.getOpcode() == PPC::SELECT_CC_I8)
       Cond.push_back(MI.getOperand(4));
     else
       Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
     Cond.push_back(MI.getOperand(1));
 
     DebugLoc dl = MI.getDebugLoc();
     TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
                       MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
   } else if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
              MI.getOpcode() == PPC::SELECT_CC_I8 ||
              MI.getOpcode() == PPC::SELECT_CC_F4 ||
              MI.getOpcode() == PPC::SELECT_CC_F8 ||
              MI.getOpcode() == PPC::SELECT_CC_F16 ||
              MI.getOpcode() == PPC::SELECT_CC_QFRC ||
              MI.getOpcode() == PPC::SELECT_CC_QSRC ||
              MI.getOpcode() == PPC::SELECT_CC_QBRC ||
              MI.getOpcode() == PPC::SELECT_CC_VRRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
              MI.getOpcode() == PPC::SELECT_CC_VSRC ||
              MI.getOpcode() == PPC::SELECT_CC_SPE4 ||
              MI.getOpcode() == PPC::SELECT_CC_SPE ||
              MI.getOpcode() == PPC::SELECT_I4 ||
              MI.getOpcode() == PPC::SELECT_I8 ||
              MI.getOpcode() == PPC::SELECT_F4 ||
              MI.getOpcode() == PPC::SELECT_F8 ||
              MI.getOpcode() == PPC::SELECT_F16 ||
              MI.getOpcode() == PPC::SELECT_QFRC ||
              MI.getOpcode() == PPC::SELECT_QSRC ||
              MI.getOpcode() == PPC::SELECT_QBRC ||
              MI.getOpcode() == PPC::SELECT_SPE ||
              MI.getOpcode() == PPC::SELECT_SPE4 ||
              MI.getOpcode() == PPC::SELECT_VRRC ||
              MI.getOpcode() == PPC::SELECT_VSFRC ||
              MI.getOpcode() == PPC::SELECT_VSSRC ||
              MI.getOpcode() == PPC::SELECT_VSRC) {
     // The incoming instruction knows the destination vreg to set, the
     // condition code register to branch on, the true/false values to
     // select between, and a branch opcode to use.
 
     //  thisMBB:
     //  ...
     //   TrueVal = ...
     //   cmpTY ccX, r1, r2
     //   bCC copy1MBB
     //   fallthrough --> copy0MBB
     MachineBasicBlock *thisMBB = BB;
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
     DebugLoc dl = MI.getDebugLoc();
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     // Next, add the true and fallthrough blocks as its successors.
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
     if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
         MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
         MI.getOpcode() == PPC::SELECT_F16 ||
         MI.getOpcode() == PPC::SELECT_SPE4 ||
         MI.getOpcode() == PPC::SELECT_SPE ||
         MI.getOpcode() == PPC::SELECT_QFRC ||
         MI.getOpcode() == PPC::SELECT_QSRC ||
         MI.getOpcode() == PPC::SELECT_QBRC ||
         MI.getOpcode() == PPC::SELECT_VRRC ||
         MI.getOpcode() == PPC::SELECT_VSFRC ||
         MI.getOpcode() == PPC::SELECT_VSSRC ||
         MI.getOpcode() == PPC::SELECT_VSRC) {
       BuildMI(BB, dl, TII->get(PPC::BC))
           .addReg(MI.getOperand(1).getReg())
           .addMBB(sinkMBB);
     } else {
       unsigned SelectPred = MI.getOperand(4).getImm();
       BuildMI(BB, dl, TII->get(PPC::BCC))
           .addImm(SelectPred)
           .addReg(MI.getOperand(1).getReg())
           .addMBB(sinkMBB);
     }
 
     //  copy0MBB:
     //   %FalseValue = ...
     //   # fallthrough to sinkMBB
     BB = copy0MBB;
 
     // Update machine-CFG edges
     BB->addSuccessor(sinkMBB);
 
     //  sinkMBB:
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
     BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
         .addReg(MI.getOperand(3).getReg())
         .addMBB(copy0MBB)
         .addReg(MI.getOperand(2).getReg())
         .addMBB(thisMBB);
   } else if (MI.getOpcode() == PPC::ReadTB) {
     // To read the 64-bit time-base register on a 32-bit target, we read the
     // two halves. Should the counter have wrapped while it was being read, we
     // need to try again.
     // ...
     // readLoop:
     // mfspr Rx,TBU # load from TBU
     // mfspr Ry,TB  # load from TB
     // mfspr Rz,TBU # load from TBU
     // cmpw crX,Rx,Rz # check if 'old'='new'
     // bne readLoop   # branch if they're not equal
     // ...
 
     MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
     DebugLoc dl = MI.getDebugLoc();
     F->insert(It, readMBB);
     F->insert(It, sinkMBB);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     BB->addSuccessor(readMBB);
     BB = readMBB;
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
     unsigned LoReg = MI.getOperand(0).getReg();
     unsigned HiReg = MI.getOperand(1).getReg();
 
     BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
     BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
     BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
 
     unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
 
     BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
       .addReg(HiReg).addReg(ReadAgainReg);
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB);
 
     BB->addSuccessor(readMBB);
     BB->addSuccessor(sinkMBB);
   } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
     BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
     BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
     BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
     BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
     BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);
 
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
     BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
   else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);
 
   else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
   else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
     BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
   else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
     BB = EmitAtomicBinary(MI, BB, 4, 0);
   else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0);
   else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
            (Subtarget.hasPartwordAtomics() &&
             MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
            (Subtarget.hasPartwordAtomics() &&
             MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
     bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
 
     auto LoadMnemonic = PPC::LDARX;
     auto StoreMnemonic = PPC::STDCX;
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Compare and swap of unknown size");
     case PPC::ATOMIC_CMP_SWAP_I8:
       LoadMnemonic = PPC::LBARX;
       StoreMnemonic = PPC::STBCX;
       assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
       break;
     case PPC::ATOMIC_CMP_SWAP_I16:
       LoadMnemonic = PPC::LHARX;
       StoreMnemonic = PPC::STHCX;
       assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
       break;
     case PPC::ATOMIC_CMP_SWAP_I32:
       LoadMnemonic = PPC::LWARX;
       StoreMnemonic = PPC::STWCX;
       break;
     case PPC::ATOMIC_CMP_SWAP_I64:
       LoadMnemonic = PPC::LDARX;
       StoreMnemonic = PPC::STDCX;
       break;
     }
     unsigned dest = MI.getOperand(0).getReg();
     unsigned ptrA = MI.getOperand(1).getReg();
     unsigned ptrB = MI.getOperand(2).getReg();
     unsigned oldval = MI.getOperand(3).getReg();
     unsigned newval = MI.getOperand(4).getReg();
     DebugLoc dl = MI.getDebugLoc();
 
     MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
     F->insert(It, loop1MBB);
     F->insert(It, loop2MBB);
     F->insert(It, midMBB);
     F->insert(It, exitMBB);
     exitMBB->splice(exitMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
     exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     //  thisMBB:
     //   ...
     //   fallthrough --> loopMBB
     BB->addSuccessor(loop1MBB);
 
     // loop1MBB:
     //   l[bhwd]arx dest, ptr
     //   cmp[wd] dest, oldval
     //   bne- midMBB
     // loop2MBB:
     //   st[bhwd]cx. newval, ptr
     //   bne- loopMBB
     //   b exitBB
     // midMBB:
     //   st[bhwd]cx. dest, ptr
     // exitBB:
     BB = loop1MBB;
     BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
       .addReg(ptrA).addReg(ptrB);
     BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
       .addReg(oldval).addReg(dest);
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
     BB->addSuccessor(loop2MBB);
     BB->addSuccessor(midMBB);
 
     BB = loop2MBB;
     BuildMI(BB, dl, TII->get(StoreMnemonic))
       .addReg(newval).addReg(ptrA).addReg(ptrB);
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
     BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
     BB->addSuccessor(loop1MBB);
     BB->addSuccessor(exitMBB);
 
     BB = midMBB;
     BuildMI(BB, dl, TII->get(StoreMnemonic))
       .addReg(dest).addReg(ptrA).addReg(ptrB);
     BB->addSuccessor(exitMBB);
 
     //  exitMBB:
     //   ...
     BB = exitMBB;
   } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
              MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
     // We must use 64-bit registers for addresses when targeting 64-bit,
     // since we're actually doing arithmetic on them.  Other registers
     // can be 32-bit.
     bool is64bit = Subtarget.isPPC64();
     bool isLittleEndian = Subtarget.isLittleEndian();
     bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
 
     unsigned dest = MI.getOperand(0).getReg();
     unsigned ptrA = MI.getOperand(1).getReg();
     unsigned ptrB = MI.getOperand(2).getReg();
     unsigned oldval = MI.getOperand(3).getReg();
     unsigned newval = MI.getOperand(4).getReg();
     DebugLoc dl = MI.getDebugLoc();
 
     MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
     F->insert(It, loop1MBB);
     F->insert(It, loop2MBB);
     F->insert(It, midMBB);
     F->insert(It, exitMBB);
     exitMBB->splice(exitMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
     exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
                                             : &PPC::GPRCRegClass;
     unsigned PtrReg = RegInfo.createVirtualRegister(RC);
     unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
     unsigned ShiftReg =
       isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
     unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
     unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
     unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
     unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
     unsigned MaskReg = RegInfo.createVirtualRegister(RC);
     unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
     unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
     unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
     unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
     unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
     unsigned Ptr1Reg;
     unsigned TmpReg = RegInfo.createVirtualRegister(RC);
     unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
     //  thisMBB:
     //   ...
     //   fallthrough --> loopMBB
     BB->addSuccessor(loop1MBB);
 
     // The 4-byte load must be aligned, while a char or short may be
     // anywhere in the word.  Hence all this nasty bookkeeping code.
     //   add ptr1, ptrA, ptrB [copy if ptrA==0]
     //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
     //   xori shift, shift1, 24 [16]
     //   rlwinm ptr, ptr1, 0, 0, 29
     //   slw newval2, newval, shift
     //   slw oldval2, oldval,shift
     //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
     //   slw mask, mask2, shift
     //   and newval3, newval2, mask
     //   and oldval3, oldval2, mask
     // loop1MBB:
     //   lwarx tmpDest, ptr
     //   and tmp, tmpDest, mask
     //   cmpw tmp, oldval3
     //   bne- midMBB
     // loop2MBB:
     //   andc tmp2, tmpDest, mask
     //   or tmp4, tmp2, newval3
     //   stwcx. tmp4, ptr
     //   bne- loop1MBB
     //   b exitBB
     // midMBB:
     //   stwcx. tmpDest, ptr
     // exitBB:
     //   srw dest, tmpDest, shift
     if (ptrA != ZeroReg) {
       Ptr1Reg = RegInfo.createVirtualRegister(RC);
       BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
         .addReg(ptrA).addReg(ptrB);
     } else {
       Ptr1Reg = ptrB;
     }
     BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
         .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
     if (!isLittleEndian)
       BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
           .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
     if (is64bit)
       BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
         .addReg(Ptr1Reg).addImm(0).addImm(61);
     else
       BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
         .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
     BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
         .addReg(newval).addReg(ShiftReg);
     BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
         .addReg(oldval).addReg(ShiftReg);
     if (is8bit)
       BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
     else {
       BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
       BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
         .addReg(Mask3Reg).addImm(65535);
     }
     BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
         .addReg(Mask2Reg).addReg(ShiftReg);
     BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
         .addReg(NewVal2Reg).addReg(MaskReg);
     BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
         .addReg(OldVal2Reg).addReg(MaskReg);
 
     BB = loop1MBB;
     BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
         .addReg(ZeroReg).addReg(PtrReg);
     BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
         .addReg(TmpDestReg).addReg(MaskReg);
     BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
         .addReg(TmpReg).addReg(OldVal3Reg);
     BuildMI(BB, dl, TII->get(PPC::BCC))
         .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
     BB->addSuccessor(loop2MBB);
     BB->addSuccessor(midMBB);
 
     BB = loop2MBB;
     BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
         .addReg(TmpDestReg).addReg(MaskReg);
     BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
         .addReg(Tmp2Reg).addReg(NewVal3Reg);
     BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
         .addReg(ZeroReg).addReg(PtrReg);
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
     BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
     BB->addSuccessor(loop1MBB);
     BB->addSuccessor(exitMBB);
 
     BB = midMBB;
     BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
       .addReg(ZeroReg).addReg(PtrReg);
     BB->addSuccessor(exitMBB);
 
     //  exitMBB:
     //   ...
     BB = exitMBB;
     BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
       .addReg(ShiftReg);
   } else if (MI.getOpcode() == PPC::FADDrtz) {
     // This pseudo performs an FADD with rounding mode temporarily forced
     // to round-to-zero.  We emit this via custom inserter since the FPSCR
     // is not modeled at the SelectionDAG level.
     unsigned Dest = MI.getOperand(0).getReg();
     unsigned Src1 = MI.getOperand(1).getReg();
     unsigned Src2 = MI.getOperand(2).getReg();
     DebugLoc dl = MI.getDebugLoc();
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
 
     // Save FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
 
     // Set rounding mode to round-to-zero.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
 
     // Perform addition.
     BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
   } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
              MI.getOpcode() == PPC::ANDIo_1_GT_BIT ||
              MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
              MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) {
     unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
                        MI.getOpcode() == PPC::ANDIo_1_GT_BIT8)
                           ? PPC::ANDIo8
                           : PPC::ANDIo;
     bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
                  MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
                                                   &PPC::GPRCRegClass :
                                                   &PPC::G8RCRegClass);
 
     DebugLoc dl = MI.getDebugLoc();
     BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
         .addReg(MI.getOperand(1).getReg())
         .addImm(1);
     BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
             MI.getOperand(0).getReg())
         .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
   } else if (MI.getOpcode() == PPC::TCHECK_RET) {
     DebugLoc Dl = MI.getDebugLoc();
     MachineRegisterInfo &RegInfo = F->getRegInfo();
     unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
     BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
     return BB;
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 //===----------------------------------------------------------------------===//
 // Target Optimization Hooks
 //===----------------------------------------------------------------------===//
 
 static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
   // For the estimates, convergence is quadratic, so we essentially double the
   // number of digits correct after every iteration. For both FRE and FRSQRTE,
   // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
   // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
   int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
   if (VT.getScalarType() == MVT::f64)
     RefinementSteps++;
   return RefinementSteps;
 }
 
 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                            int Enabled, int &RefinementSteps,
                                            bool &UseOneConstNR,
                                            bool Reciprocal) const {
   EVT VT = Operand.getValueType();
   if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
       (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
       (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
       (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
       (VT == MVT::v4f64 && Subtarget.hasQPX())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
 
     UseOneConstNR = true;
     return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
   }
   return SDValue();
 }
 
 SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
                                             int Enabled,
                                             int &RefinementSteps) const {
   EVT VT = Operand.getValueType();
   if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
       (VT == MVT::f64 && Subtarget.hasFRE()) ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
       (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
       (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
       (VT == MVT::v4f64 && Subtarget.hasQPX())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
     return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
   }
   return SDValue();
 }
 
 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
   // Note: This functionality is used only when unsafe-fp-math is enabled, and
   // on cores with reciprocal estimates (which are used when unsafe-fp-math is
   // enabled for division), this functionality is redundant with the default
   // combiner logic (once the division -> reciprocal/multiply transformation
   // has taken place). As a result, this matters more for older cores than for
   // newer ones.
 
   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
   // reciprocal if there are two or more FDIVs (for embedded cores with only
   // one FP pipeline) for three or more FDIVs (for generic OOO cores).
   switch (Subtarget.getDarwinDirective()) {
   default:
     return 3;
   case PPC::DIR_440:
   case PPC::DIR_A2:
   case PPC::DIR_E500:
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
     return 2;
   }
 }
 
 // isConsecutiveLSLoc needs to work even if all adds have not yet been
 // collapsed, and so we need to look through chains of them.
 static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
                                      int64_t& Offset, SelectionDAG &DAG) {
   if (DAG.isBaseWithConstantOffset(Loc)) {
     Base = Loc.getOperand(0);
     Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
 
     // The base might itself be a base plus an offset, and if so, accumulate
     // that as well.
     getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
   }
 }
 
 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
                             unsigned Bytes, int Dist,
                             SelectionDAG &DAG) {
   if (VT.getSizeInBits() / 8 != Bytes)
     return false;
 
   SDValue BaseLoc = Base->getBasePtr();
   if (Loc.getOpcode() == ISD::FrameIndex) {
     if (BaseLoc.getOpcode() != ISD::FrameIndex)
       return false;
     const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
     int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
     int FS  = MFI.getObjectSize(FI);
     int BFS = MFI.getObjectSize(BFI);
     if (FS != BFS || FS != (int)Bytes) return false;
     return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
   }
 
   SDValue Base1 = Loc, Base2 = BaseLoc;
   int64_t Offset1 = 0, Offset2 = 0;
   getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
   getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
   if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
     return true;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const GlobalValue *GV1 = nullptr;
   const GlobalValue *GV2 = nullptr;
   Offset1 = 0;
   Offset2 = 0;
   bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
   bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
   if (isGA1 && isGA2 && GV1 == GV2)
     return Offset1 == (Offset2 + Dist*Bytes);
   return false;
 }
 
 // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
 // not enforce equality of the chain operands.
 static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
                             unsigned Bytes, int Dist,
                             SelectionDAG &DAG) {
   if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
     EVT VT = LS->getMemoryVT();
     SDValue Loc = LS->getBasePtr();
     return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
   }
 
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     EVT VT;
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     default: return false;
     case Intrinsic::ppc_qpx_qvlfd:
     case Intrinsic::ppc_qpx_qvlfda:
       VT = MVT::v4f64;
       break;
     case Intrinsic::ppc_qpx_qvlfs:
     case Intrinsic::ppc_qpx_qvlfsa:
       VT = MVT::v4f32;
       break;
     case Intrinsic::ppc_qpx_qvlfcd:
     case Intrinsic::ppc_qpx_qvlfcda:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvlfcs:
     case Intrinsic::ppc_qpx_qvlfcsa:
       VT = MVT::v2f32;
       break;
     case Intrinsic::ppc_qpx_qvlfiwa:
     case Intrinsic::ppc_qpx_qvlfiwz:
     case Intrinsic::ppc_altivec_lvx:
     case Intrinsic::ppc_altivec_lvxl:
     case Intrinsic::ppc_vsx_lxvw4x:
     case Intrinsic::ppc_vsx_lxvw4x_be:
       VT = MVT::v4i32;
       break;
     case Intrinsic::ppc_vsx_lxvd2x:
     case Intrinsic::ppc_vsx_lxvd2x_be:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_altivec_lvebx:
       VT = MVT::i8;
       break;
     case Intrinsic::ppc_altivec_lvehx:
       VT = MVT::i16;
       break;
     case Intrinsic::ppc_altivec_lvewx:
       VT = MVT::i32;
       break;
     }
 
     return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
   }
 
   if (N->getOpcode() == ISD::INTRINSIC_VOID) {
     EVT VT;
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     default: return false;
     case Intrinsic::ppc_qpx_qvstfd:
     case Intrinsic::ppc_qpx_qvstfda:
       VT = MVT::v4f64;
       break;
     case Intrinsic::ppc_qpx_qvstfs:
     case Intrinsic::ppc_qpx_qvstfsa:
       VT = MVT::v4f32;
       break;
     case Intrinsic::ppc_qpx_qvstfcd:
     case Intrinsic::ppc_qpx_qvstfcda:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvstfcs:
     case Intrinsic::ppc_qpx_qvstfcsa:
       VT = MVT::v2f32;
       break;
     case Intrinsic::ppc_qpx_qvstfiw:
     case Intrinsic::ppc_qpx_qvstfiwa:
     case Intrinsic::ppc_altivec_stvx:
     case Intrinsic::ppc_altivec_stvxl:
     case Intrinsic::ppc_vsx_stxvw4x:
       VT = MVT::v4i32;
       break;
     case Intrinsic::ppc_vsx_stxvd2x:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_vsx_stxvw4x_be:
       VT = MVT::v4i32;
       break;
     case Intrinsic::ppc_vsx_stxvd2x_be:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_altivec_stvebx:
       VT = MVT::i8;
       break;
     case Intrinsic::ppc_altivec_stvehx:
       VT = MVT::i16;
       break;
     case Intrinsic::ppc_altivec_stvewx:
       VT = MVT::i32;
       break;
     }
 
     return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
   }
 
   return false;
 }
 
 // Return true is there is a nearyby consecutive load to the one provided
 // (regardless of alignment). We search up and down the chain, looking though
 // token factors and other loads (but nothing else). As a result, a true result
 // indicates that it is safe to create a new consecutive load adjacent to the
 // load provided.
 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   SDValue Chain = LD->getChain();
   EVT VT = LD->getMemoryVT();
 
   SmallSet<SDNode *, 16> LoadRoots;
   SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
   SmallSet<SDNode *, 16> Visited;
 
   // First, search up the chain, branching to follow all token-factor operands.
   // If we find a consecutive load, then we're done, otherwise, record all
   // nodes just above the top-level loads and token factors.
   while (!Queue.empty()) {
     SDNode *ChainNext = Queue.pop_back_val();
     if (!Visited.insert(ChainNext).second)
       continue;
 
     if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
       if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
         return true;
 
       if (!Visited.count(ChainLD->getChain().getNode()))
         Queue.push_back(ChainLD->getChain().getNode());
     } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
       for (const SDUse &O : ChainNext->ops())
         if (!Visited.count(O.getNode()))
           Queue.push_back(O.getNode());
     } else
       LoadRoots.insert(ChainNext);
   }
 
   // Second, search down the chain, starting from the top-level nodes recorded
   // in the first phase. These top-level nodes are the nodes just above all
   // loads and token factors. Starting with their uses, recursively look though
   // all loads (just the chain uses) and token factors to find a consecutive
   // load.
   Visited.clear();
   Queue.clear();
 
   for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
        IE = LoadRoots.end(); I != IE; ++I) {
     Queue.push_back(*I);
 
     while (!Queue.empty()) {
       SDNode *LoadRoot = Queue.pop_back_val();
       if (!Visited.insert(LoadRoot).second)
         continue;
 
       if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
         if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
           return true;
 
       for (SDNode::use_iterator UI = LoadRoot->use_begin(),
            UE = LoadRoot->use_end(); UI != UE; ++UI)
         if (((isa<MemSDNode>(*UI) &&
             cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
             UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
           Queue.push_back(*UI);
     }
   }
 
   return false;
 }
 
 /// This function is called when we have proved that a SETCC node can be replaced
 /// by subtraction (and other supporting instructions) so that the result of
 /// comparison is kept in a GPR instead of CR. This function is purely for
 /// codegen purposes and has some flags to guide the codegen process.
 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
                                      bool Swap, SDLoc &DL, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
 
   // Zero extend the operands to the largest legal integer. Originally, they
   // must be of a strictly smaller size.
   auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
                          DAG.getConstant(Size, DL, MVT::i32));
   auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
                          DAG.getConstant(Size, DL, MVT::i32));
 
   // Swap if needed. Depends on the condition code.
   if (Swap)
     std::swap(Op0, Op1);
 
   // Subtract extended integers.
   auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
 
   // Move the sign bit to the least significant position and zero out the rest.
   // Now the least significant bit carries the result of original comparison.
   auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
                              DAG.getConstant(Size - 1, DL, MVT::i32));
   auto Final = Shifted;
 
   // Complement the result if needed. Based on the condition code.
   if (Complement)
     Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
                         DAG.getConstant(1, DL, MVT::i64));
 
   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
 }
 
 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
   // Size of integers being compared has a critical role in the following
   // analysis, so we prefer to do this when all types are legal.
   if (!DCI.isAfterLegalizeDAG())
     return SDValue();
 
   // If all users of SETCC extend its value to a legal integer type
   // then we replace SETCC with a subtraction
   for (SDNode::use_iterator UI = N->use_begin(),
        UE = N->use_end(); UI != UE; ++UI) {
     if (UI->getOpcode() != ISD::ZERO_EXTEND)
       return SDValue();
   }
 
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   auto OpSize = N->getOperand(0).getValueSizeInBits();
 
   unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
 
   if (OpSize < Size) {
     switch (CC) {
     default: break;
     case ISD::SETULT:
       return generateEquivalentSub(N, Size, false, false, DL, DAG);
     case ISD::SETULE:
       return generateEquivalentSub(N, Size, true, true, DL, DAG);
     case ISD::SETUGT:
       return generateEquivalentSub(N, Size, false, true, DL, DAG);
     case ISD::SETUGE:
       return generateEquivalentSub(N, Size, true, false, DL, DAG);
     }
   }
 
   return SDValue();
 }
 
 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
   assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
   // If we're tracking CR bits, we need to be careful that we don't have:
   //   trunc(binary-ops(zext(x), zext(y)))
   // or
   //   trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
   // such that we're unnecessarily moving things into GPRs when it would be
   // better to keep them in CR bits.
 
   // Note that trunc here can be an actual i1 trunc, or can be the effective
   // truncation that comes from a setcc or select_cc.
   if (N->getOpcode() == ISD::TRUNCATE &&
       N->getValueType(0) != MVT::i1)
     return SDValue();
 
   if (N->getOperand(0).getValueType() != MVT::i32 &&
       N->getOperand(0).getValueType() != MVT::i64)
     return SDValue();
 
   if (N->getOpcode() == ISD::SETCC ||
       N->getOpcode() == ISD::SELECT_CC) {
     // If we're looking at a comparison, then we need to make sure that the
     // high bits (all except for the first) don't matter the result.
     ISD::CondCode CC =
       cast<CondCodeSDNode>(N->getOperand(
         N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
     unsigned OpBits = N->getOperand(0).getValueSizeInBits();
 
     if (ISD::isSignedIntSetCC(CC)) {
       if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
           DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
         return SDValue();
     } else if (ISD::isUnsignedIntSetCC(CC)) {
       if (!DAG.MaskedValueIsZero(N->getOperand(0),
                                  APInt::getHighBitsSet(OpBits, OpBits-1)) ||
           !DAG.MaskedValueIsZero(N->getOperand(1),
                                  APInt::getHighBitsSet(OpBits, OpBits-1)))
         return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
                                              : SDValue());
     } else {
       // This is neither a signed nor an unsigned comparison, just make sure
       // that the high bits are equal.
       KnownBits Op1Known, Op2Known;
       DAG.computeKnownBits(N->getOperand(0), Op1Known);
       DAG.computeKnownBits(N->getOperand(1), Op2Known);
 
       // We don't really care about what is known about the first bit (if
       // anything), so clear it in all masks prior to comparing them.
       Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
       Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);
 
       if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One)
         return SDValue();
     }
   }
 
   // We now know that the higher-order bits are irrelevant, we just need to
   // make sure that all of the intermediate operations are bit operations, and
   // all inputs are extensions.
   if (N->getOperand(0).getOpcode() != ISD::AND &&
       N->getOperand(0).getOpcode() != ISD::OR  &&
       N->getOperand(0).getOpcode() != ISD::XOR &&
       N->getOperand(0).getOpcode() != ISD::SELECT &&
       N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
       N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
       N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
       N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
     return SDValue();
 
   if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
       N->getOperand(1).getOpcode() != ISD::AND &&
       N->getOperand(1).getOpcode() != ISD::OR  &&
       N->getOperand(1).getOpcode() != ISD::XOR &&
       N->getOperand(1).getOpcode() != ISD::SELECT &&
       N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
       N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
       N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
       N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
     return SDValue();
 
   SmallVector<SDValue, 4> Inputs;
   SmallVector<SDValue, 8> BinOps, PromOps;
   SmallPtrSet<SDNode *, 16> Visited;
 
   for (unsigned i = 0; i < 2; ++i) {
     if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
           N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
           N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
           N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
         isa<ConstantSDNode>(N->getOperand(i)))
       Inputs.push_back(N->getOperand(i));
     else
       BinOps.push_back(N->getOperand(i));
 
     if (N->getOpcode() == ISD::TRUNCATE)
       break;
   }
 
   // Visit all inputs, collect all binary operations (and, or, xor and
   // select) that are all fed by extensions.
   while (!BinOps.empty()) {
     SDValue BinOp = BinOps.back();
     BinOps.pop_back();
 
     if (!Visited.insert(BinOp.getNode()).second)
       continue;
 
     PromOps.push_back(BinOp);
 
     for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
       // The condition of the select is not promoted.
       if (BinOp.getOpcode() == ISD::SELECT && i == 0)
         continue;
       if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
         continue;
 
       if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
             BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
             BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
            BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
           isa<ConstantSDNode>(BinOp.getOperand(i))) {
         Inputs.push_back(BinOp.getOperand(i));
       } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
                  BinOp.getOperand(i).getOpcode() == ISD::OR  ||
                  BinOp.getOperand(i).getOpcode() == ISD::XOR ||
                  BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
                  BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
                  BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
                  BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
                  BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
                  BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
         BinOps.push_back(BinOp.getOperand(i));
       } else {
         // We have an input that is not an extension or another binary
         // operation; we'll abort this transformation.
         return SDValue();
       }
     }
   }
 
   // Make sure that this is a self-contained cluster of operations (which
   // is not quite the same thing as saying that everything has only one
   // use).
   for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
     for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
                               UE = Inputs[i].getNode()->use_end();
          UI != UE; ++UI) {
       SDNode *User = *UI;
       if (User != N && !Visited.count(User))
         return SDValue();
 
       // Make sure that we're not going to promote the non-output-value
       // operand(s) or SELECT or SELECT_CC.
       // FIXME: Although we could sometimes handle this, and it does occur in
       // practice that one of the condition inputs to the select is also one of
       // the outputs, we currently can't deal with this.
       if (User->getOpcode() == ISD::SELECT) {
         if (User->getOperand(0) == Inputs[i])
           return SDValue();
       } else if (User->getOpcode() == ISD::SELECT_CC) {
         if (User->getOperand(0) == Inputs[i] ||
             User->getOperand(1) == Inputs[i])
           return SDValue();
       }
     }
   }
 
   for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
     for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
                               UE = PromOps[i].getNode()->use_end();
          UI != UE; ++UI) {
       SDNode *User = *UI;
       if (User != N && !Visited.count(User))
         return SDValue();
 
       // Make sure that we're not going to promote the non-output-value
       // operand(s) or SELECT or SELECT_CC.
       // FIXME: Although we could sometimes handle this, and it does occur in
       // practice that one of the condition inputs to the select is also one of
       // the outputs, we currently can't deal with this.
       if (User->getOpcode() == ISD::SELECT) {
         if (User->getOperand(0) == PromOps[i])
           return SDValue();
       } else if (User->getOpcode() == ISD::SELECT_CC) {
         if (User->getOperand(0) == PromOps[i] ||
             User->getOperand(1) == PromOps[i])
           return SDValue();
       }
     }
   }
 
   // Replace all inputs with the extension operand.
   for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
     // Constants may have users outside the cluster of to-be-promoted nodes,
     // and so we need to replace those as we do the promotions.
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
     else
       DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
   }
 
   std::list<HandleSDNode> PromOpHandles;
   for (auto &PromOp : PromOps)
     PromOpHandles.emplace_back(PromOp);
 
   // Replace all operations (these are all the same, but have a different
   // (i1) return type). DAG.getNode will validate that the types of
   // a binary operator match, so go through the list in reverse so that
   // we've likely promoted both operands first. Any intermediate truncations or
   // extensions disappear.
   while (!PromOpHandles.empty()) {
     SDValue PromOp = PromOpHandles.back().getValue();
     PromOpHandles.pop_back();
 
     if (PromOp.getOpcode() == ISD::TRUNCATE ||
         PromOp.getOpcode() == ISD::SIGN_EXTEND ||
         PromOp.getOpcode() == ISD::ZERO_EXTEND ||
         PromOp.getOpcode() == ISD::ANY_EXTEND) {
       if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
           PromOp.getOperand(0).getValueType() != MVT::i1) {
         // The operand is not yet ready (see comment below).
         PromOpHandles.emplace_front(PromOp);
         continue;
       }
 
       SDValue RepValue = PromOp.getOperand(0);
       if (isa<ConstantSDNode>(RepValue))
         RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
 
       DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
       continue;
     }
 
     unsigned C;
     switch (PromOp.getOpcode()) {
     default:             C = 0; break;
     case ISD::SELECT:    C = 1; break;
     case ISD::SELECT_CC: C = 2; break;
     }
 
     if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
          PromOp.getOperand(C).getValueType() != MVT::i1) ||
         (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
          PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
       // The to-be-promoted operands of this node have not yet been
       // promoted (this should be rare because we're going through the
       // list backward, but if one of the operands has several users in
       // this cluster of to-be-promoted nodes, it is possible).
       PromOpHandles.emplace_front(PromOp);
       continue;
     }
 
     SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
                                 PromOp.getNode()->op_end());
 
     // If there are any constant inputs, make sure they're replaced now.
     for (unsigned i = 0; i < 2; ++i)
       if (isa<ConstantSDNode>(Ops[C+i]))
         Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
 
     DAG.ReplaceAllUsesOfValueWith(PromOp,
       DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
   }
 
   // Now we're left with the initial truncation itself.
   if (N->getOpcode() == ISD::TRUNCATE)
     return N->getOperand(0);
 
   // Otherwise, this is a comparison. The operands to be compared have just
   // changed type (to i1), but everything else is the same.
   return SDValue(N, 0);
 }
 
 SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
   // If we're tracking CR bits, we need to be careful that we don't have:
   //   zext(binary-ops(trunc(x), trunc(y)))
   // or
   //   zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
   // such that we're unnecessarily moving things into CR bits that can more
   // efficiently stay in GPRs. Note that if we're not certain that the high
   // bits are set as required by the final extension, we still may need to do
   // some masking to get the proper behavior.
 
   // This same functionality is important on PPC64 when dealing with
   // 32-to-64-bit extensions; these occur often when 32-bit values are used as
   // the return values of functions. Because it is so similar, it is handled
   // here as well.
 
   if (N->getValueType(0) != MVT::i32 &&
       N->getValueType(0) != MVT::i64)
     return SDValue();
 
   if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
         (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
     return SDValue();
 
   if (N->getOperand(0).getOpcode() != ISD::AND &&
       N->getOperand(0).getOpcode() != ISD::OR  &&
       N->getOperand(0).getOpcode() != ISD::XOR &&
       N->getOperand(0).getOpcode() != ISD::SELECT &&
       N->getOperand(0).getOpcode() != ISD::SELECT_CC)
     return SDValue();
 
   SmallVector<SDValue, 4> Inputs;
   SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
   SmallPtrSet<SDNode *, 16> Visited;
 
   // Visit all inputs, collect all binary operations (and, or, xor and
   // select) that are all fed by truncations.
   while (!BinOps.empty()) {
     SDValue BinOp = BinOps.back();
     BinOps.pop_back();
 
     if (!Visited.insert(BinOp.getNode()).second)
       continue;
 
     PromOps.push_back(BinOp);
 
     for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
       // The condition of the select is not promoted.
       if (BinOp.getOpcode() == ISD::SELECT && i == 0)
         continue;
       if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
         continue;
 
       if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
           isa<ConstantSDNode>(BinOp.getOperand(i))) {
         Inputs.push_back(BinOp.getOperand(i));
       } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
                  BinOp.getOperand(i).getOpcode() == ISD::OR  ||
                  BinOp.getOperand(i).getOpcode() == ISD::XOR ||
                  BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
                  BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
         BinOps.push_back(BinOp.getOperand(i));
       } else {
         // We have an input that is not a truncation or another binary
         // operation; we'll abort this transformation.
         return SDValue();
       }
     }
   }
 
   // The operands of a select that must be truncated when the select is
   // promoted because the operand is actually part of the to-be-promoted set.
   DenseMap<SDNode *, EVT> SelectTruncOp[2];
 
   // Make sure that this is a self-contained cluster of operations (which
   // is not quite the same thing as saying that everything has only one
   // use).
   for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
     for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
                               UE = Inputs[i].getNode()->use_end();
          UI != UE; ++UI) {
       SDNode *User = *UI;
       if (User != N && !Visited.count(User))
         return SDValue();
 
       // If we're going to promote the non-output-value operand(s) or SELECT or
       // SELECT_CC, record them for truncation.
       if (User->getOpcode() == ISD::SELECT) {
         if (User->getOperand(0) == Inputs[i])
           SelectTruncOp[0].insert(std::make_pair(User,
                                     User->getOperand(0).getValueType()));
       } else if (User->getOpcode() == ISD::SELECT_CC) {
         if (User->getOperand(0) == Inputs[i])
           SelectTruncOp[0].insert(std::make_pair(User,
                                     User->getOperand(0).getValueType()));
         if (User->getOperand(1) == Inputs[i])
           SelectTruncOp[1].insert(std::make_pair(User,
                                     User->getOperand(1).getValueType()));
       }
     }
   }
 
   for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
     for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
                               UE = PromOps[i].getNode()->use_end();
          UI != UE; ++UI) {
       SDNode *User = *UI;
       if (User != N && !Visited.count(User))
         return SDValue();
 
       // If we're going to promote the non-output-value operand(s) or SELECT or
       // SELECT_CC, record them for truncation.
       if (User->getOpcode() == ISD::SELECT) {
         if (User->getOperand(0) == PromOps[i])
           SelectTruncOp[0].insert(std::make_pair(User,
                                     User->getOperand(0).getValueType()));
       } else if (User->getOpcode() == ISD::SELECT_CC) {
         if (User->getOperand(0) == PromOps[i])
           SelectTruncOp[0].insert(std::make_pair(User,
                                     User->getOperand(0).getValueType()));
         if (User->getOperand(1) == PromOps[i])
           SelectTruncOp[1].insert(std::make_pair(User,
                                     User->getOperand(1).getValueType()));
       }
     }
   }
 
   unsigned PromBits = N->getOperand(0).getValueSizeInBits();
   bool ReallyNeedsExt = false;
   if (N->getOpcode() != ISD::ANY_EXTEND) {
     // If all of the inputs are not already sign/zero extended, then
     // we'll still need to do that at the end.
     for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
       if (isa<ConstantSDNode>(Inputs[i]))
         continue;
 
       unsigned OpBits =
         Inputs[i].getOperand(0).getValueSizeInBits();
       assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
 
       if ((N->getOpcode() == ISD::ZERO_EXTEND &&
            !DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
                                   APInt::getHighBitsSet(OpBits,
                                                         OpBits-PromBits))) ||
           (N->getOpcode() == ISD::SIGN_EXTEND &&
            DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
              (OpBits-(PromBits-1)))) {
         ReallyNeedsExt = true;
         break;
       }
     }
   }
 
   // Replace all inputs, either with the truncation operand, or a
   // truncation or extension to the final output type.
   for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
     // Constant inputs need to be replaced with the to-be-promoted nodes that
     // use them because they might have users outside of the cluster of
     // promoted nodes.
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
     SDValue InSrc = Inputs[i].getOperand(0);
     if (Inputs[i].getValueType() == N->getValueType(0))
       DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
     else if (N->getOpcode() == ISD::SIGN_EXTEND)
       DAG.ReplaceAllUsesOfValueWith(Inputs[i],
         DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
     else if (N->getOpcode() == ISD::ZERO_EXTEND)
       DAG.ReplaceAllUsesOfValueWith(Inputs[i],
         DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
     else
       DAG.ReplaceAllUsesOfValueWith(Inputs[i],
         DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
   }
 
   std::list<HandleSDNode> PromOpHandles;
   for (auto &PromOp : PromOps)
     PromOpHandles.emplace_back(PromOp);
 
   // Replace all operations (these are all the same, but have a different
   // (promoted) return type). DAG.getNode will validate that the types of
   // a binary operator match, so go through the list in reverse so that
   // we've likely promoted both operands first.
   while (!PromOpHandles.empty()) {
     SDValue PromOp = PromOpHandles.back().getValue();
     PromOpHandles.pop_back();
 
     unsigned C;
     switch (PromOp.getOpcode()) {
     default:             C = 0; break;
     case ISD::SELECT:    C = 1; break;
     case ISD::SELECT_CC: C = 2; break;
     }
 
     if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
          PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
         (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
          PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
       // The to-be-promoted operands of this node have not yet been
       // promoted (this should be rare because we're going through the
       // list backward, but if one of the operands has several users in
       // this cluster of to-be-promoted nodes, it is possible).
       PromOpHandles.emplace_front(PromOp);
       continue;
     }
 
     // For SELECT and SELECT_CC nodes, we do a similar check for any
     // to-be-promoted comparison inputs.
     if (PromOp.getOpcode() == ISD::SELECT ||
         PromOp.getOpcode() == ISD::SELECT_CC) {
       if ((SelectTruncOp[0].count(PromOp.getNode()) &&
            PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
           (SelectTruncOp[1].count(PromOp.getNode()) &&
            PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
         PromOpHandles.emplace_front(PromOp);
         continue;
       }
     }
 
     SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
                                 PromOp.getNode()->op_end());
 
     // If this node has constant inputs, then they'll need to be promoted here.
     for (unsigned i = 0; i < 2; ++i) {
       if (!isa<ConstantSDNode>(Ops[C+i]))
         continue;
       if (Ops[C+i].getValueType() == N->getValueType(0))
         continue;
 
       if (N->getOpcode() == ISD::SIGN_EXTEND)
         Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
       else if (N->getOpcode() == ISD::ZERO_EXTEND)
         Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
       else
         Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
     }
 
     // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
     // truncate them again to the original value type.
     if (PromOp.getOpcode() == ISD::SELECT ||
         PromOp.getOpcode() == ISD::SELECT_CC) {
       auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
       if (SI0 != SelectTruncOp[0].end())
         Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
       auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
       if (SI1 != SelectTruncOp[1].end())
         Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
     }
 
     DAG.ReplaceAllUsesOfValueWith(PromOp,
       DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
   }
 
   // Now we're left with the initial extension itself.
   if (!ReallyNeedsExt)
     return N->getOperand(0);
 
   // To zero extend, just mask off everything except for the first bit (in the
   // i1 case).
   if (N->getOpcode() == ISD::ZERO_EXTEND)
     return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
                        DAG.getConstant(APInt::getLowBitsSet(
                                          N->getValueSizeInBits(0), PromBits),
                                        dl, N->getValueType(0)));
 
   assert(N->getOpcode() == ISD::SIGN_EXTEND &&
          "Invalid extension type");
   EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
   SDValue ShiftCst =
       DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
   return DAG.getNode(
       ISD::SRA, dl, N->getValueType(0),
       DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
       ShiftCst);
 }
 
 // Is this an extending load from an f32 to an f64?
 static bool isFPExtLoad(SDValue Op) {
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
     return LD->getExtensionType() == ISD::EXTLOAD &&
       Op.getValueType() == MVT::f64;
   return false;
 }
 
 /// Reduces the number of fp-to-int conversion when building a vector.
 ///
 /// If this vector is built out of floating to integer conversions,
 /// transform it to a vector built out of floating point values followed by a
 /// single floating to integer conversion of the vector.
 /// Namely  (build_vector (fptosi $A), (fptosi $B), ...)
 /// becomes (fptosi (build_vector ($A, $B, ...)))
 SDValue PPCTargetLowering::
 combineElementTruncationToVectorTruncation(SDNode *N,
                                            DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::BUILD_VECTOR &&
          "Should be called with a BUILD_VECTOR node");
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
   SDValue FirstInput = N->getOperand(0);
   assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
          "The input operand must be an fp-to-int conversion.");
 
   // This combine happens after legalization so the fp_to_[su]i nodes are
   // already converted to PPCSISD nodes.
   unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
   if (FirstConversion == PPCISD::FCTIDZ ||
       FirstConversion == PPCISD::FCTIDUZ ||
       FirstConversion == PPCISD::FCTIWZ ||
       FirstConversion == PPCISD::FCTIWUZ) {
     bool IsSplat = true;
     bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
       FirstConversion == PPCISD::FCTIWUZ;
     EVT SrcVT = FirstInput.getOperand(0).getValueType();
     SmallVector<SDValue, 4> Ops;
     EVT TargetVT = N->getValueType(0);
     for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
       SDValue NextOp = N->getOperand(i);
       if (NextOp.getOpcode() != PPCISD::MFVSR)
         return SDValue();
       unsigned NextConversion = NextOp.getOperand(0).getOpcode();
       if (NextConversion != FirstConversion)
         return SDValue();
       // If we are converting to 32-bit integers, we need to add an FP_ROUND.
       // This is not valid if the input was originally double precision. It is
       // also not profitable to do unless this is an extending load in which
       // case doing this combine will allow us to combine consecutive loads.
       if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
         return SDValue();
       if (N->getOperand(i) != FirstInput)
         IsSplat = false;
     }
 
     // If this is a splat, we leave it as-is since there will be only a single
     // fp-to-int conversion followed by a splat of the integer. This is better
     // for 32-bit and smaller ints and neutral for 64-bit ints.
     if (IsSplat)
       return SDValue();
 
     // Now that we know we have the right type of node, get its operands
     for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
       SDValue In = N->getOperand(i).getOperand(0);
       if (Is32Bit) {
         // For 32-bit values, we need to add an FP_ROUND node (if we made it
         // here, we know that all inputs are extending loads so this is safe).
         if (In.isUndef())
           Ops.push_back(DAG.getUNDEF(SrcVT));
         else {
           SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
                                       MVT::f32, In.getOperand(0),
                                       DAG.getIntPtrConstant(1, dl));
           Ops.push_back(Trunc);
         }
       } else
         Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
     }
 
     unsigned Opcode;
     if (FirstConversion == PPCISD::FCTIDZ ||
         FirstConversion == PPCISD::FCTIWZ)
       Opcode = ISD::FP_TO_SINT;
     else
       Opcode = ISD::FP_TO_UINT;
 
     EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
     SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
     return DAG.getNode(Opcode, dl, TargetVT, BV);
   }
   return SDValue();
 }
 
 /// Reduce the number of loads when building a vector.
 ///
 /// Building a vector out of multiple loads can be converted to a load
 /// of the vector type if the loads are consecutive. If the loads are
 /// consecutive but in descending order, a shuffle is added at the end
 /// to reorder the vector.
 static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::BUILD_VECTOR &&
          "Should be called with a BUILD_VECTOR node");
 
   SDLoc dl(N);
   bool InputsAreConsecutiveLoads = true;
   bool InputsAreReverseConsecutive = true;
   unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8;
   SDValue FirstInput = N->getOperand(0);
   bool IsRoundOfExtLoad = false;
 
   if (FirstInput.getOpcode() == ISD::FP_ROUND &&
       FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
     LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
     IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
   }
   // Not a build vector of (possibly fp_rounded) loads.
   if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD)
     return SDValue();
 
   for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
     // If any inputs are fp_round(extload), they all must be.
     if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
       return SDValue();
 
     SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
       N->getOperand(i);
     if (NextInput.getOpcode() != ISD::LOAD)
       return SDValue();
 
     SDValue PreviousInput =
       IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
     LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
     LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);
 
     // If any inputs are fp_round(extload), they all must be.
     if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
       return SDValue();
 
     if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
       InputsAreConsecutiveLoads = false;
     if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
       InputsAreReverseConsecutive = false;
 
     // Exit early if the loads are neither consecutive nor reverse consecutive.
     if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
       return SDValue();
   }
 
   assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
          "The loads cannot be both consecutive and reverse consecutive.");
 
   SDValue FirstLoadOp =
     IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
   SDValue LastLoadOp =
     IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
                        N->getOperand(N->getNumOperands()-1);
 
   LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
   LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
   if (InputsAreConsecutiveLoads) {
     assert(LD1 && "Input needs to be a LoadSDNode.");
     return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
                        LD1->getBasePtr(), LD1->getPointerInfo(),
                        LD1->getAlignment());
   }
   if (InputsAreReverseConsecutive) {
     assert(LDL && "Input needs to be a LoadSDNode.");
     SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
                                LDL->getBasePtr(), LDL->getPointerInfo(),
                                LDL->getAlignment());
     SmallVector<int, 16> Ops;
     for (int i = N->getNumOperands() - 1; i >= 0; i--)
       Ops.push_back(i);
 
     return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
                                 DAG.getUNDEF(N->getValueType(0)), Ops);
   }
   return SDValue();
 }
 
 // This function adds the required vector_shuffle needed to get
 // the elements of the vector extract in the correct position
 // as specified by the CorrectElems encoding.
 static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
                                       SDValue Input, uint64_t Elems,
                                       uint64_t CorrectElems) {
   SDLoc dl(N);
 
   unsigned NumElems = Input.getValueType().getVectorNumElements();
   SmallVector<int, 16> ShuffleMask(NumElems, -1);
 
   // Knowing the element indices being extracted from the original
   // vector and the order in which they're being inserted, just put
   // them at element indices required for the instruction.
   for (unsigned i = 0; i < N->getNumOperands(); i++) {
     if (DAG.getDataLayout().isLittleEndian())
       ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
     else
       ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
     CorrectElems = CorrectElems >> 8;
     Elems = Elems >> 8;
   }
 
   SDValue Shuffle =
       DAG.getVectorShuffle(Input.getValueType(), dl, Input,
                            DAG.getUNDEF(Input.getValueType()), ShuffleMask);
 
   EVT Ty = N->getValueType(0);
   SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
   return BV;
 }
 
 // Look for build vector patterns where input operands come from sign
 // extended vector_extract elements of specific indices. If the correct indices
 // aren't used, add a vector shuffle to fix up the indices and create a new
 // PPCISD:SExtVElems node which selects the vector sign extend instructions
 // during instruction selection.
 static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
   // This array encodes the indices that the vector sign extend instructions
   // extract from when extending from one type to another for both BE and LE.
   // The right nibble of each byte corresponds to the LE incides.
   // and the left nibble of each byte corresponds to the BE incides.
   // For example: 0x3074B8FC  byte->word
   // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
   // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
   // For example: 0x000070F8  byte->double word
   // For LE: the allowed indices are: 0x0,0x8
   // For BE: the allowed indices are: 0x7,0xF
   uint64_t TargetElems[] = {
       0x3074B8FC, // b->w
       0x000070F8, // b->d
       0x10325476, // h->w
       0x00003074, // h->d
       0x00001032, // w->d
   };
 
   uint64_t Elems = 0;
   int Index;
   SDValue Input;
 
   auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
     if (!Op)
       return false;
-    if (Op.getOpcode() != ISD::SIGN_EXTEND)
+    if (Op.getOpcode() != ISD::SIGN_EXTEND &&
+        Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
       return false;
 
+    // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
+    // of the right width.
     SDValue Extract = Op.getOperand(0);
+    if (Extract.getOpcode() == ISD::ANY_EXTEND)
+      Extract = Extract.getOperand(0);
     if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return false;
 
     ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
     if (!ExtOp)
       return false;
 
     Index = ExtOp->getZExtValue();
     if (Input && Input != Extract.getOperand(0))
       return false;
 
     if (!Input)
       Input = Extract.getOperand(0);
 
     Elems = Elems << 8;
     Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
     Elems |= Index;
 
     return true;
   };
 
   // If the build vector operands aren't sign extended vector extracts,
   // of the same input vector, then return.
   for (unsigned i = 0; i < N->getNumOperands(); i++) {
     if (!isSExtOfVecExtract(N->getOperand(i))) {
       return SDValue();
     }
   }
 
   // If the vector extract indicies are not correct, add the appropriate
   // vector_shuffle.
   int TgtElemArrayIdx;
   int InputSize = Input.getValueType().getScalarSizeInBits();
   int OutputSize = N->getValueType(0).getScalarSizeInBits();
   if (InputSize + OutputSize == 40)
     TgtElemArrayIdx = 0;
   else if (InputSize + OutputSize == 72)
     TgtElemArrayIdx = 1;
   else if (InputSize + OutputSize == 48)
     TgtElemArrayIdx = 2;
   else if (InputSize + OutputSize == 80)
     TgtElemArrayIdx = 3;
   else if (InputSize + OutputSize == 96)
     TgtElemArrayIdx = 4;
   else
     return SDValue();
 
   uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
   CorrectElems = DAG.getDataLayout().isLittleEndian()
                      ? CorrectElems & 0x0F0F0F0F0F0F0F0F
                      : CorrectElems & 0xF0F0F0F0F0F0F0F0;
   if (Elems != CorrectElems) {
     return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
   }
 
   // Regular lowering will catch cases where a shuffle is not needed.
   return SDValue();
 }
 
 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::BUILD_VECTOR &&
          "Should be called with a BUILD_VECTOR node");
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
   if (!Subtarget.hasVSX())
     return SDValue();
 
   // The target independent DAG combiner will leave a build_vector of
   // float-to-int conversions intact. We can generate MUCH better code for
   // a float-to-int conversion of a vector of floats.
   SDValue FirstInput = N->getOperand(0);
   if (FirstInput.getOpcode() == PPCISD::MFVSR) {
     SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
     if (Reduced)
       return Reduced;
   }
 
   // If we're building a vector out of consecutive loads, just load that
   // vector type.
   SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
   if (Reduced)
     return Reduced;
 
   // If we're building a vector out of extended elements from another vector
-  // we have P9 vector integer extend instructions.
-  if (Subtarget.hasP9Altivec()) {
+  // we have P9 vector integer extend instructions. The code assumes legal
+  // input types (i.e. it can't handle things like v4i16) so do not run before
+  // legalization.
+  if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
     Reduced = combineBVOfVecSExt(N, DAG);
     if (Reduced)
       return Reduced;
   }
 
 
   if (N->getValueType(0) != MVT::v2f64)
     return SDValue();
 
   // Looking for:
   // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
   if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
       FirstInput.getOpcode() != ISD::UINT_TO_FP)
     return SDValue();
   if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
       N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
     return SDValue();
   if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
     return SDValue();
 
   SDValue Ext1 = FirstInput.getOperand(0);
   SDValue Ext2 = N->getOperand(1).getOperand(0);
   if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
      Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
 
   ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
   ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
   if (!Ext1Op || !Ext2Op)
     return SDValue();
   if (Ext1.getValueType() != MVT::i32 ||
       Ext2.getValueType() != MVT::i32)
   if (Ext1.getOperand(0) != Ext2.getOperand(0))
     return SDValue();
 
   int FirstElem = Ext1Op->getZExtValue();
   int SecondElem = Ext2Op->getZExtValue();
   int SubvecIdx;
   if (FirstElem == 0 && SecondElem == 1)
     SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
   else if (FirstElem == 2 && SecondElem == 3)
     SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
   else
     return SDValue();
 
   SDValue SrcVec = Ext1.getOperand(0);
   auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
     PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
   return DAG.getNode(NodeType, dl, MVT::v2f64,
                      SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
 }
 
 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   assert((N->getOpcode() == ISD::SINT_TO_FP ||
           N->getOpcode() == ISD::UINT_TO_FP) &&
          "Need an int -> FP conversion node here");
 
   if (useSoftFloat() || !Subtarget.has64BitSupport())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue Op(N, 0);
 
   // Don't handle ppc_fp128 here or conversions that are out-of-range capable
   // from the hardware.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
   if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
       Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
     return SDValue();
 
   SDValue FirstOperand(Op.getOperand(0));
   bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
     (FirstOperand.getValueType() == MVT::i8 ||
      FirstOperand.getValueType() == MVT::i16);
   if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
     bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
     bool DstDouble = Op.getValueType() == MVT::f64;
     unsigned ConvOp = Signed ?
       (DstDouble ? PPCISD::FCFID  : PPCISD::FCFIDS) :
       (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
     SDValue WidthConst =
       DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
                             dl, false);
     LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
     SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
     SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
                                          DAG.getVTList(MVT::f64, MVT::Other),
                                          Ops, MVT::i8, LDN->getMemOperand());
 
     // For signed conversion, we need to sign-extend the value in the VSR
     if (Signed) {
       SDValue ExtOps[] = { Ld, WidthConst };
       SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
       return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
     } else
       return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
   }
 
 
   // For i32 intermediate values, unfortunately, the conversion functions
   // leave the upper 32 bits of the value are undefined. Within the set of
   // scalar instructions, we have no method for zero- or sign-extending the
   // value. Thus, we cannot handle i32 intermediate values here.
   if (Op.getOperand(0).getValueType() == MVT::i32)
     return SDValue();
 
   assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
          "UINT_TO_FP is supported only with FPCVT");
 
   // If we have FCFIDS, then use it when converting to single-precision.
   // Otherwise, convert to double-precision and then round.
   unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
                        ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
                                                             : PPCISD::FCFIDS)
                        : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
                                                             : PPCISD::FCFID);
   MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
                   ? MVT::f32
                   : MVT::f64;
 
   // If we're converting from a float, to an int, and back to a float again,
   // then we don't need the store/load pair at all.
   if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
        Subtarget.hasFPCVT()) ||
       (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
     SDValue Src = Op.getOperand(0).getOperand(0);
     if (Src.getValueType() == MVT::f32) {
       Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
       DCI.AddToWorklist(Src.getNode());
     } else if (Src.getValueType() != MVT::f64) {
       // Make sure that we don't pick up a ppc_fp128 source value.
       return SDValue();
     }
 
     unsigned FCTOp =
       Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
                                                         PPCISD::FCTIDUZ;
 
     SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
     SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
 
     if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
       FP = DAG.getNode(ISD::FP_ROUND, dl,
                        MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
       DCI.AddToWorklist(FP.getNode());
     }
 
     return FP;
   }
 
   return SDValue();
 }
 
 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
 // builtins) into loads with swaps.
 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue Chain;
   SDValue Base;
   MachineMemOperand *MMO;
 
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode for little endian VSX load");
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(N);
     Chain = LD->getChain();
     Base = LD->getBasePtr();
     MMO = LD->getMemOperand();
     // If the MMO suggests this isn't a load of a full vector, leave
     // things alone.  For a built-in, we have to make the change for
     // correctness, so if there is a size problem that will be a bug.
     if (MMO->getSize() < 16)
       return SDValue();
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
     Chain = Intrin->getChain();
     // Similarly to the store case below, Intrin->getBasePtr() doesn't get
     // us what we want. Get operand 2 instead.
     Base = Intrin->getOperand(2);
     MMO = Intrin->getMemOperand();
     break;
   }
   }
 
   MVT VecTy = N->getValueType(0).getSimpleVT();
 
   // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
   // aligned and the type is a vector with elements up to 4 bytes
   if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
       && VecTy.getScalarSizeInBits() <= 32 ) {
     return SDValue();
   }
 
   SDValue LoadOps[] = { Chain, Base };
   SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
                                          DAG.getVTList(MVT::v2f64, MVT::Other),
                                          LoadOps, MVT::v2f64, MMO);
 
   DCI.AddToWorklist(Load.getNode());
   Chain = Load.getValue(1);
   SDValue Swap = DAG.getNode(
       PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
   DCI.AddToWorklist(Swap.getNode());
 
   // Add a bitcast if the resulting load type doesn't match v2f64.
   if (VecTy != MVT::v2f64) {
     SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
     DCI.AddToWorklist(N.getNode());
     // Package {bitcast value, swap's chain} to match Load's shape.
     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
                        N, Swap.getValue(1));
   }
 
   return Swap;
 }
 
 // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
 // builtins) into stores with swaps.
 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue Chain;
   SDValue Base;
   unsigned SrcOpnd;
   MachineMemOperand *MMO;
 
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode for little endian VSX store");
   case ISD::STORE: {
     StoreSDNode *ST = cast<StoreSDNode>(N);
     Chain = ST->getChain();
     Base = ST->getBasePtr();
     MMO = ST->getMemOperand();
     SrcOpnd = 1;
     // If the MMO suggests this isn't a store of a full vector, leave
     // things alone.  For a built-in, we have to make the change for
     // correctness, so if there is a size problem that will be a bug.
     if (MMO->getSize() < 16)
       return SDValue();
     break;
   }
   case ISD::INTRINSIC_VOID: {
     MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
     Chain = Intrin->getChain();
     // Intrin->getBasePtr() oddly does not get what we want.
     Base = Intrin->getOperand(3);
     MMO = Intrin->getMemOperand();
     SrcOpnd = 2;
     break;
   }
   }
 
   SDValue Src = N->getOperand(SrcOpnd);
   MVT VecTy = Src.getValueType().getSimpleVT();
 
   // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
   // aligned and the type is a vector with elements up to 4 bytes
   if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
       && VecTy.getScalarSizeInBits() <= 32 ) {
     return SDValue();
   }
 
   // All stores are done as v2f64 and possible bit cast.
   if (VecTy != MVT::v2f64) {
     Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
     DCI.AddToWorklist(Src.getNode());
   }
 
   SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
                              DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
   DCI.AddToWorklist(Swap.getNode());
   Chain = Swap.getValue(1);
   SDValue StoreOps[] = { Chain, Swap, Base };
   SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
                                           DAG.getVTList(MVT::Other),
                                           StoreOps, VecTy, MMO);
   DCI.AddToWorklist(Store.getNode());
   return Store;
 }
 
 // Handle DAG combine for STORE (FP_TO_INT F).
 SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   unsigned Opcode = N->getOperand(1).getOpcode();
 
   assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT)
          && "Not a FP_TO_INT Instruction!");
 
   SDValue Val = N->getOperand(1).getOperand(0);
   EVT Op1VT = N->getOperand(1).getValueType();
   EVT ResVT = Val.getValueType();
 
   // Floating point types smaller than 32 bits are not legal on Power.
   if (ResVT.getScalarSizeInBits() < 32)
     return SDValue();
 
   // Only perform combine for conversion to i64/i32 or power9 i16/i8.
   bool ValidTypeForStoreFltAsInt =
         (Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
          (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
 
   if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() ||
       cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
     return SDValue();
 
   // Extend f32 values to f64
   if (ResVT.getScalarSizeInBits() == 32) {
     Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
     DCI.AddToWorklist(Val.getNode());
   }
 
   // Set signed or unsigned conversion opcode.
   unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
                           PPCISD::FP_TO_SINT_IN_VSR :
                           PPCISD::FP_TO_UINT_IN_VSR;
 
   Val = DAG.getNode(ConvOpcode,
                     dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
   DCI.AddToWorklist(Val.getNode());
 
   // Set number of bytes being converted.
   unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
   SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
                     DAG.getIntPtrConstant(ByteSize, dl, false),
                     DAG.getValueType(Op1VT) };
 
   Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
           DAG.getVTList(MVT::Other), Ops,
           cast<StoreSDNode>(N)->getMemoryVT(),
           cast<StoreSDNode>(N)->getMemOperand());
 
   DCI.AddToWorklist(Val.getNode());
   return Val;
 }
 
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case ISD::SHL:
     return combineSHL(N, DCI);
   case ISD::SRA:
     return combineSRA(N, DCI);
   case ISD::SRL:
     return combineSRL(N, DCI);
   case PPCISD::SHL:
     if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
         return N->getOperand(0);
     break;
   case PPCISD::SRL:
     if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
         return N->getOperand(0);
     break;
   case PPCISD::SRA:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
       if (C->isNullValue() ||   //  0 >>s V -> 0.
           C->isAllOnesValue())    // -1 >>s V -> -1.
         return N->getOperand(0);
     }
     break;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
     return DAGCombineExtBoolTrunc(N, DCI);
   case ISD::TRUNCATE:
   case ISD::SETCC:
   case ISD::SELECT_CC:
     return DAGCombineTruncBoolExt(N, DCI);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return combineFPToIntToFP(N, DCI);
   case ISD::STORE: {
 
     EVT Op1VT = N->getOperand(1).getValueType();
     unsigned Opcode = N->getOperand(1).getOpcode();
 
     if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) {
       SDValue Val= combineStoreFPToInt(N, DCI);
       if (Val)
         return Val;
     }
 
     // Turn STORE (BSWAP) -> sthbrx/stwbrx.
     if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
         N->getOperand(1).getNode()->hasOneUse() &&
         (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
          (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
 
       // STBRX can only handle simple types.
       EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
       if (mVT.isExtended())
         break;
 
       SDValue BSwapOp = N->getOperand(1).getOperand(0);
       // Do an any-extend to 32-bits if this is a half-word input.
       if (BSwapOp.getValueType() == MVT::i16)
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
 
       // If the type of BSWAP operand is wider than stored memory width
       // it need to be shifted to the right side before STBRX.
       if (Op1VT.bitsGT(mVT)) {
         int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
         BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
                               DAG.getConstant(Shift, dl, MVT::i32));
         // Need to truncate if this is a bswap of i64 stored as i32/i16.
         if (Op1VT == MVT::i64)
           BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
       }
 
       SDValue Ops[] = {
         N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
       };
       return
         DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
                                 Ops, cast<StoreSDNode>(N)->getMemoryVT(),
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
 
     // STORE Constant:i32<0>  ->  STORE<trunc to i32> Constant:i64<0>
     // So it can increase the chance of CSE constant construction.
     if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
         isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
       // Need to sign-extended to 64-bits to handle negative values.
       EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
       uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
                                     MemVT.getSizeInBits());
       SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
 
       // DAG.getTruncStore() can't be used here because it doesn't accept
       // the general (base + offset) addressing mode.
       // So we use UpdateNodeOperands and setTruncatingStore instead.
       DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
                              N->getOperand(3));
       cast<StoreSDNode>(N)->setTruncatingStore(true);
       return SDValue(N, 0);
     }
 
     // For little endian, VSX stores require generating xxswapd/lxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
     if (Op1VT.isSimple()) {
       MVT StoreVT = Op1VT.getSimpleVT();
       if (Subtarget.needsSwapsForVSXMemOps() &&
           (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
            StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
         return expandVSXStoreForLE(N, DCI);
     }
     break;
   }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT VT = LD->getValueType(0);
 
     // For little endian, VSX loads require generating lxvd2x/xxswapd.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
     if (VT.isSimple()) {
       MVT LoadVT = VT.getSimpleVT();
       if (Subtarget.needsSwapsForVSXMemOps() &&
           (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
            LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
         return expandVSXLoadForLE(N, DCI);
     }
 
     // We sometimes end up with a 64-bit integer load, from which we extract
     // two single-precision floating-point numbers. This happens with
     // std::complex<float>, and other similar structures, because of the way we
     // canonicalize structure copies. However, if we lack direct moves,
     // then the final bitcasts from the extracted integer values to the
     // floating-point numbers turn into store/load pairs. Even with direct moves,
     // just loading the two floating-point numbers is likely better.
     auto ReplaceTwoFloatLoad = [&]() {
       if (VT != MVT::i64)
         return false;
 
       if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
           LD->isVolatile())
         return false;
 
       //  We're looking for a sequence like this:
       //  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
       //      t16: i64 = srl t13, Constant:i32<32>
       //    t17: i32 = truncate t16
       //  t18: f32 = bitcast t17
       //    t19: i32 = truncate t13
       //  t20: f32 = bitcast t19
 
       if (!LD->hasNUsesOfValue(2, 0))
         return false;
 
       auto UI = LD->use_begin();
       while (UI.getUse().getResNo() != 0) ++UI;
       SDNode *Trunc = *UI++;
       while (UI.getUse().getResNo() != 0) ++UI;
       SDNode *RightShift = *UI;
       if (Trunc->getOpcode() != ISD::TRUNCATE)
         std::swap(Trunc, RightShift);
 
       if (Trunc->getOpcode() != ISD::TRUNCATE ||
           Trunc->getValueType(0) != MVT::i32 ||
           !Trunc->hasOneUse())
         return false;
       if (RightShift->getOpcode() != ISD::SRL ||
           !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
           RightShift->getConstantOperandVal(1) != 32 ||
           !RightShift->hasOneUse())
         return false;
 
       SDNode *Trunc2 = *RightShift->use_begin();
       if (Trunc2->getOpcode() != ISD::TRUNCATE ||
           Trunc2->getValueType(0) != MVT::i32 ||
           !Trunc2->hasOneUse())
         return false;
 
       SDNode *Bitcast = *Trunc->use_begin();
       SDNode *Bitcast2 = *Trunc2->use_begin();
 
       if (Bitcast->getOpcode() != ISD::BITCAST ||
           Bitcast->getValueType(0) != MVT::f32)
         return false;
       if (Bitcast2->getOpcode() != ISD::BITCAST ||
           Bitcast2->getValueType(0) != MVT::f32)
         return false;
 
       if (Subtarget.isLittleEndian())
         std::swap(Bitcast, Bitcast2);
 
       // Bitcast has the second float (in memory-layout order) and Bitcast2
       // has the first one.
 
       SDValue BasePtr = LD->getBasePtr();
       if (LD->isIndexed()) {
         assert(LD->getAddressingMode() == ISD::PRE_INC &&
                "Non-pre-inc AM on PPC?");
         BasePtr =
           DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                       LD->getOffset());
       }
 
       auto MMOFlags =
           LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
       SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
                                       LD->getPointerInfo(), LD->getAlignment(),
                                       MMOFlags, LD->getAAInfo());
       SDValue AddPtr =
         DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                     BasePtr, DAG.getIntPtrConstant(4, dl));
       SDValue FloatLoad2 = DAG.getLoad(
           MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
           LD->getPointerInfo().getWithOffset(4),
           MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
 
       if (LD->isIndexed()) {
         // Note that DAGCombine should re-form any pre-increment load(s) from
         // what is produced here if that makes sense.
         DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
       }
 
       DCI.CombineTo(Bitcast2, FloatLoad);
       DCI.CombineTo(Bitcast, FloatLoad2);
 
       DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
                                     SDValue(FloatLoad2.getNode(), 1));
       return true;
     };
 
     if (ReplaceTwoFloatLoad())
       return SDValue(N, 0);
 
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
     unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
     Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
     unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
     if (LD->isUnindexed() && VT.isVector() &&
         ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
           // P8 and later hardware should just use LOAD.
           !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
                                        VT == MVT::v4i32 || VT == MVT::v4f32)) ||
          (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
           LD->getAlignment() >= ScalarABIAlignment)) &&
         LD->getAlignment() < ABIAlignment) {
       // This is a type-legal unaligned Altivec or QPX load.
       SDValue Chain = LD->getChain();
       SDValue Ptr = LD->getBasePtr();
       bool isLittleEndian = Subtarget.isLittleEndian();
 
       // This implements the loading of unaligned vectors as described in
       // the venerable Apple Velocity Engine overview. Specifically:
       // https://developer.apple.com/hardwaredrivers/ve/alignment.html
       // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
       //
       // The general idea is to expand a sequence of one or more unaligned
       // loads into an alignment-based permutation-control instruction (lvsl
       // or lvsr), a series of regular vector loads (which always truncate
       // their input address to an aligned address), and a series of
       // permutations.  The results of these permutations are the requested
       // loaded values.  The trick is that the last "extra" load is not taken
       // from the address you might suspect (sizeof(vector) bytes after the
       // last requested load), but rather sizeof(vector) - 1 bytes after the
       // last requested vector. The point of this is to avoid a page fault if
       // the base address happened to be aligned. This works because if the
       // base address is aligned, then adding less than a full vector length
       // will cause the last vector in the sequence to be (re)loaded.
       // Otherwise, the next vector will be fetched as you might suspect was
       // necessary.
 
       // We might be able to reuse the permutation generation from
       // a different base address offset from this one by an aligned amount.
       // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
       // optimization later.
       Intrinsic::ID Intr, IntrLD, IntrPerm;
       MVT PermCntlTy, PermTy, LDTy;
       if (Subtarget.hasAltivec()) {
         Intr = isLittleEndian ?  Intrinsic::ppc_altivec_lvsr :
                                  Intrinsic::ppc_altivec_lvsl;
         IntrLD = Intrinsic::ppc_altivec_lvx;
         IntrPerm = Intrinsic::ppc_altivec_vperm;
         PermCntlTy = MVT::v16i8;
         PermTy = MVT::v4i32;
         LDTy = MVT::v4i32;
       } else {
         Intr =   MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
                                        Intrinsic::ppc_qpx_qvlpcls;
         IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
                                        Intrinsic::ppc_qpx_qvlfs;
         IntrPerm = Intrinsic::ppc_qpx_qvfperm;
         PermCntlTy = MVT::v4f64;
         PermTy = MVT::v4f64;
         LDTy = MemVT.getSimpleVT();
       }
 
       SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
 
       // Create the new MMO for the new base load. It is like the original MMO,
       // but represents an area in memory almost twice the vector size centered
       // on the original address. If the address is unaligned, we might start
       // reading up to (sizeof(vector)-1) bytes below the address of the
       // original unaligned load.
       MachineFunction &MF = DAG.getMachineFunction();
       MachineMemOperand *BaseMMO =
         MF.getMachineMemOperand(LD->getMemOperand(),
                                 -(long)MemVT.getStoreSize()+1,
                                 2*MemVT.getStoreSize()-1);
 
       // Create the new base load.
       SDValue LDXIntID =
           DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
       SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue BaseLoad =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
                                 DAG.getVTList(PermTy, MVT::Other),
                                 BaseLoadOps, LDTy, BaseMMO);
 
       // Note that the value of IncOffset (which is provided to the next
       // load's pointer info offset value, and thus used to calculate the
       // alignment), and the value of IncValue (which is actually used to
       // increment the pointer value) are different! This is because we
       // require the next load to appear to be aligned, even though it
       // is actually offset from the base pointer by a lesser amount.
       int IncOffset = VT.getSizeInBits() / 8;
       int IncValue = IncOffset;
 
       // Walk (both up and down) the chain looking for another load at the real
       // (aligned) offset (the alignment of the other load does not matter in
       // this case). If found, then do not use the offset reduction trick, as
       // that will prevent the loads from being later combined (as they would
       // otherwise be duplicates).
       if (!findConsecutiveLoad(LD, DAG))
         --IncValue;
 
       SDValue Increment =
           DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
 
       MachineMemOperand *ExtraMMO =
         MF.getMachineMemOperand(LD->getMemOperand(),
                                 1, 2*MemVT.getStoreSize()-1);
       SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue ExtraLoad =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
                                 DAG.getVTList(PermTy, MVT::Other),
                                 ExtraLoadOps, LDTy, ExtraMMO);
 
       SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
         BaseLoad.getValue(1), ExtraLoad.getValue(1));
 
       // Because vperm has a big-endian bias, we must reverse the order
       // of the input vectors and complement the permute control vector
       // when generating little endian code.  We have already handled the
       // latter by using lvsr instead of lvsl, so just reverse BaseLoad
       // and ExtraLoad here.
       SDValue Perm;
       if (isLittleEndian)
         Perm = BuildIntrinsicOp(IntrPerm,
                                 ExtraLoad, BaseLoad, PermCntl, DAG, dl);
       else
         Perm = BuildIntrinsicOp(IntrPerm,
                                 BaseLoad, ExtraLoad, PermCntl, DAG, dl);
 
       if (VT != PermTy)
         Perm = Subtarget.hasAltivec() ?
                  DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
                  DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
                                DAG.getTargetConstant(1, dl, MVT::i64));
                                // second argument is 1 because this rounding
                                // is always exact.
 
       // The output of the permutation is our loaded result, the TokenFactor is
       // our new chain.
       DCI.CombineTo(N, Perm, TF);
       return SDValue(N, 0);
     }
     }
     break;
     case ISD::INTRINSIC_WO_CHAIN: {
       bool isLittleEndian = Subtarget.isLittleEndian();
       unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
       Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
                                            : Intrinsic::ppc_altivec_lvsl);
       if ((IID == Intr ||
            IID == Intrinsic::ppc_qpx_qvlpcld  ||
            IID == Intrinsic::ppc_qpx_qvlpcls) &&
         N->getOperand(1)->getOpcode() == ISD::ADD) {
         SDValue Add = N->getOperand(1);
 
         int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
                    5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
 
         if (DAG.MaskedValueIsZero(Add->getOperand(1),
                                   APInt::getAllOnesValue(Bits /* alignment */)
                                       .zext(Add.getScalarValueSizeInBits()))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
           for (SDNode::use_iterator UI = BasePtr->use_begin(),
                                     UE = BasePtr->use_end();
                UI != UE; ++UI) {
             if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
                 cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
               // We've found another LVSL/LVSR, and this address is an aligned
               // multiple of that one. The results will be the same, so use the
               // one we've just found instead.
 
               return SDValue(*UI, 0);
             }
           }
         }
 
         if (isa<ConstantSDNode>(Add->getOperand(1))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
           for (SDNode::use_iterator UI = BasePtr->use_begin(),
                UE = BasePtr->use_end(); UI != UE; ++UI) {
             if (UI->getOpcode() == ISD::ADD &&
                 isa<ConstantSDNode>(UI->getOperand(1)) &&
                 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
                  cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
                 (1ULL << Bits) == 0) {
               SDNode *OtherAdd = *UI;
               for (SDNode::use_iterator VI = OtherAdd->use_begin(),
                    VE = OtherAdd->use_end(); VI != VE; ++VI) {
                 if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
                     cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
                   return SDValue(*VI, 0);
                 }
               }
             }
           }
         }
       }
     }
 
     break;
   case ISD::INTRINSIC_W_CHAIN:
     // For little endian, VSX loads require generating lxvd2x/xxswapd.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
     if (Subtarget.needsSwapsForVSXMemOps()) {
       switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
       default:
         break;
       case Intrinsic::ppc_vsx_lxvw4x:
       case Intrinsic::ppc_vsx_lxvd2x:
         return expandVSXLoadForLE(N, DCI);
       }
     }
     break;
   case ISD::INTRINSIC_VOID:
     // For little endian, VSX stores require generating xxswapd/stxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
     if (Subtarget.needsSwapsForVSXMemOps()) {
       switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
       default:
         break;
       case Intrinsic::ppc_vsx_stxvw4x:
       case Intrinsic::ppc_vsx_stxvd2x:
         return expandVSXStoreForLE(N, DCI);
       }
     }
     break;
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
         N->getOperand(0).hasOneUse() &&
         (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
          (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
           N->getValueType(0) == MVT::i64))) {
       SDValue Load = N->getOperand(0);
       LoadSDNode *LD = cast<LoadSDNode>(Load);
       // Create the byte-swapping load.
       SDValue Ops[] = {
         LD->getChain(),    // Chain
         LD->getBasePtr(),  // Ptr
         DAG.getValueType(N->getValueType(0)) // VT
       };
       SDValue BSLoad =
         DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
                                 DAG.getVTList(N->getValueType(0) == MVT::i64 ?
                                               MVT::i64 : MVT::i32, MVT::Other),
                                 Ops, LD->getMemoryVT(), LD->getMemOperand());
 
       // If this is an i16 load, insert the truncate.
       SDValue ResVal = BSLoad;
       if (N->getValueType(0) == MVT::i16)
         ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
 
       // First, combine the bswap away.  This makes the value produced by the
       // load dead.
       DCI.CombineTo(N, ResVal);
 
       // Next, combine the load away, we give it a bogus result value but a real
       // chain result.  The result value is dead because the bswap is dead.
       DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
 
       // Return N so it doesn't get rechecked!
       return SDValue(N, 0);
     }
     break;
   case PPCISD::VCMP:
     // If a VCMPo node already exists with exactly the same operands as this
     // node, use its result instead of this node (VCMPo computes both a CR6 and
     // a normal output).
     //
     if (!N->getOperand(0).hasOneUse() &&
         !N->getOperand(1).hasOneUse() &&
         !N->getOperand(2).hasOneUse()) {
 
       // Scan all of the users of the LHS, looking for VCMPo's that match.
       SDNode *VCMPoNode = nullptr;
 
       SDNode *LHSN = N->getOperand(0).getNode();
       for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
            UI != E; ++UI)
         if (UI->getOpcode() == PPCISD::VCMPo &&
             UI->getOperand(1) == N->getOperand(1) &&
             UI->getOperand(2) == N->getOperand(2) &&
             UI->getOperand(0) == N->getOperand(0)) {
           VCMPoNode = *UI;
           break;
         }
 
       // If there is no VCMPo node, or if the flag value has a single use, don't
       // transform this.
       if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
         break;
 
       // Look at the (necessarily single) use of the flag value.  If it has a
       // chain, this transformation is more complex.  Note that multiple things
       // could use the value result, which we should ignore.
       SDNode *FlagUser = nullptr;
       for (SDNode::use_iterator UI = VCMPoNode->use_begin();
            FlagUser == nullptr; ++UI) {
         assert(UI != VCMPoNode->use_end() && "Didn't find user!");
         SDNode *User = *UI;
         for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
           if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
             FlagUser = User;
             break;
           }
         }
       }
 
       // If the user is a MFOCRF instruction, we know this is safe.
       // Otherwise we give up for right now.
       if (FlagUser->getOpcode() == PPCISD::MFOCRF)
         return SDValue(VCMPoNode, 0);
     }
     break;
   case ISD::BRCOND: {
     SDValue Cond = N->getOperand(1);
     SDValue Target = N->getOperand(2);
 
     if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
         cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
           Intrinsic::ppc_is_decremented_ctr_nonzero) {
 
       // We now need to make the intrinsic dead (it cannot be instruction
       // selected).
       DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
       assert(Cond.getNode()->hasOneUse() &&
              "Counter decrement has more than one use");
 
       return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
                          N->getOperand(0), Target);
     }
   }
   break;
   case ISD::BR_CC: {
     // If this is a branch on an altivec predicate comparison, lower this so
     // that we don't have to do a MFOCRF: instead, branch directly on CR6.  This
     // lowering is done pre-legalize, because the legalizer lowers the predicate
     // compare down to code that is difficult to reassemble.
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
     SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
 
     // Sometimes the promoted value of the intrinsic is ANDed by some non-zero
     // value. If so, pass-through the AND to get to the intrinsic.
     if (LHS.getOpcode() == ISD::AND &&
         LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
         cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
           Intrinsic::ppc_is_decremented_ctr_nonzero &&
         isa<ConstantSDNode>(LHS.getOperand(1)) &&
         !isNullConstant(LHS.getOperand(1)))
       LHS = LHS.getOperand(0);
 
     if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
         cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
           Intrinsic::ppc_is_decremented_ctr_nonzero &&
         isa<ConstantSDNode>(RHS)) {
       assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
              "Counter decrement comparison is not EQ or NE");
 
       unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
       bool isBDNZ = (CC == ISD::SETEQ && Val) ||
                     (CC == ISD::SETNE && !Val);
 
       // We now need to make the intrinsic dead (it cannot be instruction
       // selected).
       DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
       assert(LHS.getNode()->hasOneUse() &&
              "Counter decrement has more than one use");
 
       return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
                          N->getOperand(0), N->getOperand(4));
     }
 
     int CompareOpc;
     bool isDot;
 
     if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
         isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
         getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
       assert(isDot && "Can't compare against a vector result!");
 
       // If this is a comparison against something other than 0/1, then we know
       // that the condition is never/always true.
       unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
       if (Val != 0 && Val != 1) {
         if (CC == ISD::SETEQ)      // Cond never true, remove branch.
           return N->getOperand(0);
         // Always !=, turn it into an unconditional branch.
         return DAG.getNode(ISD::BR, dl, MVT::Other,
                            N->getOperand(0), N->getOperand(4));
       }
 
       bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
 
       // Create the PPCISD altivec 'dot' comparison node.
       SDValue Ops[] = {
         LHS.getOperand(2),  // LHS of compare
         LHS.getOperand(3),  // RHS of compare
         DAG.getConstant(CompareOpc, dl, MVT::i32)
       };
       EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
       SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
 
       // Unpack the result based on how the target uses it.
       PPC::Predicate CompOpc;
       switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
       default:  // Can't happen, don't crash on invalid number though.
       case 0:   // Branch on the value of the EQ bit of CR6.
         CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
         break;
       case 1:   // Branch on the inverted value of the EQ bit of CR6.
         CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
         break;
       case 2:   // Branch on the value of the LT bit of CR6.
         CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
         break;
       case 3:   // Branch on the inverted value of the LT bit of CR6.
         CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
         break;
       }
 
       return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
                          DAG.getConstant(CompOpc, dl, MVT::i32),
                          DAG.getRegister(PPC::CR6, MVT::i32),
                          N->getOperand(4), CompNode.getValue(1));
     }
     break;
   }
   case ISD::BUILD_VECTOR:
     return DAGCombineBuildVector(N, DCI);
   }
 
   return SDValue();
 }
 
 SDValue
 PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                  SelectionDAG &DAG,
                                  SmallVectorImpl<SDNode *> &Created) const {
   // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
   if (VT == MVT::i64 && !Subtarget.isPPC64())
     return SDValue();
   if ((VT != MVT::i32 && VT != MVT::i64) ||
       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
     return SDValue();
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
 
   bool IsNegPow2 = (-Divisor).isPowerOf2();
   unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
   SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
 
   SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
   Created.push_back(Op.getNode());
 
   if (IsNegPow2) {
     Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
     Created.push_back(Op.getNode());
   }
 
   return Op;
 }
 
 //===----------------------------------------------------------------------===//
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
   case PPCISD::LBRX: {
     // lhbrx is known to have the top bits cleared out.
     if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
       Known.Zero = 0xFFFF0000;
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
     default: break;
     case Intrinsic::ppc_altivec_vcmpbfp_p:
     case Intrinsic::ppc_altivec_vcmpeqfp_p:
     case Intrinsic::ppc_altivec_vcmpequb_p:
     case Intrinsic::ppc_altivec_vcmpequh_p:
     case Intrinsic::ppc_altivec_vcmpequw_p:
     case Intrinsic::ppc_altivec_vcmpequd_p:
     case Intrinsic::ppc_altivec_vcmpgefp_p:
     case Intrinsic::ppc_altivec_vcmpgtfp_p:
     case Intrinsic::ppc_altivec_vcmpgtsb_p:
     case Intrinsic::ppc_altivec_vcmpgtsh_p:
     case Intrinsic::ppc_altivec_vcmpgtsw_p:
     case Intrinsic::ppc_altivec_vcmpgtsd_p:
     case Intrinsic::ppc_altivec_vcmpgtub_p:
     case Intrinsic::ppc_altivec_vcmpgtuh_p:
     case Intrinsic::ppc_altivec_vcmpgtuw_p:
     case Intrinsic::ppc_altivec_vcmpgtud_p:
       Known.Zero = ~1U;  // All bits but the low one are known to be zero.
       break;
     }
   }
   }
 }
 
 unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   switch (Subtarget.getDarwinDirective()) {
   default: break;
   case PPC::DIR_970:
   case PPC::DIR_PWR4:
   case PPC::DIR_PWR5:
   case PPC::DIR_PWR5X:
   case PPC::DIR_PWR6:
   case PPC::DIR_PWR6X:
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9: {
     if (!ML)
       break;
 
     const PPCInstrInfo *TII = Subtarget.getInstrInfo();
 
     // For small loops (between 5 and 8 instructions), align to a 32-byte
     // boundary so that the entire loop fits in one instruction-cache line.
     uint64_t LoopSize = 0;
     for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
       for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
         LoopSize += TII->getInstSizeInBytes(*J);
         if (LoopSize > 32)
           break;
       }
 
     if (LoopSize > 16 && LoopSize <= 32)
       return 5;
 
     break;
   }
   }
 
   return TargetLowering::getPrefLoopAlignment(ML);
 }
 
 /// getConstraintType - Given a constraint, return the type of
 /// constraint it is for this target.
 PPCTargetLowering::ConstraintType
 PPCTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default: break;
     case 'b':
     case 'r':
     case 'f':
     case 'd':
     case 'v':
     case 'y':
       return C_RegisterClass;
     case 'Z':
       // FIXME: While Z does indicate a memory constraint, it specifically
       // indicates an r+r address (used in conjunction with the 'y' modifier
       // in the replacement string). Currently, we're forcing the base
       // register to be r0 in the asm printer (which is interpreted as zero)
       // and forming the complete address in the second register. This is
       // suboptimal.
       return C_Memory;
     }
   } else if (Constraint == "wc") { // individual CR bits.
     return C_RegisterClass;
   } else if (Constraint == "wa" || Constraint == "wd" ||
              Constraint == "wf" || Constraint == "ws") {
     return C_RegisterClass; // VSX registers.
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
 /// Examine constraint type and operand type and determine a weight value.
 /// This object must already have been set up with the operand type
 /// and the current alternative constraint selected.
 TargetLowering::ConstraintWeight
 PPCTargetLowering::getSingleConstraintMatchWeight(
     AsmOperandInfo &info, const char *constraint) const {
   ConstraintWeight weight = CW_Invalid;
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
   if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
 
   // Look at the constraint type.
   if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
     return CW_Register; // an individual CR bit.
   else if ((StringRef(constraint) == "wa" ||
             StringRef(constraint) == "wd" ||
             StringRef(constraint) == "wf") &&
            type->isVectorTy())
     return CW_Register;
   else if (StringRef(constraint) == "ws" && type->isDoubleTy())
     return CW_Register;
 
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
     break;
   case 'b':
     if (type->isIntegerTy())
       weight = CW_Register;
     break;
   case 'f':
     if (type->isFloatTy())
       weight = CW_Register;
     break;
   case 'd':
     if (type->isDoubleTy())
       weight = CW_Register;
     break;
   case 'v':
     if (type->isVectorTy())
       weight = CW_Register;
     break;
   case 'y':
     weight = CW_Register;
     break;
   case 'Z':
     weight = CW_Memory;
     break;
   }
   return weight;
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
                                                 MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
     switch (Constraint[0]) {
     case 'b':   // R1-R31
       if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
       return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
     case 'r':   // R0-R31
       if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RCRegClass);
       return std::make_pair(0U, &PPC::GPRCRegClass);
     // 'd' and 'f' constraints are both defined to be "the floating point
     // registers", where one is for 32-bit and the other for 64-bit. We don't
     // really care overly much here so just give them all the same reg classes.
     case 'd':
     case 'f':
       if (Subtarget.hasSPE()) {
         if (VT == MVT::f32 || VT == MVT::i32)
           return std::make_pair(0U, &PPC::SPE4RCRegClass);
         if (VT == MVT::f64 || VT == MVT::i64)
           return std::make_pair(0U, &PPC::SPERCRegClass);
       } else {
         if (VT == MVT::f32 || VT == MVT::i32)
           return std::make_pair(0U, &PPC::F4RCRegClass);
         if (VT == MVT::f64 || VT == MVT::i64)
           return std::make_pair(0U, &PPC::F8RCRegClass);
         if (VT == MVT::v4f64 && Subtarget.hasQPX())
           return std::make_pair(0U, &PPC::QFRCRegClass);
         if (VT == MVT::v4f32 && Subtarget.hasQPX())
           return std::make_pair(0U, &PPC::QSRCRegClass);
       }
       break;
     case 'v':
       if (VT == MVT::v4f64 && Subtarget.hasQPX())
         return std::make_pair(0U, &PPC::QFRCRegClass);
       if (VT == MVT::v4f32 && Subtarget.hasQPX())
         return std::make_pair(0U, &PPC::QSRCRegClass);
       if (Subtarget.hasAltivec())
         return std::make_pair(0U, &PPC::VRRCRegClass);
       break;
     case 'y':   // crrc
       return std::make_pair(0U, &PPC::CRRCRegClass);
     }
   } else if (Constraint == "wc" && Subtarget.useCRBits()) {
     // An individual CR bit.
     return std::make_pair(0U, &PPC::CRBITRCRegClass);
   } else if ((Constraint == "wa" || Constraint == "wd" ||
              Constraint == "wf") && Subtarget.hasVSX()) {
     return std::make_pair(0U, &PPC::VSRCRegClass);
   } else if (Constraint == "ws" && Subtarget.hasVSX()) {
     if (VT == MVT::f32 && Subtarget.hasP8Vector())
       return std::make_pair(0U, &PPC::VSSRCRegClass);
     else
       return std::make_pair(0U, &PPC::VSFRCRegClass);
   }
 
   std::pair<unsigned, const TargetRegisterClass *> R =
       TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
   // (which we call X[0-9]+). If a 64-bit value has been requested, and a
   // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
   // register.
   // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
   // the AsmName field from *RegisterInfo.td, then this would not be necessary.
   if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
       PPC::GPRCRegClass.contains(R.first))
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
                             PPC::sub_32, &PPC::G8RCRegClass),
                           &PPC::G8RCRegClass);
 
   // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
   if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
     R.first = PPC::CR0;
     R.second = &PPC::CRRCRegClass;
   }
 
   return R;
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result;
 
   // Only support length 1 constraints.
   if (Constraint.length() > 1) return;
 
   char Letter = Constraint[0];
   switch (Letter) {
   default: break;
   case 'I':
   case 'J':
   case 'K':
   case 'L':
   case 'M':
   case 'N':
   case 'O':
   case 'P': {
     ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
     if (!CST) return; // Must be an immediate to match.
     SDLoc dl(Op);
     int64_t Value = CST->getSExtValue();
     EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
                          // numbers are printed as such.
     switch (Letter) {
     default: llvm_unreachable("Unknown constraint letter!");
     case 'I':  // "I" is a signed 16-bit constant.
       if (isInt<16>(Value))
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
       if (isShiftedUInt<16, 16>(Value))
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
       if (isShiftedInt<16, 16>(Value))
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
       if (isUInt<16>(Value))
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'M':  // "M" is a constant that is greater than 31.
       if (Value > 31)
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'N':  // "N" is a positive constant that is an exact power of two.
       if (Value > 0 && isPowerOf2_64(Value))
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'O':  // "O" is the constant zero.
       if (Value == 0)
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
       if (isInt<16>(-Value))
         Result = DAG.getTargetConstant(Value, dl, TCVT);
       break;
     }
     break;
   }
   }
 
   if (Result.getNode()) {
     Ops.push_back(Result);
     return;
   }
 
   // Handle standard constraint letters.
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS, Instruction *I) const {
   // PPC does not allow r+i addressing modes for vectors!
   if (Ty->isVectorTy() && AM.BaseOffs != 0)
     return false;
 
   // PPC allows a sign-extended 16-bit immediate field.
   if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
     return false;
 
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
 
   // PPC only support r+r,
   switch (AM.Scale) {
   case 0:  // "r+i" or just "i", depending on HasBaseReg.
     break;
   case 1:
     if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
       return false;
     // Otherwise we have r+r or r+i.
     break;
   case 2:
     if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
       return false;
     // Allow 2*r as r+r.
     break;
   default:
     // No other scales are supported.
     return false;
   }
 
   return true;
 }
 
 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setReturnAddressIsTaken(true);
 
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   // Make sure the function does not optimize away the store of the RA to
   // the stack.
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setLRStoreRequired();
   bool isPPC64 = Subtarget.isPPC64();
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
         DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
                         isPPC64 ? MVT::i64 : MVT::i32);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo());
   }
 
   // Just load the return address off the stack.
   SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
                      MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
 
   EVT PtrVT = getPointerTy(MF.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
 
   // Naked functions never have a frame pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned FrameReg;
   if (MF.getFunction().hasFnAttribute(Attribute::Naked))
     FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
   else
     FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
 
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
                                          PtrVT);
   while (Depth--)
     FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
                             FrameAddr, MachinePointerInfo());
   return FrameAddr;
 }
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               SelectionDAG &DAG) const {
   bool isPPC64 = Subtarget.isPPC64();
   bool isDarwinABI = Subtarget.isDarwinABI();
 
   if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
       (!isPPC64 && VT != MVT::i32))
     report_fatal_error("Invalid register global variable type");
 
   bool is64Bit = isPPC64 && VT == MVT::i64;
   unsigned Reg = StringSwitch<unsigned>(RegName)
                    .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
                    .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2)
                    .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
                                   (is64Bit ? PPC::X13 : PPC::R13))
                    .Default(0);
 
   if (Reg)
     return Reg;
   report_fatal_error("Invalid register name global variable");
 }
 
 bool
 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The PowerPC target isn't yet aware of offsets.
   return false;
 }
 
 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
   switch (Intrinsic) {
   case Intrinsic::ppc_qpx_qvlfd:
   case Intrinsic::ppc_qpx_qvlfs:
   case Intrinsic::ppc_qpx_qvlfcd:
   case Intrinsic::ppc_qpx_qvlfcs:
   case Intrinsic::ppc_qpx_qvlfiwa:
   case Intrinsic::ppc_qpx_qvlfiwz:
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
   case Intrinsic::ppc_altivec_lvebx:
   case Intrinsic::ppc_altivec_lvehx:
   case Intrinsic::ppc_altivec_lvewx:
   case Intrinsic::ppc_vsx_lxvd2x:
   case Intrinsic::ppc_vsx_lxvw4x: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_altivec_lvebx:
       VT = MVT::i8;
       break;
     case Intrinsic::ppc_altivec_lvehx:
       VT = MVT::i16;
       break;
     case Intrinsic::ppc_altivec_lvewx:
       VT = MVT::i32;
       break;
     case Intrinsic::ppc_vsx_lxvd2x:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvlfd:
       VT = MVT::v4f64;
       break;
     case Intrinsic::ppc_qpx_qvlfs:
       VT = MVT::v4f32;
       break;
     case Intrinsic::ppc_qpx_qvlfcd:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvlfcs:
       VT = MVT::v2f32;
       break;
     default:
       VT = MVT::v4i32;
       break;
     }
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = VT;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
     Info.align = 1;
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::ppc_qpx_qvlfda:
   case Intrinsic::ppc_qpx_qvlfsa:
   case Intrinsic::ppc_qpx_qvlfcda:
   case Intrinsic::ppc_qpx_qvlfcsa:
   case Intrinsic::ppc_qpx_qvlfiwaa:
   case Intrinsic::ppc_qpx_qvlfiwza: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_qpx_qvlfda:
       VT = MVT::v4f64;
       break;
     case Intrinsic::ppc_qpx_qvlfsa:
       VT = MVT::v4f32;
       break;
     case Intrinsic::ppc_qpx_qvlfcda:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvlfcsa:
       VT = MVT::v2f32;
       break;
     default:
       VT = MVT::v4i32;
       break;
     }
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = VT;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.size = VT.getStoreSize();
     Info.align = 1;
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::ppc_qpx_qvstfd:
   case Intrinsic::ppc_qpx_qvstfs:
   case Intrinsic::ppc_qpx_qvstfcd:
   case Intrinsic::ppc_qpx_qvstfcs:
   case Intrinsic::ppc_qpx_qvstfiw:
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
   case Intrinsic::ppc_altivec_stvebx:
   case Intrinsic::ppc_altivec_stvehx:
   case Intrinsic::ppc_altivec_stvewx:
   case Intrinsic::ppc_vsx_stxvd2x:
   case Intrinsic::ppc_vsx_stxvw4x: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_altivec_stvebx:
       VT = MVT::i8;
       break;
     case Intrinsic::ppc_altivec_stvehx:
       VT = MVT::i16;
       break;
     case Intrinsic::ppc_altivec_stvewx:
       VT = MVT::i32;
       break;
     case Intrinsic::ppc_vsx_stxvd2x:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvstfd:
       VT = MVT::v4f64;
       break;
     case Intrinsic::ppc_qpx_qvstfs:
       VT = MVT::v4f32;
       break;
     case Intrinsic::ppc_qpx_qvstfcd:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvstfcs:
       VT = MVT::v2f32;
       break;
     default:
       VT = MVT::v4i32;
       break;
     }
 
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = VT;
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
     Info.align = 1;
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::ppc_qpx_qvstfda:
   case Intrinsic::ppc_qpx_qvstfsa:
   case Intrinsic::ppc_qpx_qvstfcda:
   case Intrinsic::ppc_qpx_qvstfcsa:
   case Intrinsic::ppc_qpx_qvstfiwa: {
     EVT VT;
     switch (Intrinsic) {
     case Intrinsic::ppc_qpx_qvstfda:
       VT = MVT::v4f64;
       break;
     case Intrinsic::ppc_qpx_qvstfsa:
       VT = MVT::v4f32;
       break;
     case Intrinsic::ppc_qpx_qvstfcda:
       VT = MVT::v2f64;
       break;
     case Intrinsic::ppc_qpx_qvstfcsa:
       VT = MVT::v2f32;
       break;
     default:
       VT = MVT::v4i32;
       break;
     }
 
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = VT;
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.size = VT.getStoreSize();
     Info.align = 1;
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   default:
     break;
   }
 
   return false;
 }
 
 /// getOptimalMemOpType - Returns the target specific optimal type for load
 /// and store operations as a result of memset, memcpy, and memmove
 /// lowering. If DstAlign is zero that means it's safe to destination
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
 /// means there isn't a need to check it against alignment requirement,
 /// probably because the source does not need to be loaded. If 'IsMemset' is
 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
 /// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            unsigned DstAlign, unsigned SrcAlign,
                                            bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
   if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
     const Function &F = MF.getFunction();
     // When expanding a memset, require at least two QPX instructions to cover
     // the cost of loading the value to be stored from the constant pool.
     if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
        (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
         !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
       return MVT::v4f64;
     }
 
     // We should use Altivec/VSX loads and stores when available. For unaligned
     // addresses, unaligned VSX loads are only fast starting with the P8.
     if (Subtarget.hasAltivec() && Size >= 16 &&
         (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
          ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
       return MVT::v4i32;
   }
 
   if (Subtarget.isPPC64()) {
     return MVT::i64;
   }
 
   return MVT::i32;
 }
 
 /// Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   return !(BitSize == 0 || BitSize > 64);
 }
 
 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 == 64 && NumBits2 == 32;
 }
 
 bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 == 64 && NumBits2 == 32;
 }
 
 bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   // Generally speaking, zexts are not free, but they are free when they can be
   // folded with other operations.
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
     EVT MemVT = LD->getMemoryVT();
     if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
          (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
         (LD->getExtensionType() == ISD::NON_EXTLOAD ||
          LD->getExtensionType() == ISD::ZEXTLOAD))
       return true;
   }
 
   // FIXME: Add other cases...
   //  - 32-bit shifts with a zext to i64
   //  - zext after ctlz, bswap, etc.
   //  - zext after and by a constant mask
 
   return TargetLowering::isZExtFree(Val, VT2);
 }
 
 bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
   assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
          "invalid fpext types");
   // Extending to float128 is not free.
   if (DestVT == MVT::f128)
     return false;
   return true;
 }
 
 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return isInt<16>(Imm) || isUInt<16>(Imm);
 }
 
 bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
   return isInt<16>(Imm) || isUInt<16>(Imm);
 }
 
 bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                        unsigned,
                                                        unsigned,
                                                        bool *Fast) const {
   if (DisablePPCUnaligned)
     return false;
 
   // PowerPC supports unaligned memory access for simple non-vector types.
   // Although accessing unaligned addresses is not as efficient as accessing
   // aligned addresses, it is generally more efficient than manual expansion,
   // and generally only traps for software emulation when crossing page
   // boundaries.
 
   if (!VT.isSimple())
     return false;
 
   if (VT.getSimpleVT().isVector()) {
     if (Subtarget.hasVSX()) {
       if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
           VT != MVT::v4f32 && VT != MVT::v4i32)
         return false;
     } else {
       return false;
     }
   }
 
   if (VT == MVT::ppcf128)
     return false;
 
   if (Fast)
     *Fast = true;
 
   return true;
 }
 
 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
     return true;
   case MVT::f128:
     return (EnableQuadPrecision && Subtarget.hasP9Vector());
   default:
     break;
   }
 
   return false;
 }
 
 const MCPhysReg *
 PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
   // LR is a callee-save register, but we must treat it as clobbered by any call
   // site. Hence we include LR in the scratch registers, which are in turn added
   // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
   // to CTR, which is used by any indirect call.
   static const MCPhysReg ScratchRegs[] = {
     PPC::X12, PPC::LR8, PPC::CTR8, 0
   };
 
   return ScratchRegs;
 }
 
 unsigned PPCTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
 }
 
 unsigned PPCTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
 }
 
 bool
 PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
                      EVT VT , unsigned DefinedValues) const {
   if (VT == MVT::v2i64)
     return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
 
   if (Subtarget.hasVSX() || Subtarget.hasQPX())
     return true;
 
   return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
 }
 
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
   if (DisableILPPref || Subtarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
 
   return Sched::ILP;
 }
 
 // Create a fast isel object.
 FastISel *
 PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
                                   const TargetLibraryInfo *LibInfo) const {
   return PPC::createFastISel(FuncInfo, LibInfo);
 }
 
 void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   if (Subtarget.isDarwinABI()) return;
   if (!Subtarget.isPPC64()) return;
 
   // Update IsSplitCSR in PPCFunctionInfo
   PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
   PFI->setIsSplitCSR(true);
 }
 
 void PPCTargetLowering::insertCopiesSplitCSR(
   MachineBasicBlock *Entry,
   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   if (!IStart)
     return;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (PPC::G8RCRegClass.contains(*I))
       RC = &PPC::G8RCRegClass;
     else if (PPC::F8RCRegClass.contains(*I))
       RC = &PPC::F8RCRegClass;
     else if (PPC::CRRCRegClass.contains(*I))
       RC = &PPC::CRRCRegClass;
     else if (PPC::VRRCRegClass.contains(*I))
       RC = &PPC::VRRCRegClass;
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
     unsigned NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
     assert(Entry->getParent()->getFunction().hasFnAttribute(
              Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
       .addReg(*I);
 
     // Insert the copy-back instructions right before the terminator
     for (auto *Exit : Exits)
       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
               TII->get(TargetOpcode::COPY), *I)
         .addReg(NewVR);
   }
 }
 
 // Override to enable LOAD_STACK_GUARD lowering on Linux.
 bool PPCTargetLowering::useLoadStackGuardNode() const {
   if (!Subtarget.isTargetLinux())
     return TargetLowering::useLoadStackGuardNode();
   return true;
 }
 
 // Override to disable global variable loading on Linux.
 void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
   if (!Subtarget.isTargetLinux())
     return TargetLowering::insertSSPDeclarations(M);
 }
 
 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (!VT.isSimple() || !Subtarget.hasVSX())
     return false;
 
   switch(VT.getSimpleVT().SimpleTy) {
   default:
     // For FP types that are currently not supported by PPC backend, return
     // false. Examples: f16, f80.
     return false;
   case MVT::f32:
   case MVT::f64:
   case MVT::ppcf128:
     return Imm.isPosZero();
   }
 }
 
 // For vector shift operation op, fold
 // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
 static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
                                   SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
   unsigned Opcode = N->getOpcode();
   unsigned TargetOpcode;
 
   switch (Opcode) {
   default:
     llvm_unreachable("Unexpected shift operation");
   case ISD::SHL:
     TargetOpcode = PPCISD::SHL;
     break;
   case ISD::SRL:
     TargetOpcode = PPCISD::SRL;
     break;
   case ISD::SRA:
     TargetOpcode = PPCISD::SRA;
     break;
   }
 
   if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
       N1->getOpcode() == ISD::AND)
     if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
       if (Mask->getZExtValue() == OpSizeInBits - 1)
         return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
 
   return SDValue();
 }
 
 SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
     return Value;
 
   return SDValue();
 }
 
 SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
     return Value;
 
   return SDValue();
 }
 
 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
     return Value;
 
   return SDValue();
 }
 
 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
   if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
     return false;
 
   // If not a tail call then no need to proceed.
   if (!CI->isTailCall())
     return false;
 
   // If tail calls are disabled for the caller then we are done.
   const Function *Caller = CI->getParent()->getParent();
   auto Attr = Caller->getFnAttribute("disable-tail-calls");
   if (Attr.getValueAsString() == "true")
     return false;
 
   // If sibling calls have been disabled and tail-calls aren't guaranteed
   // there is no reason to duplicate.
   auto &TM = getTargetMachine();
   if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
     return false;
 
   // Can't tail call a function called indirectly, or if it has variadic args.
   const Function *Callee = CI->getCalledFunction();
   if (!Callee || Callee->isVarArg())
     return false;
 
   // Make sure the callee and caller calling conventions are eligible for tco.
   if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
                                            CI->getCallingConv()))
       return false;
 
   // If the function is local then we have a good chance at tail-calling it
   return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
 }
 
 bool PPCTargetLowering::
 isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
   const Value *Mask = AndI.getOperand(1);
   // If the mask is suitable for andi. or andis. we should sink the and.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
     // Can't handle constants wider than 64-bits.
     if (CI->getBitWidth() > 64)
       return false;
     int64_t ConstVal = CI->getZExtValue();
     return isUInt<16>(ConstVal) ||
       (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
   }
 
   // For non-constant masks, we can always use the record-form and.
   return true;
 }
Index: vendor/llvm/dist-release_70/lib/Target/X86/X86FlagsCopyLowering.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Target/X86/X86FlagsCopyLowering.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/Target/X86/X86FlagsCopyLowering.cpp	(revision 338000)
@@ -1,1055 +1,1060 @@
 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 /// \file
 ///
 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
 /// flag bits.
 ///
 /// We have to do this by carefully analyzing and rewriting the usage of the
 /// copied EFLAGS register because there is no general way to rematerialize the
 /// entire EFLAGS register safely and efficiently. Using `popf` both forces
 /// dynamic stack adjustment and can create correctness issues due to IF, TF,
 /// and other non-status flags being overwritten. Using sequences involving
 /// SAHF don't work on all x86 processors and are often quite slow compared to
 /// directly testing a single status preserved in its own GPR.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <utility>
 
 using namespace llvm;
 
 #define PASS_KEY "x86-flags-copy-lowering"
 #define DEBUG_TYPE PASS_KEY
 
 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
 STATISTIC(NumTestsInserted, "Number of test instructions inserted");
 STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
 
 namespace llvm {
 
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 
 } // end namespace llvm
 
 namespace {
 
 // Convenient array type for storing registers associated with each condition.
 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
 
 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
 public:
   X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
     initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
   }
 
   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
   bool runOnMachineFunction(MachineFunction &MF) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
   /// Pass identification, replacement for typeid.
   static char ID;
 
 private:
   MachineRegisterInfo *MRI;
+  const X86Subtarget *Subtarget;
   const X86InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const TargetRegisterClass *PromoteRC;
   MachineDominatorTree *MDT;
 
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator CopyDefI);
 
   unsigned promoteCondToReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator TestPos,
                             DebugLoc TestLoc, X86::CondCode Cond);
   std::pair<unsigned, bool>
   getCondOrInverseInReg(MachineBasicBlock &TestMBB,
                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                         X86::CondCode Cond, CondRegArray &CondRegs);
   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
                   DebugLoc Loc, unsigned Reg);
 
   void rewriteArithmetic(MachineBasicBlock &TestMBB,
                          MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                          MachineInstr &MI, MachineOperand &FlagUse,
                          CondRegArray &CondRegs);
   void rewriteCMov(MachineBasicBlock &TestMBB,
                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                    MachineInstr &CMovI, MachineOperand &FlagUse,
                    CondRegArray &CondRegs);
   void rewriteCondJmp(MachineBasicBlock &TestMBB,
                       MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                       MachineInstr &JmpI, CondRegArray &CondRegs);
   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
                    MachineInstr &CopyDefI);
   void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
                                MachineBasicBlock::iterator TestPos,
                                DebugLoc TestLoc, MachineInstr &SetBI,
                                MachineOperand &FlagUse, CondRegArray &CondRegs);
   void rewriteSetCC(MachineBasicBlock &TestMBB,
                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                     MachineInstr &SetCCI, MachineOperand &FlagUse,
                     CondRegArray &CondRegs);
 };
 
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
                       "X86 EFLAGS copy lowering", false, false)
 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
                     "X86 EFLAGS copy lowering", false, false)
 
 FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
   return new X86FlagsCopyLoweringPass();
 }
 
 char X86FlagsCopyLoweringPass::ID = 0;
 
 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineDominatorTree>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 namespace {
 /// An enumeration of the arithmetic instruction mnemonics which have
 /// interesting flag semantics.
 ///
 /// We can map instruction opcodes into these mnemonics to make it easy to
 /// dispatch with specific functionality.
 enum class FlagArithMnemonic {
   ADC,
   ADCX,
   ADOX,
   RCL,
   RCR,
   SBB,
 };
 } // namespace
 
 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
   switch (Opcode) {
   default:
     report_fatal_error("No support for lowering a copy into EFLAGS when used "
                        "by this instruction!");
 
 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
   case X86::MNEMONIC##8##SUFFIX:                                               \
   case X86::MNEMONIC##16##SUFFIX:                                              \
   case X86::MNEMONIC##32##SUFFIX:                                              \
   case X86::MNEMONIC##64##SUFFIX:
 
 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
   case X86::MNEMONIC##8ri:                                                     \
   case X86::MNEMONIC##16ri8:                                                   \
   case X86::MNEMONIC##32ri8:                                                   \
   case X86::MNEMONIC##64ri8:                                                   \
   case X86::MNEMONIC##16ri:                                                    \
   case X86::MNEMONIC##32ri:                                                    \
   case X86::MNEMONIC##64ri32:                                                  \
   case X86::MNEMONIC##8mi:                                                     \
   case X86::MNEMONIC##16mi8:                                                   \
   case X86::MNEMONIC##32mi8:                                                   \
   case X86::MNEMONIC##64mi8:                                                   \
   case X86::MNEMONIC##16mi:                                                    \
   case X86::MNEMONIC##32mi:                                                    \
   case X86::MNEMONIC##64mi32:                                                  \
   case X86::MNEMONIC##8i8:                                                     \
   case X86::MNEMONIC##16i16:                                                   \
   case X86::MNEMONIC##32i32:                                                   \
   case X86::MNEMONIC##64i32:
 
     LLVM_EXPAND_ADC_SBB_INSTR(ADC)
     return FlagArithMnemonic::ADC;
 
     LLVM_EXPAND_ADC_SBB_INSTR(SBB)
     return FlagArithMnemonic::SBB;
 
 #undef LLVM_EXPAND_ADC_SBB_INSTR
 
     LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
     LLVM_EXPAND_INSTR_SIZES(RCL, r1)
     LLVM_EXPAND_INSTR_SIZES(RCL, ri)
     return FlagArithMnemonic::RCL;
 
     LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
     LLVM_EXPAND_INSTR_SIZES(RCR, r1)
     LLVM_EXPAND_INSTR_SIZES(RCR, ri)
     return FlagArithMnemonic::RCR;
 
 #undef LLVM_EXPAND_INSTR_SIZES
 
   case X86::ADCX32rr:
   case X86::ADCX64rr:
   case X86::ADCX32rm:
   case X86::ADCX64rm:
     return FlagArithMnemonic::ADCX;
 
   case X86::ADOX32rr:
   case X86::ADOX64rr:
   case X86::ADOX32rm:
   case X86::ADOX64rm:
     return FlagArithMnemonic::ADOX;
   }
 }
 
 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
                                      MachineInstr &SplitI,
                                      const X86InstrInfo &TII) {
   MachineFunction &MF = *MBB.getParent();
 
   assert(SplitI.getParent() == &MBB &&
          "Split instruction must be in the split block!");
   assert(SplitI.isBranch() &&
          "Only designed to split a tail of branch instructions!");
   assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
          "Must split on an actual jCC instruction!");
 
   // Dig out the previous instruction to the split point.
   MachineInstr &PrevI = *std::prev(SplitI.getIterator());
   assert(PrevI.isBranch() && "Must split after a branch!");
   assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
          "Must split after an actual jCC instruction!");
   assert(!std::prev(PrevI.getIterator())->isTerminator() &&
          "Must only have this one terminator prior to the split!");
 
   // Grab the one successor edge that will stay in `MBB`.
   MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
 
   // Analyze the original block to see if we are actually splitting an edge
   // into two edges. This can happen when we have multiple conditional jumps to
   // the same successor.
   bool IsEdgeSplit =
       std::any_of(SplitI.getIterator(), MBB.instr_end(),
                   [&](MachineInstr &MI) {
                     assert(MI.isTerminator() &&
                            "Should only have spliced terminators!");
                     return llvm::any_of(
                         MI.operands(), [&](MachineOperand &MOp) {
                           return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
                         });
                   }) ||
       MBB.getFallThrough() == &UnsplitSucc;
 
   MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
 
   // Insert the new block immediately after the current one. Any existing
   // fallthrough will be sunk into this new block anyways.
   MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
 
   // Splice the tail of instructions into the new block.
   NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
 
   // Copy the necessary succesors (and their probability info) into the new
   // block.
   for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
     if (IsEdgeSplit || *SI != &UnsplitSucc)
       NewMBB.copySuccessor(&MBB, SI);
   // Normalize the probabilities if we didn't end up splitting the edge.
   if (!IsEdgeSplit)
     NewMBB.normalizeSuccProbs();
 
   // Now replace all of the moved successors in the original block with the new
   // block. This will merge their probabilities.
   for (MachineBasicBlock *Succ : NewMBB.successors())
     if (Succ != &UnsplitSucc)
       MBB.replaceSuccessor(Succ, &NewMBB);
 
   // We should always end up replacing at least one successor.
   assert(MBB.isSuccessor(&NewMBB) &&
          "Failed to make the new block a successor!");
 
   // Now update all the PHIs.
   for (MachineBasicBlock *Succ : NewMBB.successors()) {
     for (MachineInstr &MI : *Succ) {
       if (!MI.isPHI())
         break;
 
       for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
            OpIdx += 2) {
         MachineOperand &OpV = MI.getOperand(OpIdx);
         MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
         assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
         if (OpMBB.getMBB() != &MBB)
           continue;
 
         // Replace the operand for unsplit successors
         if (!IsEdgeSplit || Succ != &UnsplitSucc) {
           OpMBB.setMBB(&NewMBB);
 
           // We have to continue scanning as there may be multiple entries in
           // the PHI.
           continue;
         }
 
         // When we have split the edge append a new successor.
         MI.addOperand(MF, OpV);
         MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
         break;
       }
     }
   }
 
   return NewMBB;
 }
 
 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
                     << " **********\n");
 
-  auto &Subtarget = MF.getSubtarget<X86Subtarget>();
+  Subtarget = &MF.getSubtarget<X86Subtarget>();
   MRI = &MF.getRegInfo();
-  TII = Subtarget.getInstrInfo();
-  TRI = Subtarget.getRegisterInfo();
+  TII = Subtarget->getInstrInfo();
+  TRI = Subtarget->getRegisterInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
   PromoteRC = &X86::GR8RegClass;
 
   if (MF.begin() == MF.end())
     // Nothing to do for a degenerate empty function...
     return false;
 
   // Collect the copies in RPO so that when there are chains where a copy is in
   // turn copied again we visit the first one first. This ensures we can find
   // viable locations for testing the original EFLAGS that dominate all the
   // uses across complex CFGs.
   SmallVector<MachineInstr *, 4> Copies;
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   for (MachineBasicBlock *MBB : RPOT)
     for (MachineInstr &MI : *MBB)
       if (MI.getOpcode() == TargetOpcode::COPY &&
           MI.getOperand(0).getReg() == X86::EFLAGS)
         Copies.push_back(&MI);
 
   for (MachineInstr *CopyI : Copies) {
     MachineBasicBlock &MBB = *CopyI->getParent();
 
     MachineOperand &VOp = CopyI->getOperand(1);
     assert(VOp.isReg() &&
            "The input to the copy for EFLAGS should always be a register!");
     MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
     if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
       // FIXME: The big likely candidate here are PHI nodes. We could in theory
       // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
       // enough that it is probably better to change every other part of LLVM
       // to avoid creating them. The issue is that once we have PHIs we won't
       // know which original EFLAGS value we need to capture with our setCCs
       // below. The end result will be computing a complete set of setCCs that
       // we *might* want, computing them in every place where we copy *out* of
       // EFLAGS and then doing SSA formation on all of them to insert necessary
       // PHI nodes and consume those here. Then hoping that somehow we DCE the
       // unnecessary ones. This DCE seems very unlikely to be successful and so
       // we will almost certainly end up with a glut of dead setCC
       // instructions. Until we have a motivating test case and fail to avoid
       // it by changing other parts of LLVM's lowering, we refuse to handle
       // this complex case here.
       LLVM_DEBUG(
           dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
           CopyDefI.dump());
       report_fatal_error(
           "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
     }
 
     auto Cleanup = make_scope_exit([&] {
       // All uses of the EFLAGS copy are now rewritten, kill the copy into
       // eflags and if dead the copy from.
       CopyI->eraseFromParent();
       if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
         CopyDefI.eraseFromParent();
       ++NumCopiesEliminated;
     });
 
     MachineOperand &DOp = CopyI->getOperand(0);
     assert(DOp.isDef() && "Expected register def!");
     assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
     if (DOp.isDead())
       continue;
 
     MachineBasicBlock *TestMBB = CopyDefI.getParent();
     auto TestPos = CopyDefI.getIterator();
     DebugLoc TestLoc = CopyDefI.getDebugLoc();
 
     LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
 
     // Walk up across live-in EFLAGS to find where they were actually def'ed.
     //
     // This copy's def may just be part of a region of blocks covered by
     // a single def of EFLAGS and we want to find the top of that region where
     // possible.
     //
     // This is essentially a search for a *candidate* reaching definition
     // location. We don't need to ever find the actual reaching definition here,
     // but we want to walk up the dominator tree to find the highest point which
     // would be viable for such a definition.
     auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
                                 MachineBasicBlock::iterator End) {
       // Scan backwards as we expect these to be relatively short and often find
       // a clobber near the end.
       return llvm::any_of(
           llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
             // Flag any instruction (other than the copy we are
             // currently rewriting) that defs EFLAGS.
             return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
           });
     };
     auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
                                     MachineBasicBlock *EndMBB) {
       assert(MDT->dominates(BeginMBB, EndMBB) &&
              "Only support paths down the dominator tree!");
       SmallPtrSet<MachineBasicBlock *, 4> Visited;
       SmallVector<MachineBasicBlock *, 4> Worklist;
       // We terminate at the beginning. No need to scan it.
       Visited.insert(BeginMBB);
       Worklist.push_back(EndMBB);
       do {
         auto *MBB = Worklist.pop_back_val();
         for (auto *PredMBB : MBB->predecessors()) {
           if (!Visited.insert(PredMBB).second)
             continue;
           if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
             return true;
           // Enqueue this block to walk its predecessors.
           Worklist.push_back(PredMBB);
         }
       } while (!Worklist.empty());
       // No clobber found along a path from the begin to end.
       return false;
     };
     while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
            !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
       // Find the nearest common dominator of the predecessors, as
       // that will be the best candidate to hoist into.
       MachineBasicBlock *HoistMBB =
           std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
                           *TestMBB->pred_begin(),
                           [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
                             return MDT->findNearestCommonDominator(LHS, RHS);
                           });
 
       // Now we need to scan all predecessors that may be reached along paths to
       // the hoist block. A clobber anywhere in any of these blocks the hoist.
       // Note that this even handles loops because we require *no* clobbers.
       if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
         break;
 
       // We also need the terminators to not sneakily clobber flags.
       if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
                            HoistMBB->instr_end()))
         break;
 
       // We found a viable location, hoist our test position to it.
       TestMBB = HoistMBB;
       TestPos = TestMBB->getFirstTerminator()->getIterator();
       // Clear the debug location as it would just be confusing after hoisting.
       TestLoc = DebugLoc();
     }
     LLVM_DEBUG({
       auto DefIt = llvm::find_if(
           llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
           [&](MachineInstr &MI) {
             return MI.findRegisterDefOperand(X86::EFLAGS);
           });
       if (DefIt.base() != TestMBB->instr_begin()) {
         dbgs() << "  Using EFLAGS defined by: ";
         DefIt->dump();
       } else {
         dbgs() << "  Using live-in flags for BB:\n";
         TestMBB->dump();
       }
     });
 
     // While rewriting uses, we buffer jumps and rewrite them in a second pass
     // because doing so will perturb the CFG that we are walking to find the
     // uses in the first place.
     SmallVector<MachineInstr *, 4> JmpIs;
 
     // Gather the condition flags that have already been preserved in
     // registers. We do this from scratch each time as we expect there to be
     // very few of them and we expect to not revisit the same copy definition
     // many times. If either of those change sufficiently we could build a map
     // of these up front instead.
     CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
 
     // Collect the basic blocks we need to scan. Typically this will just be
     // a single basic block but we may have to scan multiple blocks if the
     // EFLAGS copy lives into successors.
     SmallVector<MachineBasicBlock *, 2> Blocks;
     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
     Blocks.push_back(&MBB);
 
     do {
       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
 
       // Track when if/when we find a kill of the flags in this block.
       bool FlagsKilled = false;
 
       // In most cases, we walk from the beginning to the end of the block. But
       // when the block is the same block as the copy is from, we will visit it
       // twice. The first time we start from the copy and go to the end. The
       // second time we start from the beginning and go to the copy. This lets
       // us handle copies inside of cycles.
       // FIXME: This loop is *super* confusing. This is at least in part
       // a symptom of all of this routine needing to be refactored into
       // documentable components. Once done, there may be a better way to write
       // this loop.
       for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
                           ? std::next(CopyI->getIterator())
                           : UseMBB.instr_begin(),
                 MIE = UseMBB.instr_end();
            MII != MIE;) {
         MachineInstr &MI = *MII++;
         // If we are in the original copy block and encounter either the copy
         // def or the copy itself, break so that we don't re-process any part of
         // the block or process the instructions in the range that was copied
         // over.
         if (&MI == CopyI || &MI == &CopyDefI) {
           assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
                  "Should only encounter these on the second pass over the "
                  "original block.");
           break;
         }
 
         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
         if (!FlagUse) {
           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
             // If EFLAGS are defined, it's as-if they were killed. We can stop
             // scanning here.
             //
             // NB!!! Many instructions only modify some flags. LLVM currently
             // models this as clobbering all flags, but if that ever changes
             // this will need to be carefully updated to handle that more
             // complex logic.
             FlagsKilled = true;
             break;
           }
           continue;
         }
 
         LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
 
         // Check the kill flag before we rewrite as that may change it.
         if (FlagUse->isKill())
           FlagsKilled = true;
 
         // Once we encounter a branch, the rest of the instructions must also be
         // branches. We can't rewrite in place here, so we handle them below.
         //
         // Note that we don't have to handle tail calls here, even conditional
         // tail calls, as those are not introduced into the X86 MI until post-RA
         // branch folding or black placement. As a consequence, we get to deal
         // with the simpler formulation of conditional branches followed by tail
         // calls.
         if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
           auto JmpIt = MI.getIterator();
           do {
             JmpIs.push_back(&*JmpIt);
             ++JmpIt;
           } while (JmpIt != UseMBB.instr_end() &&
                    X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
                        X86::COND_INVALID);
           break;
         }
 
         // Otherwise we can just rewrite in-place.
         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
           rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
                    X86::COND_INVALID) {
           rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (MI.getOpcode() == TargetOpcode::COPY) {
           rewriteCopy(MI, *FlagUse, CopyDefI);
         } else {
           // We assume all other instructions that use flags also def them.
           assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
                  "Expected a def of EFLAGS for this instruction!");
 
           // NB!!! Several arithmetic instructions only *partially* update
           // flags. Theoretically, we could generate MI code sequences that
           // would rely on this fact and observe different flags independently.
           // But currently LLVM models all of these instructions as clobbering
           // all the flags in an undef way. We rely on that to simplify the
           // logic.
           FlagsKilled = true;
 
           switch (MI.getOpcode()) {
           case X86::SETB_C8r:
           case X86::SETB_C16r:
           case X86::SETB_C32r:
           case X86::SETB_C64r:
             // Use custom lowering for arithmetic that is merely extending the
             // carry flag. We model this as the SETB_C* pseudo instructions.
             rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                                     CondRegs);
             break;
 
           default:
             // Generically handle remaining uses as arithmetic instructions.
             rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                               CondRegs);
             break;
           }
           break;
         }
 
         // If this was the last use of the flags, we're done.
         if (FlagsKilled)
           break;
       }
 
       // If the flags were killed, we're done with this block.
       if (FlagsKilled)
         continue;
 
       // Otherwise we need to scan successors for ones where the flags live-in
       // and queue those up for processing.
       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
             VisitedBlocks.insert(SuccMBB).second) {
           // We currently don't do any PHI insertion and so we require that the
           // test basic block dominates all of the use basic blocks. Further, we
           // can't have a cycle from the test block back to itself as that would
           // create a cycle requiring a PHI to break it.
           //
           // We could in theory do PHI insertion here if it becomes useful by
           // just taking undef values in along every edge that we don't trace
           // this EFLAGS copy along. This isn't as bad as fully general PHI
           // insertion, but still seems like a great deal of complexity.
           //
           // Because it is theoretically possible that some earlier MI pass or
           // other lowering transformation could induce this to happen, we do
           // a hard check even in non-debug builds here.
           if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
             LLVM_DEBUG({
               dbgs()
                   << "ERROR: Encountered use that is not dominated by our test "
                      "basic block! Rewriting this would require inserting PHI "
                      "nodes to track the flag state across the CFG.\n\nTest "
                      "block:\n";
               TestMBB->dump();
               dbgs() << "Use block:\n";
               SuccMBB->dump();
             });
             report_fatal_error(
                 "Cannot lower EFLAGS copy when original copy def "
                 "does not dominate all uses.");
           }
 
           Blocks.push_back(SuccMBB);
         }
     } while (!Blocks.empty());
 
     // Now rewrite the jumps that use the flags. These we handle specially
     // because if there are multiple jumps in a single basic block we'll have
     // to do surgery on the CFG.
     MachineBasicBlock *LastJmpMBB = nullptr;
     for (MachineInstr *JmpI : JmpIs) {
       // Past the first jump within a basic block we need to split the blocks
       // apart.
       if (JmpI->getParent() == LastJmpMBB)
         splitBlock(*JmpI->getParent(), *JmpI, *TII);
       else
         LastJmpMBB = JmpI->getParent();
 
       rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
 
     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
     // the copy's def operand is itself a kill.
   }
 
 #ifndef NDEBUG
   for (MachineBasicBlock &MBB : MF)
     for (MachineInstr &MI : MBB)
       if (MI.getOpcode() == TargetOpcode::COPY &&
           (MI.getOperand(0).getReg() == X86::EFLAGS ||
            MI.getOperand(1).getReg() == X86::EFLAGS)) {
         LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: ";
                    MI.dump());
         llvm_unreachable("Unlowered EFLAGS copy!");
       }
 #endif
 
   return true;
 }
 
 /// Collect any conditions that have already been set in registers so that we
 /// can re-use them rather than adding duplicates.
 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
   CondRegArray CondRegs = {};
 
   // Scan backwards across the range of instructions with live EFLAGS.
   for (MachineInstr &MI :
        llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
     if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
         TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
       assert(MI.getOperand(0).isDef() &&
              "A non-storing SETcc should always define a register!");
       CondRegs[Cond] = MI.getOperand(0).getReg();
     }
 
     // Stop scanning when we see the first definition of the EFLAGS as prior to
     // this we would potentially capture the wrong flag state.
     if (MI.findRegisterDefOperand(X86::EFLAGS))
       break;
   }
   return CondRegs;
 }
 
 unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, X86::CondCode Cond) {
   unsigned Reg = MRI->createVirtualRegister(PromoteRC);
   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
                       TII->get(X86::getSETFromCond(Cond)), Reg);
   (void)SetI;
   LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
   ++NumSetCCsInserted;
   return Reg;
 }
 
 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
   unsigned &CondReg = CondRegs[Cond];
   unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
   if (!CondReg && !InvCondReg)
     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
 
   if (CondReg)
     return {CondReg, false};
   else
     return {InvCondReg, true};
 }
 
 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator Pos,
                                           DebugLoc Loc, unsigned Reg) {
   auto TestI =
       BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg);
   (void)TestI;
   LLVM_DEBUG(dbgs() << "    test cond: "; TestI->dump());
   ++NumTestsInserted;
 }
 
 void X86FlagsCopyLoweringPass::rewriteArithmetic(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
     CondRegArray &CondRegs) {
   // Arithmetic is either reading CF or OF. Figure out which condition we need
   // to preserve in a register.
   X86::CondCode Cond;
 
   // The addend to use to reset CF or OF when added to the flag value.
   int Addend;
 
   switch (getMnemonicFromOpcode(MI.getOpcode())) {
   case FlagArithMnemonic::ADC:
   case FlagArithMnemonic::ADCX:
   case FlagArithMnemonic::RCL:
   case FlagArithMnemonic::RCR:
   case FlagArithMnemonic::SBB:
     Cond = X86::COND_B; // CF == 1
     // Set up an addend that when one is added will need a carry due to not
     // having a higher bit available.
     Addend = 255;
     break;
 
   case FlagArithMnemonic::ADOX:
     Cond = X86::COND_O; // OF == 1
     // Set up an addend that when one is added will turn from positive to
     // negative and thus overflow in the signed domain.
     Addend = 127;
     break;
   }
 
   // Now get a register that contains the value of the flag input to the
   // arithmetic. We require exactly this flag to simplify the arithmetic
   // required to materialize it back into the flag.
   unsigned &CondReg = CondRegs[Cond];
   if (!CondReg)
     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
 
   MachineBasicBlock &MBB = *MI.getParent();
 
   // Insert an instruction that will set the flag back to the desired value.
   unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
   auto AddI =
       BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
           .addDef(TmpReg, RegState::Dead)
           .addReg(CondReg)
           .addImm(Addend);
   (void)AddI;
   LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
   ++NumAddsInserted;
   FlagUse.setIsKill(true);
 }
 
 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
                                            MachineBasicBlock::iterator TestPos,
                                            DebugLoc TestLoc,
                                            MachineInstr &CMovI,
                                            MachineOperand &FlagUse,
                                            CondRegArray &CondRegs) {
   // First get the register containing this specific condition.
   X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
   unsigned CondReg;
   bool Inverted;
   std::tie(CondReg, Inverted) =
       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
 
   MachineBasicBlock &MBB = *CMovI.getParent();
 
   // Insert a direct test of the saved register.
   insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
 
   // Rewrite the CMov to use the !ZF flag from the test (but match register
   // size and memory operand), and then kill its use of the flags afterward.
   auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
   CMovI.setDesc(TII->get(X86::getCMovFromCond(
       Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
       !CMovI.memoperands_empty())));
   FlagUse.setIsKill(true);
   LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
 }
 
 void X86FlagsCopyLoweringPass::rewriteCondJmp(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
   // First get the register containing this specific condition.
   X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
   unsigned CondReg;
   bool Inverted;
   std::tie(CondReg, Inverted) =
       getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
 
   MachineBasicBlock &JmpMBB = *JmpI.getParent();
 
   // Insert a direct test of the saved register.
   insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
 
   // Rewrite the jump to use the !ZF flag from the test, and kill its use of
   // flags afterward.
   JmpI.setDesc(TII->get(
       X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
   const int ImplicitEFLAGSOpIdx = 1;
   JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
   LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
 }
 
 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
                                            MachineOperand &FlagUse,
                                            MachineInstr &CopyDefI) {
   // Just replace this copy with the original copy def.
   MRI->replaceRegWith(MI.getOperand(0).getReg(),
                       CopyDefI.getOperand(0).getReg());
   MI.eraseFromParent();
 }
 
 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
     CondRegArray &CondRegs) {
   // This routine is only used to handle pseudos for setting a register to zero
   // or all ones based on CF. This is essentially the sign extended from 1-bit
   // form of SETB and modeled with the SETB_C* pseudos. They require special
   // handling as they aren't normal SETcc instructions and are lowered to an
   // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
   // they are only provided in reg-defining forms. A complicating factor is that
   // they can define many different register widths.
   assert(SetBI.getOperand(0).isReg() &&
          "Cannot have a non-register defined operand to this variant of SETB!");
 
   // Little helper to do the common final step of replacing the register def'ed
   // by this SETB instruction with a new register and removing the SETB
   // instruction.
   auto RewriteToReg = [&](unsigned Reg) {
     MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
     SetBI.eraseFromParent();
   };
 
   // Grab the register class used for this particular instruction.
   auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
 
   MachineBasicBlock &MBB = *SetBI.getParent();
   auto SetPos = SetBI.getIterator();
   auto SetLoc = SetBI.getDebugLoc();
 
   auto AdjustReg = [&](unsigned Reg) {
     auto &OrigRC = *MRI->getRegClass(Reg);
     if (&OrigRC == &SetBRC)
       return Reg;
 
     unsigned NewReg;
 
     int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
     int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
     assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
     assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
     int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
                        X86::NoSubRegister, X86::sub_32bit};
 
     // If the original size is smaller than the target *and* is smaller than 4
     // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
     // to maximize the chance of being able to CSE that operation and to avoid
     // partial dependency stalls extending to 2-bytes.
     if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
       NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
       BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
           .addReg(Reg);
       if (&SetBRC == &X86::GR32RegClass)
         return NewReg;
       Reg = NewReg;
       OrigRegSize = 4;
     }
 
     NewReg = MRI->createVirtualRegister(&SetBRC);
     if (OrigRegSize < TargetRegSize) {
       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
               NewReg)
           .addImm(0)
           .addReg(Reg)
           .addImm(SubRegIdx[OrigRegSize]);
     } else if (OrigRegSize > TargetRegSize) {
-      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::EXTRACT_SUBREG),
+      if (TargetRegSize == 1 && !Subtarget->is64Bit()) {
+        // Need to constrain the register class.
+        MRI->constrainRegClass(Reg, &X86::GR32_ABCDRegClass);
+      }
+
+      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY),
               NewReg)
-          .addReg(Reg)
-          .addImm(SubRegIdx[TargetRegSize]);
+          .addReg(Reg, 0, SubRegIdx[TargetRegSize]);
     } else {
       BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
           .addReg(Reg);
     }
     return NewReg;
   };
 
   unsigned &CondReg = CondRegs[X86::COND_B];
   if (!CondReg)
     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
 
   // Adjust the condition to have the desired register width by zero-extending
   // as needed.
   // FIXME: We should use a better API to avoid the local reference and using a
   // different variable here.
   unsigned ExtCondReg = AdjustReg(CondReg);
 
   // Now we need to turn this into a bitmask. We do this by subtracting it from
   // zero.
   unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
   BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
   ZeroReg = AdjustReg(ZeroReg);
 
   unsigned Sub;
   switch (SetBI.getOpcode()) {
   case X86::SETB_C8r:
     Sub = X86::SUB8rr;
     break;
 
   case X86::SETB_C16r:
     Sub = X86::SUB16rr;
     break;
 
   case X86::SETB_C32r:
     Sub = X86::SUB32rr;
     break;
 
   case X86::SETB_C64r:
     Sub = X86::SUB64rr;
     break;
 
   default:
     llvm_unreachable("Invalid SETB_C* opcode!");
   }
   unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
       .addReg(ZeroReg)
       .addReg(ExtCondReg);
   return RewriteToReg(ResultReg);
 }
 
 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
                                             MachineBasicBlock::iterator TestPos,
                                             DebugLoc TestLoc,
                                             MachineInstr &SetCCI,
                                             MachineOperand &FlagUse,
                                             CondRegArray &CondRegs) {
   X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
   // Note that we can't usefully rewrite this to the inverse without complex
   // analysis of the users of the setCC. Largely we rely on duplicates which
   // could have been avoided already being avoided here.
   unsigned &CondReg = CondRegs[Cond];
   if (!CondReg)
     CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
 
   // Rewriting a register def is trivial: we just replace the register and
   // remove the setcc.
   if (!SetCCI.mayStore()) {
     assert(SetCCI.getOperand(0).isReg() &&
            "Cannot have a non-register defined operand to SETcc!");
     MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
     SetCCI.eraseFromParent();
     return;
   }
 
   // Otherwise, we need to emit a store.
   auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
                      SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
   // Copy the address operands.
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(SetCCI.getOperand(i));
 
   MIB.addReg(CondReg);
 
   MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end());
 
   SetCCI.eraseFromParent();
   return;
 }
Index: vendor/llvm/dist-release_70/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- vendor/llvm/dist-release_70/lib/Transforms/Vectorize/SLPVectorizer.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/lib/Transforms/Vectorize/SLPVectorizer.cpp	(revision 338000)
@@ -1,6486 +1,6480 @@
 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
 // stores that can be put together into vector-stores. Next, it attempts to
 // construct vectorizable tree using the use-def chains. If a profitable tree
 // was found, the SLP vectorizer performs vectorization on the tree.
 //
 // The pass is inspired by the work described in the paper:
 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <memory>
 #include <set>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 using namespace slpvectorizer;
 
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
 
 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
 
 static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                      cl::desc("Only vectorize if you gain more than this "
                               "number "));
 
 static cl::opt<bool>
 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                    cl::desc("Attempt to vectorize horizontal reductions"));
 
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
     cl::desc(
         "Attempt to vectorize horizontal reductions feeding into a store"));
 
 static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
 /// Limits the size of scheduling regions in a block.
 /// It avoid long compile times for _very_ large blocks where vector
 /// instructions are spread over a wide range.
 /// This limit is way higher than needed by real-world functions.
 static cl::opt<int>
 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
     cl::desc("Limit the size of the SLP scheduling region per block"));
 
 static cl::opt<int> MinVectorRegSizeOption(
     "slp-min-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
 static cl::opt<unsigned> RecursionMaxDepth(
     "slp-recursion-max-depth", cl::init(12), cl::Hidden,
     cl::desc("Limit the recursion depth when building a vectorizable tree"));
 
 static cl::opt<unsigned> MinTreeSize(
     "slp-min-tree-size", cl::init(3), cl::Hidden,
     cl::desc("Only vectorize small trees if they are fully vectorizable"));
 
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
 
 // Another limit for the alias checks: The maximum distance between load/store
 // instructions where alias checks are done.
 // This limit is useful for very large basic blocks.
 static const unsigned MaxMemDepDistance = 160;
 
 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
 /// regions to be handled.
 static const int MinScheduleRegionSize = 16;
 
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
 /// vectors. We also filter target specific types which have absolutely no
 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
 /// avoids spending time checking the cost model and realizing that they will
 /// be inevitably scalarized.
 static bool isValidElementType(Type *Ty) {
   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
          !Ty->isPPC_FP128Ty();
 }
 
 /// \returns true if all of the instructions in \p VL are in the same block or
 /// false otherwise.
 static bool allSameBlock(ArrayRef<Value *> VL) {
   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
   if (!I0)
     return false;
   BasicBlock *BB = I0->getParent();
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
     if (!I)
       return false;
 
     if (BB != I->getParent())
       return false;
   }
   return true;
 }
 
 /// \returns True if all of the values in \p VL are constants.
 static bool allConstant(ArrayRef<Value *> VL) {
   for (Value *i : VL)
     if (!isa<Constant>(i))
       return false;
   return true;
 }
 
 /// \returns True if all of the values in \p VL are identical.
 static bool isSplat(ArrayRef<Value *> VL) {
   for (unsigned i = 1, e = VL.size(); i < e; ++i)
     if (VL[i] != VL[0])
       return false;
   return true;
 }
 
 /// Checks if the vector of instructions can be represented as a shuffle, like:
 /// %x0 = extractelement <4 x i8> %x, i32 0
 /// %x3 = extractelement <4 x i8> %x, i32 3
 /// %y1 = extractelement <4 x i8> %y, i32 1
 /// %y2 = extractelement <4 x i8> %y, i32 2
 /// %x0x0 = mul i8 %x0, %x0
 /// %x3x3 = mul i8 %x3, %x3
 /// %y1y1 = mul i8 %y1, %y1
 /// %y2y2 = mul i8 %y2, %y2
 /// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
 /// ret <4 x i8> %ins4
 /// can be transformed into:
 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
 ///                                                         i32 6>
 /// %2 = mul <4 x i8> %1, %1
 /// ret <4 x i8> %2
 /// We convert this initially to something like:
 /// %x0 = extractelement <4 x i8> %x, i32 0
 /// %x3 = extractelement <4 x i8> %x, i32 3
 /// %y1 = extractelement <4 x i8> %y, i32 1
 /// %y2 = extractelement <4 x i8> %y, i32 2
 /// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
 /// %5 = mul <4 x i8> %4, %4
 /// %6 = extractelement <4 x i8> %5, i32 0
 /// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
 /// %7 = extractelement <4 x i8> %5, i32 1
 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
 /// %8 = extractelement <4 x i8> %5, i32 2
 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
 /// %9 = extractelement <4 x i8> %5, i32 3
 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
 /// ret <4 x i8> %ins4
 /// InstCombiner transforms this into a shuffle and vector mul
 /// TODO: Can we split off and reuse the shuffle mask detection from
 /// TargetTransformInfo::getInstructionThroughput?
 static Optional<TargetTransformInfo::ShuffleKind>
 isShuffle(ArrayRef<Value *> VL) {
   auto *EI0 = cast<ExtractElementInst>(VL[0]);
   unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
   enum ShuffleMode { Unknown, Select, Permute };
   ShuffleMode CommonShuffleMode = Unknown;
   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
     auto *EI = cast<ExtractElementInst>(VL[I]);
     auto *Vec = EI->getVectorOperand();
     // All vector operands must have the same number of vector elements.
     if (Vec->getType()->getVectorNumElements() != Size)
       return None;
     auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
     if (!Idx)
       return None;
     // Undefined behavior if Idx is negative or >= Size.
     if (Idx->getValue().uge(Size))
       continue;
     unsigned IntIdx = Idx->getValue().getZExtValue();
     // We can extractelement from undef vector.
     if (isa<UndefValue>(Vec))
       continue;
     // For correct shuffling we have to have at most 2 different vector operands
     // in all extractelement instructions.
     if (!Vec1 || Vec1 == Vec)
       Vec1 = Vec;
     else if (!Vec2 || Vec2 == Vec)
       Vec2 = Vec;
     else
       return None;
     if (CommonShuffleMode == Permute)
       continue;
     // If the extract index is not the same as the operation number, it is a
     // permutation.
     if (IntIdx != I) {
       CommonShuffleMode = Permute;
       continue;
     }
     CommonShuffleMode = Select;
   }
   // If we're not crossing lanes in different vectors, consider it as blending.
   if (CommonShuffleMode == Select && Vec2)
     return TargetTransformInfo::SK_Select;
   // If Vec2 was never used, we have a permutation of a single vector, otherwise
   // we have permutation of 2 vectors.
   return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
               : TargetTransformInfo::SK_PermuteSingleSrc;
 }
 
 namespace {
 
 /// Main data required for vectorization of instructions.
 struct InstructionsState {
   /// The very first instruction in the list with the main opcode.
   Value *OpValue = nullptr;
 
   /// The main/alternate instruction.
   Instruction *MainOp = nullptr;
   Instruction *AltOp = nullptr;
 
   /// The main/alternate opcodes for the list of instructions.
   unsigned getOpcode() const {
     return MainOp ? MainOp->getOpcode() : 0;
   }
 
   unsigned getAltOpcode() const {
     return AltOp ? AltOp->getOpcode() : 0;
   }
 
   /// Some of the instructions in the list have alternate opcodes.
   bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
 
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
   }
 
   InstructionsState() = delete;
   InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
 };
 
 } // end anonymous namespace
 
 /// Chooses the correct key for scheduling data. If \p Op has the same (or
 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
 /// OpValue.
 static Value *isOneOf(const InstructionsState &S, Value *Op) {
   auto *I = dyn_cast<Instruction>(Op);
   if (I && S.isOpcodeOrAlt(I))
     return Op;
   return S.OpValue;
 }
 
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        unsigned BaseIndex = 0) {
   // Make sure these are all Instructions.
   if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
 
   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
   unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
   unsigned AltOpcode = Opcode;
   unsigned AltIndex = BaseIndex;
 
   // Check for one alternate opcode from another BinaryOperator.
   // TODO - generalize to support all operators (types, calls etc.).
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
     unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
     if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
         continue;
       if (Opcode == AltOpcode) {
         AltOpcode = InstOpcode;
         AltIndex = Cnt;
         continue;
       }
     } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
       Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
       Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
       if (Ty0 == Ty1) {
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
         if (Opcode == AltOpcode) {
           AltOpcode = InstOpcode;
           AltIndex = Cnt;
           continue;
         }
       }
     } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
       continue;
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
   }
 
   return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
                            cast<Instruction>(VL[AltIndex]));
 }
 
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
   Type *Ty = VL[0]->getType();
   for (int i = 1, e = VL.size(); i < e; i++)
     if (VL[i]->getType() != Ty)
       return false;
 
   return true;
 }
 
 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
 static Optional<unsigned> getExtractIndex(Instruction *E) {
   unsigned Opcode = E->getOpcode();
   assert((Opcode == Instruction::ExtractElement ||
           Opcode == Instruction::ExtractValue) &&
          "Expected extractelement or extractvalue instruction.");
   if (Opcode == Instruction::ExtractElement) {
     auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
     if (!CI)
       return None;
     return CI->getZExtValue();
   }
   ExtractValueInst *EI = cast<ExtractValueInst>(E);
   if (EI->getNumIndices() != 1)
     return None;
   return *EI->idx_begin();
 }
 
 /// \returns True if in-tree use also needs extract. This refers to
 /// possible scalar operand in vectorized instruction.
 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
                                     TargetLibraryInfo *TLI) {
   unsigned Opcode = UserInst->getOpcode();
   switch (Opcode) {
   case Instruction::Load: {
     LoadInst *LI = cast<LoadInst>(UserInst);
     return (LI->getPointerOperand() == Scalar);
   }
   case Instruction::Store: {
     StoreInst *SI = cast<StoreInst>(UserInst);
     return (SI->getPointerOperand() == Scalar);
   }
   case Instruction::Call: {
     CallInst *CI = cast<CallInst>(UserInst);
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
       return (CI->getArgOperand(1) == Scalar);
     }
     LLVM_FALLTHROUGH;
   }
   default:
     return false;
   }
 }
 
 /// \returns the AA location that is being access by the instruction.
 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return MemoryLocation::get(SI);
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
     return MemoryLocation::get(LI);
   return MemoryLocation();
 }
 
 /// \returns True if the instruction is not a volatile or atomic load/store.
 static bool isSimple(Instruction *I) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
     return LI->isSimple();
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->isSimple();
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
     return !MI->isVolatile();
   return true;
 }
 
 namespace llvm {
 
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
   using ValueList = SmallVector<Value *, 8>;
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
   using StoreList = SmallVector<StoreInst *, 8>;
   using ExtraValueToDebugLocsMap =
       MapVector<Value *, SmallVector<Instruction *, 2>>;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
         DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
     // TODO: It would be better to limit the vectorization factor based on
     //       data type rather than just register size. For example, x86 AVX has
     //       256-bit registers, but it does not support integer operations
     //       at that width (that requires AVX2).
     if (MaxVectorRegSizeOption.getNumOccurrences())
       MaxVecRegSize = MaxVectorRegSizeOption;
     else
       MaxVecRegSize = TTI->getRegisterBitWidth(true);
 
     if (MinVectorRegSizeOption.getNumOccurrences())
       MinVecRegSize = MinVectorRegSizeOption;
     else
       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
   }
 
   /// Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
   Value *vectorizeTree();
 
   /// Vectorize the tree but with the list of externally used values \p
   /// ExternallyUsedValues. Values in this MapVector can be replaced but the
   /// generated extractvalue instructions.
   Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
   int getSpillCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
   int getTreeCost();
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
   void buildTree(ArrayRef<Value *> Roots,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
   /// into account (anf updating it, if required) list of externally used
   /// values stored in \p ExternallyUsedValues.
   void buildTree(ArrayRef<Value *> Roots,
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
     }
     MinBWs.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
 
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
     auto I = std::max_element(
         NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
         [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
            const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
           return D1.second < D2.second;
         });
     if (I == NumOpsWantToKeepOrder.end() ||
         I->getSecond() <= NumOpsWantToKeepOriginalOrder)
       return None;
 
     return makeArrayRef(I->getFirst());
   }
 
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
   /// the stored value. Otherwise, the size is the width of the largest loaded
   /// value reaching V. This method is used by the vectorizer to calculate
   /// vectorization factors.
   unsigned getVectorElementSize(Value *V);
 
   /// Compute the minimum type sizes required to represent the entries in a
   /// vectorizable tree.
   void computeMinimumValueSizes();
 
   // \returns maximum vector register size as set by TTI or overridden by cl::opt.
   unsigned getMaxVecRegSize() const {
     return MaxVecRegSize;
   }
 
   // \returns minimum vector register size as set by cl::opt.
   unsigned getMinVecRegSize() const {
     return MinVecRegSize;
   }
 
   /// Check if ArrayType or StructType is isomorphic to some VectorType.
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
 
   /// \returns True if the VectorizableTree is both tiny and not fully
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
 private:
   struct TreeEntry;
 
   /// Checks if all users of \p I are the part of the vectorization tree.
   bool areAllUsersVectorized(Instruction *I) const;
 
   /// \returns the cost of the vectorizable entry.
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
   /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
   /// returns false, setting \p CurrentOrder to either an empty vector or a
   /// non-identity permutation that allows to reuse extract instructions.
   bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                        SmallVectorImpl<unsigned> &CurrentOrder) const;
 
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
   Value *vectorizeTree(ArrayRef<Value *> VL);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
   int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
   int getGatherCost(ArrayRef<Value *> VL);
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
   void setInsertPointAfterBundle(ArrayRef<Value *> VL,
                                  const InstructionsState &S);
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree();
 
   /// \reorder commutative operands in alt shuffle if they result in
   ///  vectorized code.
   void reorderAltShuffleOperands(const InstructionsState &S,
                                  ArrayRef<Value *> VL,
                                  SmallVectorImpl<Value *> &Left,
                                  SmallVectorImpl<Value *> &Right);
 
   /// \reorder commutative operands to get better probability of
   /// generating vectorized code.
   void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
     TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
       if (VL.size() == Scalars.size())
         return std::equal(VL.begin(), VL.end(), Scalars.begin());
       return VL.size() == ReuseShuffleIndices.size() &&
              std::equal(
                  VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
                  [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
     }
 
     /// A vector of scalars.
     ValueList Scalars;
 
     /// The Scalars are vectorized into this value. It is initialized to Null.
     Value *VectorizedValue = nullptr;
 
     /// Do we need to gather this sequence ?
     bool NeedToGather = false;
 
     /// Does this sequence require some shuffling?
     SmallVector<unsigned, 4> ReuseShuffleIndices;
 
     /// Does this entry require reordering?
     ArrayRef<unsigned> ReorderIndices;
 
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
     /// to be a pointer and needs to be able to initialize the child iterator.
     /// Thus we need a reference back to the container to translate the indices
     /// to entries.
     std::vector<TreeEntry> &Container;
 
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
   };
 
   /// Create a new VectorizableTree entry.
   void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
                     ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
         ScalarToTreeEntry[VL[i]] = idx;
       }
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
 
     if (UserTreeIdx >= 0)
       Last->UserTreeIndices.push_back(UserTreeIdx);
     UserTreeIdx = idx;
   }
 
   /// -- Vectorization State --
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
 
   TreeEntry *getTreeEntry(Value *V) {
     auto I = ScalarToTreeEntry.find(V);
     if (I != ScalarToTreeEntry.end())
       return &VectorizableTree[I->second];
     return nullptr;
   }
 
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
         : Scalar(S), User(U), Lane(L) {}
 
     // Which scalar in our function.
     Value *Scalar;
 
     // Which user that uses the scalar.
     llvm::User *User;
 
     // Which lane does the scalar belong to.
     int Lane;
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
   /// is invariant in the calling loop.
   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
                  Instruction *Inst2) {
     // First check if the result is already in the cache.
     AliasCacheKey key = std::make_pair(Inst1, Inst2);
     Optional<bool> &result = AliasCache[key];
     if (result.hasValue()) {
       return result.getValue();
     }
     MemoryLocation Loc2 = getLocation(Inst2, AA);
     bool aliased = true;
     if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
       // Do the alias check.
       aliased = AA->alias(Loc1, Loc2);
     }
     // Store the result in the cache.
     result = aliased;
     return aliased;
   }
 
   using AliasCacheKey = std::pair<Instruction *, Instruction *>;
 
   /// Cache for alias results.
   /// TODO: consider moving this to the AliasAnalysis itself.
   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
 
   /// Removes an instruction from its block and eventually deletes it.
   /// It's like Instruction::eraseFromParent() except that the actual deletion
   /// is delayed until BoUpSLP is destructed.
   /// This is required to ensure that there are no incorrect collisions in the
   /// AliasCache, which can happen if a new instruction is allocated at the
   /// same address as a previously deleted instruction.
   void eraseInstruction(Instruction *I) {
     I->removeFromParent();
     I->dropAllReferences();
     DeletedInstructions.emplace_back(I);
   }
 
   /// Temporary store for deleted instructions. Instructions will be deleted
   /// eventually when the BoUpSLP is destructed.
   SmallVector<unique_value, 8> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
   /// can be nullptr, it means that this Internal Scalar will be used later,
   /// after vectorization.
   UserList ExternalUses;
 
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
   /// Holds all of the instructions that we gathered.
   SetVector<Instruction *> GatherSeq;
 
   /// A list of blocks that we are going to CSE.
   SetVector<BasicBlock *> CSEBlocks;
 
   /// Contains all scheduling relevant data for an instruction.
   /// A ScheduleData either represents a single instruction or a member of an
   /// instruction bundle (= a group of instructions which is combined into a
   /// vector instruction).
   struct ScheduleData {
     // The initial value for the dependency counters. It means that the
     // dependencies are not calculated yet.
     enum { InvalidDeps = -1 };
 
     ScheduleData() = default;
 
     void init(int BlockSchedulingRegionID, Value *OpVal) {
       FirstInBundle = this;
       NextInBundle = nullptr;
       NextLoadStore = nullptr;
       IsScheduled = false;
       SchedulingRegionID = BlockSchedulingRegionID;
       UnscheduledDepsInBundle = UnscheduledDeps;
       clearDependencies();
       OpValue = OpVal;
     }
 
     /// Returns true if the dependency information has been calculated.
     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 
     /// Returns true for single instructions and for bundle representatives
     /// (= the head of a bundle).
     bool isSchedulingEntity() const { return FirstInBundle == this; }
 
     /// Returns true if it represents an instruction bundle and not only a
     /// single instruction.
     bool isPartOfBundle() const {
       return NextInBundle != nullptr || FirstInBundle != this;
     }
 
     /// Returns true if it is ready for scheduling, i.e. it has no more
     /// unscheduled depending instructions/bundles.
     bool isReady() const {
       assert(isSchedulingEntity() &&
              "can't consider non-scheduling entity for ready list");
       return UnscheduledDepsInBundle == 0 && !IsScheduled;
     }
 
     /// Modifies the number of unscheduled dependencies, also updating it for
     /// the whole bundle.
     int incrementUnscheduledDeps(int Incr) {
       UnscheduledDeps += Incr;
       return FirstInBundle->UnscheduledDepsInBundle += Incr;
     }
 
     /// Sets the number of unscheduled dependencies to the number of
     /// dependencies.
     void resetUnscheduledDeps() {
       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
     }
 
     /// Clears all dependency information.
     void clearDependencies() {
       Dependencies = InvalidDeps;
       resetUnscheduledDeps();
       MemoryDependencies.clear();
     }
 
     void dump(raw_ostream &os) const {
       if (!isSchedulingEntity()) {
         os << "/ " << *Inst;
       } else if (NextInBundle) {
         os << '[' << *Inst;
         ScheduleData *SD = NextInBundle;
         while (SD) {
           os << ';' << *SD->Inst;
           SD = SD->NextInBundle;
         }
         os << ']';
       } else {
         os << *Inst;
       }
     }
 
     Instruction *Inst = nullptr;
 
     /// Points to the head in an instruction bundle (and always to this for
     /// single instructions).
     ScheduleData *FirstInBundle = nullptr;
 
     /// Single linked list of all instructions in a bundle. Null if it is a
     /// single instruction.
     ScheduleData *NextInBundle = nullptr;
 
     /// Single linked list of all memory instructions (e.g. load, store, call)
     /// in the block - until the end of the scheduling region.
     ScheduleData *NextLoadStore = nullptr;
 
     /// The dependent memory instructions.
     /// This list is derived on demand in calculateDependencies().
     SmallVector<ScheduleData *, 4> MemoryDependencies;
 
     /// This ScheduleData is in the current scheduling region if this matches
     /// the current SchedulingRegionID of BlockScheduling.
     int SchedulingRegionID = 0;
 
     /// Used for getting a "good" final ordering of instructions.
     int SchedulingPriority = 0;
 
     /// The number of dependencies. Constitutes of the number of users of the
     /// instruction plus the number of dependent memory instructions (if any).
     /// This value is calculated on demand.
     /// If InvalidDeps, the number of dependencies is not calculated yet.
     int Dependencies = InvalidDeps;
 
     /// The number of dependencies minus the number of dependencies of scheduled
     /// instructions. As soon as this is zero, the instruction/bundle gets ready
     /// for scheduling.
     /// Note that this is negative as long as Dependencies is not calculated.
     int UnscheduledDeps = InvalidDeps;
 
     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
     /// single instructions.
     int UnscheduledDepsInBundle = InvalidDeps;
 
     /// True if this instruction is scheduled (or considered as scheduled in the
     /// dry-run).
     bool IsScheduled = false;
 
     /// Opcode of the current instruction in the schedule data.
     Value *OpValue = nullptr;
   };
 
 #ifndef NDEBUG
   friend inline raw_ostream &operator<<(raw_ostream &os,
                                         const BoUpSLP::ScheduleData &SD) {
     SD.dump(os);
     return os;
   }
 #endif
 
   friend struct GraphTraits<BoUpSLP *>;
   friend struct DOTGraphTraits<BoUpSLP *>;
 
   /// Contains all scheduling data for a basic block.
   struct BlockScheduling {
     BlockScheduling(BasicBlock *BB)
         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
 
     void clear() {
       ReadyInsts.clear();
       ScheduleStart = nullptr;
       ScheduleEnd = nullptr;
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
 
       // Reduce the maximum schedule region size by the size of the
       // previous scheduling run.
       ScheduleRegionSizeLimit -= ScheduleRegionSize;
       if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
         ScheduleRegionSizeLimit = MinScheduleRegionSize;
       ScheduleRegionSize = 0;
 
       // Make a new scheduling region, i.e. all existing ScheduleData is not
       // in the new region yet.
       ++SchedulingRegionID;
     }
 
     ScheduleData *getScheduleData(Value *V) {
       ScheduleData *SD = ScheduleDataMap[V];
       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
         return SD;
       return nullptr;
     }
 
     ScheduleData *getScheduleData(Value *V, Value *Key) {
       if (V == Key)
         return getScheduleData(V);
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end()) {
         ScheduleData *SD = I->second[Key];
         if (SD && SD->SchedulingRegionID == SchedulingRegionID)
           return SD;
       }
       return nullptr;
     }
 
     bool isInSchedulingRegion(ScheduleData *SD) {
       return SD->SchedulingRegionID == SchedulingRegionID;
     }
 
     /// Marks an instruction as scheduled and puts all dependent ready
     /// instructions into the ready-list.
     template <typename ReadyListType>
     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
       SD->IsScheduled = true;
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
       ScheduleData *BundleMember = SD;
       while (BundleMember) {
         if (BundleMember->Inst != BundleMember->OpValue) {
           BundleMember = BundleMember->NextInBundle;
           continue;
         }
         // Handle the def-use chain dependencies.
         for (Use &U : BundleMember->Inst->operands()) {
           auto *I = dyn_cast<Instruction>(U.get());
           if (!I)
             continue;
           doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
             if (OpDef && OpDef->hasValidDependencies() &&
                 OpDef->incrementUnscheduledDeps(-1) == 0) {
               // There are no more unscheduled dependencies after
               // decrementing, so we can put the dependent instruction
               // into the ready list.
               ScheduleData *DepBundle = OpDef->FirstInBundle;
               assert(!DepBundle->IsScheduled &&
                      "already scheduled bundle gets ready");
               ReadyList.insert(DepBundle);
               LLVM_DEBUG(dbgs()
                          << "SLP:    gets ready (def): " << *DepBundle << "\n");
             }
           });
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
             // There are no more unscheduled dependencies after decrementing,
             // so we can put the dependent instruction into the ready list.
             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
             assert(!DepBundle->IsScheduled &&
                    "already scheduled bundle gets ready");
             ReadyList.insert(DepBundle);
             LLVM_DEBUG(dbgs()
                        << "SLP:    gets ready (mem): " << *DepBundle << "\n");
           }
         }
         BundleMember = BundleMember->NextInBundle;
       }
     }
 
     void doForAllOpcodes(Value *V,
                          function_ref<void(ScheduleData *SD)> Action) {
       if (ScheduleData *SD = getScheduleData(V))
         Action(SD);
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end())
         for (auto &P : I->second)
           if (P.second->SchedulingRegionID == SchedulingRegionID)
             Action(P.second);
     }
 
     /// Put all instructions into the ReadyList which are ready for scheduling.
     template <typename ReadyListType>
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         doForAllOpcodes(I, [&](ScheduleData *SD) {
           if (SD->isSchedulingEntity() && SD->isReady()) {
             ReadyList.insert(SD);
             LLVM_DEBUG(dbgs()
                        << "SLP:    initially in ready list: " << *I << "\n");
           }
         });
       }
     }
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
     bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                            const InstructionsState &S);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
 
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
     bool extendSchedulingRegion(Value *V, const InstructionsState &S);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
     void initScheduleData(Instruction *FromI, Instruction *ToI,
                           ScheduleData *PrevLoadStore,
                           ScheduleData *NextLoadStore);
 
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
                                BoUpSLP *SLP);
 
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
     BasicBlock *BB;
 
     /// Simple memory allocation for ScheduleData.
     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
 
     /// The size of a ScheduleData array in ScheduleDataChunks.
     int ChunkSize;
 
     /// The allocator position in the current chunk, which is the last entry
     /// of ScheduleDataChunks.
     int ChunkPos;
 
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
     /// ScheduleData structures are recycled.
     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
 
     /// Attaches ScheduleData to Instruction with the leading key.
     DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
         ExtraScheduleDataMap;
 
     struct ReadyList : SmallVector<ScheduleData *, 8> {
       void insert(ScheduleData *SD) { push_back(SD); }
     };
 
     /// The ready-list for scheduling (only used for the dry-run).
     ReadyList ReadyInsts;
 
     /// The first instruction of the scheduling region.
     Instruction *ScheduleStart = nullptr;
 
     /// The first instruction _after_ the scheduling region.
     Instruction *ScheduleEnd = nullptr;
 
     /// The first memory accessing instruction in the scheduling region
     /// (can be null).
     ScheduleData *FirstLoadStoreInRegion = nullptr;
 
     /// The last memory accessing instruction in the scheduling region
     /// (can be null).
     ScheduleData *LastLoadStoreInRegion = nullptr;
 
     /// The current size of the scheduling region.
     int ScheduleRegionSize = 0;
 
     /// The maximum size allowed for the scheduling region.
     int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
 
     /// The ID of the scheduling region. For a new vectorization iteration this
     /// is incremented which "removes" all ScheduleData from the region.
     // Make sure that the initial SchedulingRegionID is greater than the
     // initial SchedulingRegionID in ScheduleData (which is 0).
     int SchedulingRegionID = 1;
   };
 
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
 
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
   using OrdersType = SmallVector<unsigned, 4>;
   /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
   /// sorted SmallVectors of unsigned.
   struct OrdersTypeDenseMapInfo {
     static OrdersType getEmptyKey() {
       OrdersType V;
       V.push_back(~1U);
       return V;
     }
 
     static OrdersType getTombstoneKey() {
       OrdersType V;
       V.push_back(~2U);
       return V;
     }
 
     static unsigned getHashValue(const OrdersType &V) {
       return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
     }
 
     static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
       return LHS == RHS;
     }
   };
 
   /// Contains orders of operations along with the number of bundles that have
   /// operations in this order. It stores only those orders that require
   /// reordering, if reordering is not required it is counted using \a
   /// NumOpsWantToKeepOriginalOrder.
   DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
   /// Number of bundles that do not require reordering.
   unsigned NumOpsWantToKeepOriginalOrder = 0;
 
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
   TargetTransformInfo *TTI;
   TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
   LoopInfo *LI;
   DominatorTree *DT;
   AssumptionCache *AC;
   DemandedBits *DB;
   const DataLayout *DL;
   OptimizationRemarkEmitter *ORE;
 
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
 
   /// Instruction builder to construct the vectorized tree.
   IRBuilder<> Builder;
 
   /// A map of scalar integer values to the smallest bit width with which they
   /// can legally be represented. The values map to (width, signed) pairs,
   /// where "width" indicates the minimum bit width and "signed" is True if the
   /// value must be signed-extended, rather than zero-extended, back to its
   /// original width.
   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
 };
 
 } // end namespace slpvectorizer
 
 template <> struct GraphTraits<BoUpSLP *> {
   using TreeEntry = BoUpSLP::TreeEntry;
 
   /// NodeRef has to be a pointer per the GraphWriter.
   using NodeRef = TreeEntry *;
 
   /// Add the VectorizableTree to the index iterator to be able to return
   /// TreeEntry pointers.
   struct ChildIteratorType
       : public iterator_adaptor_base<ChildIteratorType,
                                      SmallVector<int, 1>::iterator> {
     std::vector<TreeEntry> &VectorizableTree;
 
     ChildIteratorType(SmallVector<int, 1>::iterator W,
                       std::vector<TreeEntry> &VT)
         : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
 
     NodeRef operator*() { return &VectorizableTree[*I]; }
   };
 
   static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
 
   static ChildIteratorType child_begin(NodeRef N) {
     return {N->UserTreeIndices.begin(), N->Container};
   }
 
   static ChildIteratorType child_end(NodeRef N) {
     return {N->UserTreeIndices.end(), N->Container};
   }
 
   /// For the node iterator we just need to turn the TreeEntry iterator into a
   /// TreeEntry* iterator so that it dereferences to NodeRef.
   using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
 
   static nodes_iterator nodes_begin(BoUpSLP *R) {
     return nodes_iterator(R->VectorizableTree.begin());
   }
 
   static nodes_iterator nodes_end(BoUpSLP *R) {
     return nodes_iterator(R->VectorizableTree.end());
   }
 
   static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
 };
 
 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
   using TreeEntry = BoUpSLP::TreeEntry;
 
   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
     std::string Str;
     raw_string_ostream OS(Str);
     if (isSplat(Entry->Scalars)) {
       OS << "<splat> " << *Entry->Scalars[0];
       return Str;
     }
     for (auto V : Entry->Scalars) {
       OS << *V;
       if (std::any_of(
               R->ExternalUses.begin(), R->ExternalUses.end(),
               [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
         OS << " <extract>";
       OS << "\n";
     }
     return Str;
   }
 
   static std::string getNodeAttributes(const TreeEntry *Entry,
                                        const BoUpSLP *) {
     if (Entry->NeedToGather)
       return "color=red";
     return "";
   }
 };
 
 } // end namespace llvm
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
   buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
 }
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
                         ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
     TreeEntry *Entry = &EIdx;
 
     // No need to handle users of gathered values.
     if (Entry->NeedToGather)
       continue;
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
       int FoundLane = Lane;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
             std::distance(Entry->ReuseShuffleIndices.begin(),
                           llvm::find(Entry->ReuseShuffleIndices, FoundLane));
       }
 
       // Check if the scalar is externally used as an extra arg.
       auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
                           << Lane << " from " << *Scalar << ".\n");
         ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
       }
       for (User *U : Scalar->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
         Instruction *UserInst = dyn_cast<Instruction>(U);
         if (!UserInst)
           continue;
 
         // Skip in-tree scalars that become vectors
         if (TreeEntry *UseEntry = getTreeEntry(U)) {
           Value *UseScalar = UseEntry->Scalars[0];
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
           if (UseScalar != U ||
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(!UseEntry->NeedToGather && "Bad state");
             continue;
           }
         }
 
         // Ignore users in the user ignore list.
         if (is_contained(UserIgnoreList, UserInst))
           continue;
 
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
                           << Lane << " from " << *Scalar << ".\n");
         ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
       }
     }
   }
 }
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // We now know that this is a vector of instructions of the same type from
   // the same block.
 
   // Don't vectorize ephemeral values.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (EphValues.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is ephemeral.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check if this is a duplicate of another entry.
   if (TreeEntry *E = getTreeEntry(S.OpValue)) {
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
     // properly draw the graph rather than for the actual vectorization.
     E->UserTreeIndices.push_back(UserTreeIdx);
     LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
                       << ".\n");
     return;
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     auto *I = dyn_cast<Instruction>(VL[i]);
     if (!I)
       continue;
     if (getTreeEntry(I)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is already in tree.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
 
   // If any of the scalars is marked as a value that needs to stay scalar, then
   // we need to gather the scalars.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
   auto *VL0 = cast<Instruction>(S.OpValue);
   BasicBlock *BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
     newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
   SmallVector<Value *, 4> UniqueValues;
   DenseMap<Value *, unsigned> UniquePositions;
   for (Value *V : VL) {
     auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
     ReuseShuffleIndicies.emplace_back(Res.first->second);
     if (Res.second)
       UniqueValues.emplace_back(V);
   }
   if (UniqueValues.size() == VL.size()) {
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
     if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
       newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
     VL = UniqueValues;
   }
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
     BSRef = llvm::make_unique<BlockScheduling>(BB);
 
   BlockScheduling &BS = *BSRef.get();
 
   if (!BS.tryScheduleBundle(VL, this, S)) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
   unsigned ShuffleOrOp = S.isAltShuffle() ?
                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
       for (unsigned j = 0; j < VL.size(); ++j)
         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
           TerminatorInst *Term = dyn_cast<TerminatorInst>(
               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
           if (Term) {
             LLVM_DEBUG(
                 dbgs()
                 << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL, VL0);
             newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
           }
         }
 
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
       OrdersType CurrentOrder;
       bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
         newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
                      ReuseShuffleIndicies);
         return;
       }
       if (!CurrentOrder.empty()) {
         LLVM_DEBUG({
           dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
                     "with order";
           for (unsigned Idx : CurrentOrder)
             dbgs() << " " << Idx;
           dbgs() << "\n";
         });
         // Insert new order with initial value 0, if it does not exist,
         // otherwise return the iterator to the existing one.
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
         newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
       newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
       BS.cancelScheduling(VL, VL0);
       return;
     }
     case Instruction::Load: {
       // Check that a vectorized load would load the same memory as a scalar
       // load. For example, we don't want to vectorize loads that are smaller
       // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
       // treats loading/storing it as an i8 struct. If we vectorize loads/stores
       // from such a struct, we read/write packed bits disagreeing with the
       // unvectorized version.
       Type *ScalarTy = VL0->getType();
 
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
         newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
       SmallVector<Value *, 4> PointerOps(VL.size());
       auto POIter = PointerOps.begin();
       for (Value *V : VL) {
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
         *POIter = L->getPointerOperand();
         ++POIter;
       }
 
       OrdersType CurrentOrder;
       // Check the order of pointer operands.
       if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
         Value *Ptr0;
         Value *PtrN;
         if (CurrentOrder.empty()) {
           Ptr0 = PointerOps.front();
           PtrN = PointerOps.back();
         } else {
           Ptr0 = PointerOps[CurrentOrder.front()];
           PtrN = PointerOps[CurrentOrder.back()];
         }
         const SCEV *Scev0 = SE->getSCEV(Ptr0);
         const SCEV *ScevN = SE->getSCEV(PtrN);
         const auto *Diff =
             dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
         uint64_t Size = DL->getTypeAllocSize(ScalarTy);
         // Check that the sorted loads are consecutive.
         if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
             newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
             newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies, I->getFirst());
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
           return;
         }
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       BS.cancelScheduling(VL, VL0);
       newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
     case Instruction::FPToSI:
     case Instruction::FPExt:
     case Instruction::PtrToInt:
     case Instruction::IntToPtr:
     case Instruction::SIToFP:
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       for (unsigned i = 0; i < VL.size(); ++i) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
       // Check that all of the compares have the same predicate.
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Type *ComparedTy = VL0->getOperand(0)->getType();
       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
         CmpInst *Cmp = cast<CmpInst>(VL[i]);
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::Select:
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
     case Instruction::FSub:
     case Instruction::Mul:
     case Instruction::FMul:
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::FDiv:
     case Instruction::URem:
     case Instruction::SRem:
     case Instruction::FRem:
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
       for (unsigned j = 0; j < VL.size(); ++j) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
       for (unsigned j = 0; j < VL.size(); ++j) {
         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
       for (unsigned j = 0; j < VL.size(); ++j) {
         auto Op = cast<Instruction>(VL[j])->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
       buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
       // Check if the calls are all to the same vectorizable intrinsic.
       CallInst *CI = cast<CallInst>(VL0);
       // Check if this is an Intrinsic call or something that can be
       // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
         newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
       Function *Int = CI->getCalledFunction();
       Value *A1I = nullptr;
       if (hasVectorInstrinsicScalarOpd(ID, 1))
         A1I = CI->getArgOperand(1);
       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                             << "\n");
           return;
         }
         // ctlz,cttz and powi are special intrinsics whose second argument
         // should be same in order for them to be vectorized.
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL, VL0);
             newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                               << " argument " << A1I << "!=" << A1J << "\n");
             return;
           }
         }
         // Verify that the bundle operands are identical between the two calls.
         if (CI->hasOperandBundles() &&
             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
                             << *CI << "!=" << *VL[i] << '\n');
           return;
         }
       }
 
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL) {
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ShuffleVector:
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
         BS.cancelScheduling(VL, VL0);
         newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
       newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(S, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
       BS.cancelScheduling(VL, VL0);
       newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
 }
 
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N;
   Type *EltTy;
   auto *ST = dyn_cast<StructType>(T);
   if (ST) {
     N = ST->getNumElements();
     EltTy = *ST->element_begin();
   } else {
     N = cast<ArrayType>(T)->getNumElements();
     EltTy = cast<ArrayType>(T)->getElementType();
   }
   if (!isValidElementType(EltTy))
     return 0;
   uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
     return 0;
   if (ST) {
     // Check that struct is homogeneous.
     for (const auto *Ty : ST->elements())
       if (Ty != EltTy)
         return 0;
   }
   return N;
 }
 
 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                               SmallVectorImpl<unsigned> &CurrentOrder) const {
   Instruction *E0 = cast<Instruction>(OpValue);
   assert(E0->getOpcode() == Instruction::ExtractElement ||
          E0->getOpcode() == Instruction::ExtractValue);
   assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *Vec = E0->getOperand(0);
 
   CurrentOrder.clear();
 
   // We have to extract from a vector/aggregate with the same number of elements.
   unsigned NElts;
   if (E0->getOpcode() == Instruction::ExtractValue) {
     const DataLayout &DL = E0->getModule()->getDataLayout();
     NElts = canMapToVector(Vec->getType(), DL);
     if (!NElts)
       return false;
     // Check if load can be rewritten as load of vector.
     LoadInst *LI = dyn_cast<LoadInst>(Vec);
     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
       return false;
   } else {
     NElts = Vec->getType()->getVectorNumElements();
   }
 
   if (NElts != VL.size())
     return false;
 
   // Check that all of the indices extract from the correct offset.
   bool ShouldKeepOrder = true;
   unsigned E = VL.size();
   // Assign to all items the initial value E + 1 so we can check if the extract
   // instruction index was used already.
   // Also, later we can check that all the indices are used and we have a
   // consecutive access in the extract instructions, by checking that no
   // element of CurrentOrder still has value E + 1.
   CurrentOrder.assign(E, E + 1);
   unsigned I = 0;
   for (; I < E; ++I) {
     auto *Inst = cast<Instruction>(VL[I]);
     if (Inst->getOperand(0) != Vec)
       break;
     Optional<unsigned> Idx = getExtractIndex(Inst);
     if (!Idx)
       break;
     const unsigned ExtIdx = *Idx;
     if (ExtIdx != I) {
       if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
         break;
       ShouldKeepOrder = false;
       CurrentOrder[ExtIdx] = I;
     } else {
       if (CurrentOrder[I] != E + 1)
         break;
       CurrentOrder[I] = I;
     }
   }
   if (I < E) {
     CurrentOrder.clear();
     return false;
   }
 
   return ShouldKeepOrder;
 }
 
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() ||
          std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
            return ScalarToTreeEntry.count(U) > 0;
          });
 }
 
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
   else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
     ScalarTy = CI->getOperand(0)->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
   if (MinBWs.count(VL[0]))
     VecTy = VectorType::get(
         IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   int ReuseShuffleCost = 0;
   if (NeedToShuffleReuses) {
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
   if (E->NeedToGather) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
     if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
         allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
         int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
         for (auto *V : VL) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
           if (areAllUsersVectorized(cast<Instruction>(V)) &&
               !ScalarToTreeEntry.count(V)) {
             auto *IO = cast<ConstantInt>(
                 cast<ExtractElementInst>(V)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                             IO->getZExtValue());
           }
         }
         return ReuseShuffleCost + Cost;
       }
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
   InstructionsState S = getSameOpcode(VL);
   assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = cast<Instruction>(S.OpValue);
   unsigned ShuffleOrOp = S.isAltShuffle() ?
                (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
 
     case Instruction::ExtractValue:
     case Instruction::ExtractElement:
       if (NeedToShuffleReuses) {
         unsigned Idx = 0;
         for (unsigned I : E->ReuseShuffleIndices) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *IO = cast<ConstantInt>(
                 cast<ExtractElementInst>(VL[I])->getIndexOperand());
             Idx = IO->getZExtValue();
             ReuseShuffleCost -= TTI->getVectorInstrCost(
                 Instruction::ExtractElement, VecTy, Idx);
           } else {
             ReuseShuffleCost -= TTI->getVectorInstrCost(
                 Instruction::ExtractElement, VecTy, Idx);
             ++Idx;
           }
         }
         Idx = ReuseShuffleNumbers;
         for (Value *V : VL) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *IO = cast<ConstantInt>(
                 cast<ExtractElementInst>(V)->getIndexOperand());
             Idx = IO->getZExtValue();
           } else {
             --Idx;
           }
           ReuseShuffleCost +=
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
       if (!E->NeedToGather) {
         int DeadCost = ReuseShuffleCost;
         if (!E->ReorderIndices.empty()) {
           // TODO: Merge this shuffle with the ReuseShuffleCost.
           DeadCost += TTI->getShuffleCost(
               TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
         }
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
           Instruction *E = cast<Instruction>(VL[i]);
           // If all users are going to be vectorized, instruction can be
           // considered as dead.
           // The same, if have only one user, it will be vectorized for sure.
           if (areAllUsersVectorized(E)) {
             // Take credit for instruction that will become dead.
             if (E->hasOneUse()) {
               Instruction *Ext = E->user_back();
               if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
                   all_of(Ext->users(),
                          [](User *U) { return isa<GetElementPtrInst>(U); })) {
                 // Use getExtractWithExtendCost() to calculate the cost of
                 // extractelement/ext pair.
                 DeadCost -= TTI->getExtractWithExtendCost(
                     Ext->getOpcode(), Ext->getType(), VecTy, i);
                 // Add back the cost of s|zext which is subtracted seperately.
                 DeadCost += TTI->getCastInstrCost(
                     Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
                 continue;
               }
             }
             DeadCost -=
                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
           }
         }
         return DeadCost;
       }
       return ReuseShuffleCost + getGatherCost(VL);
 
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
     case Instruction::FPToSI:
     case Instruction::FPExt:
     case Instruction::PtrToInt:
     case Instruction::IntToPtr:
     case Instruction::SIToFP:
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
           TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
       int ScalarCost = VL.size() * ScalarEltCost;
 
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
       int VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
                   TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
       int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
                                                   Builder.getInt1Ty(), VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
     case Instruction::FSub:
     case Instruction::Mul:
     case Instruction::FMul:
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::FDiv:
     case Instruction::URem:
     case Instruction::SRem:
     case Instruction::FRem:
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
       // Certain instructions can be cheaper to vectorize if they have a
       // constant second vector operand.
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_UniformConstantValue;
       TargetTransformInfo::OperandValueProperties Op1VP =
           TargetTransformInfo::OP_None;
       TargetTransformInfo::OperandValueProperties Op2VP =
           TargetTransformInfo::OP_PowerOf2;
 
       // If all operands are exactly the same ConstantInt then set the
       // operand kind to OK_UniformConstantValue.
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
       ConstantInt *CInt0 = nullptr;
       for (unsigned i = 0, e = VL.size(); i < e; ++i) {
         const Instruction *I = cast<Instruction>(VL[i]);
         ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
         if (!CInt) {
           Op2VK = TargetTransformInfo::OK_AnyValue;
           Op2VP = TargetTransformInfo::OP_None;
           break;
         }
         if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
             !CInt->getValue().isPowerOf2())
           Op2VP = TargetTransformInfo::OP_None;
         if (i == 0) {
           CInt0 = CInt;
           continue;
         }
         if (CInt0 != CInt)
           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
           S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
                                                 Op2VK, Op1VP, Op2VP, Operands);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_UniformConstantValue;
 
       int ScalarEltCost =
           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost =
           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
       int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       int VecLdCost =
           TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
       return ReuseShuffleCost + VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
       int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
       int VecStCost =
           TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
       return ReuseShuffleCost + VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
         ScalarTys.push_back(CI->getArgOperand(op)->getType());
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
         FMF = FPMO->getFastMathFlags();
 
       int ScalarEltCost =
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
       SmallVector<Value *, 4> Args(CI->arg_operands());
       int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
                                                    VecTy->getNumElements());
 
       LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
                         << " (" << VecCallCost << "-" << ScalarCallCost << ")"
                         << " for " << *CI << "\n");
 
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
       assert(S.isAltShuffle() &&
              ((Instruction::isBinaryOp(S.getOpcode()) &&
                Instruction::isBinaryOp(S.getAltOpcode())) ||
               (Instruction::isCast(S.getOpcode()) &&
                Instruction::isCast(S.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
           Instruction *I = cast<Instruction>(VL[Idx]);
           ReuseShuffleCost -= TTI->getInstructionCost(
               I, TargetTransformInfo::TCK_RecipThroughput);
         }
         for (Value *V : VL) {
           Instruction *I = cast<Instruction>(V);
           ReuseShuffleCost += TTI->getInstructionCost(
               I, TargetTransformInfo::TCK_RecipThroughput);
         }
       }
       for (Value *i : VL) {
         Instruction *I = cast<Instruction>(i);
         assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(
             I, TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
       if (Instruction::isBinaryOp(S.getOpcode())) {
         VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
         VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
       } else {
         Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
         Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
         VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
         VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
         VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
         VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     default:
       llvm_unreachable("Unknown instruction");
   }
 }
 
 bool BoUpSLP::isFullyVectorizableTinyTree() {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
                     << VectorizableTree.size() << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
   if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
     return true;
 
   if (VectorizableTree.size() != 2)
     return false;
 
   // Handle splat and all-constants stores.
   if (!VectorizableTree[0].NeedToGather &&
       (allConstant(VectorizableTree[1].Scalars) ||
        isSplat(VectorizableTree[1].Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
     return false;
 
   return true;
 }
 
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
   if (VectorizableTree.size() >= MinTreeSize)
     return false;
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
   // can vectorize it if we can prove it fully vectorizable.
   if (isFullyVectorizableTinyTree())
     return false;
 
   assert(VectorizableTree.empty()
              ? ExternalUses.empty()
              : true && "We shouldn't have any external users");
 
   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
   // vectorizable.
   return true;
 }
 
 int BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
   int Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
   for (const auto &N : VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
     if (!Inst)
       continue;
 
     if (!PrevInst) {
       PrevInst = Inst;
       continue;
     }
 
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
       if (isa<Instruction>(&*J) && getTreeEntry(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
     LLVM_DEBUG({
       dbgs() << "SLP: #LV: " << LiveValues.size();
       for (auto *X : LiveValues)
         dbgs() << " " << X->getName();
       dbgs() << ", Looking at ";
       Inst->dump();
     });
 
     // Now find the sequence of instructions between PrevInst and Inst.
     BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
                                  PrevInstIt =
                                      PrevInst->getIterator().getReverse();
     while (InstIt != PrevInstIt) {
       if (PrevInstIt == PrevInst->getParent()->rend()) {
         PrevInstIt = Inst->getParent()->rbegin();
         continue;
       }
 
       // Debug informations don't impact spill cost.
       if ((isa<CallInst>(&*PrevInstIt) &&
            !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
           &*PrevInstIt != PrevInst) {
         SmallVector<Type*, 4> V;
         for (auto *II : LiveValues)
           V.push_back(VectorType::get(II->getType(), BundleWidth));
         Cost += TTI->getCostOfKeepingLiveOverCall(V);
       }
 
       ++PrevInstIt;
     }
 
     PrevInst = Inst;
   }
 
   return Cost;
 }
 
 int BoUpSLP::getTreeCost() {
   int Cost = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
 
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = VectorizableTree[I];
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
     // For example, if we have a build vector (i.e., insertelement sequence)
     // that is used by more than one vector instruction, we only need to
     // compute the cost of the insertelement instructions once. The redundent
     // instructions will be eliminated by CSE.
     //
     // We should consider not creating duplicate tree entries for gather
     // sequences, and instead add additional edges to the tree representing
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yeild different results.
     //
     if (TE.NeedToGather &&
         std::any_of(std::next(VectorizableTree.begin(), I + 1),
                     VectorizableTree.end(), [TE](TreeEntry &Entry) {
                       return Entry.NeedToGather && Entry.isSame(TE.Scalars);
                     }))
       continue;
 
     int C = getEntryCost(&TE);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
     Cost += C;
   }
 
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
   int ExtractCost = 0;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!ExtractCostCalculated.insert(EU.Scalar).second)
       continue;
 
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
     // removed as well).
     if (EphValues.count(EU.User))
       continue;
 
     // If we plan to rewrite the tree in a smaller type, we will need to sign
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
     auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
     auto *ScalarRoot = VectorizableTree[0].Scalars[0];
     if (MinBWs.count(ScalarRoot)) {
       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
       auto Extend =
           MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
       VecTy = VectorType::get(MinTy, BundleWidth);
       ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
                                                    VecTy, EU.Lane);
     } else {
       ExtractCost +=
           TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
     }
   }
 
   int SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
 
   std::string Str;
   {
     raw_string_ostream OS(Str);
     OS << "SLP: Spill Cost = " << SpillCost << ".\n"
        << "SLP: Extract Cost = " << ExtractCost << ".\n"
        << "SLP: Total Cost = " << Cost << ".\n";
   }
   LLVM_DEBUG(dbgs() << Str);
 
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 
   return Cost;
 }
 
 int BoUpSLP::getGatherCost(Type *Ty,
                            const DenseSet<unsigned> &ShuffledIndices) {
   int Cost = 0;
   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
     if (!ShuffledIndices.count(i))
       Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
   if (!ShuffledIndices.empty())
       Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
 }
 
 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
   // Find the type of the operands in VL.
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   DenseSet<unsigned> ShuffledElements;
   DenseSet<Value *> UniqueElements;
   // Iterate in reverse order to consider insert elements with the high cost.
   for (unsigned I = VL.size(); I > 0; --I) {
     unsigned Idx = I - 1;
     if (!UniqueElements.insert(VL[Idx]).second)
       ShuffledElements.insert(Idx);
   }
   return getGatherCost(VecTy, ShuffledElements);
 }
 
 // Reorder commutative operations in alternate shuffle if the resulting vectors
 // are consecutive loads. This would allow us to vectorize the tree.
 // If we have something like-
 // load a[0] - load b[0]
 // load b[1] + load a[1]
 // load a[2] - load b[2]
 // load a[3] + load b[3]
 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
 // code.
 void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
                                         ArrayRef<Value *> VL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
   // Push left and right operands of binary operation into Left and Right
   for (Value *V : VL) {
     auto *I = cast<Instruction>(V);
     assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
     Left.push_back(I->getOperand(0));
     Right.push_back(I->getOperand(1));
   }
 
   // Reorder if we have a commutative operation and consecutive access
   // are on either side of the alternate instructions.
   for (unsigned j = 0; j < VL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
         } else if (VL2->isCommutative() &&
                    isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
         // else unchanged
       }
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
         } else if (VL2->isCommutative() &&
                    isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
         // else unchanged
       }
     }
   }
 }
 
 // Return true if I should be commuted before adding it's left and right
 // operands to the arrays Left and Right.
 //
 // The vectorizer is trying to either have all elements one side being
 // instruction with the same opcode to enable further vectorization, or having
 // a splat to lower the vectorizing cost.
 static bool shouldReorderOperands(
     int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
     ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
     bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
   VLeft = I.getOperand(0);
   VRight = I.getOperand(1);
   // If we have "SplatRight", try to see if commuting is needed to preserve it.
   if (SplatRight) {
     if (VRight == Right[i - 1])
       // Preserve SplatRight
       return false;
     if (VLeft == Right[i - 1]) {
       // Commuting would preserve SplatRight, but we don't want to break
       // SplatLeft either, i.e. preserve the original order if possible.
       // (FIXME: why do we care?)
       if (SplatLeft && VLeft == Left[i - 1])
         return false;
       return true;
     }
   }
   // Symmetrically handle Right side.
   if (SplatLeft) {
     if (VLeft == Left[i - 1])
       // Preserve SplatLeft
       return false;
     if (VRight == Left[i - 1])
       return true;
   }
 
   Instruction *ILeft = dyn_cast<Instruction>(VLeft);
   Instruction *IRight = dyn_cast<Instruction>(VRight);
 
   // If we have "AllSameOpcodeRight", try to see if the left operands preserves
   // it and not the right, in this case we want to commute.
   if (AllSameOpcodeRight) {
     unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
     if (IRight && RightPrevOpcode == IRight->getOpcode())
       // Do not commute, a match on the right preserves AllSameOpcodeRight
       return false;
     if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
       // We have a match and may want to commute, but first check if there is
       // not also a match on the existing operands on the Left to preserve
       // AllSameOpcodeLeft, i.e. preserve the original order if possible.
       // (FIXME: why do we care?)
       if (AllSameOpcodeLeft && ILeft &&
           cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
         return false;
       return true;
     }
   }
   // Symmetrically handle Left side.
   if (AllSameOpcodeLeft) {
     unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
     if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
       return false;
     if (IRight && LeftPrevOpcode == IRight->getOpcode())
       return true;
   }
   return false;
 }
 
 void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
                                              ArrayRef<Value *> VL,
                                              SmallVectorImpl<Value *> &Left,
                                              SmallVectorImpl<Value *> &Right) {
   if (!VL.empty()) {
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
     auto *I = cast<Instruction>(VL[0]);
     Value *VLeft = I->getOperand(0);
     Value *VRight = I->getOperand(1);
     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
       // Favor having instruction to the right. FIXME: why?
       std::swap(VLeft, VRight);
     Left.push_back(VLeft);
     Right.push_back(VRight);
   }
 
   // Keep track if we have instructions with all the same opcode on one side.
   bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
   bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
   // Keep track if we have one side with all the same value (broadcast).
   bool SplatLeft = true;
   bool SplatRight = true;
 
   for (unsigned i = 1, e = VL.size(); i != e; ++i) {
     Instruction *I = cast<Instruction>(VL[i]);
     assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
             (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
            "Can only process commutative instruction");
     // Commute to favor either a splat or maximizing having the same opcodes on
     // one side.
     Value *VLeft;
     Value *VRight;
     if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
                               AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
                               VRight)) {
       Left.push_back(VRight);
       Right.push_back(VLeft);
     } else {
       Left.push_back(VLeft);
       Right.push_back(VRight);
     }
     // Update Splat* and AllSameOpcode* after the insertion.
     SplatRight = SplatRight && (Right[i - 1] == Right[i]);
     SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
     AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
                         (cast<Instruction>(Left[i - 1])->getOpcode() ==
                          cast<Instruction>(Left[i])->getOpcode());
     AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
                          (cast<Instruction>(Right[i - 1])->getOpcode() ==
                           cast<Instruction>(Right[i])->getOpcode());
   }
 
   // If one operand end up being broadcast, return this operand order.
   if (SplatRight || SplatLeft)
     return;
 
   // Finally check if we can get longer vectorizable chain by reordering
   // without breaking the good operand order detected above.
   // E.g. If we have something like-
   // load a[0]  load b[0]
   // load b[1]  load a[1]
   // load a[2]  load b[2]
   // load a[3]  load b[3]
   // Reordering the second load b[1]  load a[1] would allow us to vectorize
   // this code and we still retain AllSameOpcode property.
   // FIXME: This load reordering might break AllSameOpcode in some rare cases
   // such as-
   // add a[0],c[0]  load b[0]
   // add a[1],c[2]  load b[1]
   // b[2]           load b[2]
   // add a[3],c[3]  load b[3]
   for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
       }
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
       }
     }
     // else unchanged
   }
 }
 
 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
                                         const InstructionsState &S) {
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
   auto *Front = cast<Instruction>(S.OpValue);
   auto *BB = Front->getParent();
   assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
     auto *I = cast<Instruction>(V);
     return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
   }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
   // Find the last instruction. The common case should be that BB has been
   // scheduled, and the last instruction is VL.back(). So we start with
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
     auto *Bundle =
         BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
           LastInst = Bundle->Inst;
   }
 
   // LastInst can still be null at this point if there's either not an entry
   // for BB in BlocksSchedules or there's no ScheduleData available for
   // VL.back(). This can be the case if buildTree_rec aborts for various
   // reasons (e.g., the maximum recursion depth is reached, the maximum region
   // size is reached, etc.). ScheduleData is initialized in the scheduling
   // "dry-run".
   //
   // If this happens, we can still find the last instruction by brute force. We
   // iterate forwards from Front (inclusive) until we either see all
   // instructions in the bundle or reach the end of the block. If Front is the
   // last instruction in program order, LastInst will be set to Front, and we
   // will visit all the remaining instructions in the block.
   //
   // One of the reasons we exit early from buildTree_rec is to place an upper
   // bound on compile-time. Thus, taking an additional compile-time hit here is
   // not ideal. However, this should be exceedingly rare since it requires that
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
     SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
       if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
         LastInst = &I;
       if (Bundle.empty())
         break;
     }
   }
 
   // Set the insertion point after the last instruction in the bundle. Set the
   // debug location to Front.
   Builder.SetInsertPoint(BB, ++LastInst->getIterator());
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   Value *Vec = UndefValue::get(Ty);
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
       GatherSeq.insert(Insrt);
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
       if (TreeEntry *E = getTreeEntry(VL[i])) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
           // Is this the lane of the scalar that we are looking for ?
           if (E->Scalars[Lane] == VL[i]) {
             FoundLane = Lane;
             break;
           }
         }
         assert(FoundLane >= 0 && "Could not find the correct lane");
         if (!E->ReuseShuffleIndices.empty()) {
           FoundLane =
               std::distance(E->ReuseShuffleIndices.begin(),
                             llvm::find(E->ReuseShuffleIndices, FoundLane));
         }
         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
       }
     }
   }
 
   return Vec;
 }
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
   if (S.getOpcode()) {
     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
       if (E->isSame(VL)) {
         Value *V = vectorizeTree(E);
         if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
           // We need to get the vectorized value but without shuffle.
           if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
             V = SV->getOperand(0);
           } else {
             // Reshuffle to get only unique values.
             SmallVector<unsigned, 4> UniqueIdxs;
             SmallSet<unsigned, 4> UsedIdxs;
             for(unsigned Idx : E->ReuseShuffleIndices)
               if (UsedIdxs.insert(Idx).second)
                 UniqueIdxs.emplace_back(Idx);
             V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
                                             UniqueIdxs);
           }
         }
         return V;
       }
     }
   }
 
   Type *ScalarTy = S.OpValue->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     ScalarTy = SI->getValueOperand()->getType();
 
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
   SmallVector<Value *, 4> UniqueValues;
   if (VL.size() > 2) {
     DenseMap<Value *, unsigned> UniquePositions;
     for (Value *V : VL) {
       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
       ReuseShuffleIndicies.emplace_back(Res.first->second);
       if (Res.second || isa<Constant>(V))
         UniqueValues.emplace_back(V);
     }
     // Do not shuffle single element or if number of unique values is not power
     // of 2.
     if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
         !llvm::isPowerOf2_32(UniqueValues.size()))
       ReuseShuffleIndicies.clear();
     else
       VL = UniqueValues;
   }
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
   Value *V = Gather(VL, VecTy);
   if (!ReuseShuffleIndicies.empty()) {
     V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                     ReuseShuffleIndicies, "shuffle");
     if (auto *I = dyn_cast<Instruction>(V)) {
       GatherSeq.insert(I);
       CSEBlocks.insert(I->getParent());
     }
   }
   return V;
 }
 
 static void inversePermutation(ArrayRef<unsigned> Indices,
                                SmallVectorImpl<unsigned> &Mask) {
   Mask.clear();
   const unsigned E = Indices.size();
   Mask.resize(E);
   for (unsigned I = 0; I < E; ++I)
     Mask[Indices[I]] = I;
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
     LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
 
   InstructionsState S = getSameOpcode(E->Scalars);
   Instruction *VL0 = cast<Instruction>(S.OpValue);
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
   if (E->NeedToGather) {
     setInsertPointAfterBundle(E->Scalars, S);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                       E->ReuseShuffleIndices, "shuffle");
       if (auto *I = dyn_cast<Instruction>(V)) {
         GatherSeq.insert(I);
         CSEBlocks.insert(I->getParent());
       }
     }
     E->VectorizedValue = V;
     return V;
   }
 
   unsigned ShuffleOrOp = S.isAltShuffle() ?
            (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
 
       // PHINodes may have multiple entries from the same block. We want to
       // visit every block once.
       SmallPtrSet<BasicBlock*, 4> VisitedBBs;
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         BasicBlock *IBB = PH->getIncomingBlock(i);
 
         if (!VisitedBBs.insert(IBB).second) {
           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
           continue;
         }
 
         // Prepare the operand vector.
         for (Value *V : E->Scalars)
           Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeTree(Operands);
         NewPhi->addIncoming(Vec, IBB);
       }
 
       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
              "Invalid number of incoming values");
       return V;
     }
 
     case Instruction::ExtractElement: {
       if (!E->NeedToGather) {
         Value *V = VL0->getOperand(0);
         if (!E->ReorderIndices.empty()) {
           OrdersType Mask;
           inversePermutation(E->ReorderIndices, Mask);
           Builder.SetInsertPoint(VL0);
           V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
                                           "reorder_shuffle");
         }
         if (NeedToShuffleReuses) {
           // TODO: Merge this shuffle with the ReorderShuffleMask.
-          if (!E->ReorderIndices.empty())
+          if (E->ReorderIndices.empty())
             Builder.SetInsertPoint(VL0);
-          else if (auto *I = dyn_cast<Instruction>(V))
-            Builder.SetInsertPoint(I->getParent(),
-                                   std::next(I->getIterator()));
-          else
-            Builder.SetInsertPoint(&F->getEntryBlock(),
-                                   F->getEntryBlock().getFirstInsertionPt());
           V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                           E->ReuseShuffleIndices, "shuffle");
         }
         E->VectorizedValue = V;
         return V;
       }
       setInsertPointAfterBundle(E->Scalars, S);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
         if (auto *I = dyn_cast<Instruction>(V)) {
           GatherSeq.insert(I);
           CSEBlocks.insert(I->getParent());
         }
       }
       E->VectorizedValue = V;
       return V;
     }
     case Instruction::ExtractValue: {
       if (!E->NeedToGather) {
         LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
         Builder.SetInsertPoint(LI);
         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
         Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
         LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
         Value *NewV = propagateMetadata(V, E->Scalars);
         if (!E->ReorderIndices.empty()) {
           OrdersType Mask;
           inversePermutation(E->ReorderIndices, Mask);
           NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
                                              "reorder_shuffle");
         }
         if (NeedToShuffleReuses) {
           // TODO: Merge this shuffle with the ReorderShuffleMask.
           NewV = Builder.CreateShuffleVector(
               NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
         }
         E->VectorizedValue = NewV;
         return NewV;
       }
       setInsertPointAfterBundle(E->Scalars, S);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
         if (auto *I = dyn_cast<Instruction>(V)) {
           GatherSeq.insert(I);
           CSEBlocks.insert(I->getParent());
         }
       }
       E->VectorizedValue = V;
       return V;
     }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
     case Instruction::FPToSI:
     case Instruction::FPExt:
     case Instruction::PtrToInt:
     case Instruction::IntToPtr:
     case Instruction::SIToFP:
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       ValueList INVL;
       for (Value *V : E->Scalars)
         INVL.push_back(cast<Instruction>(V)->getOperand(0));
 
       setInsertPointAfterBundle(E->Scalars, S);
 
       Value *InVec = vectorizeTree(INVL);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
 
       CastInst *CI = dyn_cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::FCmp:
     case Instruction::ICmp: {
       ValueList LHSV, RHSV;
       for (Value *V : E->Scalars) {
         LHSV.push_back(cast<Instruction>(V)->getOperand(0));
         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
       }
 
       setInsertPointAfterBundle(E->Scalars, S);
 
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
       if (S.getOpcode() == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
 
       propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::Select: {
       ValueList TrueVec, FalseVec, CondVec;
       for (Value *V : E->Scalars) {
         CondVec.push_back(cast<Instruction>(V)->getOperand(0));
         TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
       }
 
       setInsertPointAfterBundle(E->Scalars, S);
 
       Value *Cond = vectorizeTree(CondVec);
       Value *True = vectorizeTree(TrueVec);
       Value *False = vectorizeTree(FalseVec);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
     case Instruction::FSub:
     case Instruction::Mul:
     case Instruction::FMul:
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::FDiv:
     case Instruction::URem:
     case Instruction::SRem:
     case Instruction::FRem:
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
         reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
                                        RHSVL);
       else
         for (Value *V : E->Scalars) {
           auto *I = cast<Instruction>(V);
           LHSVL.push_back(I->getOperand(0));
           RHSVL.push_back(I->getOperand(1));
         }
 
       setInsertPointAfterBundle(E->Scalars, S);
 
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
 
       Value *V = Builder.CreateBinOp(
           static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
       return V;
     }
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
       bool IsReorder = !E->ReorderIndices.empty();
       if (IsReorder) {
         S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
         VL0 = cast<Instruction>(S.OpValue);
       }
       setInsertPointAfterBundle(E->Scalars, S);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
       unsigned AS = LI->getPointerAddressSpace();
 
       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
                                             VecTy->getPointerTo(AS));
 
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
       Value *PO = LI->getPointerOperand();
       if (getTreeEntry(PO))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
       if (!Alignment) {
         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
       }
       LI->setAlignment(Alignment);
       Value *V = propagateMetadata(LI, E->Scalars);
       if (IsReorder) {
         OrdersType Mask;
         inversePermutation(E->ReorderIndices, Mask);
         V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
                                         Mask, "reorder_shuffle");
       }
       if (NeedToShuffleReuses) {
         // TODO: Merge this shuffle with the ReorderShuffleMask.
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
       unsigned Alignment = SI->getAlignment();
       unsigned AS = SI->getPointerAddressSpace();
 
       ValueList ScalarStoreValues;
       for (Value *V : E->Scalars)
         ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
 
       setInsertPointAfterBundle(E->Scalars, S);
 
       Value *VecValue = vectorizeTree(ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
       // future.
       if (getTreeEntry(ScalarPtr))
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
       if (!Alignment)
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
 
       ST->setAlignment(Alignment);
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::GetElementPtr: {
       setInsertPointAfterBundle(E->Scalars, S);
 
       ValueList Op0VL;
       for (Value *V : E->Scalars)
         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
       Value *Op0 = vectorizeTree(Op0VL);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
            ++j) {
         ValueList OpVL;
         for (Value *V : E->Scalars)
           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
         Value *OpVec = vectorizeTree(OpVL);
         OpVecs.push_back(OpVec);
       }
 
       Value *V = Builder.CreateGEP(
           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
       return V;
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
       setInsertPointAfterBundle(E->Scalars, S);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
       Value *ScalarArg = nullptr;
       if (CI && (FI = CI->getCalledFunction())) {
         IID = FI->getIntrinsicID();
       }
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
         ValueList OpVL;
         // ctlz,cttz and powi are special intrinsics whose second argument is
         // a scalar. This argument should not be vectorized.
         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
           CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(j);
           OpVecs.push_back(CEI->getArgOperand(j));
           continue;
         }
         for (Value *V : E->Scalars) {
           CallInst *CEI = cast<CallInst>(V);
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
         Value *OpVec = vectorizeTree(OpVL);
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
 
       Module *M = F->getParent();
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
       SmallVector<OperandBundleDef, 1> OpBundles;
       CI->getOperandBundlesAsDefs(OpBundles);
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
       if (ScalarArg && getTreeEntry(ScalarArg))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
       assert(S.isAltShuffle() &&
              ((Instruction::isBinaryOp(S.getOpcode()) &&
                Instruction::isBinaryOp(S.getAltOpcode())) ||
               (Instruction::isCast(S.getOpcode()) &&
                Instruction::isCast(S.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
 
       Value *LHS, *RHS;
       if (Instruction::isBinaryOp(S.getOpcode())) {
         reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
         setInsertPointAfterBundle(E->Scalars, S);
         LHS = vectorizeTree(LHSVL);
         RHS = vectorizeTree(RHSVL);
       } else {
         ValueList INVL;
         for (Value *V : E->Scalars)
           INVL.push_back(cast<Instruction>(V)->getOperand(0));
         setInsertPointAfterBundle(E->Scalars, S);
         LHS = vectorizeTree(INVL);
       }
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
 
       Value *V0, *V1;
       if (Instruction::isBinaryOp(S.getOpcode())) {
         V0 = Builder.CreateBinOp(
           static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
         V1 = Builder.CreateBinOp(
           static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
             static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
         V1 = Builder.CreateCast(
             static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
       // Also, gather up main and alt scalar ops to propagate IR flags to
       // each vector operation.
       ValueList OpScalars, AltScalars;
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
         assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
         if (OpInst->getOpcode() == S.getAltOpcode()) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
           Mask[i] = Builder.getInt32(i);
           OpScalars.push_back(E->Scalars[i]);
         }
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
       propagateIRFlags(V0, OpScalars);
       propagateIRFlags(V1, AltScalars);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
       return V;
     }
     default:
     llvm_unreachable("unknown inst");
   }
   return nullptr;
 }
 
 Value *BoUpSLP::vectorizeTree() {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
   return vectorizeTree(ExternallyUsedValues);
 }
 
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
     scheduleBlock(BSIter.second.get());
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
   auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
   auto *ScalarRoot = VectorizableTree[0].Scalars[0];
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
     auto BundleWidth = VectorizableTree[0].Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
     auto *VecTy = VectorType::get(MinTy, BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
     VectorizableTree[0].VectorizedValue = Trunc;
   }
 
   LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
                     << " values .\n");
 
   // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
   // specified by ScalarType.
   auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
     if (!MinBWs.count(ScalarRoot))
       return Ex;
     if (MinBWs[ScalarRoot].second)
       return Builder.CreateSExt(Ex, ScalarType);
     return Builder.CreateZExt(Ex, ScalarType);
   };
 
   // Extract all of the elements with the external uses.
   for (const auto &ExternalUse : ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
     llvm::User *User = ExternalUse.User;
 
     // Skip users that we already RAUW. This happens when one instruction
     // has multiple uses of the same value.
     if (User && !is_contained(Scalar->users(), User))
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
     assert(!E->NeedToGather && "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
     // If User == nullptr, the Scalar is used as extra arg. Generate
     // ExtractElement instruction and update the record for this scalar in
     // ExternallyUsedValues.
     if (!User) {
       assert(ExternallyUsedValues.count(Scalar) &&
              "Scalar with nullptr as an external user must be registered in "
              "ExternallyUsedValues map");
       if (auto *VecI = dyn_cast<Instruction>(Vec)) {
         Builder.SetInsertPoint(VecI->getParent(),
                                std::next(VecI->getIterator()));
       } else {
         Builder.SetInsertPoint(&F->getEntryBlock().front());
       }
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
       Ex = extend(ScalarRoot, Ex, Scalar->getType());
       CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
       auto &Locs = ExternallyUsedValues[Scalar];
       ExternallyUsedValues.insert({Ex, Locs});
       ExternallyUsedValues.erase(Scalar);
       continue;
     }
 
     // Generate extracts for out-of-tree users.
     // Find the insertion point for the extractelement lane.
     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
       if (PHINode *PH = dyn_cast<PHINode>(User)) {
         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
           if (PH->getIncomingValue(i) == Scalar) {
             TerminatorInst *IncomingTerminator =
                 PH->getIncomingBlock(i)->getTerminator();
             if (isa<CatchSwitchInst>(IncomingTerminator)) {
               Builder.SetInsertPoint(VecI->getParent(),
                                      std::next(VecI->getIterator()));
             } else {
               Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
             }
             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
             Ex = extend(ScalarRoot, Ex, Scalar->getType());
             CSEBlocks.insert(PH->getIncomingBlock(i));
             PH->setOperand(i, Ex);
           }
         }
       } else {
         Builder.SetInsertPoint(cast<Instruction>(User));
         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
         Ex = extend(ScalarRoot, Ex, Scalar->getType());
         CSEBlocks.insert(cast<Instruction>(User)->getParent());
         User->replaceUsesOfWith(Scalar, Ex);
       }
     } else {
       Builder.SetInsertPoint(&F->getEntryBlock().front());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
       Ex = extend(ScalarRoot, Ex, Scalar->getType());
       CSEBlocks.insert(&F->getEntryBlock());
       User->replaceUsesOfWith(Scalar, Ex);
     }
 
     LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
   }
 
   // For each vectorized value:
   for (TreeEntry &EIdx : VectorizableTree) {
     TreeEntry *Entry = &EIdx;
 
     // No need to handle users of gathered values.
     if (Entry->NeedToGather)
       continue;
 
     assert(Entry->VectorizedValue && "Can't find vectorizable value");
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
 #ifndef NDEBUG
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to replace users in the ignorelist by undef.
           assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
         }
 #endif
         Value *Undef = UndefValue::get(Ty);
         Scalar->replaceAllUsesWith(Undef);
       }
       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
       eraseInstruction(cast<Instruction>(Scalar));
     }
   }
 
   Builder.ClearInsertionPoint();
 
   return VectorizableTree[0].VectorizedValue;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
   LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
                     << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
   for (Instruction *I : GatherSeq) {
     if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
       continue;
 
     // Check if this block is inside a loop.
     Loop *L = LI->getLoopFor(I->getParent());
     if (!L)
       continue;
 
     // Check if it has a preheader.
     BasicBlock *PreHeader = L->getLoopPreheader();
     if (!PreHeader)
       continue;
 
     // If the vector or the element that we insert into it are
     // instructions that are defined in this basic block then we can't
     // hoist this instruction.
     auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
     auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
     if (Op0 && L->contains(Op0))
       continue;
     if (Op1 && L->contains(Op1))
       continue;
 
     // We can hoist this instruction. Move it to the pre-header.
     I->moveBefore(PreHeader->getTerminator());
   }
 
   // Make a list of all reachable blocks in our CSE queue.
   SmallVector<const DomTreeNode *, 8> CSEWorkList;
   CSEWorkList.reserve(CSEBlocks.size());
   for (BasicBlock *BB : CSEBlocks)
     if (DomTreeNode *N = DT->getNode(BB)) {
       assert(DT->isReachableFromEntry(N));
       CSEWorkList.push_back(N);
     }
 
   // Sort blocks by domination. This ensures we visit a block after all blocks
   // dominating it are visited.
   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
                    [this](const DomTreeNode *A, const DomTreeNode *B) {
     return DT->properlyDominates(A, B);
   });
 
   // Perform O(N^2) search over the gather sequences and merge identical
   // instructions. TODO: We can further optimize this scan if we split the
   // instructions into different buckets based on the insert lane.
   SmallVector<Instruction *, 16> Visited;
   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
            "Worklist not sorted properly!");
     BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
       Instruction *In = &*it++;
       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
       // Check if we can replace this instruction with any of the
       // visited instructions.
       for (Instruction *v : Visited) {
         if (In->isIdenticalTo(v) &&
             DT->dominates(v->getParent(), In->getParent())) {
           In->replaceAllUsesWith(v);
           eraseInstruction(In);
           In = nullptr;
           break;
         }
       }
       if (In) {
         assert(!is_contained(Visited, In));
         Visited.push_back(In);
       }
     }
   }
   CSEBlocks.clear();
   GatherSeq.clear();
 }
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
                                                  BoUpSLP *SLP,
                                                  const InstructionsState &S) {
   if (isa<PHINode>(S.OpValue))
     return true;
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
     if (!extendSchedulingRegion(V, S))
       return false;
   }
 
   for (Value *V : VL) {
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
     if (BundleMember->IsScheduled) {
       // A bundle member was scheduled as single instruction before and now
       // needs to be scheduled as part of the bundle. We just get rid of the
       // existing schedule.
       LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
                         << " was already scheduled\n");
       ReSchedule = true;
     }
     assert(BundleMember->isSchedulingEntity() &&
            "bundle member already part of other bundle");
     if (PrevInBundle) {
       PrevInBundle->NextInBundle = BundleMember;
     } else {
       Bundle = BundleMember;
     }
     BundleMember->UnscheduledDepsInBundle = 0;
     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
     PrevInBundle = BundleMember;
   }
   if (ScheduleEnd != OldScheduleEnd) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
     // It is seldom that this needs to be done a second time after adding the
     // initial bundle to the region.
     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
       doForAllOpcodes(I, [](ScheduleData *SD) {
         SD->clearDependencies();
       });
     }
     ReSchedule = true;
   }
   if (ReSchedule) {
     resetSchedule();
     initialFillReadyList(ReadyInsts);
   }
 
   LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
                     << BB->getName() << "\n");
 
   calculateDependencies(Bundle, true, SLP);
 
   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
   // means that there are no cyclic dependencies and we can schedule it.
   // Note that's important that we don't "schedule" the bundle yet (see
   // cancelScheduling).
   while (!Bundle->isReady() && !ReadyInsts.empty()) {
 
     ScheduleData *pickedSD = ReadyInsts.back();
     ReadyInsts.pop_back();
 
     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
       schedule(pickedSD, ReadyInsts);
     }
   }
   if (!Bundle->isReady()) {
     cancelScheduling(VL, S.OpValue);
     return false;
   }
   return true;
 }
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                 Value *OpValue) {
   if (isa<PHINode>(OpValue))
     return;
 
   ScheduleData *Bundle = getScheduleData(OpValue);
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
          "tried to unbundle something which is not a bundle");
 
   // Un-bundle: make single instructions out of the bundle.
   ScheduleData *BundleMember = Bundle;
   while (BundleMember) {
     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
     if (BundleMember->UnscheduledDepsInBundle == 0) {
       ReadyInsts.insert(BundleMember);
     }
     BundleMember = Next;
   }
 }
 
 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   // Allocate a new ScheduleData for the instruction.
   if (ChunkPos >= ChunkSize) {
     ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
     ChunkPos = 0;
   }
   return &(ScheduleDataChunks.back()[ChunkPos++]);
 }
 
 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
                                                       const InstructionsState &S) {
   if (getScheduleData(V, isOneOf(S, V)))
     return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
   auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
     ScheduleData *ISD = getScheduleData(I);
     if (!ISD)
       return false;
     assert(isInSchedulingRegion(ISD) &&
            "ScheduleData not in scheduling region");
     ScheduleData *SD = allocateScheduleDataChunks();
     SD->Inst = I;
     SD->init(SchedulingRegionID, S.OpValue);
     ExtraScheduleDataMap[I][S.OpValue] = SD;
     return true;
   };
   if (CheckSheduleForI(I))
     return true;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
     if (isOneOf(S, I) != I)
       CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
   }
   // Search up and down at the same time, because we don't know if the new
   // instruction is above or below the existing scheduling region.
   BasicBlock::reverse_iterator UpIter =
       ++ScheduleStart->getIterator().getReverse();
   BasicBlock::reverse_iterator UpperEnd = BB->rend();
   BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
   BasicBlock::iterator LowerEnd = BB->end();
   while (true) {
     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
       LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
       return false;
     }
 
     if (UpIter != UpperEnd) {
       if (&*UpIter == I) {
         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
         ScheduleStart = I;
         if (isOneOf(S, I) != I)
           CheckSheduleForI(I);
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                           << "\n");
         return true;
       }
       UpIter++;
     }
     if (DownIter != LowerEnd) {
       if (&*DownIter == I) {
         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
                          nullptr);
         ScheduleEnd = I->getNextNode();
         if (isOneOf(S, I) != I)
           CheckSheduleForI(I);
         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
                           << "\n");
         return true;
       }
       DownIter++;
     }
     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
            "instruction not found in block");
   }
   return true;
 }
 
 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
                                                 Instruction *ToI,
                                                 ScheduleData *PrevLoadStore,
                                                 ScheduleData *NextLoadStore) {
   ScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
     ScheduleData *SD = ScheduleDataMap[I];
     if (!SD) {
       SD = allocateScheduleDataChunks();
       ScheduleDataMap[I] = SD;
       SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
            "new ScheduleData already in scheduling region");
     SD->init(SchedulingRegionID, I);
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
          cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
       // Update the linked list of memory accessing instructions.
       if (CurrentLoadStore) {
         CurrentLoadStore->NextLoadStore = SD;
       } else {
         FirstLoadStoreInRegion = SD;
       }
       CurrentLoadStore = SD;
     }
   }
   if (NextLoadStore) {
     if (CurrentLoadStore)
       CurrentLoadStore->NextLoadStore = NextLoadStore;
   } else {
     LastLoadStoreInRegion = CurrentLoadStore;
   }
 }
 
 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
                                                      bool InsertInReadyList,
                                                      BoUpSLP *SLP) {
   assert(SD->isSchedulingEntity());
 
   SmallVector<ScheduleData *, 10> WorkList;
   WorkList.push_back(SD);
 
   while (!WorkList.empty()) {
     ScheduleData *SD = WorkList.back();
     WorkList.pop_back();
 
     ScheduleData *BundleMember = SD;
     while (BundleMember) {
       assert(isInSchedulingRegion(BundleMember));
       if (!BundleMember->hasValidDependencies()) {
 
         LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
                           << "\n");
         BundleMember->Dependencies = 0;
         BundleMember->resetUnscheduledDeps();
 
         // Handle def-use chain dependencies.
         if (BundleMember->OpValue != BundleMember->Inst) {
           ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
           if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
             BundleMember->Dependencies++;
             ScheduleData *DestBundle = UseSD->FirstInBundle;
             if (!DestBundle->IsScheduled)
               BundleMember->incrementUnscheduledDeps(1);
             if (!DestBundle->hasValidDependencies())
               WorkList.push_back(DestBundle);
           }
         } else {
           for (User *U : BundleMember->Inst->users()) {
             if (isa<Instruction>(U)) {
               ScheduleData *UseSD = getScheduleData(U);
               if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
                 BundleMember->Dependencies++;
                 ScheduleData *DestBundle = UseSD->FirstInBundle;
                 if (!DestBundle->IsScheduled)
                   BundleMember->incrementUnscheduledDeps(1);
                 if (!DestBundle->hasValidDependencies())
                   WorkList.push_back(DestBundle);
               }
             } else {
               // I'm not sure if this can ever happen. But we need to be safe.
               // This lets the instruction/bundle never be scheduled and
               // eventually disable vectorization.
               BundleMember->Dependencies++;
               BundleMember->incrementUnscheduledDeps(1);
             }
           }
         }
 
         // Handle the memory dependencies.
         ScheduleData *DepDest = BundleMember->NextLoadStore;
         if (DepDest) {
           Instruction *SrcInst = BundleMember->Inst;
           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
           unsigned numAliased = 0;
           unsigned DistToSrc = 1;
 
           while (DepDest) {
             assert(isInSchedulingRegion(DepDest));
 
             // We have two limits to reduce the complexity:
             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
             //    SLP->isAliased (which is the expensive part in this loop).
             // 2) MaxMemDepDistance: It's for very large blocks and it aborts
             //    the whole loop (even if the loop is fast, it's quadratic).
             //    It's important for the loop break condition (see below) to
             //    check this limit even between two read-only instructions.
             if (DistToSrc >= MaxMemDepDistance ||
                     ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
                      (numAliased >= AliasedCheckLimit ||
                       SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
 
               // We increment the counter only if the locations are aliased
               // (instead of counting all alias checks). This gives a better
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
               DepDest->MemoryDependencies.push_back(BundleMember);
               BundleMember->Dependencies++;
               ScheduleData *DestBundle = DepDest->FirstInBundle;
               if (!DestBundle->IsScheduled) {
                 BundleMember->incrementUnscheduledDeps(1);
               }
               if (!DestBundle->hasValidDependencies()) {
                 WorkList.push_back(DestBundle);
               }
             }
             DepDest = DepDest->NextLoadStore;
 
             // Example, explaining the loop break condition: Let's assume our
             // starting instruction is i0 and MaxMemDepDistance = 3.
             //
             //                      +--------v--v--v
             //             i0,i1,i2,i3,i4,i5,i6,i7,i8
             //             +--------^--^--^
             //
             // MaxMemDepDistance let us stop alias-checking at i3 and we add
             // dependencies from i0 to i3,i4,.. (even if they are not aliased).
             // Previously we already added dependencies from i3 to i6,i7,i8
             // (because of MaxMemDepDistance). As we added a dependency from
             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
             // and we can abort this loop at i6.
             if (DistToSrc >= 2 * MaxMemDepDistance)
               break;
             DistToSrc++;
           }
         }
       }
       BundleMember = BundleMember->NextInBundle;
     }
     if (InsertInReadyList && SD->isReady()) {
       ReadyInsts.push_back(SD);
       LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
                         << "\n");
     }
   }
 }
 
 void BoUpSLP::BlockScheduling::resetSchedule() {
   assert(ScheduleStart &&
          "tried to reset schedule on block which has not been scheduled");
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
     doForAllOpcodes(I, [&](ScheduleData *SD) {
       assert(isInSchedulingRegion(SD) &&
              "ScheduleData not in scheduling region");
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
     });
   }
   ReadyInsts.clear();
 }
 
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
 
   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
 
   BS->resetSchedule();
 
   // For the real scheduling we use a more sophisticated ready-list: it is
   // sorted by the original instruction location. This lets the final schedule
   // be as  close as possible to the original instruction order.
   struct ScheduleDataCompare {
     bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
       return SD2->SchedulingPriority < SD1->SchedulingPriority;
     }
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
 
   // Ensure that all dependency data is updated and fill the ready-list with
   // initial instructions.
   int Idx = 0;
   int NumToSchedule = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
       assert(SD->isPartOfBundle() ==
                  (getTreeEntry(SD->Inst) != nullptr) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
         BS->calculateDependencies(SD, false, this);
         NumToSchedule++;
       }
     });
   }
   BS->initialFillReadyList(ReadyInsts);
 
   Instruction *LastScheduledInst = BS->ScheduleEnd;
 
   // Do the "real" scheduling.
   while (!ReadyInsts.empty()) {
     ScheduleData *picked = *ReadyInsts.begin();
     ReadyInsts.erase(ReadyInsts.begin());
 
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     ScheduleData *BundleMember = picked;
     while (BundleMember) {
       Instruction *pickedInst = BundleMember->Inst;
       if (LastScheduledInst->getNextNode() != pickedInst) {
         BS->BB->getInstList().remove(pickedInst);
         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
                                      pickedInst);
       }
       LastScheduledInst = pickedInst;
       BundleMember = BundleMember->NextInBundle;
     }
 
     BS->schedule(picked, ReadyInsts);
     NumToSchedule--;
   }
   assert(NumToSchedule == 0 && "could not schedule all instructions");
 
   // Avoid duplicate scheduling of the block.
   BS->ScheduleStart = nullptr;
 }
 
 unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // If V is a store, just return the width of the stored value without
   // traversing the expression tree. This is the common case.
   if (auto *Store = dyn_cast<StoreInst>(V))
     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
 
   // If V is not a store, we can traverse the expression tree to find loads
   // that feed it. The type of the loaded value may indicate a more suitable
   // width than V's type. We want to base the vector element size on the width
   // of memory operations where possible.
   SmallVector<Instruction *, 16> Worklist;
   SmallPtrSet<Instruction *, 16> Visited;
   if (auto *I = dyn_cast<Instruction>(V))
     Worklist.push_back(I);
 
   // Traverse the expression tree in bottom-up order looking for loads. If we
   // encounter an instruciton we don't yet handle, we give up.
   auto MaxWidth = 0u;
   auto FoundUnknownInst = false;
   while (!Worklist.empty() && !FoundUnknownInst) {
     auto *I = Worklist.pop_back_val();
     Visited.insert(I);
 
     // We should only be looking at scalar instructions here. If the current
     // instruction has a vector type, give up.
     auto *Ty = I->getType();
     if (isa<VectorType>(Ty))
       FoundUnknownInst = true;
 
     // If the current instruction is a load, update MaxWidth to reflect the
     // width of the loaded value.
     else if (isa<LoadInst>(I))
       MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
 
     // Otherwise, we need to visit the operands of the instruction. We only
     // handle the interesting cases from buildTree here. If an operand is an
     // instruction we haven't yet visited, we add it to the worklist.
     else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
       for (Use &U : I->operands())
         if (auto *J = dyn_cast<Instruction>(U.get()))
           if (!Visited.count(J))
             Worklist.push_back(J);
     }
 
     // If we don't yet handle the instruction, give up.
     else
       FoundUnknownInst = true;
   }
 
   // If we didn't encounter a memory access in the expression tree, or if we
   // gave up for some reason, just return the width of V.
   if (!MaxWidth || FoundUnknownInst)
     return DL->getTypeSizeInBits(V->getType());
 
   // Otherwise, return the maximum width we found.
   return MaxWidth;
 }
 
 // Determine if a value V in a vectorizable expression Expr can be demoted to a
 // smaller type with a truncation. We collect the values that will be demoted
 // in ToDemote and additional roots that require investigating in Roots.
 static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
                                   SmallVectorImpl<Value *> &ToDemote,
                                   SmallVectorImpl<Value *> &Roots) {
   // We can always demote constants.
   if (isa<Constant>(V)) {
     ToDemote.push_back(V);
     return true;
   }
 
   // If the value is not an instruction in the expression with only one use, it
   // cannot be demoted.
   auto *I = dyn_cast<Instruction>(V);
   if (!I || !I->hasOneUse() || !Expr.count(I))
     return false;
 
   switch (I->getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
   // seed additional demotion, we save the truncated value.
   case Instruction::Trunc:
     Roots.push_back(I->getOperand(0));
     break;
   case Instruction::ZExt:
   case Instruction::SExt:
     break;
 
   // We can demote certain binary operations if we can demote both of their
   // operands.
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
     if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
         !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
       return false;
     break;
 
   // We can demote selects if we can demote their true and false values.
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
         !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
       return false;
     break;
   }
 
   // We can demote phis if we can demote all their incoming operands. Note that
   // we don't need to worry about cycles since we ensure single use above.
   case Instruction::PHI: {
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
       if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
         return false;
     break;
   }
 
   // Otherwise, conservatively give up.
   default:
     return false;
   }
 
   // Record the value that we can demote.
   ToDemote.push_back(V);
   return true;
 }
 
 void BoUpSLP::computeMinimumValueSizes() {
   // If there are no external uses, the expression tree must be rooted by a
   // store. We can't demote in-memory values, so there is nothing to do here.
   if (ExternalUses.empty())
     return;
 
   // We only attempt to truncate integer expressions.
   auto &TreeRoot = VectorizableTree[0].Scalars;
   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
   if (!TreeRootIT)
     return;
 
   // If the expression is not rooted by a store, these roots should have
   // external uses. We will rely on InstCombine to rewrite the expression in
   // the narrower type. However, InstCombine only rewrites single-use values.
   // This means that if a tree entry other than a root is used externally, it
   // must have multiple uses and InstCombine will not rewrite it. The code
   // below ensures that only the roots are used externally.
   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
   for (auto &EU : ExternalUses)
     if (!Expr.erase(EU.Scalar))
       return;
   if (!Expr.empty())
     return;
 
   // Collect the scalar values of the vectorizable expression. We will use this
   // context to determine which values can be demoted. If we see a truncation,
   // we mark it as seeding another demotion.
   for (auto &Entry : VectorizableTree)
     Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
 
   // Ensure the roots of the vectorizable tree don't form a cycle. They must
   // have a single external user that is not in the vectorizable tree.
   for (auto *Root : TreeRoot)
     if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
       return;
 
   // Conservatively determine if we can actually truncate the roots of the
   // expression. Collect the values that can be demoted in ToDemote and
   // additional roots that require investigating in Roots.
   SmallVector<Value *, 32> ToDemote;
   SmallVector<Value *, 4> Roots;
   for (auto *Root : TreeRoot)
     if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
       return;
 
   // The maximum bit width required to represent all the values that can be
   // demoted without loss of precision. It would be safe to truncate the roots
   // of the expression to this width.
   auto MaxBitWidth = 8u;
 
   // We first check if all the bits of the roots are demanded. If they're not,
   // we can truncate the roots to this narrower type.
   for (auto *Root : TreeRoot) {
     auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
     MaxBitWidth = std::max<unsigned>(
         Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
   }
 
   // True if the roots can be zero-extended back to their original type, rather
   // than sign-extended. We know that if the leading bits are not demanded, we
   // can safely zero-extend. So we initialize IsKnownPositive to True.
   bool IsKnownPositive = true;
 
   // If all the bits of the roots are demanded, we can try a little harder to
   // compute a narrower type. This can happen, for example, if the roots are
   // getelementptr indices. InstCombine promotes these indices to the pointer
   // width. Thus, all their bits are technically demanded even though the
   // address computation might be vectorized in a smaller type.
   //
   // We start by looking at each entry that can be demoted. We compute the
   // maximum bit width required to store the scalar by using ValueTracking to
   // compute the number of high-order bits we can truncate.
   if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
       llvm::all_of(TreeRoot, [](Value *R) {
         assert(R->hasOneUse() && "Root should have only one use!");
         return isa<GetElementPtrInst>(R->user_back());
       })) {
     MaxBitWidth = 8u;
 
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
     IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
       KnownBits Known = computeKnownBits(R, *DL);
       return Known.isNonNegative();
     });
 
     // Determine the maximum number of bits required to store the scalar
     // values.
     for (auto *Scalar : ToDemote) {
       auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
       auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
       MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
     }
 
     // If we can't prove that the sign bit is zero, we must add one to the
     // maximum bit width to account for the unknown sign bit. This preserves
     // the existing sign bit so we can safely sign-extend the root back to the
     // original type. Otherwise, if we know the sign bit is zero, we will
     // zero-extend the root instead.
     //
     // FIXME: This is somewhat suboptimal, as there will be cases where adding
     //        one to the maximum bit width will yield a larger-than-necessary
     //        type. In general, we need to add an extra bit only if we can't
     //        prove that the upper bit of the original type is equal to the
     //        upper bit of the proposed smaller type. If these two bits are the
     //        same (either zero or one) we know that sign-extending from the
     //        smaller type will result in the same value. Here, since we can't
     //        yet prove this, we are just making the proposed smaller type
     //        larger to ensure correctness.
     if (!IsKnownPositive)
       ++MaxBitWidth;
   }
 
   // Round MaxBitWidth up to the next power-of-two.
   if (!isPowerOf2_64(MaxBitWidth))
     MaxBitWidth = NextPowerOf2(MaxBitWidth);
 
   // If the maximum bit width we compute is less than the with of the roots'
   // type, we can proceed with the narrowing. Otherwise, do nothing.
   if (MaxBitWidth >= TreeRootIT->getBitWidth())
     return;
 
   // If we can truncate the root, we must collect additional values that might
   // be demoted as a result. That is, those seeded by truncations we will
   // modify.
   while (!Roots.empty())
     collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
 
   // Finally, map the values we can demote to the maximum bit with we computed.
   for (auto *Scalar : ToDemote)
     MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
 }
 
 namespace {
 
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
   SLPVectorizerPass Impl;
 
   /// Pass identification, replacement for typeid
   static char ID;
 
   explicit SLPVectorizer() : FunctionPass(ID) {
     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
   }
 
   bool doInitialization(Module &M) override {
     return false;
   }
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
 
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
     return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     FunctionPass::getAnalysisUsage(AU);
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<DemandedBitsWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.setPreservesCFG();
   }
 };
 
 } // end anonymous namespace
 
 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
   auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
   auto *AA = &AM.getResult<AAManager>(F);
   auto *LI = &AM.getResult<LoopAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
   bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   if (!Changed)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<AAManager>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
 
 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
                                 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                                 LoopInfo *LI_, DominatorTree *DT_,
                                 AssumptionCache *AC_, DemandedBits *DB_,
                                 OptimizationRemarkEmitter *ORE_) {
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
   AA = AA_;
   LI = LI_;
   DT = DT_;
   AC = AC_;
   DB = DB_;
   DL = &F.getParent()->getDataLayout();
 
   Stores.clear();
   GEPs.clear();
   bool Changed = false;
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
   if (!TTI->getNumberOfRegisters(true))
     return false;
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
   // Use the bottom up slp vectorizer to construct chains that start with
   // store instructions.
   BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
 
   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   // delete instructions.
 
   // Scan the blocks in the function in post order.
   for (auto BB : post_order(&F.getEntryBlock())) {
     collectSeedInstructions(BB);
 
     // Vectorize trees that end at stores.
     if (!Stores.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
                         << " underlying objects.\n");
       Changed |= vectorizeStoreChains(R);
     }
 
     // Vectorize trees that end at reductions.
     Changed |= vectorizeChainsInBlock(BB, R);
 
     // Vectorize the index computations of getelementptr instructions. This
     // is primarily intended to catch gather-like idioms ending at
     // non-consecutive loads.
     if (!GEPs.empty()) {
       LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
                         << " underlying objects.\n");
       Changed |= vectorizeGEPIndices(BB, R);
     }
   }
 
   if (Changed) {
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
     LLVM_DEBUG(verifyFunction(F));
   }
   return Changed;
 }
 
 /// Check that the Values in the slice in VL array are still existent in
 /// the WeakTrackingVH array.
 /// Vectorization of part of the VL array may cause later values in the VL array
 /// to become invalid. We track when this has happened in the WeakTrackingVH
 /// array.
 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
                                ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
                                unsigned SliceSize) {
   VL = VL.slice(SliceBegin, SliceSize);
   VH = VH.slice(SliceBegin, SliceSize);
   return !std::equal(VL.begin(), VL.end(), VH.begin());
 }
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned VecRegSize) {
   const unsigned ChainLen = Chain.size();
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
   const unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
   const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
 
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
   for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
 
     // Check that a previous iteration of this loop did not delete the Value.
     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
       continue;
 
     LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
                       << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     R.buildTree(Operands);
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
 
     R.computeMinimumValueSizes();
 
     int Cost = R.getTreeCost();
 
     LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
                       << "\n");
     if (Cost < -SLPCostThreshold) {
       LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
 
       using namespace ore;
 
       R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
                                           cast<StoreInst>(Chain[i]))
                        << "Stores SLP vectorized with cost " << NV("Cost", Cost)
                        << " and with tree size "
                        << NV("TreeSize", R.getTreeSize()));
 
       R.vectorizeTree();
 
       // Move to the next bundle.
       i += VF - 1;
       Changed = true;
     }
   }
 
   return Changed;
 }
 
 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
                                         BoUpSLP &R) {
   SetVector<StoreInst *> Heads;
   SmallDenseSet<StoreInst *> Tails;
   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
 
   // We may run into multiple chains that merge into a single chain. We mark the
   // stores that we vectorized so that we don't visit the same store twice.
   BoUpSLP::ValueSet VectorizedStores;
   bool Changed = false;
 
   // Do a quadratic search on all of the given stores in reverse order and find
   // all of the pairs of stores that follow each other.
   SmallVector<unsigned, 16> IndexQueue;
   unsigned E = Stores.size();
   IndexQueue.resize(E - 1);
   for (unsigned I = E; I > 0; --I) {
     unsigned Idx = I - 1;
     // If a store has multiple consecutive store candidates, search Stores
     // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
     // This is because usually pairing with immediate succeeding or preceding
     // candidate create the best chance to find slp vectorization opportunity.
     unsigned Offset = 1;
     unsigned Cnt = 0;
     for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
       if (Idx >= Offset) {
         IndexQueue[Cnt] = Idx - Offset;
         ++Cnt;
       }
       if (Idx + Offset < E) {
         IndexQueue[Cnt] = Idx + Offset;
         ++Cnt;
       }
     }
 
     for (auto K : IndexQueue) {
       if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
         Tails.insert(Stores[Idx]);
         Heads.insert(Stores[K]);
         ConsecutiveChain[Stores[K]] = Stores[Idx];
         break;
       }
     }
   }
 
   // For stores that start but don't end a link in the chain:
   for (auto *SI : llvm::reverse(Heads)) {
     if (Tails.count(SI))
       continue;
 
     // We found a store instr that starts a chain. Now follow the chain and try
     // to vectorize it.
     BoUpSLP::ValueList Operands;
     StoreInst *I = SI;
     // Collect the chain into a list.
     while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
       Operands.push_back(I);
       // Move to the next value in the chain.
       I = ConsecutiveChain[I];
     }
 
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
     for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
          Size /= 2) {
       if (vectorizeStoreChain(Operands, R, Size)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
         Changed = true;
         break;
       }
     }
   }
 
   return Changed;
 }
 
 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
   // Initialize the collections. We will make a single pass over the block.
   Stores.clear();
   GEPs.clear();
 
   // Visit the store and getelementptr instructions in BB and organize them in
   // Stores and GEPs according to the underlying objects of their pointer
   // operands.
   for (Instruction &I : *BB) {
     // Ignore store instructions that are volatile or have a pointer operand
     // that doesn't point to a scalar type.
     if (auto *SI = dyn_cast<StoreInst>(&I)) {
       if (!SI->isSimple())
         continue;
       if (!isValidElementType(SI->getValueOperand()->getType()))
         continue;
       Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
     }
 
     // Ignore getelementptr instructions that have more than one index, a
     // constant index, or a pointer operand that doesn't point to a scalar
     // type.
     else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
       auto Idx = GEP->idx_begin()->get();
       if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
         continue;
       if (!isValidElementType(Idx->getType()))
         continue;
       if (GEP->getType()->isVectorTy())
         continue;
       GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
     }
   }
 }
 
 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
   return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                            int UserCost, bool AllowReorder) {
   if (VL.size() < 2)
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
                     << VL.size() << ".\n");
 
   // Check that all of the parts are scalar instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
   InstructionsState S = getSameOpcode(VL);
   if (!S.getOpcode())
     return false;
 
   Instruction *I0 = cast<Instruction>(S.OpValue);
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
              << "Cannot SLP vectorize list: vectorization factor "
              << "less than 2 is not supported";
     });
     return false;
   }
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
     if (!isValidElementType(Ty)) {
       // NOTE: the following will give user internal llvm type name, which may
       // not be useful.
       R.getORE()->emit([&]() {
         std::string type_str;
         llvm::raw_string_ostream rso(type_str);
         Ty->print(rso);
         return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
                << "Cannot SLP vectorize list: type "
                << rso.str() + " is unsupported by vectorizer";
       });
       return false;
     }
   }
 
   bool Changed = false;
   bool CandidateFound = false;
   int MinCost = SLPCostThreshold;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
   SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
 
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
        VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
     auto *VecTy = VectorType::get(VL[0]->getType(), VF);
     if (TTI->getNumberOfParts(VecTy) == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned OpsWidth = 0;
 
       if (I + VF > MaxInst)
         OpsWidth = MaxInst - I;
       else
         OpsWidth = VF;
 
       if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
         break;
 
       // Check that a previous iteration of this loop did not delete the Value.
       if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
         continue;
 
       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
                         << "\n");
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
       R.buildTree(Ops);
       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
       // TODO: check if we can allow reordering for more cases.
       if (AllowReorder && Order) {
         // TODO: reorder tree nodes without tree rebuilding.
         // Conceptually, there is nothing actually preventing us from trying to
         // reorder a larger list. In fact, we do exactly this when vectorizing
         // reductions. However, at this point, we only expect to get here when
         // there are exactly two operations.
         assert(Ops.size() == 2);
         Value *ReorderedOps[] = {Ops[1], Ops[0]};
         R.buildTree(ReorderedOps, None);
       }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
 
       R.computeMinimumValueSizes();
       int Cost = R.getTreeCost() - UserCost;
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
                                                     cast<Instruction>(Ops[0]))
                                  << "SLP vectorized with cost " << ore::NV("Cost", Cost)
                                  << " and with tree size "
                                  << ore::NV("TreeSize", R.getTreeSize()));
 
         R.vectorizeTree();
         // Move to the next bundle.
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
       }
     }
   }
 
   if (!Changed && CandidateFound) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
              << "List vectorization was possible but not beneficial with cost "
              << ore::NV("Cost", MinCost) << " >= "
              << ore::NV("Treshold", -SLPCostThreshold);
     });
   } else if (!Changed) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
              << "Cannot SLP vectorize list: vectorization was impossible"
              << " with available vectorization factors";
     });
   }
   return Changed;
 }
 
 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   if (!I)
     return false;
 
   if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
     return false;
 
   Value *P = I->getParent();
 
   // Vectorize in current basic block only.
   auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
   auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
     return false;
 
   // Try to vectorize V.
   if (tryToVectorizePair(Op0, Op1, R))
     return true;
 
   auto *A = dyn_cast<BinaryOperator>(Op0);
   auto *B = dyn_cast<BinaryOperator>(Op1);
   // Try to skip B.
   if (B && B->hasOneUse()) {
     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
     if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
       return true;
     if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
       return true;
   }
 
   // Try to skip A.
   if (A && A->hasOneUse()) {
     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
     if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
       return true;
     if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
       return true;
   }
   return false;
 }
 
 /// Generate a shuffle mask to be used in a reduction tree.
 ///
 /// \param VecLen The length of the vector to be reduced.
 /// \param NumEltsToRdx The number of elements that should be reduced in the
 ///        vector.
 /// \param IsPairwise Whether the reduction is a pairwise or splitting
 ///        reduction. A pairwise reduction will generate a mask of
 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
                                    bool IsPairwise, bool IsLeft,
                                    IRBuilder<> &Builder) {
   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
 
   SmallVector<Constant *, 32> ShuffleMask(
       VecLen, UndefValue::get(Builder.getInt32Ty()));
 
   if (IsPairwise)
     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
     for (unsigned i = 0; i != NumEltsToRdx; ++i)
       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
   else
     // Move the upper half of the vector to the lower half.
     for (unsigned i = 0; i != NumEltsToRdx; ++i)
       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
 
   return ConstantVector::get(ShuffleMask);
 }
 
 namespace {
 
 /// Model horizontal reductions.
 ///
 /// A horizontal reduction is a tree of reduction operations (currently add and
 /// fadd) that has operations that can be put into a vector as its leaf.
 /// For example, this tree:
 ///
 /// mul mul mul mul
 ///  \  /    \  /
 ///   +       +
 ///    \     /
 ///       +
 /// This tree has "mul" as its reduced values and "+" as its reduction
 /// operations. A reduction might be feeding into a store or a binary operation
 /// feeding a phi.
 ///    ...
 ///    \  /
 ///     +
 ///     |
 ///  phi +=
 ///
 ///  Or:
 ///    ...
 ///    \  /
 ///     +
 ///     |
 ///   *p =
 ///
 class HorizontalReduction {
   using ReductionOpsType = SmallVector<Value *, 16>;
   using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
   ReductionOpsListType  ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
 
   /// Kind of the reduction data.
   enum ReductionKind {
     RK_None,       /// Not a reduction.
     RK_Arithmetic, /// Binary reduction data.
     RK_Min,        /// Minimum reduction data.
     RK_UMin,       /// Unsigned minimum reduction data.
     RK_Max,        /// Maximum reduction data.
     RK_UMax,       /// Unsigned maximum reduction data.
   };
 
   /// Contains info about operation, like its opcode, left and right operands.
   class OperationData {
     /// Opcode of the instruction.
     unsigned Opcode = 0;
 
     /// Left operand of the reduction operation.
     Value *LHS = nullptr;
 
     /// Right operand of the reduction operation.
     Value *RHS = nullptr;
 
     /// Kind of the reduction operation.
     ReductionKind Kind = RK_None;
 
     /// True if float point min/max reduction has no NaNs.
     bool NoNaN = false;
 
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable() const {
       return LHS && RHS &&
              // We currently only support adds && min/max reductions.
              ((Kind == RK_Arithmetic &&
                (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
               ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
                (Kind == RK_Min || Kind == RK_Max)) ||
               (Opcode == Instruction::ICmp &&
                (Kind == RK_UMin || Kind == RK_UMax)));
     }
 
     /// Creates reduction operation with the current opcode.
     Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
       assert(isVectorizable() &&
              "Expected add|fadd or min/max reduction operation.");
       Value *Cmp;
       switch (Kind) {
       case RK_Arithmetic:
         return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
                                    Name);
       case RK_Min:
         Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
                                           : Builder.CreateFCmpOLT(LHS, RHS);
         break;
       case RK_Max:
         Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
                                           : Builder.CreateFCmpOGT(LHS, RHS);
         break;
       case RK_UMin:
         assert(Opcode == Instruction::ICmp && "Expected integer types.");
         Cmp = Builder.CreateICmpULT(LHS, RHS);
         break;
       case RK_UMax:
         assert(Opcode == Instruction::ICmp && "Expected integer types.");
         Cmp = Builder.CreateICmpUGT(LHS, RHS);
         break;
       case RK_None:
         llvm_unreachable("Unknown reduction operation.");
       }
       return Builder.CreateSelect(Cmp, LHS, RHS, Name);
     }
 
   public:
     explicit OperationData() = default;
 
     /// Construction for reduced values. They are identified by opcode only and
     /// don't have associated LHS/RHS values.
     explicit OperationData(Value *V) {
       if (auto *I = dyn_cast<Instruction>(V))
         Opcode = I->getOpcode();
     }
 
     /// Constructor for reduction operations with opcode and its left and
     /// right operands.
     OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
                   bool NoNaN = false)
         : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
       assert(Kind != RK_None && "One of the reduction operations is expected.");
     }
 
     explicit operator bool() const { return Opcode; }
 
     /// Get the index of the first operand.
     unsigned getFirstOperandIndex() const {
       assert(!!*this && "The opcode is not set.");
       switch (Kind) {
       case RK_Min:
       case RK_UMin:
       case RK_Max:
       case RK_UMax:
         return 1;
       case RK_Arithmetic:
       case RK_None:
         break;
       }
       return 0;
     }
 
     /// Total number of operands in the reduction operation.
     unsigned getNumberOfOperands() const {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
       switch (Kind) {
       case RK_Arithmetic:
         return 2;
       case RK_Min:
       case RK_UMin:
       case RK_Max:
       case RK_UMax:
         return 3;
       case RK_None:
         break;
       }
       llvm_unreachable("Reduction kind is not set");
     }
 
     /// Checks if the operation has the same parent as \p P.
     bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
       if (!IsRedOp)
         return I->getParent() == P;
       switch (Kind) {
       case RK_Arithmetic:
         // Arithmetic reduction operation must be used once only.
         return I->getParent() == P;
       case RK_Min:
       case RK_UMin:
       case RK_Max:
       case RK_UMax: {
         // SelectInst must be used twice while the condition op must have single
         // use only.
         auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
         return I->getParent() == P && Cmp && Cmp->getParent() == P;
       }
       case RK_None:
         break;
       }
       llvm_unreachable("Reduction kind is not set");
     }
     /// Expected number of uses for reduction operations/reduced values.
     bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
       switch (Kind) {
       case RK_Arithmetic:
         return I->hasOneUse();
       case RK_Min:
       case RK_UMin:
       case RK_Max:
       case RK_UMax:
         return I->hasNUses(2) &&
                (!IsReductionOp ||
                 cast<SelectInst>(I)->getCondition()->hasOneUse());
       case RK_None:
         break;
       }
       llvm_unreachable("Reduction kind is not set");
     }
 
     /// Initializes the list of reduction operations.
     void initReductionOps(ReductionOpsListType &ReductionOps) {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
       switch (Kind) {
       case RK_Arithmetic:
         ReductionOps.assign(1, ReductionOpsType());
         break;
       case RK_Min:
       case RK_UMin:
       case RK_Max:
       case RK_UMax:
         ReductionOps.assign(2, ReductionOpsType());
         break;
       case RK_None:
         llvm_unreachable("Reduction kind is not set");
       }
     }
     /// Add all reduction operations for the reduction instruction \p I.
     void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
       assert(Kind != RK_None && !!*this && LHS && RHS &&
              "Expected reduction operation.");
       switch (Kind) {
       case RK_Arithmetic:
         ReductionOps[0].emplace_back(I);
         break;
       case RK_Min:
       case RK_UMin:
       case RK_Max:
       case RK_UMax:
         ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
         ReductionOps[1].emplace_back(I);
         break;
       case RK_None:
         llvm_unreachable("Reduction kind is not set");
       }
     }
 
     /// Checks if instruction is associative and can be vectorized.
     bool isAssociative(Instruction *I) const {
       assert(Kind != RK_None && *this && LHS && RHS &&
              "Expected reduction operation.");
       switch (Kind) {
       case RK_Arithmetic:
         return I->isAssociative();
       case RK_Min:
       case RK_Max:
         return Opcode == Instruction::ICmp ||
                cast<Instruction>(I->getOperand(0))->isFast();
       case RK_UMin:
       case RK_UMax:
         assert(Opcode == Instruction::ICmp &&
                "Only integer compare operation is expected.");
         return true;
       case RK_None:
         break;
       }
       llvm_unreachable("Reduction kind is not set");
     }
 
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable(Instruction *I) const {
       return isVectorizable() && isAssociative(I);
     }
 
     /// Checks if two operation data are both a reduction op or both a reduced
     /// value.
     bool operator==(const OperationData &OD) {
       assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
              "One of the comparing operations is incorrect.");
       return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
     }
     bool operator!=(const OperationData &OD) { return !(*this == OD); }
     void clear() {
       Opcode = 0;
       LHS = nullptr;
       RHS = nullptr;
       Kind = RK_None;
       NoNaN = false;
     }
 
     /// Get the opcode of the reduction operation.
     unsigned getOpcode() const {
       assert(isVectorizable() && "Expected vectorizable operation.");
       return Opcode;
     }
 
     /// Get kind of reduction data.
     ReductionKind getKind() const { return Kind; }
     Value *getLHS() const { return LHS; }
     Value *getRHS() const { return RHS; }
     Type *getConditionType() const {
       switch (Kind) {
       case RK_Arithmetic:
         return nullptr;
       case RK_Min:
       case RK_Max:
       case RK_UMin:
       case RK_UMax:
         return CmpInst::makeCmpResultType(LHS->getType());
       case RK_None:
         break;
       }
       llvm_unreachable("Reduction kind is not set");
     }
 
     /// Creates reduction operation with the current opcode with the IR flags
     /// from \p ReductionOps.
     Value *createOp(IRBuilder<> &Builder, const Twine &Name,
                     const ReductionOpsListType &ReductionOps) const {
       assert(isVectorizable() &&
              "Expected add|fadd or min/max reduction operation.");
       auto *Op = createOp(Builder, Name);
       switch (Kind) {
       case RK_Arithmetic:
         propagateIRFlags(Op, ReductionOps[0]);
         return Op;
       case RK_Min:
       case RK_Max:
       case RK_UMin:
       case RK_UMax:
         if (auto *SI = dyn_cast<SelectInst>(Op))
           propagateIRFlags(SI->getCondition(), ReductionOps[0]);
         propagateIRFlags(Op, ReductionOps[1]);
         return Op;
       case RK_None:
         break;
       }
       llvm_unreachable("Unknown reduction operation.");
     }
     /// Creates reduction operation with the current opcode with the IR flags
     /// from \p I.
     Value *createOp(IRBuilder<> &Builder, const Twine &Name,
                     Instruction *I) const {
       assert(isVectorizable() &&
              "Expected add|fadd or min/max reduction operation.");
       auto *Op = createOp(Builder, Name);
       switch (Kind) {
       case RK_Arithmetic:
         propagateIRFlags(Op, I);
         return Op;
       case RK_Min:
       case RK_Max:
       case RK_UMin:
       case RK_UMax:
         if (auto *SI = dyn_cast<SelectInst>(Op)) {
           propagateIRFlags(SI->getCondition(),
                            cast<SelectInst>(I)->getCondition());
         }
         propagateIRFlags(Op, I);
         return Op;
       case RK_None:
         break;
       }
       llvm_unreachable("Unknown reduction operation.");
     }
 
     TargetTransformInfo::ReductionFlags getFlags() const {
       TargetTransformInfo::ReductionFlags Flags;
       Flags.NoNaN = NoNaN;
       switch (Kind) {
       case RK_Arithmetic:
         break;
       case RK_Min:
         Flags.IsSigned = Opcode == Instruction::ICmp;
         Flags.IsMaxOp = false;
         break;
       case RK_Max:
         Flags.IsSigned = Opcode == Instruction::ICmp;
         Flags.IsMaxOp = true;
         break;
       case RK_UMin:
         Flags.IsSigned = false;
         Flags.IsMaxOp = false;
         break;
       case RK_UMax:
         Flags.IsSigned = false;
         Flags.IsMaxOp = true;
         break;
       case RK_None:
         llvm_unreachable("Reduction kind is not set");
       }
       return Flags;
     }
   };
 
   Instruction *ReductionRoot = nullptr;
 
   /// The operation data of the reduction operation.
   OperationData ReductionData;
 
   /// The operation data of the values we perform a reduction on.
   OperationData ReducedValueData;
 
   /// Should we model this reduction as a pairwise reduction tree or a tree that
   /// splits the vector in halves and adds those halves.
   bool IsPairwiseReduction = false;
 
   /// Checks if the ParentStackElem.first should be marked as a reduction
   /// operation with an extra argument or as extra argument itself.
   void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
                     Value *ExtraArg) {
     if (ExtraArgs.count(ParentStackElem.first)) {
       ExtraArgs[ParentStackElem.first] = nullptr;
       // We ran into something like:
       // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
       // The whole ParentStackElem.first should be considered as an extra value
       // in this case.
       // Do not perform analysis of remaining operands of ParentStackElem.first
       // instruction, this whole instruction is an extra argument.
       ParentStackElem.second = ParentStackElem.first->getNumOperands();
     } else {
       // We ran into something like:
       // ParentStackElem.first += ... + ExtraArg + ...
       ExtraArgs[ParentStackElem.first] = ExtraArg;
     }
   }
 
   static OperationData getOperationData(Value *V) {
     if (!V)
       return OperationData();
 
     Value *LHS;
     Value *RHS;
     if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
       return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
                            RK_Arithmetic);
     }
     if (auto *Select = dyn_cast<SelectInst>(V)) {
       // Look for a min/max pattern.
       if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
         return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
       } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
         return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
       } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
                  m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
         return OperationData(
             Instruction::FCmp, LHS, RHS, RK_Min,
             cast<Instruction>(Select->getCondition())->hasNoNaNs());
       } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
         return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
       } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
         return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
       } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
                  m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
         return OperationData(
             Instruction::FCmp, LHS, RHS, RK_Max,
             cast<Instruction>(Select->getCondition())->hasNoNaNs());
       } else {
         // Try harder: look for min/max pattern based on instructions producing
         // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
         // During the intermediate stages of SLP, it's very common to have
         // pattern like this (since optimizeGatherSequence is run only once
         // at the end):
         // %1 = extractelement <2 x i32> %a, i32 0
         // %2 = extractelement <2 x i32> %a, i32 1
         // %cond = icmp sgt i32 %1, %2
         // %3 = extractelement <2 x i32> %a, i32 0
         // %4 = extractelement <2 x i32> %a, i32 1
         // %select = select i1 %cond, i32 %3, i32 %4
         CmpInst::Predicate Pred;
         Instruction *L1;
         Instruction *L2;
 
         LHS = Select->getTrueValue();
         RHS = Select->getFalseValue();
         Value *Cond = Select->getCondition();
 
         // TODO: Support inverse predicates.
         if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
           if (!isa<ExtractElementInst>(RHS) ||
               !L2->isIdenticalTo(cast<Instruction>(RHS)))
             return OperationData(V);
         } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
           if (!isa<ExtractElementInst>(LHS) ||
               !L1->isIdenticalTo(cast<Instruction>(LHS)))
             return OperationData(V);
         } else {
           if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
             return OperationData(V);
           if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
               !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
               !L2->isIdenticalTo(cast<Instruction>(RHS)))
             return OperationData(V);
         }
         switch (Pred) {
         default:
           return OperationData(V);
 
         case CmpInst::ICMP_ULT:
         case CmpInst::ICMP_ULE:
           return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
 
         case CmpInst::ICMP_SLT:
         case CmpInst::ICMP_SLE:
           return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
 
         case CmpInst::FCMP_OLT:
         case CmpInst::FCMP_OLE:
         case CmpInst::FCMP_ULT:
         case CmpInst::FCMP_ULE:
           return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
                                cast<Instruction>(Cond)->hasNoNaNs());
 
         case CmpInst::ICMP_UGT:
         case CmpInst::ICMP_UGE:
           return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
 
         case CmpInst::ICMP_SGT:
         case CmpInst::ICMP_SGE:
           return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
 
         case CmpInst::FCMP_OGT:
         case CmpInst::FCMP_OGE:
         case CmpInst::FCMP_UGT:
         case CmpInst::FCMP_UGE:
           return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
                                cast<Instruction>(Cond)->hasNoNaNs());
         }
       }
     }
     return OperationData(V);
   }
 
 public:
   HorizontalReduction() = default;
 
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
     assert((!Phi || is_contained(Phi->operands(), B)) &&
            "Thi phi needs to use the binary operator");
 
     ReductionData = getOperationData(B);
 
     // We could have a initial reductions that is not an add.
     //  r *= v1 + v2 + v3 + v4
     // In such a case start looking for a tree rooted in the first '+'.
     if (Phi) {
       if (ReductionData.getLHS() == Phi) {
         Phi = nullptr;
         B = dyn_cast<Instruction>(ReductionData.getRHS());
         ReductionData = getOperationData(B);
       } else if (ReductionData.getRHS() == Phi) {
         Phi = nullptr;
         B = dyn_cast<Instruction>(ReductionData.getLHS());
         ReductionData = getOperationData(B);
       }
     }
 
     if (!ReductionData.isVectorizable(B))
       return false;
 
     Type *Ty = B->getType();
     if (!isValidElementType(Ty))
       return false;
     if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
       return false;
 
     ReducedValueData.clear();
     ReductionRoot = B;
 
     // Post order traverse the reduction tree starting at B. We only handle true
     // trees containing only binary operators.
     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
     Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
     ReductionData.initReductionOps(ReductionOps);
     while (!Stack.empty()) {
       Instruction *TreeN = Stack.back().first;
       unsigned EdgeToVist = Stack.back().second++;
       OperationData OpData = getOperationData(TreeN);
       bool IsReducedValue = OpData != ReductionData;
 
       // Postorder vist.
       if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
         if (IsReducedValue)
           ReducedVals.push_back(TreeN);
         else {
           auto I = ExtraArgs.find(TreeN);
           if (I != ExtraArgs.end() && !I->second) {
             // Check if TreeN is an extra argument of its parent operation.
             if (Stack.size() <= 1) {
               // TreeN can't be an extra argument as it is a root reduction
               // operation.
               return false;
             }
             // Yes, TreeN is an extra argument, do not add it to a list of
             // reduction operations.
             // Stack[Stack.size() - 2] always points to the parent operation.
             markExtraArg(Stack[Stack.size() - 2], TreeN);
             ExtraArgs.erase(TreeN);
           } else
             ReductionData.addReductionOps(TreeN, ReductionOps);
         }
         // Retract.
         Stack.pop_back();
         continue;
       }
 
       // Visit left or right.
       Value *NextV = TreeN->getOperand(EdgeToVist);
       if (NextV != Phi) {
         auto *I = dyn_cast<Instruction>(NextV);
         OpData = getOperationData(I);
         // Continue analysis if the next operand is a reduction operation or
         // (possibly) a reduced value. If the reduced value opcode is not set,
         // the first met operation != reduction operation is considered as the
         // reduced value class.
         if (I && (!ReducedValueData || OpData == ReducedValueData ||
                   OpData == ReductionData)) {
           const bool IsReductionOperation = OpData == ReductionData;
           // Only handle trees in the current basic block.
           if (!ReductionData.hasSameParent(I, B->getParent(),
                                            IsReductionOperation)) {
             // I is an extra argument for TreeN (its parent operation).
             markExtraArg(Stack.back(), I);
             continue;
           }
 
           // Each tree node needs to have minimal number of users except for the
           // ultimate reduction.
           if (!ReductionData.hasRequiredNumberOfUses(I,
                                                      OpData == ReductionData) &&
               I != B) {
             // I is an extra argument for TreeN (its parent operation).
             markExtraArg(Stack.back(), I);
             continue;
           }
 
           if (IsReductionOperation) {
             // We need to be able to reassociate the reduction operations.
             if (!OpData.isAssociative(I)) {
               // I is an extra argument for TreeN (its parent operation).
               markExtraArg(Stack.back(), I);
               continue;
             }
           } else if (ReducedValueData &&
                      ReducedValueData != OpData) {
             // Make sure that the opcodes of the operations that we are going to
             // reduce match.
             // I is an extra argument for TreeN (its parent operation).
             markExtraArg(Stack.back(), I);
             continue;
           } else if (!ReducedValueData)
             ReducedValueData = OpData;
 
           Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
           continue;
         }
       }
       // NextV is an extra argument for TreeN (its parent operation).
       markExtraArg(Stack.back(), NextV);
     }
     return true;
   }
 
   /// Attempt to vectorize the tree found by
   /// matchAssociativeReduction.
   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
     if (ReducedVals.empty())
       return false;
 
     // If there is a sufficient number of reduction values, reduce
     // to a nearby power-of-2. Can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
     if (NumReducedVals < 4)
       return false;
 
     unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
 
     Value *VectorizedTree = nullptr;
     IRBuilder<> Builder(ReductionRoot);
     FastMathFlags Unsafe;
     Unsafe.setFast();
     Builder.setFastMathFlags(Unsafe);
     unsigned i = 0;
 
     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
     // The same extra argument may be used several time, so log each attempt
     // to use it.
     for (auto &Pair : ExtraArgs)
       ExternallyUsedValues[Pair.second].push_back(Pair.first);
     SmallVector<Value *, 16> IgnoreList;
     for (auto &V : ReductionOps)
       IgnoreList.append(V.begin(), V.end());
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
       // TODO: Handle orders of size less than number of elements in the vector.
       if (Order && Order->size() == VL.size()) {
         // TODO: reorder tree nodes without tree rebuilding.
         SmallVector<Value *, 4> ReorderedOps(VL.size());
         llvm::transform(*Order, ReorderedOps.begin(),
                         [VL](const unsigned Idx) { return VL[Idx]; });
         V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
 
       V.computeMinimumValueSizes();
 
       // Estimate cost.
       int TreeCost = V.getTreeCost();
       int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
       int Cost = TreeCost + ReductionCost;
       if (Cost >= -SLPCostThreshold) {
           V.getORE()->emit([&]() {
               return OptimizationRemarkMissed(
                          SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
                      << "Vectorizing horizontal reduction is possible"
                      << "but not beneficial with cost "
                      << ore::NV("Cost", Cost) << " and threshold "
                      << ore::NV("Threshold", -SLPCostThreshold);
           });
           break;
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
                         << Cost << ". (HorRdx)\n");
       V.getORE()->emit([&]() {
           return OptimizationRemark(
                      SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
           << "Vectorized horizontal reduction with cost "
           << ore::NV("Cost", Cost) << " and with tree size "
           << ore::NV("TreeSize", V.getTreeSize());
       });
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
       // Emit a reduction.
       Value *ReducedSubTree =
           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         OperationData VectReductionData(ReductionData.getOpcode(),
                                         VectorizedTree, ReducedSubTree,
                                         ReductionData.getKind());
         VectorizedTree =
             VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
       i += ReduxWidth;
       ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
 
     if (VectorizedTree) {
       // Finish the reduction.
       for (; i < NumReducedVals; ++i) {
         auto *I = cast<Instruction>(ReducedVals[i]);
         Builder.SetCurrentDebugLocation(I->getDebugLoc());
         OperationData VectReductionData(ReductionData.getOpcode(),
                                         VectorizedTree, I,
                                         ReductionData.getKind());
         VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
       }
       for (auto &Pair : ExternallyUsedValues) {
         assert(!Pair.second.empty() &&
                "At least one DebugLoc must be inserted");
         // Add each externally used value to the final reduction.
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
           OperationData VectReductionData(ReductionData.getOpcode(),
                                           VectorizedTree, Pair.first,
                                           ReductionData.getKind());
           VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
         }
       }
       // Update users.
       ReductionRoot->replaceAllUsesWith(VectorizedTree);
     }
     return VectorizedTree != nullptr;
   }
 
   unsigned numReductionValues() const {
     return ReducedVals.size();
   }
 
 private:
   /// Calculate the cost of a reduction.
   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
                        unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
 
     int PairwiseRdxCost;
     int SplittingRdxCost;
     switch (ReductionData.getKind()) {
     case RK_Arithmetic:
       PairwiseRdxCost =
           TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
                                           /*IsPairwiseForm=*/true);
       SplittingRdxCost =
           TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
                                           /*IsPairwiseForm=*/false);
       break;
     case RK_Min:
     case RK_Max:
     case RK_UMin:
     case RK_UMax: {
       Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
       bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
                         ReductionData.getKind() == RK_UMax;
       PairwiseRdxCost =
           TTI->getMinMaxReductionCost(VecTy, VecCondTy,
                                       /*IsPairwiseForm=*/true, IsUnsigned);
       SplittingRdxCost =
           TTI->getMinMaxReductionCost(VecTy, VecCondTy,
                                       /*IsPairwiseForm=*/false, IsUnsigned);
       break;
     }
     case RK_None:
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
 
     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
     int ScalarReduxCost;
     switch (ReductionData.getKind()) {
     case RK_Arithmetic:
       ScalarReduxCost =
           TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
       break;
     case RK_Min:
     case RK_Max:
     case RK_UMin:
     case RK_UMax:
       ScalarReduxCost =
           TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
                                   CmpInst::makeCmpResultType(ScalarTy));
       break;
     case RK_None:
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
     ScalarReduxCost *= (ReduxWidth - 1);
 
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
                       << " for reduction that starts with " << *FirstReducedVal
                       << " (It is a "
                       << (IsPairwiseReduction ? "pairwise" : "splitting")
                       << " reduction)\n");
 
     return VecReduxCost - ScalarReduxCost;
   }
 
   /// Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
                        unsigned ReduxWidth, const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
     if (!IsPairwiseReduction)
       return createSimpleTargetReduction(
           Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
           ReductionData.getFlags(), ReductionOps.back());
 
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
       Value *LeftMask =
           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
       Value *RightMask =
           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
 
       Value *LeftShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
       Value *RightShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
       OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
                                       RightShuf, ReductionData.getKind());
       TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
     }
 
     // The result is in the first element of the vector.
     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
   }
 };
 
 } // end anonymous namespace
 
 /// Recognize construction of vectors like
 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
 ///  starting from the last insertelement instruction.
 ///
 /// Returns true if it matches
 static bool findBuildVector(InsertElementInst *LastInsertElem,
                             TargetTransformInfo *TTI,
                             SmallVectorImpl<Value *> &BuildVectorOpds,
                             int &UserCost) {
   UserCost = 0;
   Value *V = nullptr;
   do {
     if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
       UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
                                           LastInsertElem->getType(),
                                           CI->getZExtValue());
     }
     BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
     V = LastInsertElem->getOperand(0);
     if (isa<UndefValue>(V))
       break;
     LastInsertElem = dyn_cast<InsertElementInst>(V);
     if (!LastInsertElem || !LastInsertElem->hasOneUse())
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
   return true;
 }
 
 /// Like findBuildVector, but looks for construction of aggregate.
 ///
 /// \return true if it matches.
 static bool findBuildAggregate(InsertValueInst *IV,
                                SmallVectorImpl<Value *> &BuildVectorOpds) {
   Value *V;
   do {
     BuildVectorOpds.push_back(IV->getInsertedValueOperand());
     V = IV->getAggregateOperand();
     if (isa<UndefValue>(V))
       break;
     IV = dyn_cast<InsertValueInst>(V);
     if (!IV || !IV->hasOneUse())
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
   return true;
 }
 
 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   return V->getType() < V2->getType();
 }
 
 /// Try and get a reduction value from a phi node.
 ///
 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
 /// if they come from either \p ParentBB or a containing loop latch.
 ///
 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
 /// if not possible.
 static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
                                 BasicBlock *ParentBB, LoopInfo *LI) {
   // There are situations where the reduction value is not dominated by the
   // reduction phi. Vectorizing such cases has been reported to cause
   // miscompiles. See PR25787.
   auto DominatedReduxValue = [&](Value *R) {
     return isa<Instruction>(R) &&
            DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
   };
 
   Value *Rdx = nullptr;
 
   // Return the incoming value if it comes from the same BB as the phi node.
   if (P->getIncomingBlock(0) == ParentBB) {
     Rdx = P->getIncomingValue(0);
   } else if (P->getIncomingBlock(1) == ParentBB) {
     Rdx = P->getIncomingValue(1);
   }
 
   if (Rdx && DominatedReduxValue(Rdx))
     return Rdx;
 
   // Otherwise, check whether we have a loop latch to look at.
   Loop *BBL = LI->getLoopFor(ParentBB);
   if (!BBL)
     return nullptr;
   BasicBlock *BBLatch = BBL->getLoopLatch();
   if (!BBLatch)
     return nullptr;
 
   // There is a loop latch, return the incoming value if it comes from
   // that. This reduction pattern occasionally turns up.
   if (P->getIncomingBlock(0) == BBLatch) {
     Rdx = P->getIncomingValue(0);
   } else if (P->getIncomingBlock(1) == BBLatch) {
     Rdx = P->getIncomingValue(1);
   }
 
   if (Rdx && DominatedReduxValue(Rdx))
     return Rdx;
 
   return nullptr;
 }
 
 /// Attempt to reduce a horizontal reduction.
 /// If it is legal to match a horizontal reduction feeding the phi node \a P
 /// with reduction operators \a Root (or one of its operands) in a basic block
 /// \a BB, then check if it can be done. If horizontal reduction is not found
 /// and root instruction is a binary operation, vectorization of the operands is
 /// attempted.
 /// \returns true if a horizontal reduction was matched and reduced or operands
 /// of one of the binary instruction were vectorized.
 /// \returns false if a horizontal reduction was not matched (or not possible)
 /// or no vectorization of any binary operation feeding \a Root instruction was
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
     TargetTransformInfo *TTI,
     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
 
   if (!Root)
     return false;
 
   if (Root->getParent() != BB || isa<PHINode>(Root))
     return false;
   // Start analysis starting from Root instruction. If horizontal reduction is
   // found, try to vectorize it. If it is not a horizontal reduction or
   // vectorization is not possible or not effective, and currently analyzed
   // instruction is a binary operation, try to vectorize the operands, using
   // pre-order DFS traversal order. If the operands were not vectorized, repeat
   // the same procedure considering each operand as a possible root of the
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
   SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
   while (!Stack.empty()) {
     Value *V;
     unsigned Level;
     std::tie(V, Level) = Stack.pop_back_val();
     if (!V)
       continue;
     auto *Inst = dyn_cast<Instruction>(V);
     if (!Inst)
       continue;
     auto *BI = dyn_cast<BinaryOperator>(Inst);
     auto *SI = dyn_cast<SelectInst>(Inst);
     if (BI || SI) {
       HorizontalReduction HorRdx;
       if (HorRdx.matchAssociativeReduction(P, Inst)) {
         if (HorRdx.tryToReduce(R, TTI)) {
           Res = true;
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
           P = nullptr;
           continue;
         }
       }
       if (P && BI) {
         Inst = dyn_cast<Instruction>(BI->getOperand(0));
         if (Inst == P)
           Inst = dyn_cast<Instruction>(BI->getOperand(1));
         if (!Inst) {
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
           P = nullptr;
           continue;
         }
       }
     }
     // Set P to nullptr to avoid re-analysis of phi node in
     // matchAssociativeReduction function unless this is the root node.
     P = nullptr;
     if (Vectorize(Inst, R)) {
       Res = true;
       continue;
     }
 
     // Try to vectorize operands.
     // Continue analysis for the instruction from the same basic block only to
     // save compile time.
     if (++Level < RecursionMaxDepth)
       for (auto *Op : Inst->operand_values())
         if (VisitedInstrs.insert(Op).second)
           if (auto *I = dyn_cast<Instruction>(Op))
             if (!isa<PHINode>(I) && I->getParent() == BB)
               Stack.emplace_back(Op, Level);
   }
   return Res;
 }
 
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                  BasicBlock *BB, BoUpSLP &R,
                                                  TargetTransformInfo *TTI) {
   if (!V)
     return false;
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return false;
 
   if (!isa<BinaryOperator>(I))
     P = nullptr;
   // Try to match and vectorize a horizontal reduction.
   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
     return tryToVectorize(I, R);
   };
   return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
                                                   ExtraVectorization);
 }
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                  BasicBlock *BB, BoUpSLP &R) {
   const DataLayout &DL = BB->getModule()->getDataLayout();
   if (!R.canMapToVector(IVI->getType(), DL))
     return false;
 
   SmallVector<Value *, 16> BuildVectorOpds;
   if (!findBuildAggregate(IVI, BuildVectorOpds))
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
   return tryToVectorizeList(BuildVectorOpds, R);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
   int UserCost;
   SmallVector<Value *, 16> BuildVectorOpds;
   if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
     return false;
 
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
   return tryToVectorizeList(BuildVectorOpds, R, UserCost);
 }
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
                                          BoUpSLP &R) {
   if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
     return true;
 
   bool OpsChanged = false;
   for (int Idx = 0; Idx < 2; ++Idx) {
     OpsChanged |=
         vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
   }
   return OpsChanged;
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
     SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
   bool OpsChanged = false;
   for (auto &VH : reverse(Instructions)) {
     auto *I = dyn_cast_or_null<Instruction>(VH);
     if (!I)
       continue;
     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
     else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
     else if (auto *CI = dyn_cast<CmpInst>(I))
       OpsChanged |= vectorizeCmpInst(CI, BB, R);
   }
   Instructions.clear();
   return OpsChanged;
 }
 
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
     HaveVectorizedPhiNodes = false;
 
     // Collect the incoming values from the PHIs.
     Incoming.clear();
     for (Instruction &I : *BB) {
       PHINode *P = dyn_cast<PHINode>(&I);
       if (!P)
         break;
 
       if (!VisitedInstrs.count(P))
         Incoming.push_back(P);
     }
 
     // Sort by type.
     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
 
     // Try to vectorize elements base on their type.
     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
                                            E = Incoming.end();
          IncIt != E;) {
 
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
       while (SameTypeIt != E &&
              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
 
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
       LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
                         << NumElts << ")\n");
       // The order in which the phi nodes appear in the program does not matter.
       // So allow tryToVectorizeList to reorder them if it is beneficial. This
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
       bool AllowReorder = NumElts == 2;
       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
                                             /*UserCost=*/0, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
         break;
       }
 
       // Start over at the next instruction of a different type (or the end).
       IncIt = SameTypeIt;
     }
   }
 
   VisitedInstrs.clear();
 
   SmallVector<WeakVH, 8> PostProcessInstructions;
   SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
       if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
           vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
         Changed = true;
         it = BB->begin();
         e = BB->end();
       }
       continue;
     }
 
     if (isa<DbgInfoIntrinsic>(it))
       continue;
 
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(it)) {
       // Check that the PHI is a reduction PHI.
       if (P->getNumIncomingValues() != 2)
         return Changed;
 
       // Try to match and vectorize a horizontal reduction.
       if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
                                    TTI)) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
         continue;
       }
       continue;
     }
 
     // Ran into an instruction without users, like terminator, or function call
     // with ignored return value, store. Ignore unused instructions (basing on
     // instruction type, except for CallInst and InvokeInst).
     if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
                             isa<InvokeInst>(it))) {
       KeyNodes.insert(&*it);
       bool OpsChanged = false;
       if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
         for (auto *V : it->operand_values()) {
           // Try to match and vectorize a horizontal reduction.
           OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
         }
       }
       // Start vectorization of post-process list of instructions from the
       // top-tree instructions to try to vectorize as many instructions as
       // possible.
       OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
       if (OpsChanged) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
         Changed = true;
         it = BB->begin();
         e = BB->end();
         continue;
       }
     }
 
     if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
         isa<InsertValueInst>(it))
       PostProcessInstructions.push_back(&*it);
   }
 
   return Changed;
 }
 
 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
   auto Changed = false;
   for (auto &Entry : GEPs) {
     // If the getelementptr list has fewer than two elements, there's nothing
     // to do.
     if (Entry.second.size() < 2)
       continue;
 
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
                       << Entry.second.size() << ".\n");
 
     // We process the getelementptr list in chunks of 16 (like we do for
     // stores) to minimize compile-time.
     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
       auto Len = std::min<unsigned>(BE - BI, 16);
       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
 
       // Initialize a set a candidate getelementptrs. Note that we use a
       // SetVector here to preserve program order. If the index computations
       // are vectorizable and begin with loads, we want to minimize the chance
       // of having to reorder them later.
       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
 
       // Some of the candidates may have already been vectorized after we
       // initially collected them. If so, the WeakTrackingVHs will have
       // nullified the
       // values, so remove them from the set of candidates.
       Candidates.remove(nullptr);
 
       // Remove from the set of candidates all pairs of getelementptrs with
       // constant differences. Such getelementptrs are likely not good
       // candidates for vectorization in a bottom-up phase since one can be
       // computed from the other. We also ensure all candidate getelementptr
       // indices are unique.
       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
         auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
         if (!Candidates.count(GEPI))
           continue;
         auto *SCEVI = SE->getSCEV(GEPList[I]);
         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
           auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
           auto *SCEVJ = SE->getSCEV(GEPList[J]);
           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
             Candidates.remove(GEPList[I]);
             Candidates.remove(GEPList[J]);
           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
             Candidates.remove(GEPList[J]);
           }
         }
       }
 
       // We break out of the above computation as soon as we know there are
       // fewer than two candidates remaining.
       if (Candidates.size() < 2)
         continue;
 
       // Add the single, non-constant index of each candidate to the bundle. We
       // ensured the indices met these constraints when we originally collected
       // the getelementptrs.
       SmallVector<Value *, 16> Bundle(Candidates.size());
       auto BundleIndex = 0u;
       for (auto *V : Candidates) {
         auto *GEP = cast<GetElementPtrInst>(V);
         auto *GEPIdx = GEP->idx_begin()->get();
         assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
         Bundle[BundleIndex++] = GEPIdx;
       }
 
       // Try and vectorize the indices. We are currently only interested in
       // gather-like cases of the form:
       //
       // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
       //
       // where the loads of "a", the loads of "b", and the subtractions can be
       // performed in parallel. It's likely that detecting this pattern in a
       // bottom-up phase will be simpler and less costly than building a
       // full-blown top-down phase beginning at the consecutive loads.
       Changed |= tryToVectorizeList(Bundle, R);
     }
   }
   return Changed;
 }
 
 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
   for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
        ++it) {
     if (it->second.size() < 2)
       continue;
 
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
                       << it->second.size() << ".\n");
 
     // Process the stores in chunks of 16.
     // TODO: The limit of 16 inhibits greater vectorization factors.
     //       For example, AVX2 supports v32i8. Increasing this limit, however,
     //       may cause a significant compile-time increase.
     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
       unsigned Len = std::min<unsigned>(CE - CI, 16);
       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
     }
   }
   return Changed;
 }
 
 char SLPVectorizer::ID = 0;
 
 static const char lv_name[] = "SLP Vectorizer";
 
 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
Index: vendor/llvm/dist-release_70/test/Analysis/BasicAA/tail-byval.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Analysis/BasicAA/tail-byval.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/Analysis/BasicAA/tail-byval.ll	(revision 338000)
@@ -0,0 +1,15 @@
+; RUN: opt -basicaa -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+declare void @takebyval(i32* byval %p)
+
+define i32 @tailbyval() {
+entry:
+  %p = alloca i32
+  store i32 42, i32* %p
+  tail call void @takebyval(i32* byval %p)
+  %rv = load i32, i32* %p
+  ret i32 %rv
+}
+; FIXME: This should be Just Ref.
+; CHECK-LABEL: Function: tailbyval: 1 pointers, 1 call sites
+; CHECK-NEXT:   Both ModRef:  Ptr: i32* %p       <->  tail call void @takebyval(i32* byval %p)
Index: vendor/llvm/dist-release_70/test/BugPoint/compile-custom.ll
===================================================================
--- vendor/llvm/dist-release_70/test/BugPoint/compile-custom.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/BugPoint/compile-custom.ll	(revision 338000)
@@ -1,12 +1,12 @@
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%python %/s.py arg1 arg2" --opt-command opt --output-prefix %t %s | FileCheck %s
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%/s.py arg1 arg2" --opt-command opt --output-prefix %t %s | FileCheck %s
 ; REQUIRES: loadable_module
 
 ; Test that arguments are correctly passed in --compile-command.  The output
 ; of bugpoint includes the output of the custom tool, so we just echo the args
 ; in the tool and check here.
 
 ; CHECK: Error: arg1 arg2
 
 define void @noop() {
     ret void
 }
Index: vendor/llvm/dist-release_70/test/BugPoint/unsymbolized.ll
===================================================================
--- vendor/llvm/dist-release_70/test/BugPoint/unsymbolized.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/BugPoint/unsymbolized.ll	(revision 338000)
@@ -1,23 +1,23 @@
 ; REQUIRES: loadable_module
 ; RUN: echo "import sys" > %t.py
 ; RUN: echo "print('args = ' + str(sys.argv))" >> %t.py
 ; RUN: echo "exit(1)" >> %t.py
-; RUN: not bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -opt-command="%python" -opt-args %t.py | FileCheck %s
+; RUN: not bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -opt-command=%python -opt-args %t.py | FileCheck %s
 ; RUN: not --crash opt -load %llvmshlibdir/BugpointPasses%shlibext %s -bugpoint-crashcalls -disable-symbolication 2>&1 | FileCheck --check-prefix=CRASH %s
 ; RUN: not bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -opt-command=%t.non.existent.opt.binary -opt-args %t.py 2>&1 | FileCheck %s --check-prefix=BAD-OPT
 
 ; Test that bugpoint disables symbolication on the opt tool to reduce runtime overhead when opt crashes
 ; CHECK: args = {{.*}}'-disable-symbolication'
 
 ; Test that opt, when it crashes & is passed -disable-symbolication, doesn't symbolicate.
 ; In theory this test should maybe be in test/tools/opt or
 ; test/Transforms, but since there doesn't seem to be another convenient way to
 ; crash opt, apart from the BugpointPasses dynamic plugin, this is the spot for
 ; now.
 ; CRASH-NOT: Signals.inc
 
 ; BAD-OPT: Specified `opt' binary does not exist: {{.*}}non.existent.opt.binary
 define void @f() {
   call void @f()
   ret void
 }
Index: vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll	(revision 338000)
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck %s
+
+; Test for ICE in SelectionDAG::computeKnownBits when visiting EXTRACT_SUBVECTOR
+; with DemandedElts already as wide as the source vector.
+
+define <3 x i32> @quux() #0 {
+; CHECK-LABEL: quux:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %tmp = shufflevector <4 x i8> <i8 1, i8 2, i8 3, i8 4>, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %tmp1 = extractelement <3 x i8> %tmp, i64 0
+  %tmp2 = zext i8 %tmp1 to i32
+  %tmp3 = insertelement <3 x i32> undef, i32 %tmp2, i32 0
+  %tmp4 = extractelement <3 x i8> %tmp, i64 1
+  %tmp5 = zext i8 %tmp4 to i32
+  %tmp6 = insertelement <3 x i32> %tmp3, i32 %tmp5, i32 1
+  %tmp7 = extractelement <3 x i8> %tmp, i64 2
+  %tmp8 = zext i8 %tmp7 to i32
+  %tmp9 = insertelement <3 x i32> %tmp6, i32 %tmp8, i32 2
+  %tmp10 = lshr <3 x i32> %tmp9, <i32 1, i32 1, i32 1>
+  ret <3 x i32> %tmp10
+}
+
+attributes #0 = { noinline optnone }
Index: vendor/llvm/dist-release_70/test/CodeGen/ARM/inline-asm-operand-implicit-cast.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/ARM/inline-asm-operand-implicit-cast.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/CodeGen/ARM/inline-asm-operand-implicit-cast.ll	(nonexistent)
@@ -1,122 +0,0 @@
-; RUN: llc -mtriple armv7-arm-linux-gnueabihf -O2 -mcpu=cortex-a7 < %s | FileCheck %s
-
-; Check support for returning a float in GPR with soft float ABI
-define arm_aapcscc float @zerobits_float_soft() #0 {
-; CHECK-LABEL: zerobits_float_soft
-; CHECK: mov r0, #0
-  %1 = tail call float asm "mov ${0}, #0", "=&r"()
-  ret float %1
-}
-
-; Check support for returning a double in GPR with soft float ABI
-define arm_aapcscc double @zerobits_double_soft() #0 {
-; CHECK-LABEL: zerobits_double_soft
-; CHECK: mov r0, #0
-; CHECK-NEXT: mov r1, #0
-  %1 = tail call double asm "mov ${0:Q}, #0\0Amov ${0:R}, #0", "=&r"()
-  ret double %1
-}
-
-; Check support for returning a float in GPR with matching float input with
-; soft float ABI
-define arm_aapcscc float @flt_gpr_matching_in_op_soft(float %f) #0 {
-; CHECK-LABEL: flt_gpr_matching_in_op_soft
-; CHECK: mov r0, r0
-  %1 = call float asm "mov $0, $1", "=&r,0"(float %f)
-  ret float %1
-}
-
-; Check support for returning a double in GPR with matching double input with
-; soft float ABI
-define arm_aapcscc double @dbl_gpr_matching_in_op_soft(double %d) #0 {
-; CHECK-LABEL: dbl_gpr_matching_in_op_soft
-; CHECK: mov r1, r0
-  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&r,0"(double %d)
-  ret double %1
-}
-
-; Check support for returning a float in specific GPR with matching float input
-; with soft float ABI
-define arm_aapcscc float @flt_gpr_matching_spec_reg_in_op_soft(float %f) #0 {
-; CHECK-LABEL: flt_gpr_matching_spec_reg_in_op_soft
-; CHECK: mov r3, r3
-  %1 = call float asm "mov $0, $1", "=&{r3},0"(float %f)
-  ret float %1
-}
-
-; Check support for returning a double in specific GPR with matching double
-; input with soft float ABI
-define arm_aapcscc double @dbl_gpr_matching_spec_reg_in_op_soft(double %d) #0 {
-; CHECK-LABEL: dbl_gpr_matching_spec_reg_in_op_soft
-; CHECK: mov r3, r2
-  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&{r2},0"(double %d)
-  ret double %1
-}
-
-attributes #0 = { nounwind "target-features"="+d16,+vfp2,+vfp3,-fp-only-sp" "use-soft-float"="true" }
-
-
-; Check support for returning a float in GPR with hard float ABI
-define float @zerobits_float_hard() #1 {
-; CHECK-LABEL: zerobits_float_hard
-; CHECK: mov r0, #0
-; CHECK: vmov s0, r0
-  %1 = tail call float asm "mov ${0}, #0", "=&r"()
-  ret float %1
-}
-
-; Check support for returning a double in GPR with hard float ABI
-define double @zerobits_double_hard() #1 {
-; CHECK-LABEL: zerobits_double_hard
-; CHECK: mov r0, #0
-; CHECK-NEXT: mov r1, #0
-; CHECK: vmov d0, r0, r1
-  %1 = tail call double asm "mov ${0:Q}, #0\0Amov ${0:R}, #0", "=&r"()
-  ret double %1
-}
-
-; Check support for returning a float in GPR with matching float input with
-; hard float ABI
-define float @flt_gpr_matching_in_op_hard(float %f) #1 {
-; CHECK-LABEL: flt_gpr_matching_in_op_hard
-; CHECK: vmov r0, s0
-; CHECK: mov r0, r0
-; CHECK: vmov s0, r0
-  %1 = call float asm "mov $0, $1", "=&r,0"(float %f)
-  ret float %1
-}
-
-; Check support for returning a double in GPR with matching double input with
-; hard float ABI
-define double @dbl_gpr_matching_in_op_hard(double %d) #1 {
-; CHECK-LABEL: dbl_gpr_matching_in_op_hard
-; CHECK: vmov r0, r1, d0
-; CHECK: mov r1, r0
-; CHECK: vmov d0, r0, r1
-  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&r,0"(double %d)
-  ret double %1
-}
-
-; Check support for returning a float in specific GPR with matching float
-; input with hard float ABI
-define float @flt_gpr_matching_spec_reg_in_op_hard(float %f) #1 {
-; CHECK-LABEL: flt_gpr_matching_spec_reg_in_op_hard
-; CHECK: vmov r3, s0
-; CHECK: mov r3, r3
-; CHECK: vmov s0, r3
-  %1 = call float asm "mov $0, $1", "=&{r3},0"(float %f)
-  ret float %1
-}
-
-; Check support for returning a double in specific GPR with matching double
-; input with hard float ABI
-define double @dbl_gpr_matching_spec_reg_in_op_hard(double %d) #1 {
-; CHECK-LABEL: dbl_gpr_matching_spec_reg_in_op_hard
-; CHECK: vmov r2, r3, d0
-; CHECK: mov r3, r2
-; CHECK: vmov d0, r2, r3
-  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&{r2},0"(double %d)
-  ret double %1
-}
-
-attributes #1 = { nounwind "target-features"="+d16,+vfp2,+vfp3,-fp-only-sp" "use-soft-float"="false" }
Index: vendor/llvm/dist-release_70/test/CodeGen/ARM/inlineasm-operand-implicit-cast.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/ARM/inlineasm-operand-implicit-cast.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/CodeGen/ARM/inlineasm-operand-implicit-cast.ll	(revision 338000)
@@ -0,0 +1,307 @@
+; RUN: llc -mtriple armv7-arm-linux-gnueabihf -O2 -mcpu=cortex-a7 < %s | FileCheck %s
+
+%struct.twofloat = type { float, float }
+%struct.twodouble = type { double, double }
+
+; Check support for returning a float in GPR with soft float ABI
+define arm_aapcscc float @zerobits_float_soft() #0 {
+; CHECK-LABEL: zerobits_float_soft
+; CHECK: mov r0, #0
+  %1 = tail call float asm "mov ${0}, #0", "=&r"()
+  ret float %1
+}
+
+; Check support for returning a double in GPR with soft float ABI
+define arm_aapcscc double @zerobits_double_soft() #0 {
+; CHECK-LABEL: zerobits_double_soft
+; CHECK: mov r0, #0
+; CHECK-NEXT: mov r1, #0
+  %1 = tail call double asm "mov ${0:Q}, #0\0Amov ${0:R}, #0", "=&r"()
+  ret double %1
+}
+
+; Check support for returning a float in GPR with matching float input with
+; soft float ABI
+define arm_aapcscc float @flt_gpr_matching_in_op_soft(float %f) #0 {
+; CHECK-LABEL: flt_gpr_matching_in_op_soft
+; CHECK: mov r0, r0
+  %1 = call float asm "mov $0, $1", "=&r,0"(float %f)
+  ret float %1
+}
+
+; Check support for returning a double in GPR with matching double input with
+; soft float ABI
+define arm_aapcscc double @dbl_gpr_matching_in_op_soft(double %d) #0 {
+; CHECK-LABEL: dbl_gpr_matching_in_op_soft
+; CHECK: mov r1, r0
+  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&r,0"(double %d)
+  ret double %1
+}
+
+; Check support for returning a float in specific GPR with matching float input
+; with soft float ABI
+define arm_aapcscc float @flt_gpr_matching_spec_reg_in_op_soft(float %f) #0 {
+; CHECK-LABEL: flt_gpr_matching_spec_reg_in_op_soft
+; CHECK: mov r3, r3
+  %1 = call float asm "mov $0, $1", "=&{r3},0"(float %f)
+  ret float %1
+}
+
+; Check support for returning a double in specific GPR with matching double
+; input with soft float ABI
+define arm_aapcscc double @dbl_gpr_matching_spec_reg_in_op_soft(double %d) #0 {
+; CHECK-LABEL: dbl_gpr_matching_spec_reg_in_op_soft
+; CHECK: mov r3, r2
+  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&{r2},0"(double %d)
+  ret double %1
+}
+
+; Check support for returning several float in GPR
+define arm_aapcscc float @zerobits_float_convoluted_soft() #0 {
+; CHECK-LABEL: zerobits_float_convoluted_soft
+; CHECK: mov r0, #0
+; CHECK-NEXT: mov r1, #0
+  %1 = call { float, float } asm "mov $0, #0; mov $1, #0", "=r,=r"()
+  %asmresult = extractvalue { float, float } %1, 0
+  %asmresult1 = extractvalue { float, float } %1, 1
+  %add = fadd float %asmresult, %asmresult1
+  ret float %add
+}
+
+; Check support for returning several double in GPR
+define double @zerobits_double_convoluted_soft() #0 {
+; CHECK-LABEL: zerobits_double_convoluted_soft
+; CHECK: mov r0, #0
+; CHECK-NEXT: mov r1, #0
+; CHECK-NEXT: mov r2, #0
+; CHECK-NEXT: mov r3, #0
+  %1 = call { double, double } asm "mov ${0:Q}, #0; mov ${0:R}, #0; mov ${1:Q}, #0; mov ${1:R}, #0", "=r,=r"()
+  %asmresult = extractvalue { double, double } %1, 0
+  %asmresult1 = extractvalue { double, double } %1, 1
+  %add = fadd double %asmresult, %asmresult1
+  ret double %add
+}
+
+; Check support for returning several floats in GPRs with matching float inputs
+; with soft float ABI
+define arm_aapcscc float @flt_gprs_matching_in_op_soft(float %f1, float %f2) #0 {
+; CHECK-LABEL: flt_gprs_matching_in_op_soft
+; CHECK: mov r0, r0
+; CHECK-NEXT: mov r1, r1
+  %1 = call { float, float } asm "mov $0, $2; mov $1, $3", "=&r,=&r,0,1"(float %f1, float %f2)
+  %asmresult1 = extractvalue { float, float } %1, 0
+  %asmresult2 = extractvalue { float, float } %1, 1
+  %add = fadd float %asmresult1, %asmresult2
+  ret float %add
+}
+
+; Check support for returning several double in GPRs with matching double input
+; with soft float ABI
+define arm_aapcscc double @dbl_gprs_matching_in_op_soft(double %d1, double %d2) #0 {
+; CHECK-LABEL: dbl_gprs_matching_in_op_soft
+; CHECK: mov r1, r0
+; CHECK-NEXT: mov r3, r2
+  %1 = call { double, double } asm "mov ${0:R}, ${2:Q}; mov ${1:R}, ${3:Q}", "=&r,=&r,0,1"(double %d1, double %d2)
+  %asmresult1 = extractvalue { double, double } %1, 0
+  %asmresult2 = extractvalue { double, double } %1, 1
+  %add = fadd double %asmresult1, %asmresult2
+  ret double %add
+}
+
+; Check support for returning several float in specific GPRs with matching
+; float input with soft float ABI
+define arm_aapcscc float @flt_gprs_matching_spec_reg_in_op_soft(float %f1, float %f2) #0 {
+; CHECK-LABEL: flt_gprs_matching_spec_reg_in_op_soft
+; CHECK: mov r3, r3
+; CHECK-NEXT: mov r4, r4
+  %1 = call { float, float } asm "mov $0, $2; mov $1, $3", "=&{r3},=&{r4},0,1"(float %f1, float %f2)
+  %asmresult1 = extractvalue { float, float } %1, 0
+  %asmresult2 = extractvalue { float, float } %1, 1
+  %add = fadd float %asmresult1, %asmresult2
+  ret float %add
+}
+
+; Check support for returning several double in specific GPRs with matching
+; double input with soft float ABI
+define arm_aapcscc double @dbl_gprs_matching_spec_reg_in_op_soft(double %d1, double %d2) #0 {
+; CHECK-LABEL: dbl_gprs_matching_spec_reg_in_op_soft
+; CHECK: mov r3, r2
+; CHECK-NEXT: mov r5, r4
+  %1 = call { double, double } asm "mov ${0:R}, ${2:Q}; mov ${1:R}, ${3:Q}", "=&{r2},=&{r4},0,1"(double %d1, double %d2)
+  %asmresult1 = extractvalue { double, double } %1, 0
+  %asmresult2 = extractvalue { double, double } %1, 1
+  %add = fadd double %asmresult1, %asmresult2
+  ret double %add
+}
+
+attributes #0 = { nounwind "target-features"="+d16,+vfp2,+vfp3,-fp-only-sp" "use-soft-float"="true" }
+
+
+; Check support for returning a float in GPR with hard float ABI
+define float @zerobits_float_hard() #1 {
+; CHECK-LABEL: zerobits_float_hard
+; CHECK: mov r0, #0
+; CHECK: vmov s0, r0
+  %1 = tail call float asm "mov ${0}, #0", "=&r"()
+  ret float %1
+}
+
+; Check support for returning a double in GPR with hard float ABI
+define double @zerobits_double_hard() #1 {
+; CHECK-LABEL: zerobits_double_hard
+; CHECK: mov r0, #0
+; CHECK-NEXT: mov r1, #0
+; CHECK: vmov d0, r0, r1
+  %1 = tail call double asm "mov ${0:Q}, #0\0Amov ${0:R}, #0", "=&r"()
+  ret double %1
+}
+
+; Check support for returning a float in GPR with matching float input with
+; hard float ABI
+define float @flt_gpr_matching_in_op_hard(float %f) #1 {
+; CHECK-LABEL: flt_gpr_matching_in_op_hard
+; CHECK: vmov r0, s0
+; CHECK: mov r0, r0
+; CHECK: vmov s0, r0
+  %1 = call float asm "mov $0, $1", "=&r,0"(float %f)
+  ret float %1
+}
+
+; Check support for returning a double in GPR with matching double input with
+; hard float ABI
+define double @dbl_gpr_matching_in_op_hard(double %d) #1 {
+; CHECK-LABEL: dbl_gpr_matching_in_op_hard
+; CHECK: vmov r0, r1, d0
+; CHECK: mov r1, r0
+; CHECK: vmov d0, r0, r1
+  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&r,0"(double %d)
+  ret double %1
+}
+
+; Check support for returning a float in specific GPR with matching float
+; input with hard float ABI
+define float @flt_gpr_matching_spec_reg_in_op_hard(float %f) #1 {
+; CHECK-LABEL: flt_gpr_matching_spec_reg_in_op_hard
+; CHECK: vmov r3, s0
+; CHECK: mov r3, r3
+; CHECK: vmov s0, r3
+  %1 = call float asm "mov $0, $1", "=&{r3},0"(float %f)
+  ret float %1
+}
+
+; Check support for returning a double in specific GPR with matching double
+; input with hard float ABI
+define double @dbl_gpr_matching_spec_reg_in_op_hard(double %d) #1 {
+; CHECK-LABEL: dbl_gpr_matching_spec_reg_in_op_hard
+; CHECK: vmov r2, r3, d0
+; CHECK: mov r3, r2
+; CHECK: vmov d0, r2, r3
+  %1 = call double asm "mov ${0:R}, ${1:Q}", "=&{r2},0"(double %d)
+  ret double %1
+}
+
+; Check support for returning several float in GPR
+define %struct.twofloat @zerobits_float_convoluted_hard() #1 {
+; CHECK-LABEL: zerobits_float_convoluted_hard
+; CHECK: mov r0, #0
+; CHECK-NEXT: mov r1, #0
+; CHECK: vmov s0, r0
+; CHECK-NEXT: vmov s1, r1
+  %1 = call { float, float } asm "mov $0, #0; mov $1, #0", "=r,=r"()
+  %asmresult1 = extractvalue { float, float } %1, 0
+  %asmresult2 = extractvalue { float, float } %1, 1
+  %partialres = insertvalue %struct.twofloat undef, float %asmresult1, 0
+  %res = insertvalue %struct.twofloat %partialres, float %asmresult2, 1
+  ret %struct.twofloat %res
+}
+
+; Check support for returning several double in GPR
+define %struct.twodouble @zerobits_double_convoluted_hard() #1 {
+; CHECK-LABEL: zerobits_double_convoluted_hard
+; CHECK: mov r0, #0
+; CHECK-NEXT: mov r1, #0
+; CHECK-NEXT: mov r2, #0
+; CHECK-NEXT: mov r3, #0
+; CHECK: vmov d0, r0, r1
+; CHECK-NEXT: vmov d1, r2, r3
+  %1 = call { double, double } asm "mov ${0:Q}, #0; mov ${0:R}, #0; mov ${1:Q}, #0; mov ${1:R}, #0", "=r,=r"()
+  %asmresult1 = extractvalue { double, double } %1, 0
+  %asmresult2 = extractvalue { double, double } %1, 1
+  %partialres = insertvalue %struct.twodouble undef, double %asmresult1, 0
+  %res = insertvalue %struct.twodouble %partialres, double %asmresult2, 1
+  ret %struct.twodouble %res
+}
+
+; Check support for returning several floats in GPRs with matching float inputs
+; with hard float ABI
+define %struct.twofloat @flt_gprs_matching_in_op_hard(float %f1, float %f2) #1 {
+; CHECK-LABEL: flt_gprs_matching_in_op_hard
+; CHECK: vmov r0, s0
+; CHECK-NEXT: vmov r1, s1
+; CHECK: mov r0, r0
+; CHECK-NEXT: mov r1, r1
+; CHECK: vmov s0, r0
+; CHECK-NEXT: vmov s1, r1
+  %1 = call { float, float } asm "mov $0, $2; mov $1, $3", "=&r,=&r,0,1"(float %f1, float %f2)
+  %asmresult1 = extractvalue { float, float } %1, 0
+  %asmresult2 = extractvalue { float, float } %1, 1
+  %partialres = insertvalue %struct.twofloat undef, float %asmresult1, 0
+  %res = insertvalue %struct.twofloat %partialres, float %asmresult2, 1
+  ret %struct.twofloat %res
+}
+
+; Check support for returning several double in GPRs with matching double input
+; with hard float ABI
+define %struct.twodouble @dbl_gprs_matching_in_op_hard(double %d1, double %d2) #1 {
+; CHECK-LABEL: dbl_gprs_matching_in_op_hard
+; CHECK: vmov r0, r1, d0
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK: mov r1, r0
+; CHECK-NEXT: mov r3, r2
+; CHECK: vmov d0, r0, r1
+; CHECK-NEXT: vmov d1, r2, r3
+  %1 = call { double, double } asm "mov ${0:R}, ${2:Q}; mov ${1:R}, ${3:Q}", "=&r,=&r,0,1"(double %d1, double %d2)
+  %asmresult1 = extractvalue { double, double } %1, 0
+  %asmresult2 = extractvalue { double, double } %1, 1
+  %partialres = insertvalue %struct.twodouble undef, double %asmresult1, 0
+  %res = insertvalue %struct.twodouble %partialres, double %asmresult2, 1
+  ret %struct.twodouble %res
+}
+
+; Check support for returning several float in specific GPRs with matching
+; float input with hard float ABI
+define %struct.twofloat @flt_gprs_matching_spec_reg_in_op_hard(float %f1, float %f2) #1 {
+; CHECK-LABEL: flt_gprs_matching_spec_reg_in_op_hard
+; CHECK: vmov r3, s0
+; CHECK-NEXT: vmov r4, s1
+; CHECK: mov r3, r3
+; CHECK-NEXT: mov r4, r4
+; CHECK: vmov s0, r3
+; CHECK-NEXT: vmov s1, r4
+  %1 = call { float, float } asm "mov $0, $2; mov $1, $3", "=&{r3},=&{r4},0,1"(float %f1, float %f2)
+  %asmresult1 = extractvalue { float, float } %1, 0
+  %asmresult2 = extractvalue { float, float } %1, 1
+  %partialres = insertvalue %struct.twofloat undef, float %asmresult1, 0
+  %res = insertvalue %struct.twofloat %partialres, float %asmresult2, 1
+  ret %struct.twofloat %res
+}
+
+; Check support for returning several double in specific GPRs with matching
+; double input with hard float ABI
+define %struct.twodouble @dbl_gprs_matching_spec_reg_in_op_hard(double %d1, double %d2) #1 {
+; CHECK-LABEL: dbl_gprs_matching_spec_reg_in_op_hard
+; CHECK: vmov r2, r3, d0
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK: mov r3, r2
+; CHECK-NEXT: mov r5, r4
+; CHECK: vmov d0, r2, r3
+; CHECK-NEXT: vmov d1, r4, r5
+  %1 = call { double, double } asm "mov ${0:R}, ${2:Q}; mov ${1:R}, ${3:Q}", "=&{r2},=&{r4},0,1"(double %d1, double %d2)
+  %asmresult1 = extractvalue { double, double } %1, 0
+  %asmresult2 = extractvalue { double, double } %1, 1
+  %partialres = insertvalue %struct.twodouble undef, double %asmresult1, 0
+  %res = insertvalue %struct.twodouble %partialres, double %asmresult2, 1
+  ret %struct.twodouble %res
+}
+
+attributes #1 = { nounwind "target-features"="+d16,+vfp2,+vfp3,-fp-only-sp" "use-soft-float"="false" }
Index: vendor/llvm/dist-release_70/test/CodeGen/PowerPC/pr38087.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/PowerPC/pr38087.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/CodeGen/PowerPC/pr38087.ll	(revision 338000)
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64le-unknown-unknown -ppc-asm-full-reg-names < %s | \
+; RUN:   FileCheck %s
+; Function Attrs: nounwind readnone speculatable
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
+
+; Function Attrs: nounwind readnone speculatable
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0
+
+define void @draw_llvm_vs_variant0() {
+; CHECK-LABEL: draw_llvm_vs_variant0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ldx r3, 0, r3
+; CHECK-NEXT:    mtvsrd f0, r3
+; CHECK-NEXT:    xxswapd v2, vs0
+; CHECK-NEXT:    vmrglh v2, v2, v2
+; CHECK-NEXT:    vextsh2w v2, v2
+; CHECK-NEXT:    xvcvsxwsp vs0, v2
+; CHECK-NEXT:    xxspltw vs0, vs0, 2
+; CHECK-NEXT:    xvmaddasp vs0, vs0, vs0
+; CHECK-NEXT:    stxvx vs0, 0, r3
+; CHECK-NEXT:    blr
+entry:
+  %.size = load i32, i32* undef
+  %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %.size, i32 7)
+  %1 = extractvalue { i32, i1 } %0, 0
+  %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %1, i32 0)
+  %3 = extractvalue { i32, i1 } %2, 0
+  %4 = select i1 false, i32 0, i32 %3
+  %5 = xor i1 false, true
+  %6 = sext i1 %5 to i32
+  %7 = load <4 x i16>, <4 x i16>* undef, align 2
+  %8 = extractelement <4 x i16> %7, i32 0
+  %9 = sext i16 %8 to i32
+  %10 = insertelement <4 x i32> undef, i32 %9, i32 0
+  %11 = extractelement <4 x i16> %7, i32 1
+  %12 = sext i16 %11 to i32
+  %13 = insertelement <4 x i32> %10, i32 %12, i32 1
+  %14 = extractelement <4 x i16> %7, i32 2
+  %15 = sext i16 %14 to i32
+  %16 = insertelement <4 x i32> %13, i32 %15, i32 2
+  %17 = extractelement <4 x i16> %7, i32 3
+  %18 = sext i16 %17 to i32
+  %19 = insertelement <4 x i32> %16, i32 %18, i32 3
+  %20 = sitofp <4 x i32> %19 to <4 x float>
+  %21 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %22 = shufflevector <4 x i32> %21, <4 x i32> undef, <4 x i32> zeroinitializer
+  %23 = bitcast <4 x float> %20 to <4 x i32>
+  %24 = and <4 x i32> %23, %22
+  %25 = bitcast <4 x i32> %24 to <4 x float>
+  %26 = shufflevector <4 x float> %25, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %27 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> %26)
+  store <4 x float> %27, <4 x float>* undef
+  ret void
+}
Index: vendor/llvm/dist-release_70/test/CodeGen/X86/flags-copy-lowering.mir
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/X86/flags-copy-lowering.mir	(revision 337999)
+++ vendor/llvm/dist-release_70/test/CodeGen/X86/flags-copy-lowering.mir	(revision 338000)
@@ -1,1057 +1,1057 @@
 # RUN: llc -run-pass x86-flags-copy-lowering -verify-machineinstrs -o - %s | FileCheck %s
 #
 # Lower various interesting copy patterns of EFLAGS without using LAHF/SAHF.
 
 --- |
   target triple = "x86_64-unknown-unknown"
   
   declare void @foo()
   
   define i32 @test_branch(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i32 0
   }
 
   define i32 @test_branch_fallthrough(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i32 0
   }
 
   define void @test_setcc(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_cmov(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_adc(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_sbb(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_adcx(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_adox(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_rcl(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_rcr(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define void @test_setb_c(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret void
   }
 
   define i64 @test_branch_with_livein_and_kill(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i64 0
   }
 
   define i64 @test_branch_with_interleaved_livein_and_kill(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i64 0
   }
 
   define i64 @test_mid_cycle_copies(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i64 0
   }
 
   define i32 @test_existing_setcc(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i32 0
   }
 
   define i32 @test_existing_setcc_memory(i64 %a, i64 %b) {
   entry:
     call void @foo()
     ret i32 0
   }
 ...
 ---
 name:            test_branch
 # CHECK-LABEL: name: test_branch
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2, %bb.3
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     JA_1 %bb.1, implicit $eflags
     JB_1 %bb.2, implicit $eflags
     JMP_1 %bb.3
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.1, implicit killed $eflags
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: {{.*$}}
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT:   TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
   ; CHECK-NEXT:   JMP_1 %bb.3
   
   bb.1:
     %3:gr32 = MOV32ri64 42
     $eax = COPY %3
     RET 0, $eax
   
   bb.2:
     %4:gr32 = MOV32ri64 43
     $eax = COPY %4
     RET 0, $eax
   
   bb.3:
     %5:gr32 = MOV32r0 implicit-def dead $eflags
     $eax = COPY %5
     RET 0, $eax
 
 ...
 ---
 name:            test_branch_fallthrough
 # CHECK-LABEL: name: test_branch_fallthrough
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2, %bb.3
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     JA_1 %bb.2, implicit $eflags
     JB_1 %bb.3, implicit $eflags
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: {{.*$}}
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT:   TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.3, implicit killed $eflags
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT:   bb.1:
 
   bb.1:
     %5:gr32 = MOV32r0 implicit-def dead $eflags
     $eax = COPY %5
     RET 0, $eax
   
   bb.2:
     %3:gr32 = MOV32ri64 42
     $eax = COPY %3
     RET 0, $eax
   
   bb.3:
     %4:gr32 = MOV32ri64 43
     $eax = COPY %4
     RET 0, $eax
   
 ...
 ---
 name:            test_setcc
 # CHECK-LABEL: name: test_setcc
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
   ; CHECK-NEXT: %[[NE_REG:[^:]*]]:gr8 = SETNEr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     %3:gr8 = SETAr implicit $eflags
     %4:gr8 = SETBr implicit $eflags
     %5:gr8 = SETEr implicit $eflags
     SETNEm $rsp, 1, $noreg, -16, $noreg, implicit killed $eflags
     MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %3
     MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %4
     MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %5
   ; CHECK-NOT:     $eflags =
   ; CHECK-NOT:             = SET{{.*}}
   ; CHECK:         MOV8mr {{.*}}, killed %[[A_REG]]
   ; CHECK-NEXT:    MOV8mr {{.*}}, killed %[[B_REG]]
   ; CHECK-NEXT:    MOV8mr {{.*}}, killed %[[E_REG]]
   ; CHECK-NOT:     MOV8mr {{.*}}, killed %[[NE_REG]]
 
     RET 0
 
 ...
 ---
 name:            test_cmov
 # CHECK-LABEL: name: test_cmov
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     %3:gr64 = CMOVA64rr %0, %1, implicit $eflags
     %4:gr64 = CMOVB64rr %0, %1, implicit $eflags
     %5:gr64 = CMOVE64rr %0, %1, implicit $eflags
     %6:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %3:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NEXT:    TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %6:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
 
     RET 0
 
 ...
 ---
 name:            test_adc
 # CHECK-LABEL: name: test_adc
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr64 = ADC64ri32 %2:gr64, 42, implicit-def $eflags, implicit $eflags
     %5:gr64 = ADC64ri32 %4:gr64, 42, implicit-def $eflags, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         dead %{{[^:]*}}:gr8 = ADD8ri %[[CF_REG]], 255, implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = ADC64ri32 %2, 42, implicit-def $eflags, implicit killed $eflags
   ; CHECK-NEXT:    %5:gr64 = ADC64ri32 %4, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
 
     RET 0
 
 ...
 ---
 name:            test_sbb
 # CHECK-LABEL: name: test_sbb
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = SUB64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY killed $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr64 = SBB64ri32 %2:gr64, 42, implicit-def $eflags, implicit killed $eflags
     %5:gr64 = SBB64ri32 %4:gr64, 42, implicit-def dead $eflags, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         dead %{{[^:]*}}:gr8 = ADD8ri %[[CF_REG]], 255, implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = SBB64ri32 %2, 42, implicit-def $eflags, implicit killed $eflags
   ; CHECK-NEXT:    %5:gr64 = SBB64ri32 %4, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
 
     RET 0
 
 ...
 ---
 name:            test_adcx
 # CHECK-LABEL: name: test_adcx
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
   ; CHECK-NEXT:   %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr64 = CMOVE64rr %0, %1, implicit $eflags
     %5:gr64 = MOV64ri32 42
     %6:gr64 = ADCX64rr %2, %5, implicit-def $eflags, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NEXT:    %5:gr64 = MOV64ri32 42
   ; CHECK-NEXT:    dead %{{[^:]*}}:gr8 = ADD8ri %[[CF_REG]], 255, implicit-def $eflags
   ; CHECK-NEXT:    %6:gr64 = ADCX64rr %2, %5, implicit-def{{( dead)?}} $eflags, implicit killed $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
 
     RET 0
 
 ...
 ---
 name:            test_adox
 # CHECK-LABEL: name: test_adox
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
   ; CHECK-NEXT:   %[[OF_REG:[^:]*]]:gr8 = SETOr implicit $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr64 = CMOVE64rr %0, %1, implicit $eflags
     %5:gr64 = MOV64ri32 42
     %6:gr64 = ADOX64rr %2, %5, implicit-def $eflags, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NEXT:    %5:gr64 = MOV64ri32 42
   ; CHECK-NEXT:    dead %{{[^:]*}}:gr8 = ADD8ri %[[OF_REG]], 127, implicit-def $eflags
   ; CHECK-NEXT:    %6:gr64 = ADOX64rr %2, %5, implicit-def{{( dead)?}} $eflags, implicit killed $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
 
     RET 0
 
 ...
 ---
 name:            test_rcl
 # CHECK-LABEL: name: test_rcl
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr64 = RCL64r1 %2:gr64, implicit-def $eflags, implicit $eflags
     %5:gr64 = RCL64r1 %4:gr64, implicit-def $eflags, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         dead %{{[^:]*}}:gr8 = ADD8ri %[[CF_REG]], 255, implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = RCL64r1 %2, implicit-def $eflags, implicit killed $eflags
   ; CHECK-NEXT:    %5:gr64 = RCL64r1 %4, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
 
     RET 0
 
 ...
 ---
 name:            test_rcr
 # CHECK-LABEL: name: test_rcr
 liveins:         
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
   
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr64 = RCR64r1 %2:gr64, implicit-def $eflags, implicit $eflags
     %5:gr64 = RCR64r1 %4:gr64, implicit-def $eflags, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         dead %{{[^:]*}}:gr8 = ADD8ri %[[CF_REG]], 255, implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = RCR64r1 %2, implicit-def $eflags, implicit killed $eflags
   ; CHECK-NEXT:    %5:gr64 = RCR64r1 %4, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
 
     RET 0
 
 ...
 ---
 name:            test_setb_c
 # CHECK-LABEL: name: test_setb_c
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     liveins: $rdi, $rsi
 
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     %2:gr64 = ADD64rr %0, %1, implicit-def $eflags
     %3:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[CF_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
     %4:gr8 = SETB_C8r implicit-def $eflags, implicit $eflags
     MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %4
   ; CHECK-NOT:     $eflags =
   ; CHECK:         %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
-  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr8 = EXTRACT_SUBREG %[[ZERO]], %subreg.sub_8bit
+  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr8 = COPY %[[ZERO]].sub_8bit
   ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr8 = SUB8rr %[[ZERO_SUBREG]], %[[CF_REG]]
   ; CHECK-NEXT:    MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
 
     $eflags = COPY %3
     %5:gr16 = SETB_C16r implicit-def $eflags, implicit $eflags
     MOV16mr $rsp, 1, $noreg, -16, $noreg, killed %5
   ; CHECK-NOT:     $eflags =
   ; CHECK:         %[[CF_EXT:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
-  ; CHECK-NEXT:    %[[CF_TRUNC:[^:]*]]:gr16 = EXTRACT_SUBREG %[[CF_EXT]], %subreg.sub_16bit
+  ; CHECK-NEXT:    %[[CF_TRUNC:[^:]*]]:gr16 = COPY %[[CF_EXT]].sub_16bit
   ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
-  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr16 = EXTRACT_SUBREG %[[ZERO]], %subreg.sub_16bit
+  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr16 = COPY %[[ZERO]].sub_16bit
   ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr16 = SUB16rr %[[ZERO_SUBREG]], %[[CF_TRUNC]]
   ; CHECK-NEXT:    MOV16mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
 
     $eflags = COPY %3
     %6:gr32 = SETB_C32r implicit-def $eflags, implicit $eflags
     MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %6
   ; CHECK-NOT:     $eflags =
   ; CHECK:         %[[CF_EXT:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
   ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
   ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr32 = SUB32rr %[[ZERO]], %[[CF_EXT]]
   ; CHECK-NEXT:    MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
 
     $eflags = COPY %3
     %7:gr64 = SETB_C64r implicit-def $eflags, implicit $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
   ; CHECK-NOT:     $eflags =
   ; CHECK:         %[[CF_EXT1:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
   ; CHECK-NEXT:    %[[CF_EXT2:[^:]*]]:gr64 = SUBREG_TO_REG 0, %[[CF_EXT1]], %subreg.sub_32bit
   ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
   ; CHECK-NEXT:    %[[ZERO_EXT:[^:]*]]:gr64 = SUBREG_TO_REG 0, %[[ZERO]], %subreg.sub_32bit
   ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr64 = SUB64rr %[[ZERO_EXT]], %[[CF_EXT2]]
   ; CHECK-NEXT:    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
 
     RET 0
 
 ...
 ---
 name:            test_branch_with_livein_and_kill
 # CHECK-LABEL: name: test_branch_with_livein_and_kill
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2, %bb.3
     liveins: $rdi, $rsi
 
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[S_REG:[^:]*]]:gr8 = SETSr implicit $eflags
   ; CHECK-NEXT: %[[NE_REG:[^:]*]]:gr8 = SETNEr implicit $eflags
   ; CHECK-NEXT: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     JA_1 %bb.1, implicit $eflags
     JB_1 %bb.2, implicit $eflags
     JMP_1 %bb.3
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.1, implicit killed $eflags
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: {{.*$}}
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT:   TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
   ; CHECK-NEXT:   JMP_1 %bb.3
 
   bb.1:
     liveins: $eflags
 
     %3:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[NE_REG]], %[[NE_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %3:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
     $rax = COPY %3
     RET 0, $rax
 
   bb.2:
     liveins: $eflags
 
     %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[NE_REG]], %[[NE_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
     $rax = COPY %4
     RET 0, $rax
 
   bb.3:
     liveins: $eflags
 
     %5:gr64 = CMOVS64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[S_REG]], %[[S_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
     $rax = COPY %5
     RET 0, $rax
 
 ...
 ---
 name:            test_branch_with_interleaved_livein_and_kill
 # CHECK-LABEL: name: test_branch_with_interleaved_livein_and_kill
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2, %bb.5
     liveins: $rdi, $rsi
 
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr64 = COPY $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
   ; CHECK:      %[[S_REG:[^:]*]]:gr8 = SETSr implicit $eflags
   ; CHECK-NEXT: %[[P_REG:[^:]*]]:gr8 = SETPr implicit $eflags
   ; CHECK-NEXT: %[[NE_REG:[^:]*]]:gr8 = SETNEr implicit $eflags
   ; CHECK-NEXT: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NEXT: %[[O_REG:[^:]*]]:gr8 = SETOr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     JA_1 %bb.1, implicit $eflags
     JB_1 %bb.2, implicit $eflags
     JMP_1 %bb.5
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.1, implicit killed $eflags
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: {{.*$}}
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT:   TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
   ; CHECK-NEXT:   JMP_1 %bb.5
 
   bb.1:
     liveins: $eflags
 
     %3:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[NE_REG]], %[[NE_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %3:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
     $rax = COPY %3
     RET 0, $rax
 
   bb.2:
     ; The goal is to have another batch of successors discovered in a block
     ; between two successors which kill $eflags. This ensures that neither of
     ; the surrounding kills impact recursing through this block.
     successors: %bb.3, %bb.4
     liveins: $eflags
 
     JO_1 %bb.3, implicit $eflags
     JMP_1 %bb.4
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[O_REG]], %[[O_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.3, implicit killed $eflags
   ; CHECK-NEXT:   JMP_1 %bb.4
 
   bb.3:
     liveins: $eflags
 
     %4:gr64 = CMOVNE64rr %0, %1, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[NE_REG]], %[[NE_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
     $rax = COPY %4
     RET 0, $rax
 
   bb.4:
     liveins: $eflags
 
     %5:gr64 = CMOVP64rr %0, %1, implicit $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[P_REG]], %[[P_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
     $rax = COPY %5
     RET 0, $rax
 
   bb.5:
     liveins: $eflags
 
     %6:gr64 = CMOVS64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:     $eflags =
   ; CHECK:         TEST8rr %[[S_REG]], %[[S_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %6:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
     $rax = COPY %6
     RET 0, $rax
 
 ...
 ---
 # This test case is designed to exercise a particularly challenging situation:
 # when the flags are copied and restored *inside* of a complex and cyclic CFG
 # all of which have live-in flags. To correctly handle this case we have to walk
 # up the dominator tree and locate a viable reaching definition location,
 # checking for clobbers along any path. The CFG for this function looks like the
 # following diagram, control flowing out the bottom of blocks and in the top:
 #
 #  bb.0
 #   | __________________
 #   |/                  \
 #  bb.1                  |
 #   |\_________          |
 #   | __       \ ____    |
 #   |/  \      |/    \   |
 #  bb.2  |    bb.4    |  |
 #   |\__/     / \     |  |
 #   |        /   \    |  |
 #  bb.3    bb.5  bb.6 |  |
 #   |        \   /    |  |
 #   |         \ /     |  |
 #   |         bb.7    |  |
 #   | ________/ \____/   |
 #   |/                   |
 #  bb.8                  |
 #   |\__________________/
 #   |
 #  bb.9
 #
 # We set EFLAGS in bb.0, clobber them in bb.3, and copy them in bb.2 and bb.6.
 # Because of the cycles this requires hoisting the `SETcc` instructions to
 # capture the flags for the bb.6 copy to bb.1 and using them for the copy in
 # `bb.2` as well despite the clobber in `bb.3`. The clobber in `bb.3` also
 # prevents hoisting the `SETcc`s up to `bb.0`.
 #
 # Throughout the test we use branch instructions that are totally bogus (as the
 # flags are obviously not changing!) but this is just to allow us to send
 # a small but complex CFG structure through the backend and force it to choose
 # plausible lowering decisions based on the core CFG presented, regardless of
 # the futility of the actual branches.
 name:            test_mid_cycle_copies
 # CHECK-LABEL: name: test_mid_cycle_copies
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1
     liveins: $rdi, $rsi
 
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
   ; CHECK:      bb.0:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        CMP64rr %0, %1, implicit-def $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
     JMP_1 %bb.1
 
   bb.1:
     successors: %bb.2, %bb.4
     liveins: $eflags
 
     ; Outer loop header, target for one set of hoisting.
     JE_1 %bb.2, implicit $eflags
     JMP_1 %bb.4
   ; CHECK:      bb.1:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT:   %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
   ; CHECK-NEXT:   %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
   bb.2:
     successors: %bb.2, %bb.3
     liveins: $eflags
 
     ; Inner loop with a local copy. We should eliminate this but can't hoist.
     %2:gr64 = COPY $eflags
     $eflags = COPY %2
     JE_1 %bb.2, implicit $eflags
     JMP_1 %bb.3
   ; CHECK:      bb.2:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
   bb.3:
     successors: %bb.8
     liveins: $eflags
 
     ; Use and then clobber $eflags. Then hop to the outer loop latch.
     %3:gr64 = ADC64ri32 %0, 42, implicit-def dead $eflags, implicit $eflags
   ; CHECK:      bb.3:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        dead %{{[^:]*}}:gr8 = ADD8ri %[[B_REG]], 255, implicit-def $eflags
   ; CHECK-NEXT:   %3:gr64 = ADC64ri32 %0, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
     JMP_1 %bb.8
 
   bb.4:
     successors: %bb.5, %bb.6
     liveins: $eflags
 
     ; Another inner loop, this one with a diamond.
     JE_1 %bb.5, implicit $eflags
     JMP_1 %bb.6
   ; CHECK:      bb.4:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.5, implicit killed $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
   bb.5:
     successors: %bb.7
     liveins: $eflags
 
     ; Just use $eflags on this side of the diamond.
     %4:gr64 = CMOVA64rr %0, %1, implicit $eflags
   ; CHECK:      bb.5:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
     JMP_1 %bb.7
 
   bb.6:
     successors: %bb.7
     liveins: $eflags
 
     ; Use, copy, and then use $eflags again.
     %5:gr64 = CMOVA64rr %0, %1, implicit $eflags
   ; CHECK:      bb.6:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
 
     %6:gr64 = COPY $eflags
     $eflags = COPY %6:gr64
 
     %7:gr64 = CMOVA64rr %0, %1, implicit $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   %7:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
     MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
     JMP_1 %bb.7
 
   bb.7:
     successors: %bb.4, %bb.8
     liveins: $eflags
 
     ; Inner loop latch.
     JE_1 %bb.4, implicit $eflags
     JMP_1 %bb.8
   ; CHECK:      bb.7:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.4, implicit killed $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
   bb.8:
     successors: %bb.1, %bb.9
 
     ; Outer loop latch. Note that we cannot have EFLAGS live-in here as that
     ; immediately require PHIs.
     CMP64rr %0, %1, implicit-def $eflags
     JE_1 %bb.1, implicit $eflags
     JMP_1 %bb.9
   ; CHECK:      bb.8:
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
   ; CHECK:        CMP64rr %0, %1, implicit-def $eflags
   ; CHECK-NEXT:   JE_1 %bb.1, implicit $eflags
   ; CHECK-NOT:    COPY{{( killed)?}} $eflags
 
   bb.9:
     liveins: $eflags
 
     ; And we're done.
     %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
     $rax = COPY %8
     RET 0, $rax
   ; CHECK:      bb.9:
   ; CHECK-NOT:     $eflags
   ; CHECK:         %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
 
 ...
 ---
 name:            test_existing_setcc
 # CHECK-LABEL: name: test_existing_setcc
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2, %bb.3
     liveins: $rdi, $rsi
 
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     %2:gr8 = SETAr implicit $eflags
     %3:gr8 = SETAEr implicit $eflags
     %4:gr64 = COPY $eflags
   ; CHECK:      CMP64rr %0, %1, implicit-def $eflags
   ; CHECK-NEXT: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
   ; CHECK-NEXT: %[[AE_REG:[^:]*]]:gr8 = SETAEr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %4
     JA_1 %bb.1, implicit $eflags
     JB_1 %bb.2, implicit $eflags
     JMP_1 %bb.3
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.1, implicit killed $eflags
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: {{.*$}}
   ; CHECK-SAME: {{$[[:space:]]}}
   ; CHECK-NEXT:   TEST8rr %[[AE_REG]], %[[AE_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JE_1 %bb.2, implicit killed $eflags
   ; CHECK-NEXT:   JMP_1 %bb.3
 
   bb.1:
     %5:gr32 = MOV32ri64 42
     $eax = COPY %5
     RET 0, $eax
 
   bb.2:
     %6:gr32 = MOV32ri64 43
     $eax = COPY %6
     RET 0, $eax
 
   bb.3:
     %7:gr32 = MOV32r0 implicit-def dead $eflags
     $eax = COPY %7
     RET 0, $eax
 
 ...
 ---
 name:            test_existing_setcc_memory
 # CHECK-LABEL: name: test_existing_setcc_memory
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
 body:             |
   bb.0:
     successors: %bb.1, %bb.2
     liveins: $rdi, $rsi
 
     %0:gr64 = COPY $rdi
     %1:gr64 = COPY $rsi
     CMP64rr %0, %1, implicit-def $eflags
     SETEm %0, 1, $noreg, -16, $noreg, implicit $eflags
     %2:gr64 = COPY $eflags
   ; CHECK:      CMP64rr %0, %1, implicit-def $eflags
   ; We cannot reuse this SETE because it stores the flag directly to memory,
   ; so we have two SETEs here. FIXME: It'd be great if something could fold
   ; these automatically. If not, maybe we want to unfold SETcc instructions
   ; writing to memory so we can reuse them.
   ; CHECK-NEXT: SETEm {{.*}} implicit $eflags
   ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
   ; CHECK-NOT:  COPY{{( killed)?}} $eflags
 
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %2
     JE_1 %bb.1, implicit $eflags
     JMP_1 %bb.2
   ; CHECK-NOT: $eflags =
   ;
   ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
   ; CHECK-NEXT:   JNE_1 %bb.1, implicit killed $eflags
   ; CHECK-NEXT:   JMP_1 %bb.2
 
   bb.1:
     %3:gr32 = MOV32ri64 42
     $eax = COPY %3
     RET 0, $eax
 
   bb.2:
     %4:gr32 = MOV32ri64 43
     $eax = COPY %4
     RET 0, $eax
 
 ...
Index: vendor/llvm/dist-release_70/test/CodeGen/X86/pr38533.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/X86/pr38533.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/CodeGen/X86/pr38533.ll	(revision 338000)
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX512
+
+; This test makes sure that a vector that needs to be promoted that is bitcasted to fp16 is legalized correctly without causing a width mismatch.
+define void @constant_fold_vector_to_half() {
+; CHECK-LABEL: constant_fold_vector_to_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movw $16384, (%rax) # imm = 0x4000
+; CHECK-NEXT:    retq
+  store volatile half bitcast (<4 x i4> <i4 0, i4 0, i4 0, i4 4> to half), half* undef
+  ret void
+}
+
+; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing.
+define void @pr38533_2(half %x) {
+; SSE-LABEL: pr38533_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    callq __gnu_f2h_ieee
+; SSE-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movw %ax, (%rax)
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; AVX512-LABEL: pr38533_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT:    movw %ax, (%rax)
+; AVX512-NEXT:    retq
+  %a = bitcast half %x to <4 x i4>
+  store volatile <4 x i4> %a, <4 x i4>* undef
+  ret void
+}
+
+; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized.
+define void @pr38533_3(half %x) {
+; SSE-LABEL: pr38533_3:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    callq __gnu_f2h_ieee
+; SSE-NEXT:    movw %ax, (%rsp)
+; SSE-NEXT:    movzwl (%rsp), %eax
+; SSE-NEXT:    movw %ax, (%rax)
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; AVX512-LABEL: pr38533_3:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    movw %ax, (%rax)
+; AVX512-NEXT:    retq
+  %a = bitcast half %x to <16 x i1>
+  store volatile <16 x i1> %a, <16 x i1>* undef
+  ret void
+}
Index: vendor/llvm/dist-release_70/test/CodeGen/X86/pr38539.ll
===================================================================
--- vendor/llvm/dist-release_70/test/CodeGen/X86/pr38539.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/CodeGen/X86/pr38539.ll	(revision 338000)
@@ -0,0 +1,314 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -verify-machineinstrs | FileCheck %s --check-prefix=X86
+
+; This test is targeted at 64-bit mode. It used to crash due to the creation of an EXTRACT_SUBREG after the peephole pass had ran.
+define void @f() {
+; X64-LABEL: f:
+; X64:       # %bb.0: # %BB
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 48
+; X64-NEXT:    .cfi_offset %rbx, -32
+; X64-NEXT:    .cfi_offset %r14, -24
+; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    shlq $62, %rcx
+; X64-NEXT:    sarq $62, %rcx
+; X64-NEXT:    movq (%rsp), %r14
+; X64-NEXT:    movb (%rax), %bpl
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    callq __modti3
+; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    cmpq %rax, %r14
+; X64-NEXT:    sbbq %rdx, %rbx
+; X64-NEXT:    setb %sil
+; X64-NEXT:    setae %bl
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    setne %dl
+; X64-NEXT:    setne (%rax)
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    subb %sil, %cl
+; X64-NEXT:    # kill: def $eax killed $eax def $ax
+; X64-NEXT:    divb %al
+; X64-NEXT:    negb %bl
+; X64-NEXT:    cmpb %al, %al
+; X64-NEXT:    setle %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    cbtw
+; X64-NEXT:    idivb %bl
+; X64-NEXT:    movsbl %ah, %eax
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    negb %dl
+; X64-NEXT:    leaq -16(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, (%rax)
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cbtw
+; X64-NEXT:    idivb %dl
+; X64-NEXT:    movsbl %ah, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    movb %al, (%rax)
+; X64-NEXT:    addq $16, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    popq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-LABEL: f:
+; X86:       # %bb.0: # %BB
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll $30, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $30, %ecx
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movb (%eax), %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    setae %dl
+; X86-NEXT:    sbbb %cl, %cl
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    setne %ch
+; X86-NEXT:    setne (%eax)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
+; X86-NEXT:    movzbl %dh, %eax
+; X86-NEXT:    # kill: def $eax killed $eax def $ax
+; X86-NEXT:    divb %dh
+; X86-NEXT:    negb %ch
+; X86-NEXT:    negb %dl
+; X86-NEXT:    cmpb %al, %al
+; X86-NEXT:    setle %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    cbtw
+; X86-NEXT:    idivb %dl
+; X86-NEXT:    movsbl %ah, %eax
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    leal -4(%esp,%eax,4), %eax
+; X86-NEXT:    movl %eax, (%eax)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cbtw
+; X86-NEXT:    idivb %ch
+; X86-NEXT:    movsbl %ah, %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    movb %al, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+BB:
+  %A30 = alloca i66
+  %L17 = load i66, i66* %A30
+  %B20 = and i66 %L17, -1
+  %G2 = getelementptr i66, i66* %A30, i1 true
+  %L10 = load i8, i8* undef
+  %B6 = udiv i8 %L10, %L10
+  %C15 = icmp eq i8 undef, 0
+  %B8 = srem i66 0, %B20
+  %C2 = icmp ule i66 %B8, %B20
+  %B5 = or i8 0, %B6
+  %C19 = icmp uge i1 false, %C2
+  %C1 = icmp sle i8 undef, %B5
+  %B37 = srem i1 %C1, %C2
+  %C7 = icmp uge i1 false, %C15
+  store i1 %C7, i1* undef
+  %G6 = getelementptr i66, i66* %G2, i1 %B37
+  store i66* %G6, i66** undef
+  %B30 = srem i1 %C19, %C7
+  store i1 %B30, i1* undef
+  ret void
+}
+
+; Similar to above, but bitwidth adjusted to target 32-bit mode. This also shows that we didn't constrain the register class when extracting a subreg.
+define void @g() {
+; X64-LABEL: g:
+; X64:       # %bb.0: # %BB
+; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; X64-NEXT:    shlq $32, %rsi
+; X64-NEXT:    orq %rax, %rsi
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    shlq $30, %rdi
+; X64-NEXT:    sarq $30, %rdi
+; X64-NEXT:    movb (%rax), %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    # kill: def $eax killed $eax def $ax
+; X64-NEXT:    divb %al
+; X64-NEXT:    movl %eax, %r8d
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivq %rdi
+; X64-NEXT:    movabsq $17179869183, %rax # imm = 0x3FFFFFFFF
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    setne %dil
+; X64-NEXT:    setne (%rax)
+; X64-NEXT:    cmpq %rsi, %rax
+; X64-NEXT:    seta %dl
+; X64-NEXT:    setbe %cl
+; X64-NEXT:    negb %cl
+; X64-NEXT:    cmpb %r8b, %al
+; X64-NEXT:    setle %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    cbtw
+; X64-NEXT:    idivb %cl
+; X64-NEXT:    movsbl %ah, %eax
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    shlq $3, %rax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    negb %dil
+; X64-NEXT:    negb %dl
+; X64-NEXT:    leaq -16(%rsp,%rax), %rax
+; X64-NEXT:    movq %rax, (%rax)
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    cbtw
+; X64-NEXT:    idivb %dil
+; X64-NEXT:    movsbl %ah, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    movb %al, (%rax)
+; X64-NEXT:    retq
+;
+; X86-LABEL: g:
+; X86:       # %bb.0: # %BB
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll $30, %eax
+; X86-NEXT:    sarl $30, %eax
+; X86-NEXT:    movl (%esp), %edi
+; X86-NEXT:    movb (%eax), %bl
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $0
+; X86-NEXT:    calll __moddi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    andl $3, %edx
+; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    setb %dl
+; X86-NEXT:    setae %dh
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    setne %bh
+; X86-NEXT:    setne (%eax)
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    # kill: def $eax killed $eax def $ax
+; X86-NEXT:    divb %bl
+; X86-NEXT:    negb %dh
+; X86-NEXT:    cmpb %al, %al
+; X86-NEXT:    setle %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    cbtw
+; X86-NEXT:    idivb %dh
+; X86-NEXT:    movsbl %ah, %eax
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    negb %bh
+; X86-NEXT:    leal -8(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, (%eax)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cbtw
+; X86-NEXT:    idivb %bh
+; X86-NEXT:    movsbl %ah, %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    movb %al, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+BB:
+  %A30 = alloca i34
+  %L17 = load i34, i34* %A30
+  %B20 = and i34 %L17, -1
+  %G2 = getelementptr i34, i34* %A30, i1 true
+  %L10 = load i8, i8* undef
+  %B6 = udiv i8 %L10, %L10
+  %C15 = icmp eq i8 undef, 0
+  %B8 = srem i34 0, %B20
+  %C2 = icmp ule i34 %B8, %B20
+  %B5 = or i8 0, %B6
+  %C19 = icmp uge i1 false, %C2
+  %C1 = icmp sle i8 undef, %B5
+  %B37 = srem i1 %C1, %C2
+  %C7 = icmp uge i1 false, %C15
+  store i1 %C7, i1* undef
+  %G6 = getelementptr i34, i34* %G2, i1 %B37
+  store i34* %G6, i34** undef
+  %B30 = srem i1 %C19, %C7
+  store i1 %B30, i1* undef
+  ret void
+}
Index: vendor/llvm/dist-release_70/test/Other/opt-bisect-legacy-pass-manager.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Other/opt-bisect-legacy-pass-manager.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/Other/opt-bisect-legacy-pass-manager.ll	(revision 338000)
@@ -1,178 +1,178 @@
 ; This file verifies the behavior of the OptBisect class, which is used to
 ; diagnose optimization related failures.  The tests check various
 ; invocations that result in different sets of optimization passes that
 ; are run in different ways.
 ;
 ; This set of tests exercises the legacy pass manager interface to the OptBisect
 ; class.  Because the exact set of optimizations that will be run may
 ; change over time, these tests are written in a more general manner than the
 ; corresponding tests for the new pass manager.
 ;
 ; Don't use NEXT checks or hard-code pass numbering so that this won't fail if
 ; new passes are inserted.
 
 
 ; Verify that the file can be compiled to an object file at -O3 with all
 ; skippable passes skipped.
 
 ; REQUIRES: default_triple
 ; RUN: opt -O3 -opt-bisect-limit=0 < %s | llc -O3 -opt-bisect-limit=0
 
 
 ; Verify that no skippable passes are run with -opt-bisect-limit=0.
 
 ; RUN: opt -disable-output -disable-verify -O3 -opt-bisect-limit=0 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-SKIP-ALL
 ; CHECK-SKIP-ALL: BISECT: NOT running pass ({{[0-9]+}})
 ; CHECK-SKIP-ALL-NOT: BISECT: running pass ({{[0-9]+}})
 
 
 ; Verify that no passes run at -O0 are skipped
 ; RUN: opt -opt-bisect-limit=0 < %s 2>&1 | FileCheck %s --check-prefix=OPTBISECT-O0
 ; OPTBISECT-O0-NOT: BISECT: NOT running
 
 ; FIXME: There are still some AMDGPU passes being skipped that run at -O0.
 ; XFAIL: r600, amdgcn
 
 ; Verify that we can use the opt-bisect-helper.py script (derived from
 ; utils/bisect) to locate the optimization that inlines the call to
 ; f2() in f3().
 
-; RUN: '%python' %S/opt-bisect-helper.py --start=0 --end=256 --optcmd=opt \
+; RUN: %python %S/opt-bisect-helper.py --start=0 --end=256 --optcmd=opt \
 ; RUN:         --filecheckcmd=FileCheck --test=%s \
 ; RUN:         --prefix=CHECK-BISECT-INLINE-HELPER \
 ; RUN:         | FileCheck %s --check-prefix=CHECK-BISECT-INLINE-RESULT
 ; The helper script uses this to find the optimization that inlines the call.
 ; CHECK-BISECT-INLINE-HELPER: call i32 @f2()
 ; These checks verifies that the optimization was found.
 ; CHECK-BISECT-INLINE-RESULT-NOT: Last good count: 0
 ; CHECK-BISECT-INLINE-RESULT: Last good count: {{[0-9]+}}
 
 
 ; Test a module pass.
 
 ; RUN: opt -disable-output -disable-verify -deadargelim -opt-bisect-limit=-1 %s \
 ; RUN:     2>&1 | FileCheck %s --check-prefix=CHECK-DEADARG
 ; CHECK-DEADARG: BISECT: running pass ({{[0-9]+}}) Dead Argument Elimination on module
 
 ; RUN: opt -disable-output -disable-verify -deadargelim -opt-bisect-limit=0 %s \
 ; RUN:     2>&1 | FileCheck %s --check-prefix=CHECK-NOT-DEADARG
 ; CHECK-NOT-DEADARG: BISECT: NOT running pass ({{[0-9]+}}) Dead Argument Elimination on module
 
 
 ; Test an SCC pass.
 
 ; RUN: opt -disable-output -disable-verify -inline -opt-bisect-limit=-1 %s \
 ; RUN:     2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 ; CHECK-INLINE: BISECT: running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (<<null function>>)
 ; CHECK-INLINE: BISECT: running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (g)
 ; CHECK-INLINE: BISECT: running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (f1)
 ; CHECK-INLINE: BISECT: running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (f2)
 ; CHECK-INLINE: BISECT: running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (f3)
 ; CHECK-INLINE: BISECT: running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (<<null function>>)
 
 ; RUN: opt -disable-output -disable-verify -inline -opt-bisect-limit=0 %s \
 ; RUN:     2>&1 | FileCheck %s --check-prefix=CHECK-NOT-INLINE
 ; CHECK-NOT-INLINE: BISECT: NOT running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (<<null function>>)
 ; CHECK-NOT-INLINE: BISECT: NOT running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (g)
 ; CHECK-NOT-INLINE: BISECT: NOT running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (f1)
 ; CHECK-NOT-INLINE: BISECT: NOT running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (f2)
 ; CHECK-NOT-INLINE: BISECT: NOT running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (f3)
 ; CHECK-NOT-INLINE: BISECT: NOT running pass ({{[0-9]+}}) Function Integration/Inlining on SCC (<<null function>>)
 
 
 ; Test a function pass.
 
 ; RUN: opt -disable-output -disable-verify -early-cse -opt-bisect-limit=-1 \
 ; RUN:     %s 2>&1 | FileCheck %s --check-prefix=CHECK-EARLY-CSE
 ; CHECK-EARLY-CSE: BISECT: running pass ({{[0-9]+}}) Early CSE on function (f1)
 ; CHECK-EARLY-CSE: BISECT: running pass ({{[0-9]+}}) Early CSE on function (f2)
 ; CHECK-EARLY-CSE: BISECT: running pass ({{[0-9]+}}) Early CSE on function (f3)
 
 ; RUN: opt -disable-output -disable-verify -early-cse -opt-bisect-limit=0 %s \
 ; RUN:     2>&1 | FileCheck %s --check-prefix=CHECK-NOT-EARLY-CSE
 ; CHECK-NOT-EARLY-CSE: BISECT: NOT running pass ({{[0-9]+}}) Early CSE on function (f1)
 ; CHECK-NOT-EARLY-CSE: BISECT: NOT running pass ({{[0-9]+}}) Early CSE on function (f2)
 ; CHECK-NOT-EARLY-CSE: BISECT: NOT running pass ({{[0-9]+}}) Early CSE on function (f3)
 
 
 ; Test a loop pass.
 
 ; RUN: opt -disable-output -disable-verify -loop-reduce -opt-bisect-limit=-1 \
 ; RUN:      %s 2>&1 | FileCheck %s --check-prefix=CHECK-LOOP-REDUCE
 ; CHECK-LOOP-REDUCE: BISECT: running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-LOOP-REDUCE: BISECT: running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-LOOP-REDUCE: BISECT: running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-LOOP-REDUCE: BISECT: running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-LOOP-REDUCE: BISECT: running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 
 ; RUN: opt -disable-output -disable-verify -loop-reduce -opt-bisect-limit=0 \
 ; RUN:     %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOT-LOOP-REDUCE
 ; CHECK-NOT-LOOP-REDUCE: BISECT: NOT running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-NOT-LOOP-REDUCE: BISECT: NOT running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-NOT-LOOP-REDUCE: BISECT: NOT running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-NOT-LOOP-REDUCE: BISECT: NOT running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 ; CHECK-NOT-LOOP-REDUCE: BISECT: NOT running pass ({{[0-9]+}}) Loop Strength Reduction on loop
 
 
 declare i32 @g()
 
 define void @f1() {
 entry:
   br label %loop.0
 loop.0:
   br i1 undef, label %loop.0.0, label %loop.1
 loop.0.0:
   br i1 undef, label %loop.0.0, label %loop.0.1
 loop.0.1:
   br i1 undef, label %loop.0.1, label %loop.0
 loop.1:
   br i1 undef, label %loop.1, label %loop.1.bb1
 loop.1.bb1:
   br i1 undef, label %loop.1, label %loop.1.bb2
 loop.1.bb2:
   br i1 undef, label %end, label %loop.1.0
 loop.1.0:
   br i1 undef, label %loop.1.0, label %loop.1
 end:
   ret void
 }
 
 define i32 @f2() {
 entry:
   ret i32 0
 }
 
 define i32 @f3() {
 entry:
   %temp = call i32 @g()
   %icmp = icmp ugt i32 %temp, 2
   br i1 %icmp, label %bb.true, label %bb.false
 bb.true:
   %temp2 = call i32 @f2()
   ret i32 %temp2
 bb.false:
   ret i32 0
 }
 
 ; This function is here to verify that opt-bisect can skip all passes for
 ; functions that contain lifetime intrinsics.
 define void @f4() {
 entry:
   %i = alloca i32, align 4
   %tmp = bitcast i32* %i to i8*
   call void @llvm.lifetime.start(i64 4, i8* %tmp)
   br label %for.cond
 
 for.cond:
   br i1 undef, label %for.body, label %for.end
 
 for.body:
   br label %for.cond
 
 for.end:
   ret void
 }
 
 declare void @llvm.lifetime.start(i64, i8* nocapture)
 
Index: vendor/llvm/dist-release_70/test/TableGen/JSON.td
===================================================================
--- vendor/llvm/dist-release_70/test/TableGen/JSON.td	(revision 337999)
+++ vendor/llvm/dist-release_70/test/TableGen/JSON.td	(revision 338000)
@@ -1,146 +1,146 @@
-// RUN: llvm-tblgen -dump-json %s | '%python' %S/JSON-check.py %s
+// RUN: llvm-tblgen -dump-json %s | %python %S/JSON-check.py %s
 
 // CHECK: data['!tablegen_json_version'] == 1
 
 // CHECK: all(data[s]['!name'] == s for s in data if not s.startswith("!"))
 
 class Base {}
 class Intermediate : Base {}
 class Derived : Intermediate {}
 
 def D : Intermediate {}
 // CHECK: 'D' in data['!instanceof']['Base']
 // CHECK: 'D' in data['!instanceof']['Intermediate']
 // CHECK: 'D' not in data['!instanceof']['Derived']
 // CHECK: 'Base' in data['D']['!superclasses']
 // CHECK: 'Intermediate' in data['D']['!superclasses']
 // CHECK: 'Derived' not in data['D']['!superclasses']
 
 def ExampleDagOp;
 
 def FieldKeywordTest {
     int a;
     field int b;
     // CHECK: 'a' not in data['FieldKeywordTest']['!fields']
     // CHECK: 'b' in data['FieldKeywordTest']['!fields']
 }
 
 class Variables {
     int i;
     string s;
     bit b;
     bits<8> bs;
     code c;
     list<int> li;
     Base base;
     dag d;
 }
 def VarNull : Variables {
     // A variable not filled in at all has its value set to JSON
     // 'null', which translates to Python None
     // CHECK: data['VarNull']['i'] is None
 }
 def VarPrim : Variables {
     // Test initializers that map to primitive JSON types
 
     int i = 3;
     // CHECK: data['VarPrim']['i'] == 3
 
     // Integer literals should be emitted in the JSON at full 64-bit
     // precision, for the benefit of JSON readers that preserve that
     // much information. Python's is one such.
     int enormous_pos = 9123456789123456789;
     int enormous_neg = -9123456789123456789;
     // CHECK: data['VarPrim']['enormous_pos'] == 9123456789123456789
     // CHECK: data['VarPrim']['enormous_neg'] == -9123456789123456789
 
     string s = "hello, world";
     // CHECK: data['VarPrim']['s'] == 'hello, world'
 
     bit b = 0;
     // CHECK: data['VarPrim']['b'] == 0
 
     // bits<> arrays are stored in logical order (array[i] is the same
     // bit identified in .td files as bs{i}), which means the _visual_
     // order of the list (in default rendering) is reversed.
     bits<8> bs = { 0,0,0,1,0,1,1,1 };
     // CHECK: data['VarPrim']['bs'] == [ 1,1,1,0,1,0,0,0 ]
 
     code c = [{ \"  }];
     // CHECK: data['VarPrim']['c'] == r' \"  '
 
     list<int> li = [ 1, 2, 3, 4 ];
     // CHECK: data['VarPrim']['li'] == [ 1, 2, 3, 4 ]
 }
 def VarObj : Variables {
     // Test initializers that map to JSON objects containing a 'kind'
     // discriminator
 
     Base base = D;
     // CHECK: data['VarObj']['base']['kind'] == 'def'
     // CHECK: data['VarObj']['base']['def'] == 'D'
     // CHECK: data['VarObj']['base']['printable'] == 'D'
 
     dag d = (ExampleDagOp 22, "hello":$foo);
     // CHECK: data['VarObj']['d']['kind'] == 'dag'
     // CHECK: data['VarObj']['d']['operator']['kind'] == 'def'
     // CHECK: data['VarObj']['d']['operator']['def'] == 'ExampleDagOp'
     // CHECK: data['VarObj']['d']['operator']['printable'] == 'ExampleDagOp'
     // CHECK: data['VarObj']['d']['args'] == [[22, None], ["hello", "foo"]]
     // CHECK: data['VarObj']['d']['printable'] == '(ExampleDagOp 22, "hello":$foo)'
 
     int undef_int;
     field int ref_int = undef_int;
     // CHECK: data['VarObj']['ref_int']['kind'] == 'var'
     // CHECK: data['VarObj']['ref_int']['var'] == 'undef_int'
     // CHECK: data['VarObj']['ref_int']['printable'] == 'undef_int'
 
     bits<2> undef_bits;
     bits<4> ref_bits;
     let ref_bits{3-2} = 0b10;
     let ref_bits{1-0} = undef_bits{1-0};
     // CHECK: data['VarObj']['ref_bits'][3] == 1
     // CHECK: data['VarObj']['ref_bits'][2] == 0
     // CHECK: data['VarObj']['ref_bits'][1]['kind'] == 'varbit'
     // CHECK: data['VarObj']['ref_bits'][1]['var'] == 'undef_bits'
     // CHECK: data['VarObj']['ref_bits'][1]['index'] == 1
     // CHECK: data['VarObj']['ref_bits'][1]['printable'] == 'undef_bits{1}'
     // CHECK: data['VarObj']['ref_bits'][0]['kind'] == 'varbit'
     // CHECK: data['VarObj']['ref_bits'][0]['var'] == 'undef_bits'
     // CHECK: data['VarObj']['ref_bits'][0]['index'] == 0
     // CHECK: data['VarObj']['ref_bits'][0]['printable'] == 'undef_bits{0}'
 
     field int complex_ref_int = !add(undef_int, 2);
     // CHECK: data['VarObj']['complex_ref_int']['kind'] == 'complex'
     // CHECK: data['VarObj']['complex_ref_int']['printable'] == '!add(undef_int, 2)'
 }
 
 // Test the !anonymous member. This is tricky because when a def is
 // anonymous, almost by definition, the test can't reliably predict
 // the name it will be stored under! So we have to search all the defs
 // in the JSON output looking for the one that has the test integer
 // field set to the right value.
 
 def Named { int AnonTestField = 1; }
 // CHECK: data['Named']['AnonTestField'] == 1
 // CHECK: data['Named']['!anonymous'] is False
 
 def { int AnonTestField = 2; }
 // CHECK: next(rec for rec in data.values() if isinstance(rec, dict) and rec.get('AnonTestField') == 2)['!anonymous'] is True
 
 multiclass AnonTestMulticlass<int base> {
     def _plus_one { int AnonTestField = !add(base,1); }
     def { int AnonTestField = !add(base,2); }
 }
 
 defm NamedDefm : AnonTestMulticlass<10>;
 // CHECK: data['NamedDefm_plus_one']['!anonymous'] is False
 // CHECK: data['NamedDefm_plus_one']['AnonTestField'] == 11
 // CHECK: next(rec for rec in data.values() if isinstance(rec, dict) and rec.get('AnonTestField') == 12)['!anonymous'] is True
 
 // D47431 clarifies that a named def inside a multiclass gives a
 // *non*-anonymous output record, even if the defm that instantiates
 // that multiclass is anonymous.
 defm : AnonTestMulticlass<20>;
 // CHECK: next(rec for rec in data.values() if isinstance(rec, dict) and rec.get('AnonTestField') == 21)['!anonymous'] is False
 // CHECK: next(rec for rec in data.values() if isinstance(rec, dict) and rec.get('AnonTestField') == 22)['!anonymous'] is True
Index: vendor/llvm/dist-release_70/test/ThinLTO/X86/cache.ll
===================================================================
--- vendor/llvm/dist-release_70/test/ThinLTO/X86/cache.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/ThinLTO/X86/cache.ll	(revision 338000)
@@ -1,144 +1,144 @@
 ; Verify first that *without* hash, we don't use the cache.
 
 ; RUN: opt -module-summary %s -o %t.bc
 ; RUN: opt -module-summary %p/Inputs/cache.ll -o %t2.bc
 
 ; Verify that enabling caching is ignoring module without hash
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: ls %t.cache/llvmcache.timestamp
 ; RUN: ls %t.cache | count 1
 
 ; Verify that enabling caching is ignoring module without hash with llvm-lto2
 ; RUN: rm -Rf %t.cache
 ; RUN: llvm-lto2 run -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
 ; RUN: ls %t.cache | count 0
 
 
 ; Repeat again, *with* hash this time.
 
 ; RUN: opt -module-hash -module-summary %s -o %t.bc
 ; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o %t2.bc
 
 ; Verify that enabling caching is working, and that the pruner only removes
 ; files matching the pattern "llvmcache-*".
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo %t.cache/foo
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: ls %t.cache | count 4
 ; RUN: ls %t.cache/llvmcache.timestamp
 ; RUN: ls %t.cache/foo
 ; RUN: not ls %t.cache/llvmcache-foo
 ; RUN: ls %t.cache/llvmcache-* | count 2
 
 ; Verify that enabling caching is working with llvm-lto2
 ; RUN: rm -Rf %t.cache
 ; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
 ; RUN: ls %t.cache | count 2
 ; RUN: ls %t.cache/llvmcache-* | count 2
 
 ; Verify that caches with a timestamp older than the pruning interval
 ; will be pruned
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
 ; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: not ls %t.cache/llvmcache-foo
 
 ; Verify that specifying a negative number for the pruning interval
 ; effectively disables the pruning
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
 ; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-pruning-interval -1
 ; RUN: ls %t.cache/llvmcache-foo
 
 ; Verify that the pruner doesn't run and a cache file is not deleted when: 
 ; default values for pruning interval and cache expiration are used, 
 ; llvmcache.timestamp is current, 
 ; cache file is older than default cache expiration value.
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
 ; RUN: touch %t.cache/llvmcache.timestamp
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: ls %t.cache/llvmcache-foo
 
 ; Verify that the pruner runs and a cache file is deleted when:
 ; pruning interval has value 0 (i.e. run garbage collector now)
 ; default value for cache expiration is used,
 ; llvmcache.timestamp is current,
 ; cache file is older than default cache expiration value.
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
 ; RUN: touch %t.cache/llvmcache.timestamp
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-pruning-interval 0
 ; RUN: not ls %t.cache/llvmcache-foo
 
 ; Populate the cache with files with "old" access times, then check llvm-lto updates these file times
 ; A negative pruning interval is used to avoid removing cache entries
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: touch -a -t 197001011200 %t.cache/llvmcache-*
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-pruning-interval -1
 ; RUN: ls -ltu %t.cache/* | not grep 1970-01-01
 
 ; Populate the cache with files with "old" access times, then check llvm-lto2 updates these file times
 ; RUN: rm -Rf %t.cache
 ; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
 ; RUN: touch -a -t 197001011200 %t.cache/llvmcache-*
 ; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
 ; RUN: ls -ltu %t.cache/* | not grep 1970-01-01
 
 ; Verify that specifying max size for the cache directory prunes it to this
 ; size, removing the largest files first.
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; Create cache files with different sizes.
 ; Only 8B, 16B and 76B files should stay after pruning.
-; RUN: "%python" -c "with open(r'%t.cache/llvmcache-foo-1024', 'w') as file: file.truncate(1024)"
-; RUN: "%python" -c "with open(r'%t.cache/llvmcache-foo-16', 'w') as file: file.truncate(16)"
-; RUN: "%python" -c "with open(r'%t.cache/llvmcache-foo-8', 'w') as file: file.truncate(8)"
-; RUN: "%python" -c "with open(r'%t.cache/llvmcache-foo-76', 'w') as file: file.truncate(76)"
-; RUN: "%python" -c "with open(r'%t.cache/llvmcache-foo-77', 'w') as file: file.truncate(77)"
+; RUN: %python -c "with open(r'%t.cache/llvmcache-foo-1024', 'w') as file: file.truncate(1024)"
+; RUN: %python -c "with open(r'%t.cache/llvmcache-foo-16', 'w') as file: file.truncate(16)"
+; RUN: %python -c "with open(r'%t.cache/llvmcache-foo-8', 'w') as file: file.truncate(8)"
+; RUN: %python -c "with open(r'%t.cache/llvmcache-foo-76', 'w') as file: file.truncate(76)"
+; RUN: %python -c "with open(r'%t.cache/llvmcache-foo-77', 'w') as file: file.truncate(77)"
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-max-size-bytes 100
 ; RUN: ls %t.cache/llvmcache-foo-16
 ; RUN: ls %t.cache/llvmcache-foo-8
 ; RUN: ls %t.cache/llvmcache-foo-76
 ; RUN: not ls %t.cache/llvmcache-foo-1024
 ; RUN: not ls %t.cache/llvmcache-foo-77
 
 ; Verify that specifying max number of files in the cache directory prunes
 ; it to this amount, removing the largest files first.
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; Create cache files with different sizes.
 ; Only 8B and 16B files should stay after pruning.
-; RUN: "%python" -c "print(' ' * 1023)" > %t.cache/llvmcache-foo-1024
-; RUN: "%python" -c "print(' ' * 15)" > %t.cache/llvmcache-foo-16
-; RUN: "%python" -c "print(' ' * 7)" > %t.cache/llvmcache-foo-8
-; RUN: "%python" -c "print(' ' * 75)" > %t.cache/llvmcache-foo-76
-; RUN: "%python" -c "print(' ' * 76)" > %t.cache/llvmcache-foo-77
+; RUN: %python -c "print(' ' * 1023)" > %t.cache/llvmcache-foo-1024
+; RUN: %python -c "print(' ' * 15)" > %t.cache/llvmcache-foo-16
+; RUN: %python -c "print(' ' * 7)" > %t.cache/llvmcache-foo-8
+; RUN: %python -c "print(' ' * 75)" > %t.cache/llvmcache-foo-76
+; RUN: %python -c "print(' ' * 76)" > %t.cache/llvmcache-foo-77
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-max-size-files 2
 ; RUN: ls %t.cache/llvmcache-foo-16
 ; RUN: ls %t.cache/llvmcache-foo-8
 ; RUN: not ls %t.cache/llvmcache-foo-76
 ; RUN: not ls %t.cache/llvmcache-foo-1024
 ; RUN: not ls %t.cache/llvmcache-foo-77
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 define void @globalfunc() #0 {
 entry:
   ret void
 }
Index: vendor/llvm/dist-release_70/test/Transforms/DeadStoreElimination/tail-byval.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Transforms/DeadStoreElimination/tail-byval.ll	(nonexistent)
+++ vendor/llvm/dist-release_70/test/Transforms/DeadStoreElimination/tail-byval.ll	(revision 338000)
@@ -0,0 +1,23 @@
+; RUN: opt -dse -S < %s | FileCheck %s
+
+; Don't eliminate stores to allocas before tail calls to functions that use
+; byval. It's correct to mark calls like these as 'tail'. To implement this tail
+; call, the backend should copy the bytes from the alloca into the argument area
+; before clearing the stack.
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+declare void @g(i32* byval %p)
+
+define void @f(i32* byval %x) {
+entry:
+  %p = alloca i32
+  %v = load i32, i32* %x
+  store i32 %v, i32* %p
+  tail call void @g(i32* byval %p)
+  ret void
+}
+; CHECK-LABEL: define void @f(i32* byval %x)
+; CHECK:   store i32 %v, i32* %p
+; CHECK:   tail call void @g(i32* byval %p)
Index: vendor/llvm/dist-release_70/test/Transforms/EarlyCSE/memoryssa.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Transforms/EarlyCSE/memoryssa.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/Transforms/EarlyCSE/memoryssa.ll	(revision 338000)
@@ -1,108 +1,150 @@
 ; RUN: opt < %s -S -early-cse | FileCheck %s --check-prefix=CHECK-NOMEMSSA
 ; RUN: opt < %s -S -basicaa -early-cse-memssa | FileCheck %s
 ; RUN: opt < %s -S -passes='early-cse' | FileCheck %s --check-prefix=CHECK-NOMEMSSA
 ; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='early-cse-memssa' | FileCheck %s
 
 @G1 = global i32 zeroinitializer
 @G2 = global i32 zeroinitializer
 @G3 = global i32 zeroinitializer
 
 ;; Simple load value numbering across non-clobbering store.
 ; CHECK-LABEL: @test1(
 ; CHECK-NOMEMSSA-LABEL: @test1(
 define i32 @test1() {
   %V1 = load i32, i32* @G1
   store i32 0, i32* @G2
   %V2 = load i32, i32* @G1
   ; CHECK-NOMEMSSA: sub i32 %V1, %V2
   %Diff = sub i32 %V1, %V2
   ret i32 %Diff
   ; CHECK: ret i32 0
 }
 
 ;; Simple dead store elimination across non-clobbering store.
 ; CHECK-LABEL: @test2(
 ; CHECK-NOMEMSSA-LABEL: @test2(
 define void @test2() {
 entry:
   %V1 = load i32, i32* @G1
   ; CHECK: store i32 0, i32* @G2
   store i32 0, i32* @G2
   ; CHECK-NOT: store
   ; CHECK-NOMEMSSA: store i32 %V1, i32* @G1
   store i32 %V1, i32* @G1
   ret void
 }
 
 ;; Check that memoryphi optimization happens during EarlyCSE, enabling
 ;; more load CSE opportunities.
 ; CHECK-LABEL: @test_memphiopt(
 ; CHECK-NOMEMSSA-LABEL: @test_memphiopt(
 define void @test_memphiopt(i1 %c, i32* %p) {
 ; CHECK-LABEL: entry:
 ; CHECK-NOMEMSSA-LABEL: entry:
 entry:
 ; CHECK: load
 ; CHECK-NOMEMSSA: load
   %v1 = load i32, i32* @G1
   br i1 %c, label %then, label %end
 
 ; CHECK-LABEL: then:
 ; CHECK-NOMEMSSA-LABEL: then:
 then:
 ; CHECK: load
 ; CHECK-NOMEMSSA: load
   %pv = load i32, i32* %p
 ; CHECK-NOT: store
 ; CHECK-NOMEMSSA-NOT: store
   store i32 %pv, i32* %p
   br label %end
 
 ; CHECK-LABEL: end:
 ; CHECK-NOMEMSSA-LABEL: end:
 end:
 ; CHECK-NOT: load
 ; CHECK-NOMEMSSA: load
   %v2 = load i32, i32* @G1
   %sum = add i32 %v1, %v2
   store i32 %sum, i32* @G2
   ret void
 }
 
 
 ;; Check that MemoryPhi optimization and MemoryUse re-optimization
 ;; happens during EarlyCSE, enabling more load CSE opportunities.
 ; CHECK-LABEL: @test_memphiopt2(
 ; CHECK-NOMEMSSA-LABEL: @test_memphiopt2(
 define void @test_memphiopt2(i1 %c, i32* %p) {
 ; CHECK-LABEL: entry:
 ; CHECK-NOMEMSSA-LABEL: entry:
 entry:
 ; CHECK: load
 ; CHECK-NOMEMSSA: load
   %v1 = load i32, i32* @G1
 ; CHECK: store
 ; CHECK-NOMEMSSA: store
   store i32 %v1, i32* @G2
   br i1 %c, label %then, label %end
 
 ; CHECK-LABEL: then:
 ; CHECK-NOMEMSSA-LABEL: then:
 then:
 ; CHECK: load
 ; CHECK-NOMEMSSA: load
   %pv = load i32, i32* %p
 ; CHECK-NOT: store
 ; CHECK-NOMEMSSA-NOT: store
   store i32 %pv, i32* %p
   br label %end
 
 ; CHECK-LABEL: end:
 ; CHECK-NOMEMSSA-LABEL: end:
 end:
 ; CHECK-NOT: load
 ; CHECK-NOMEMSSA: load
   %v2 = load i32, i32* @G1
   store i32 %v2, i32* @G3
   ret void
 }
+
+;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting
+;; stores that, without the lifetime calls, would be writebacks.
+; CHECK-LABEL: @test_writeback_lifetimes(
+; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes(
+define void @test_writeback_lifetimes(i32* %p) {
+entry:
+  %q = getelementptr i32, i32* %p, i64 1
+  %pv = load i32, i32* %p
+  %qv = load i32, i32* %q
+  call void @llvm.lifetime.end.p0i8(i64 8, i32* %p)
+  call void @llvm.lifetime.start.p0i8(i64 8, i32* %p)
+  ; CHECK: store i32 %pv
+  ; CHECK-NOMEMSSA-LABEL: store i32 %pv
+  store i32 %pv, i32* %p
+  ; CHECK: store i32 %qv, i32* %q
+  ; CHECK-NOMEMSSA-LABEL: store i32 %qv, i32* %q
+  store i32 %qv, i32* %q
+  ret void
+}
+
+;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting
+;; stores that, without the lifetime calls, would be writebacks.
+; CHECK-LABEL: @test_writeback_lifetimes_multi_arg(
+; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes_multi_arg(
+define void @test_writeback_lifetimes_multi_arg(i32* %p, i32* %q) {
+entry:
+  %pv = load i32, i32* %p
+  %qv = load i32, i32* %q
+  call void @llvm.lifetime.end.p0i8(i64 8, i32* %p)
+  call void @llvm.lifetime.start.p0i8(i64 8, i32* %p)
+  ; CHECK: store i32 %pv
+  ; CHECK-NOMEMSSA-LABEL: store i32 %pv
+  store i32 %pv, i32* %p
+  ; CHECK: store i32 %qv, i32* %q
+  ; CHECK-NOMEMSSA-LABEL: store i32 %qv, i32* %q
+  store i32 %qv, i32* %q
+  ret void
+}
+
+declare void @llvm.lifetime.end.p0i8(i64, i32*)
+declare void @llvm.lifetime.start.p0i8(i64, i32*)
Index: vendor/llvm/dist-release_70/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
===================================================================
--- vendor/llvm/dist-release_70/test/Transforms/SLPVectorizer/AArch64/PR38339.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/Transforms/SLPVectorizer/AArch64/PR38339.ll	(revision 338000)
@@ -1,29 +1,124 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -S -mtriple=aarch64-apple-ios -mcpu=cyclone -o - %s | FileCheck %s
 
 define void @f1(<2 x i16> %x, i16* %a) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP1]], i16* [[A:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
 ; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %t2 = extractelement <2 x i16> %x, i32 0
   %t3 = extractelement <2 x i16> %x, i32 1
   %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
   %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
   %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
   %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
   store i16 %t2, i16* %a
   store i16 %t2, i16* %ptr0
   store i16 %t3, i16* %ptr1
   store i16 %t3, i16* %ptr2
   store i16 %t2, i16* %ptr3
   ret void
 }
+
+define void @f2(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       cont:
+; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %cont
+
+cont:                                           ; preds = %entry, %cont
+  %xx = phi <2 x i16> [ %x, %entry ], [ undef, %cont ]
+  %aa = phi i16* [ %a, %entry ], [ undef, %cont ]
+  %t2 = extractelement <2 x i16> %xx, i32 0
+  %t3 = extractelement <2 x i16> %xx, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t2, i16* %a
+  store i16 %t2, i16* %ptr0
+  store i16 %t3, i16* %ptr1
+  store i16 %t3, i16* %ptr2
+  store i16 %t2, i16* %ptr3
+  %a_val = load i16, i16* %a, align 2
+  %cmp = icmp eq i16 %a_val, 0
+  br i1 %cmp, label %cont, label %exit
+
+exit:                                           ; preds = %cont
+  ret void
+}
+
+define void @f3(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       cont:
+; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %cont
+
+cont:                                           ; preds = %entry, %cont
+  %xx = phi <2 x i16> [ %x, %entry ], [ undef, %cont ]
+  %aa = phi i16* [ %a, %entry ], [ undef, %cont ]
+  %t2 = extractelement <2 x i16> %xx, i32 0
+  %t3 = extractelement <2 x i16> %xx, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t3, i16* %a
+  store i16 %t3, i16* %ptr0
+  store i16 %t2, i16* %ptr1
+  store i16 %t2, i16* %ptr2
+  store i16 %t3, i16* %ptr3
+  %a_val = load i16, i16* %a, align 2
+  %cmp = icmp eq i16 %a_val, 0
+  br i1 %cmp, label %cont, label %exit
+
+exit:                                           ; preds = %cont
+  ret void
+}
Index: vendor/llvm/dist-release_70/test/tools/gold/X86/common.ll
===================================================================
--- vendor/llvm/dist-release_70/test/tools/gold/X86/common.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/gold/X86/common.ll	(revision 338000)
@@ -1,50 +1,50 @@
 ; RUN: llvm-as %s -o %t1.o
 ; RUN: llvm-as %p/Inputs/common.ll -o %t2.o
 ; RUN: llvm-as %p/Inputs/common2.ll -o %t2b.o
 ; RUN: llvm-as %p/Inputs/common3.ll -o %t2c.o
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @a = common global i16 0, align 8
 
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t1.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s --check-prefix=A
 
 ; Shared library case, we merge @a as common and keep it for the symbol table.
 ; A: @a = common global [4 x i8] zeroinitializer, align 8
 
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t1.o %t2b.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s --check-prefix=B
 
 ; (i16 align 8) + (i8 align 16) = i16 align 16
 ; B: @a = common global i16 0, align 16
 
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    -shared %t1.o %t2c.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck %s --check-prefix=C
 
 ; (i16 align 8) + (i8 align 1) = i16 align 8.
 ; C: @a = common global i16 0, align 8
 
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    %t1.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=EXEC %s
 
 ; All IR case, we internalize a after merging.
 ; EXEC: @a = internal global [4 x i8] zeroinitializer, align 8
 
 ; RUN: llc %p/Inputs/common.ll -o %t2native.o -filetype=obj
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --plugin-opt=emit-llvm \
 ; RUN:    %t1.o %t2native.o -o %t3.o
 ; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=MIXED %s
 
 ; Mixed ELF and IR. We keep ours as common so the linker will finish the merge.
 ; MIXED: @a = common dso_local global i16 0, align 8
Index: vendor/llvm/dist-release_70/test/tools/gold/X86/v1.16/wrap-1.ll
===================================================================
--- vendor/llvm/dist-release_70/test/tools/gold/X86/v1.16/wrap-1.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/gold/X86/v1.16/wrap-1.ll	(revision 338000)
@@ -1,42 +1,42 @@
 ; LTO
 ; RUN: llvm-as %s -o %t.o
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext %t.o -o %t.out -wrap=bar -plugin-opt=save-temps
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext %t.o -o %t.out -wrap=bar -plugin-opt=save-temps
 ; RUN: llvm-readobj -t %t.out | FileCheck %s
 ; RUN: cat %t.out.resolution.txt | FileCheck -check-prefix=RESOLS %s
 
 ; ThinLTO
 ; RUN: opt -module-summary %s -o %t.o
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext %t.o -o %t.out -wrap=bar -plugin-opt=save-temps
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext %t.o -o %t.out -wrap=bar -plugin-opt=save-temps
 ; RUN: llvm-readobj -t %t.out | FileCheck %s
 ; RUN: cat %t.out.resolution.txt | FileCheck -check-prefix=RESOLS %s
 
 ; CHECK:      Name: __wrap_bar
 ; CHECK-NEXT: Value:
 ; CHECK-NEXT: Size:
 ; CHECK-NEXT: Binding: Global
 ; CHECK-NEXT: Type: Function
 
 ; Make sure that the 'r' (linker redefined) bit is set for bar and __real_bar
 ; in the resolutions file, and that the 'x' (visible to regular obj) bit is set
 ; for bar and __wrap_bar.
 ; RESOLS: ,bar,lxr
 ; RESOLS: ,__wrap_bar,plx
 ; RESOLS: ,__real_bar,plr
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 declare void @bar()
 
 define void @_start() {
   call void @bar()
   ret void
 }
 
 define void @__wrap_bar() {
   ret void
 }
 
 define void @__real_bar() {
   ret void
 }
Index: vendor/llvm/dist-release_70/test/tools/gold/X86/v1.16/wrap-2.ll
===================================================================
--- vendor/llvm/dist-release_70/test/tools/gold/X86/v1.16/wrap-2.ll	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/gold/X86/v1.16/wrap-2.ll	(revision 338000)
@@ -1,55 +1,55 @@
 ; LTO
 ; This doesn't currently work with gold, because it does not apply defsym
 ; renaming to symbols in the same module (apparently by design for consistency
 ; with GNU ld). Because regular LTO hands back a single object file to gold,
 ; it doesn't perform the desired defsym renaming. This isn't an issue with
 ; ThinLTO which hands back multiple native objects to gold. For regular
 ; LTO defsym handling, gold will need a fix (not the gold plugin).
 ; RUN-TODO: llvm-as %s -o %t.o
 ; RUN-TODO: llvm-as %S/Inputs/wrap-bar.ll -o %t1.o
-; RUN-TODO: %gold -plugin %llvmshlibdir/LLVMgold%shlibext %t.o %t1.o -shared -o %t.so -wrap=bar
+; RUN-TODO: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext %t.o %t1.o -shared -o %t.so -wrap=bar
 ; RUN-TODO: llvm-objdump -d %t.so | FileCheck %s
 ; RUN-TODO: llvm-readobj -t %t.so | FileCheck -check-prefix=BIND %s
 
 ; ThinLTO
 ; RUN: opt -module-summary %s -o %t.o
 ; RUN: opt -module-summary %S/Inputs/wrap-bar.ll -o %t1.o
-; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext %t.o %t1.o -shared -o %t.so -wrap=bar
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext %t.o %t1.o -shared -o %t.so -wrap=bar
 ; RUN: llvm-objdump -d %t.so | FileCheck %s -check-prefix=THIN
 ; RUN: llvm-readobj -t %t.so | FileCheck -check-prefix=BIND %s
 
 ; Make sure that calls in foo() are not eliminated and that bar is
 ; routed to __wrap_bar and __real_bar is routed to bar.
 
 ; CHECK:      foo:
 ; CHECK-NEXT: pushq	%rax
 ; CHECK-NEXT: callq{{.*}}<__wrap_bar>
 ; CHECK-NEXT: callq{{.*}}<bar>
 
 ; THIN:      foo:
 ; THIN-NEXT: pushq	%rax
 ; THIN-NEXT: callq{{.*}}<__wrap_bar>
 ; THIN-NEXT: popq  %rax
 ; THIN-NEXT: jmp{{.*}}<bar>
 
 ; Check that bar and __wrap_bar retain their original binding.
 ; BIND:      Name: bar
 ; BIND-NEXT: Value:
 ; BIND-NEXT: Size:
 ; BIND-NEXT: Binding: Local
 ; BIND:      Name: __wrap_bar
 ; BIND-NEXT: Value:
 ; BIND-NEXT: Size:
 ; BIND-NEXT: Binding: Local
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 declare void @bar()
 declare void @__real_bar()
 
 define void @foo() {
   call void @bar()
   call void @__real_bar()
   ret void
 }
Index: vendor/llvm/dist-release_70/test/tools/llvm-cov/showLineExecutionCounts.cpp
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-cov/showLineExecutionCounts.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-cov/showLineExecutionCounts.cpp	(revision 338000)
@@ -1,77 +1,77 @@
 // Basic handling of line counts.
 // RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata
 
 // before any coverage              // WHOLE-FILE: [[@LINE]]|      |// before
                                     // FILTER-NOT: [[@LINE-1]]|    |// before
 int main() {                              // TEXT: [[@LINE]]|   161|int main(
   int x = 0;                              // TEXT: [[@LINE]]|   161|  int x
                                           // TEXT: [[@LINE]]|   161|
   if (x) {                                // TEXT: [[@LINE]]|   161|  if (x)
     x = 0;                                // TEXT: [[@LINE]]|     0|    x = 0
   } else {                                // TEXT: [[@LINE]]|   161|  } else
     x = 1;                                // TEXT: [[@LINE]]|   161|    x = 1
   }                                       // TEXT: [[@LINE]]|   161|  }
                                           // TEXT: [[@LINE]]|   161|
   for (int i = 0; i < 100; ++i) {         // TEXT: [[@LINE]]| 16.2k|  for (
     x = 1;                                // TEXT: [[@LINE]]| 16.1k|    x = 1
   }                                       // TEXT: [[@LINE]]| 16.1k|  }
                                           // TEXT: [[@LINE]]|   161|
   x = x < 10 ? x + 1 : x - 1;             // TEXT: [[@LINE]]|   161|  x =
   x = x > 10 ?                            // TEXT: [[@LINE]]|   161|  x =
         x - 1:                            // TEXT: [[@LINE]]|     0|        x
         x + 1;                            // TEXT: [[@LINE]]|   161|        x
                                           // TEXT: [[@LINE]]|   161|
   return 0;                               // TEXT: [[@LINE]]|   161|  return
 }                                         // TEXT: [[@LINE]]|   161|}
 // after coverage                   // WHOLE-FILE: [[@LINE]]|      |// after
                                     // FILTER-NOT: [[@LINE-1]]|    |// after
 
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck -check-prefixes=TEXT,WHOLE-FILE %s
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S -name=main %s | FileCheck -check-prefixes=TEXT,FILTER %s
 
 // Test -output-dir.
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -o %t.dir -instr-profile %t.profdata -path-equivalence=/tmp,%S %s
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -output-dir %t.filtered.dir -instr-profile %t.profdata -path-equivalence=/tmp,%S -name=main %s
 // RUN: FileCheck -check-prefixes=TEXT,WHOLE-FILE -input-file %t.dir/coverage/tmp/showLineExecutionCounts.cpp.txt %s
 // RUN: FileCheck -check-prefixes=TEXT,FILTER -input-file %t.filtered.dir/coverage/tmp/showLineExecutionCounts.cpp.txt %s
 //
 // RUN: llvm-cov export %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -name=main 2>/dev/null > %t.export.json
 // RUN: FileCheck -input-file %t.export.json %S/Inputs/lineExecutionCounts.json
-// RUN: cat %t.export.json | "%python" -c "import json, sys; json.loads(sys.stdin.read())"
+// RUN: cat %t.export.json | %python -c "import json, sys; json.loads(sys.stdin.read())"
 //
 // RUN: llvm-cov export %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata 2>/dev/null -summary-only > %t.export-summary.json
 // RUN: not grep '"name":"main"' %t.export-summary.json
 //
 // Test html output.
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.html.dir -instr-profile %t.profdata -path-equivalence=/tmp,%S %s
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.html.filtered.dir -instr-profile %t.profdata -path-equivalence=/tmp,%S -name=main %s
 // RUN: FileCheck -check-prefixes=HTML,HTML-WHOLE-FILE -input-file %t.html.dir/coverage/tmp/showLineExecutionCounts.cpp.html %s
 // RUN: FileCheck -check-prefixes=HTML,HTML-FILTER -input-file %t.html.filtered.dir/coverage/tmp/showLineExecutionCounts.cpp.html %s
 //
 // HTML-WHOLE-FILE: <td class='line-number'><a name='L4' href='#L4'><pre>4</pre></a></td><td class='uncovered-line'></td><td class='code'><pre>// before
 // HTML-FILTER-NOT: <td class='line-number'><a name='L4' href='#L4'><pre>4</pre></a></td><td class='uncovered-line'></td><td class='code'><pre>// before
 // HTML: <td class='line-number'><a name='L6' href='#L6'><pre>6</pre></a></td><td class='covered-line'><pre>161</pre></td><td class='code'><pre>int main() {
 // HTML-WHOLE-FILE: <td class='line-number'><a name='L26' href='#L26'><pre>26</pre></a></td><td class='uncovered-line'></td><td class='code'><pre>// after
 // HTML-FILTER-NOT: <td class='line-number'><a name='L26' href='#L26'><pre>26</pre></a></td><td class='uncovered-line'></td><td class='code'><pre>// after
 //
 // Test index creation.
 // RUN: FileCheck -check-prefix=TEXT-INDEX -input-file %t.dir/index.txt %s
 // TEXT-INDEX: Filename
 // TEXT-INDEX-NEXT: ---
 // TEXT-INDEX-NEXT: {{.*}}showLineExecutionCounts.cpp
 //
 // RUN: FileCheck -check-prefix HTML-INDEX -input-file %t.html.dir/index.html %s
 // HTML-INDEX-LABEL: <table>
 // HTML-INDEX: <td class='column-entry-bold'>Filename</td>
 // HTML-INDEX: <td class='column-entry-bold'>Function Coverage</td>
 // HTML-INDEX: <td class='column-entry-bold'>Line Coverage</td>
 // HTML-INDEX: <td class='column-entry-bold'>Region Coverage</td>
 // HTML-INDEX: <a href='coverage{{.*}}showLineExecutionCounts.cpp.html'{{.*}}showLineExecutionCounts.cpp</a>
 // HTML-INDEX: <td class='column-entry-green'>
 // HTML-INDEX: 100.00% (1/1)
 // HTML-INDEX: <td class='column-entry-yellow'>
 // HTML-INDEX: 90.00% (18/20)
 // HTML-INDEX: <td class='column-entry-red'>
 // HTML-INDEX: 72.73% (8/11)
 // HTML-INDEX: <tr class='light-row-bold'>
 // HTML-INDEX: Totals
Index: vendor/llvm/dist-release_70/test/tools/llvm-objcopy/auto-remove-shndx.test
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-objcopy/auto-remove-shndx.test	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-objcopy/auto-remove-shndx.test	(revision 338000)
@@ -1,5 +1,5 @@
-# RUN: '%python' %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t
+# RUN: %python %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t
 # RUN: llvm-objcopy -R .text -R s0 -R s1 -R s2 -R s3 -R s4 -R s5 -R s6 %t %t2
 # RUN: llvm-readobj -sections %t2 | FileCheck --check-prefix=SECS %s
 
 # SECS-NOT: Name: .symtab_shndx
Index: vendor/llvm/dist-release_70/test/tools/llvm-objcopy/many-sections.test
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-objcopy/many-sections.test	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-objcopy/many-sections.test	(revision 338000)
@@ -1,53 +1,53 @@
-RUN: '%python' %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t
+RUN: %python %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t
 RUN: llvm-objcopy %t %t2
 RUN: llvm-readobj -file-headers %t2 | FileCheck --check-prefix=EHDR %s
 RUN: llvm-readobj -sections %t2 | FileCheck --check-prefix=SECS %s
 RUN: llvm-readobj -symbols %t2 | grep "Symbol {" | wc -l | FileCheck --check-prefix=SYMS %s
 
 EHDR:      Format: ELF64-x86-64
 EHDR-NEXT: Arch: x86_64
 EHDR-NEXT: AddressSize: 64bit
 EHDR-NEXT: LoadName:
 EHDR-NEXT: ElfHeader {
 EHDR-NEXT:   Ident {
 EHDR-NEXT:     Magic: (7F 45 4C 46)
 EHDR-NEXT:     Class: 64-bit (0x2)
 EHDR-NEXT:     DataEncoding: LittleEndian (0x1)
 EHDR-NEXT:     FileVersion: 1
 EHDR-NEXT:     OS/ABI: SystemV (0x0)
 EHDR-NEXT:     ABIVersion: 0
 EHDR-NEXT:     Unused: (00 00 00 00 00 00 00)
 EHDR-NEXT:   }
 EHDR-NEXT:   Type: Relocatable (0x1)
 EHDR-NEXT:   Machine: EM_X86_64 (0x3E)
 EHDR-NEXT:   Version: 1
 EHDR-NEXT:   Entry: 0x0
 EHDR-NEXT:   ProgramHeaderOffset: 0x40
 EHDR-NEXT:   SectionHeaderOffset:
 EHDR-NEXT:   Flags [ (0x0)
 EHDR-NEXT:   ]
 EHDR-NEXT:   HeaderSize: 64
 EHDR-NEXT:   ProgramHeaderEntrySize: 56
 EHDR-NEXT:   ProgramHeaderCount: 0
 EHDR-NEXT:   SectionHeaderEntrySize: 64
 EHDR-NEXT:   SectionHeaderCount: 0
 EHDR-NEXT:   StringTableSectionIndex: 65535
 EHDR-NEXT: }
 
 SECS: Index: 65285
 SECS-NEXT: Name: .symtab
 SECS-NEXT: Type: SHT_SYMTAB
 SECS: Name: .symtab_shndx
 SECS-NEXT: Type: SHT_SYMTAB_SHNDX
 SECS-NEXT: Flags [ (0x0)
 SECS-NEXT: ]
 SECS-NEXT: Address: 0x0
 SECS-NEXT: Offset:
 # There should be #syms * EntrySize bytes.
 SECS-NEXT: Size: 261136
 SECS-NEXT: Link: 65285
 SECS-NEXT: Info:
 SECS-NEXT: AddressAlignment: 4
 SECS-NEXT: EntrySize: 4
 SECS: Index: 65287
 SYMS: 65284
Index: vendor/llvm/dist-release_70/test/tools/llvm-objcopy/remove-shndx.test
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-objcopy/remove-shndx.test	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-objcopy/remove-shndx.test	(revision 338000)
@@ -1,7 +1,7 @@
 # This test checks to see that a .symtab_shndx section is added to any binary
 # that needs it, even if the original was removed.
-RUN: '%python' %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t
+RUN: %python %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t
 RUN: llvm-objcopy -R .symtab_shndx %t %t2
 RUN: llvm-readobj -sections %t2 | FileCheck %s
 
 CHECK: Name: .symtab_shndx (
Index: vendor/llvm/dist-release_70/test/tools/llvm-objcopy/strict-no-add.test
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-objcopy/strict-no-add.test	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-objcopy/strict-no-add.test	(revision 338000)
@@ -1,10 +1,10 @@
 # This test makes sure that sections added at the end that don't have symbols
 # defined in them don't trigger the creation of a large index table.
 
-RUN: '%python' %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t.0
+RUN: %python %p/Inputs/ungzip.py %p/Inputs/many-sections.o.gz > %t.0
 RUN: cat %p/Inputs/alloc-symtab.o > %t
 RUN: llvm-objcopy -R .text -R s0 -R s1 -R s2 -R s3 -R s4 -R s5 -R s6 %t.0 %t2
 RUN: llvm-objcopy -add-section=.s0=%t -add-section=.s1=%t -add-section=.s2=%t %t2 %t2
 RUN: llvm-readobj -sections %t2 | FileCheck --check-prefix=SECS %s
 
 SECS-NOT: Name: .symtab_shndx
Index: vendor/llvm/dist-release_70/test/tools/llvm-symbolizer/pdb/pdb.test
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-symbolizer/pdb/pdb.test	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-symbolizer/pdb/pdb.test	(revision 338000)
@@ -1,49 +1,49 @@
 RUN: grep '^ADDR:' %s | sed -s 's/ADDR: //' \
 RUN: 	| llvm-symbolizer -obj="%p/Inputs/test.exe" \
 RUN:    | FileCheck %s
 RUN: grep '^ADDR:' %s | sed -s 's/ADDR: //' \
 RUN: 	| llvm-symbolizer -obj="%p/Inputs/test.exe" -demangle=false \
 RUN: 	| FileCheck %s --check-prefix=CHECK-NO-DEMANGLE
 
 Subtract ImageBase from all the offsets and run the test again with
 --relative-address.
 
 RUN: grep '^ADDR:' %s | sed -s 's/ADDR: //' \
-RUN: 	| "%python" -c 'import sys;print("\n".join([hex(int(x, 16) - 0x400000) for x in sys.stdin]))' \
+RUN: 	| %python -c 'import sys;print("\n".join([hex(int(x, 16) - 0x400000) for x in sys.stdin]))' \
 RUN:	| llvm-symbolizer -obj="%p/Inputs/test.exe" -demangle=false --relative-address \
 RUN:    | FileCheck %s --check-prefix=CHECK-NO-DEMANGLE
 
 ADDR: 0x401380
 ADDR: 0x401390
 ADDR: 0x4013A0
 ADDR: 0x4013C0
 ADDR: 0x4013D0
 ADDR: 0x4013E0
 ADDR: 0x4013F0
 ADDR: 0x401420
 
 CHECK: foo(void)
 CHECK-NEXT: test.cpp:10
 CHECK: {{^private_symbol$}}
 CHECK-NEXT: test.cpp:13:0
 CHECK: {{^main}}
 CHECK-NEXT: test.cpp:16:0
 CHECK: {{^foo_cdecl$}}
 CHECK: {{^foo_stdcall$}}
 CHECK: {{^foo_fastcall$}}
 CHECK: {{^foo_vectorcall$}}
 CHECK: NS::Foo::bar(void)
 CHECK-NEXT: test.cpp:6:0
 
 CHECK-NO-DEMANGLE: ?foo@@YAXXZ
 CHECK-NO-DEMANGLE-NEXT: test.cpp:10
 CHECK-NO-DEMANGLE: private_symbol
 CHECK-NO-DEMANGLE-NEXT: test.cpp:13
 CHECK-NO-DEMANGLE: _main
 CHECK-NO-DEMANGLE-NEXT: test.cpp:16
 CHECK-NO-DEMANGLE: _foo_cdecl
 CHECK-NO-DEMANGLE: _foo_stdcall@0
 CHECK-NO-DEMANGLE: @foo_fastcall@0
 CHECK-NO-DEMANGLE: foo_vectorcall@@0
 CHECK-NO-DEMANGLE: ?bar@Foo@NS@@QAEXXZ
 CHECK-NO-DEMANGLE-NEXT: test.cpp:6
Index: vendor/llvm/dist-release_70/test/tools/llvm-symbolizer/ppc64.test
===================================================================
--- vendor/llvm/dist-release_70/test/tools/llvm-symbolizer/ppc64.test	(revision 337999)
+++ vendor/llvm/dist-release_70/test/tools/llvm-symbolizer/ppc64.test	(revision 338000)
@@ -1,11 +1,11 @@
 // ppc64 was compiled from this source on a big-endian 64-bit PowerPC box
 // with just "clang -nostdlib":
 int foo() { return 0; }
 int bar() { return foo(); }
 int _start() { return bar(); }
 
-RUN: "%python" -c "print('0x1000014c\n0x1000018c\n0x100001cc')" | llvm-symbolizer -obj=%p/Inputs/ppc64 | FileCheck %s
+RUN: %python -c "print('0x1000014c\n0x1000018c\n0x100001cc')" | llvm-symbolizer -obj=%p/Inputs/ppc64 | FileCheck %s
 
 CHECK: foo
 CHECK: bar
 CHECK: _start
Index: vendor/llvm/dist-release_70/unittests/Analysis/MemorySSA.cpp
===================================================================
--- vendor/llvm/dist-release_70/unittests/Analysis/MemorySSA.cpp	(revision 337999)
+++ vendor/llvm/dist-release_70/unittests/Analysis/MemorySSA.cpp	(revision 338000)
@@ -1,1201 +1,1267 @@
 //===- MemorySSA.cpp - Unit tests for MemorySSA ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
 const static char DLString[] = "e-i64:64-f80:128-n8:16:32:64-S128";
 
 /// There's a lot of common setup between these tests. This fixture helps reduce
 /// that. Tests should mock up a function, store it in F, and then call
 /// setupAnalyses().
 class MemorySSATest : public testing::Test {
 protected:
   // N.B. Many of these members depend on each other (e.g. the Module depends on
   // the Context, etc.). So, order matters here (and in TestAnalyses).
   LLVMContext C;
   Module M;
   IRBuilder<> B;
   DataLayout DL;
   TargetLibraryInfoImpl TLII;
   TargetLibraryInfo TLI;
   Function *F;
 
   // Things that we need to build after the function is created.
   struct TestAnalyses {
     DominatorTree DT;
     AssumptionCache AC;
     AAResults AA;
     BasicAAResult BAA;
     // We need to defer MSSA construction until AA is *entirely* set up, which
     // requires calling addAAResult. Hence, we just use a pointer here.
     std::unique_ptr<MemorySSA> MSSA;
     MemorySSAWalker *Walker;
 
     TestAnalyses(MemorySSATest &Test)
         : DT(*Test.F), AC(*Test.F), AA(Test.TLI),
           BAA(Test.DL, *Test.F, Test.TLI, AC, &DT) {
       AA.addAAResult(BAA);
       MSSA = make_unique<MemorySSA>(*Test.F, &AA, &DT);
       Walker = MSSA->getWalker();
     }
   };
 
   std::unique_ptr<TestAnalyses> Analyses;
 
   void setupAnalyses() {
     assert(F);
     Analyses.reset(new TestAnalyses(*this));
   }
 
 public:
   MemorySSATest()
       : M("MemorySSATest", C), B(C), DL(DLString), TLI(TLII), F(nullptr) {}
 };
 
 TEST_F(MemorySSATest, CreateALoad) {
   // We create a diamond where there is a store on one side, and then after
   // building MemorySSA, create a load after the merge point, and use it to test
   // updating by creating an access for the load.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   Argument *PointerArg = &*F->arg_begin();
   B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
   // Add the load
   B.SetInsertPoint(Merge);
   LoadInst *LoadInst = B.CreateLoad(PointerArg);
 
   // MemoryPHI should already exist.
   MemoryPhi *MP = MSSA.getMemoryAccess(Merge);
   EXPECT_NE(MP, nullptr);
 
   // Create the load memory acccess
   MemoryUse *LoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LoadInst, MP, Merge, MemorySSA::Beginning));
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
   EXPECT_TRUE(isa<MemoryPhi>(DefiningAccess));
   MSSA.verifyMemorySSA();
 }
 TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   // We create a diamond, then build memoryssa with no memory accesses, and
   // incrementally update it by inserting a store in the, entry, a load in the
   // merge point, then a store in the branch, another load in the merge point,
   // and then a store in the entry.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left, Left->begin());
   Argument *PointerArg = &*F->arg_begin();
   B.SetInsertPoint(Left);
   B.CreateBr(Merge);
   B.SetInsertPoint(Right);
   B.CreateBr(Merge);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
   // Add the store
   B.SetInsertPoint(Entry, Entry->begin());
   StoreInst *EntryStore = B.CreateStore(B.getInt8(16), PointerArg);
   MemoryAccess *EntryStoreAccess = Updater.createMemoryAccessInBB(
       EntryStore, nullptr, Entry, MemorySSA::Beginning);
   Updater.insertDef(cast<MemoryDef>(EntryStoreAccess));
 
   // Add the load
   B.SetInsertPoint(Merge, Merge->begin());
   LoadInst *FirstLoad = B.CreateLoad(PointerArg);
 
   // MemoryPHI should not already exist.
   MemoryPhi *MP = MSSA.getMemoryAccess(Merge);
   EXPECT_EQ(MP, nullptr);
 
   // Create the load memory access
   MemoryUse *FirstLoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       FirstLoad, nullptr, Merge, MemorySSA::Beginning));
   Updater.insertUse(FirstLoadAccess);
   // Should just have a load using the entry access, because it should discover
   // the phi is trivial
   EXPECT_EQ(FirstLoadAccess->getDefiningAccess(), EntryStoreAccess);
 
   // Create a store on the left
   // Add the store
   B.SetInsertPoint(Left, Left->begin());
   StoreInst *LeftStore = B.CreateStore(B.getInt8(16), PointerArg);
   MemoryAccess *LeftStoreAccess = Updater.createMemoryAccessInBB(
       LeftStore, nullptr, Left, MemorySSA::Beginning);
   Updater.insertDef(cast<MemoryDef>(LeftStoreAccess), false);
   // We don't touch existing loads, so we need to create a new one to get a phi
   // Add the second load
   B.SetInsertPoint(Merge, Merge->begin());
   LoadInst *SecondLoad = B.CreateLoad(PointerArg);
 
   // MemoryPHI should not already exist.
   MP = MSSA.getMemoryAccess(Merge);
   EXPECT_EQ(MP, nullptr);
 
   // Create the load memory access
   MemoryUse *SecondLoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       SecondLoad, nullptr, Merge, MemorySSA::Beginning));
   Updater.insertUse(SecondLoadAccess);
   // Now the load should be a phi of the entry store and the left store
   MemoryPhi *MergePhi =
       dyn_cast<MemoryPhi>(SecondLoadAccess->getDefiningAccess());
   EXPECT_NE(MergePhi, nullptr);
   EXPECT_EQ(MergePhi->getIncomingValue(0), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), LeftStoreAccess);
   // Now create a store below the existing one in the entry
   B.SetInsertPoint(Entry, --Entry->end());
   StoreInst *SecondEntryStore = B.CreateStore(B.getInt8(16), PointerArg);
   MemoryAccess *SecondEntryStoreAccess = Updater.createMemoryAccessInBB(
       SecondEntryStore, nullptr, Entry, MemorySSA::End);
   // Insert it twice just to test renaming
   Updater.insertDef(cast<MemoryDef>(SecondEntryStoreAccess), false);
   EXPECT_NE(FirstLoadAccess->getDefiningAccess(), MergePhi);
   Updater.insertDef(cast<MemoryDef>(SecondEntryStoreAccess), true);
   EXPECT_EQ(FirstLoadAccess->getDefiningAccess(), MergePhi);
   // and make sure the phi below it got updated, despite being blocks away
   MergePhi = dyn_cast<MemoryPhi>(SecondLoadAccess->getDefiningAccess());
   EXPECT_NE(MergePhi, nullptr);
   EXPECT_EQ(MergePhi->getIncomingValue(0), SecondEntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), LeftStoreAccess);
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, CreateALoadUpdater) {
   // We create a diamond, then build memoryssa with no memory accesses, and
   // incrementally update it by inserting a store in one of the branches, and a
   // load in the merge point
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left, Left->begin());
   Argument *PointerArg = &*F->arg_begin();
   B.SetInsertPoint(Left);
   B.CreateBr(Merge);
   B.SetInsertPoint(Right);
   B.CreateBr(Merge);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
   B.SetInsertPoint(Left, Left->begin());
   // Add the store
   StoreInst *SI = B.CreateStore(B.getInt8(16), PointerArg);
   MemoryAccess *StoreAccess =
       Updater.createMemoryAccessInBB(SI, nullptr, Left, MemorySSA::Beginning);
   Updater.insertDef(cast<MemoryDef>(StoreAccess));
 
   // Add the load
   B.SetInsertPoint(Merge, Merge->begin());
   LoadInst *LoadInst = B.CreateLoad(PointerArg);
 
   // MemoryPHI should not already exist.
   MemoryPhi *MP = MSSA.getMemoryAccess(Merge);
   EXPECT_EQ(MP, nullptr);
 
   // Create the load memory acccess
   MemoryUse *LoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LoadInst, nullptr, Merge, MemorySSA::Beginning));
   Updater.insertUse(LoadAccess);
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
   EXPECT_TRUE(isa<MemoryPhi>(DefiningAccess));
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, SinkLoad) {
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left, Left->begin());
   Argument *PointerArg = &*F->arg_begin();
   B.SetInsertPoint(Left);
   B.CreateBr(Merge);
   B.SetInsertPoint(Right);
   B.CreateBr(Merge);
 
   // Load in left block
   B.SetInsertPoint(Left, Left->begin());
   LoadInst *LoadInst1 = B.CreateLoad(PointerArg);
   // Store in merge block
   B.SetInsertPoint(Merge, Merge->begin());
   B.CreateStore(B.getInt8(16), PointerArg);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
 
   // Mimic sinking of a load:
   // - clone load
   // - insert in "exit" block
   // - insert in mssa
   // - remove from original block
 
   LoadInst *LoadInstClone = cast<LoadInst>(LoadInst1->clone());
   Merge->getInstList().insert(Merge->begin(), LoadInstClone);
   MemoryAccess * NewLoadAccess =
       Updater.createMemoryAccessInBB(LoadInstClone, nullptr,
                                      LoadInstClone->getParent(),
                                      MemorySSA::Beginning);
   Updater.insertUse(cast<MemoryUse>(NewLoadAccess));
   MSSA.verifyMemorySSA();
   Updater.removeMemoryAccess(MSSA.getMemoryAccess(LoadInst1));
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, MoveAStore) {
   // We create a diamond where there is a in the entry, a store on one side, and
   // a load at the end.  After building MemorySSA, we test updating by moving
   // the store from the side block to the entry block. This destroys the old
   // access.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   Argument *PointerArg = &*F->arg_begin();
   StoreInst *EntryStore = B.CreateStore(B.getInt8(16), PointerArg);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   StoreInst *SideStore = B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   B.CreateLoad(PointerArg);
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
   // Move the store
   SideStore->moveBefore(Entry->getTerminator());
   MemoryAccess *EntryStoreAccess = MSSA.getMemoryAccess(EntryStore);
   MemoryAccess *SideStoreAccess = MSSA.getMemoryAccess(SideStore);
   MemoryAccess *NewStoreAccess = Updater.createMemoryAccessAfter(
       SideStore, EntryStoreAccess, EntryStoreAccess);
   EntryStoreAccess->replaceAllUsesWith(NewStoreAccess);
   Updater.removeMemoryAccess(SideStoreAccess);
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, MoveAStoreUpdater) {
   // We create a diamond where there is a in the entry, a store on one side, and
   // a load at the end.  After building MemorySSA, we test updating by moving
   // the store from the side block to the entry block.  This destroys the old
   // access.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   Argument *PointerArg = &*F->arg_begin();
   StoreInst *EntryStore = B.CreateStore(B.getInt8(16), PointerArg);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   auto *SideStore = B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   auto *MergeLoad = B.CreateLoad(PointerArg);
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
 
   // Move the store
   SideStore->moveBefore(Entry->getTerminator());
   auto *EntryStoreAccess = MSSA.getMemoryAccess(EntryStore);
   auto *SideStoreAccess = MSSA.getMemoryAccess(SideStore);
   auto *NewStoreAccess = Updater.createMemoryAccessAfter(
       SideStore, EntryStoreAccess, EntryStoreAccess);
   // Before, the load will point to a phi of the EntryStore and SideStore.
   auto *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(MergeLoad));
   EXPECT_TRUE(isa<MemoryPhi>(LoadAccess->getDefiningAccess()));
   MemoryPhi *MergePhi = cast<MemoryPhi>(LoadAccess->getDefiningAccess());
   EXPECT_EQ(MergePhi->getIncomingValue(1), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(0), SideStoreAccess);
   Updater.removeMemoryAccess(SideStoreAccess);
   Updater.insertDef(cast<MemoryDef>(NewStoreAccess));
   // After it's a phi of the new side store access.
   EXPECT_EQ(MergePhi->getIncomingValue(0), NewStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), NewStoreAccess);
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, MoveAStoreUpdaterMove) {
   // We create a diamond where there is a in the entry, a store on one side, and
   // a load at the end.  After building MemorySSA, we test updating by moving
   // the store from the side block to the entry block.  This does not destroy
   // the old access.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   Argument *PointerArg = &*F->arg_begin();
   StoreInst *EntryStore = B.CreateStore(B.getInt8(16), PointerArg);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   auto *SideStore = B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   auto *MergeLoad = B.CreateLoad(PointerArg);
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
 
   // Move the store
   auto *EntryStoreAccess = MSSA.getMemoryAccess(EntryStore);
   auto *SideStoreAccess = MSSA.getMemoryAccess(SideStore);
   // Before, the load will point to a phi of the EntryStore and SideStore.
   auto *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(MergeLoad));
   EXPECT_TRUE(isa<MemoryPhi>(LoadAccess->getDefiningAccess()));
   MemoryPhi *MergePhi = cast<MemoryPhi>(LoadAccess->getDefiningAccess());
   EXPECT_EQ(MergePhi->getIncomingValue(1), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(0), SideStoreAccess);
   SideStore->moveBefore(*EntryStore->getParent(), ++EntryStore->getIterator());
   Updater.moveAfter(SideStoreAccess, EntryStoreAccess);
   // After, it's a phi of the side store.
   EXPECT_EQ(MergePhi->getIncomingValue(0), SideStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), SideStoreAccess);
 
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, MoveAStoreAllAround) {
   // We create a diamond where there is a in the entry, a store on one side, and
   // a load at the end.  After building MemorySSA, we test updating by moving
   // the store from the side block to the entry block, then to the other side
   // block, then to before the load.  This does not destroy the old access.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   Argument *PointerArg = &*F->arg_begin();
   StoreInst *EntryStore = B.CreateStore(B.getInt8(16), PointerArg);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   auto *SideStore = B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   auto *MergeLoad = B.CreateLoad(PointerArg);
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
 
   // Move the store
   auto *EntryStoreAccess = MSSA.getMemoryAccess(EntryStore);
   auto *SideStoreAccess = MSSA.getMemoryAccess(SideStore);
   // Before, the load will point to a phi of the EntryStore and SideStore.
   auto *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(MergeLoad));
   EXPECT_TRUE(isa<MemoryPhi>(LoadAccess->getDefiningAccess()));
   MemoryPhi *MergePhi = cast<MemoryPhi>(LoadAccess->getDefiningAccess());
   EXPECT_EQ(MergePhi->getIncomingValue(1), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(0), SideStoreAccess);
   // Move the store before the entry store
   SideStore->moveBefore(*EntryStore->getParent(), EntryStore->getIterator());
   Updater.moveBefore(SideStoreAccess, EntryStoreAccess);
   // After, it's a phi of the entry store.
   EXPECT_EQ(MergePhi->getIncomingValue(0), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), EntryStoreAccess);
   MSSA.verifyMemorySSA();
   // Now move the store to the right branch
   SideStore->moveBefore(*Right, Right->begin());
   Updater.moveToPlace(SideStoreAccess, Right, MemorySSA::Beginning);
   MSSA.verifyMemorySSA();
   EXPECT_EQ(MergePhi->getIncomingValue(0), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), SideStoreAccess);
   // Now move it before the load
   SideStore->moveBefore(MergeLoad);
   Updater.moveBefore(SideStoreAccess, LoadAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(0), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(1), EntryStoreAccess);
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, RemoveAPhi) {
   // We create a diamond where there is a store on one side, and then a load
   // after the merge point.  This enables us to test a bunch of different
   // removal cases.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   Argument *PointerArg = &*F->arg_begin();
   StoreInst *StoreInst = B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   LoadInst *LoadInst = B.CreateLoad(PointerArg);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
 
   // Before, the load will be a use of a phi<store, liveonentry>.
   MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(LoadInst));
   MemoryDef *StoreAccess = cast<MemoryDef>(MSSA.getMemoryAccess(StoreInst));
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
   EXPECT_TRUE(isa<MemoryPhi>(DefiningAccess));
   // Kill the store
   Updater.removeMemoryAccess(StoreAccess);
   MemoryPhi *MP = cast<MemoryPhi>(DefiningAccess);
   // Verify the phi ended up as liveonentry, liveonentry
   for (auto &Op : MP->incoming_values())
     EXPECT_TRUE(MSSA.isLiveOnEntryDef(cast<MemoryAccess>(Op.get())));
   // Replace the phi uses with the live on entry def
   MP->replaceAllUsesWith(MSSA.getLiveOnEntryDef());
   // Verify the load is now defined by liveOnEntryDef
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(LoadAccess->getDefiningAccess()));
   // Remove the PHI
   Updater.removeMemoryAccess(MP);
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, RemoveMemoryAccess) {
   // We create a diamond where there is a store on one side, and then a load
   // after the merge point.  This enables us to test a bunch of different
   // removal cases.
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   BasicBlock *Left(BasicBlock::Create(C, "", F));
   BasicBlock *Right(BasicBlock::Create(C, "", F));
   BasicBlock *Merge(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
   B.CreateCondBr(B.getTrue(), Left, Right);
   B.SetInsertPoint(Left);
   Argument *PointerArg = &*F->arg_begin();
   StoreInst *StoreInst = B.CreateStore(B.getInt8(16), PointerArg);
   BranchInst::Create(Merge, Left);
   BranchInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   LoadInst *LoadInst = B.CreateLoad(PointerArg);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
   MemorySSAUpdater Updater(&MSSA);
 
   // Before, the load will be a use of a phi<store, liveonentry>. It should be
   // the same after.
   MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(LoadInst));
   MemoryDef *StoreAccess = cast<MemoryDef>(MSSA.getMemoryAccess(StoreInst));
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
   EXPECT_TRUE(isa<MemoryPhi>(DefiningAccess));
   // The load is currently clobbered by one of the phi arguments, so the walker
   // should determine the clobbering access as the phi.
   EXPECT_EQ(DefiningAccess, Walker->getClobberingMemoryAccess(LoadInst));
   Updater.removeMemoryAccess(StoreAccess);
   MSSA.verifyMemorySSA();
   // After the removeaccess, let's see if we got the right accesses
   // The load should still point to the phi ...
   EXPECT_EQ(DefiningAccess, LoadAccess->getDefiningAccess());
   // but we should now get live on entry for the clobbering definition of the
   // load, since it will walk past the phi node since every argument is the
   // same.
   // XXX: This currently requires either removing the phi or resetting optimized
   // on the load
 
   EXPECT_FALSE(
       MSSA.isLiveOnEntryDef(Walker->getClobberingMemoryAccess(LoadInst)));
   // If we reset optimized, we get live on entry.
   LoadAccess->resetOptimized();
   EXPECT_TRUE(
       MSSA.isLiveOnEntryDef(Walker->getClobberingMemoryAccess(LoadInst)));
   // The phi should now be a two entry phi with two live on entry defs.
   for (const auto &Op : DefiningAccess->operands()) {
     MemoryAccess *Operand = cast<MemoryAccess>(&*Op);
     EXPECT_TRUE(MSSA.isLiveOnEntryDef(Operand));
   }
 
   // Now we try to remove the single valued phi
   Updater.removeMemoryAccess(DefiningAccess);
   MSSA.verifyMemorySSA();
   // Now the load should be a load of live on entry.
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(LoadAccess->getDefiningAccess()));
 }
 
 // We had a bug with caching where the walker would report MemoryDef#3's clobber
 // (below) was MemoryDef#1.
 //
 // define void @F(i8*) {
 //   %A = alloca i8, i8 1
 // ; 1 = MemoryDef(liveOnEntry)
 //   store i8 0, i8* %A
 // ; 2 = MemoryDef(1)
 //   store i8 1, i8* %A
 // ; 3 = MemoryDef(2)
 //   store i8 2, i8* %A
 // }
 TEST_F(MemorySSATest, TestTripleStore) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Value *Alloca = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   StoreInst *S1 = B.CreateStore(ConstantInt::get(Int8, 0), Alloca);
   StoreInst *S2 = B.CreateStore(ConstantInt::get(Int8, 1), Alloca);
   StoreInst *S3 = B.CreateStore(ConstantInt::get(Int8, 2), Alloca);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
 
   unsigned I = 0;
   for (StoreInst *V : {S1, S2, S3}) {
     // Everything should be clobbered by its defining access
     MemoryAccess *DefiningAccess = MSSA.getMemoryAccess(V)->getDefiningAccess();
     MemoryAccess *WalkerClobber = Walker->getClobberingMemoryAccess(V);
     EXPECT_EQ(DefiningAccess, WalkerClobber)
         << "Store " << I << " doesn't have the correct clobbering access";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
 }
 
 // ...And fixing the above bug made it obvious that, when walking, MemorySSA's
 // walker was caching the initial node it walked. This was fine (albeit
 // mostly redundant) unless the initial node being walked is a clobber for the
 // query. In that case, we'd cache that the node clobbered itself.
 TEST_F(MemorySSATest, TestStoreAndLoad) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Value *Alloca = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   Instruction *SI = B.CreateStore(ConstantInt::get(Int8, 0), Alloca);
   Instruction *LI = B.CreateLoad(Alloca);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
 
   MemoryAccess *LoadClobber = Walker->getClobberingMemoryAccess(LI);
   EXPECT_EQ(LoadClobber, MSSA.getMemoryAccess(SI));
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(Walker->getClobberingMemoryAccess(SI)));
 }
 
 // Another bug (related to the above two fixes): It was noted that, given the
 // following code:
 // ; 1 = MemoryDef(liveOnEntry)
 // store i8 0, i8* %1
 //
 // ...A query to getClobberingMemoryAccess(MemoryAccess*, MemoryLocation) would
 // hand back the store (correctly). A later call to
 // getClobberingMemoryAccess(const Instruction*) would also hand back the store
 // (incorrectly; it should return liveOnEntry).
 //
 // This test checks that repeated calls to either function returns what they're
 // meant to.
 TEST_F(MemorySSATest, TestStoreDoubleQuery) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Value *Alloca = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   StoreInst *SI = B.CreateStore(ConstantInt::get(Int8, 0), Alloca);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
 
   MemoryAccess *StoreAccess = MSSA.getMemoryAccess(SI);
   MemoryLocation StoreLoc = MemoryLocation::get(SI);
   MemoryAccess *Clobber =
       Walker->getClobberingMemoryAccess(StoreAccess, StoreLoc);
   MemoryAccess *LiveOnEntry = Walker->getClobberingMemoryAccess(SI);
 
   EXPECT_EQ(Clobber, StoreAccess);
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(LiveOnEntry));
 
   // Try again (with entries in the cache already) for good measure...
   Clobber = Walker->getClobberingMemoryAccess(StoreAccess, StoreLoc);
   LiveOnEntry = Walker->getClobberingMemoryAccess(SI);
   EXPECT_EQ(Clobber, StoreAccess);
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(LiveOnEntry));
 }
 
 // Bug: During phi optimization, the walker wouldn't cache to the proper result
 // in the farthest-walked BB.
 //
 // Specifically, it would assume that whatever we walked to was a clobber.
 // "Whatever we walked to" isn't a clobber if we hit a cache entry.
 //
 // ...So, we need a test case that looks like:
 //    A
 //   / \
 //  B   |
 //   \ /
 //    C
 //
 // Where, when we try to optimize a thing in 'C', a blocker is found in 'B'.
 // The walk must determine that the blocker exists by using cache entries *while
 // walking* 'B'.
 TEST_F(MemorySSATest, PartialWalkerCacheWithPhis) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "A", F));
   Type *Int8 = Type::getInt8Ty(C);
   Constant *One = ConstantInt::get(Int8, 1);
   Constant *Zero = ConstantInt::get(Int8, 0);
   Value *AllocA = B.CreateAlloca(Int8, One, "a");
   Value *AllocB = B.CreateAlloca(Int8, One, "b");
   BasicBlock *IfThen = BasicBlock::Create(C, "B", F);
   BasicBlock *IfEnd = BasicBlock::Create(C, "C", F);
 
   B.CreateCondBr(UndefValue::get(Type::getInt1Ty(C)), IfThen, IfEnd);
 
   B.SetInsertPoint(IfThen);
   Instruction *FirstStore = B.CreateStore(Zero, AllocA);
   B.CreateStore(Zero, AllocB);
   Instruction *ALoad0 = B.CreateLoad(AllocA, "");
   Instruction *BStore = B.CreateStore(Zero, AllocB);
   // Due to use optimization/etc. we make a store to A, which is removed after
   // we build MSSA. This helps keep the test case simple-ish.
   Instruction *KillStore = B.CreateStore(Zero, AllocA);
   Instruction *ALoad = B.CreateLoad(AllocA, "");
   B.CreateBr(IfEnd);
 
   B.SetInsertPoint(IfEnd);
   Instruction *BelowPhi = B.CreateStore(Zero, AllocA);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
   MemorySSAUpdater Updater(&MSSA);
 
   // Kill `KillStore`; it exists solely so that the load after it won't be
   // optimized to FirstStore.
   Updater.removeMemoryAccess(MSSA.getMemoryAccess(KillStore));
   KillStore->eraseFromParent();
   auto *ALoadMA = cast<MemoryUse>(MSSA.getMemoryAccess(ALoad));
   EXPECT_EQ(ALoadMA->getDefiningAccess(), MSSA.getMemoryAccess(BStore));
 
   // Populate the cache for the store to AllocB directly after FirstStore. It
   // should point to something in block B (so something in D can't be optimized
   // to it).
   MemoryAccess *Load0Clobber = Walker->getClobberingMemoryAccess(ALoad0);
   EXPECT_EQ(MSSA.getMemoryAccess(FirstStore), Load0Clobber);
 
   // If the bug exists, this will introduce a bad cache entry for %a on BStore.
   // It will point to the store to %b after FirstStore. This only happens during
   // phi optimization.
   MemoryAccess *BottomClobber = Walker->getClobberingMemoryAccess(BelowPhi);
   MemoryAccess *Phi = MSSA.getMemoryAccess(IfEnd);
   EXPECT_EQ(BottomClobber, Phi);
 
   // This query will first check the cache for {%a, BStore}. It should point to
   // FirstStore, not to the store after FirstStore.
   MemoryAccess *UseClobber = Walker->getClobberingMemoryAccess(ALoad);
   EXPECT_EQ(UseClobber, MSSA.getMemoryAccess(FirstStore));
 }
 
 // Test that our walker properly handles loads with the invariant group
 // attribute. It's a bit hacky, since we add the invariant attribute *after*
 // building MSSA. Otherwise, the use optimizer will optimize it for us, which
 // isn't what we want.
 // FIXME: It may be easier/cleaner to just add an 'optimize uses?' flag to MSSA.
 TEST_F(MemorySSATest, WalkerInvariantLoadOpt) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Constant *One = ConstantInt::get(Int8, 1);
   Value *AllocA = B.CreateAlloca(Int8, One, "");
 
   Instruction *Store = B.CreateStore(One, AllocA);
   Instruction *Load = B.CreateLoad(AllocA);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
 
   auto *LoadMA = cast<MemoryUse>(MSSA.getMemoryAccess(Load));
   auto *StoreMA = cast<MemoryDef>(MSSA.getMemoryAccess(Store));
   EXPECT_EQ(LoadMA->getDefiningAccess(), StoreMA);
 
   // ...At the time of writing, no cache should exist for LoadMA. Be a bit
   // flexible to future changes.
   Walker->invalidateInfo(LoadMA);
   Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(C, {}));
 
   MemoryAccess *LoadClobber = Walker->getClobberingMemoryAccess(LoadMA);
   EXPECT_EQ(LoadClobber, MSSA.getLiveOnEntryDef());
 }
 
 // Test loads get reoptimized properly by the walker.
 TEST_F(MemorySSATest, WalkerReopt) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Value *AllocaA = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   Instruction *SIA = B.CreateStore(ConstantInt::get(Int8, 0), AllocaA);
   Value *AllocaB = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "B");
   Instruction *SIB = B.CreateStore(ConstantInt::get(Int8, 0), AllocaB);
   Instruction *LIA = B.CreateLoad(AllocaA);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
   MemorySSAUpdater Updater(&MSSA);
 
   MemoryAccess *LoadClobber = Walker->getClobberingMemoryAccess(LIA);
   MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(LIA));
   EXPECT_EQ(LoadClobber, MSSA.getMemoryAccess(SIA));
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(Walker->getClobberingMemoryAccess(SIA)));
   Updater.removeMemoryAccess(LoadAccess);
 
   // Create the load memory access pointing to an unoptimized place.
   MemoryUse *NewLoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LIA, MSSA.getMemoryAccess(SIB), LIA->getParent(), MemorySSA::End));
   // This should it cause it to be optimized
   EXPECT_EQ(Walker->getClobberingMemoryAccess(NewLoadAccess), LoadClobber);
   EXPECT_EQ(NewLoadAccess->getDefiningAccess(), LoadClobber);
 }
 
 // Test out MemorySSAUpdater::moveBefore
 TEST_F(MemorySSATest, MoveAboveMemoryDef) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
 
   Type *Int8 = Type::getInt8Ty(C);
   Value *A = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   Value *B_ = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "B");
   Value *C = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "C");
 
   StoreInst *StoreA0 = B.CreateStore(ConstantInt::get(Int8, 0), A);
   StoreInst *StoreB = B.CreateStore(ConstantInt::get(Int8, 0), B_);
   LoadInst *LoadB = B.CreateLoad(B_);
   StoreInst *StoreA1 = B.CreateStore(ConstantInt::get(Int8, 4), A);
   StoreInst *StoreC = B.CreateStore(ConstantInt::get(Int8, 4), C);
   StoreInst *StoreA2 = B.CreateStore(ConstantInt::get(Int8, 4), A);
   LoadInst *LoadC = B.CreateLoad(C);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker &Walker = *Analyses->Walker;
 
   MemorySSAUpdater Updater(&MSSA);
   StoreC->moveBefore(StoreB);
   Updater.moveBefore(cast<MemoryDef>(MSSA.getMemoryAccess(StoreC)),
                      cast<MemoryDef>(MSSA.getMemoryAccess(StoreB)));
 
   MSSA.verifyMemorySSA();
 
   EXPECT_EQ(MSSA.getMemoryAccess(StoreB)->getDefiningAccess(),
             MSSA.getMemoryAccess(StoreC));
   EXPECT_EQ(MSSA.getMemoryAccess(StoreC)->getDefiningAccess(),
             MSSA.getMemoryAccess(StoreA0));
   EXPECT_EQ(MSSA.getMemoryAccess(StoreA2)->getDefiningAccess(),
             MSSA.getMemoryAccess(StoreA1));
   EXPECT_EQ(Walker.getClobberingMemoryAccess(LoadB),
             MSSA.getMemoryAccess(StoreB));
   EXPECT_EQ(Walker.getClobberingMemoryAccess(LoadC),
             MSSA.getMemoryAccess(StoreC));
 
   // exercise block numbering
   EXPECT_TRUE(MSSA.locallyDominates(MSSA.getMemoryAccess(StoreC),
                                     MSSA.getMemoryAccess(StoreB)));
   EXPECT_TRUE(MSSA.locallyDominates(MSSA.getMemoryAccess(StoreA1),
                                     MSSA.getMemoryAccess(StoreA2)));
 }
 
 TEST_F(MemorySSATest, Irreducible) {
   // Create the equivalent of
   // x = something
   // if (...)
   //    goto second_loop_entry
   // while (...) {
   // second_loop_entry:
   // }
   // use(x)
 
   SmallVector<PHINode *, 8> Inserted;
   IRBuilder<> B(C);
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
 
   // Make blocks
   BasicBlock *IfBB = BasicBlock::Create(C, "if", F);
   BasicBlock *LoopStartBB = BasicBlock::Create(C, "loopstart", F);
   BasicBlock *LoopMainBB = BasicBlock::Create(C, "loopmain", F);
   BasicBlock *AfterLoopBB = BasicBlock::Create(C, "afterloop", F);
   B.SetInsertPoint(IfBB);
   B.CreateCondBr(B.getTrue(), LoopMainBB, LoopStartBB);
   B.SetInsertPoint(LoopStartBB);
   B.CreateBr(LoopMainBB);
   B.SetInsertPoint(LoopMainBB);
   B.CreateCondBr(B.getTrue(), LoopStartBB, AfterLoopBB);
   B.SetInsertPoint(AfterLoopBB);
   Argument *FirstArg = &*F->arg_begin();
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
   // Create the load memory acccess
   LoadInst *LoadInst = B.CreateLoad(FirstArg);
   MemoryUse *LoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LoadInst, nullptr, AfterLoopBB, MemorySSA::Beginning));
   Updater.insertUse(LoadAccess);
   MSSA.verifyMemorySSA();
 }
 
 TEST_F(MemorySSATest, MoveToBeforeLiveOnEntryInvalidatesCache) {
   // Create:
   //   %1 = alloca i8
   //   ; 1 = MemoryDef(liveOnEntry)
   //   store i8 0, i8* %1
   //   ; 2 = MemoryDef(1)
   //   store i8 0, i8* %1
   //
   // ...And be sure that MSSA's caching doesn't give us `1` for the clobber of
   // `2` after `1` is removed.
   IRBuilder<> B(C);
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
 
   BasicBlock *Entry = BasicBlock::Create(C, "if", F);
   B.SetInsertPoint(Entry);
 
   Value *A = B.CreateAlloca(B.getInt8Ty());
   StoreInst *StoreA = B.CreateStore(B.getInt8(0), A);
   StoreInst *StoreB = B.CreateStore(B.getInt8(0), A);
 
   setupAnalyses();
 
   MemorySSA &MSSA = *Analyses->MSSA;
 
   auto *DefA = cast<MemoryDef>(MSSA.getMemoryAccess(StoreA));
   auto *DefB = cast<MemoryDef>(MSSA.getMemoryAccess(StoreB));
 
   MemoryAccess *BClobber = MSSA.getWalker()->getClobberingMemoryAccess(DefB);
   ASSERT_EQ(DefA, BClobber);
 
   MemorySSAUpdater(&MSSA).removeMemoryAccess(DefA);
   StoreA->eraseFromParent();
 
   EXPECT_EQ(DefB->getDefiningAccess(), MSSA.getLiveOnEntryDef());
 
   EXPECT_EQ(MSSA.getWalker()->getClobberingMemoryAccess(DefB),
             MSSA.getLiveOnEntryDef())
       << "(DefA = " << DefA << ")";
 }
 
 TEST_F(MemorySSATest, RemovingDefInvalidatesCache) {
   // Create:
   //   %x = alloca i8
   //   %y = alloca i8
   //   ; 1 = MemoryDef(liveOnEntry)
   //   store i8 0, i8* %x
   //   ; 2 = MemoryDef(1)
   //   store i8 0, i8* %y
   //   ; 3 = MemoryDef(2)
   //   store i8 0, i8* %x
   //
   // And be sure that MSSA's caching handles the removal of def `1`
   // appropriately.
   IRBuilder<> B(C);
   F = Function::Create(
       FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
       GlobalValue::ExternalLinkage, "F", &M);
 
   BasicBlock *Entry = BasicBlock::Create(C, "if", F);
   B.SetInsertPoint(Entry);
 
   Value *X = B.CreateAlloca(B.getInt8Ty());
   Value *Y = B.CreateAlloca(B.getInt8Ty());
   StoreInst *StoreX1 = B.CreateStore(B.getInt8(0), X);
   StoreInst *StoreY = B.CreateStore(B.getInt8(0), Y);
   StoreInst *StoreX2 = B.CreateStore(B.getInt8(0), X);
 
   setupAnalyses();
 
   MemorySSA &MSSA = *Analyses->MSSA;
 
   auto *DefX1 = cast<MemoryDef>(MSSA.getMemoryAccess(StoreX1));
   auto *DefY = cast<MemoryDef>(MSSA.getMemoryAccess(StoreY));
   auto *DefX2 = cast<MemoryDef>(MSSA.getMemoryAccess(StoreX2));
 
   EXPECT_EQ(DefX2->getDefiningAccess(), DefY);
   MemoryAccess *X2Clobber = MSSA.getWalker()->getClobberingMemoryAccess(DefX2);
   ASSERT_EQ(DefX1, X2Clobber);
 
   MemorySSAUpdater(&MSSA).removeMemoryAccess(DefX1);
   StoreX1->eraseFromParent();
 
   EXPECT_EQ(DefX2->getDefiningAccess(), DefY);
   EXPECT_EQ(MSSA.getWalker()->getClobberingMemoryAccess(DefX2),
             MSSA.getLiveOnEntryDef())
       << "(DefX1 = " << DefX1 << ")";
 }
 
 // Test Must alias for optimized uses
 TEST_F(MemorySSATest, TestLoadMustAlias) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Value *AllocaA = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   Value *AllocaB = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "B");
 
   B.CreateStore(ConstantInt::get(Int8, 1), AllocaB);
   // Check load from LOE
   LoadInst *LA1 = B.CreateLoad(AllocaA, "");
   // Check load alias cached for second load
   LoadInst *LA2 = B.CreateLoad(AllocaA, "");
 
   B.CreateStore(ConstantInt::get(Int8, 1), AllocaA);
   // Check load from store/def
   LoadInst *LA3 = B.CreateLoad(AllocaA, "");
   // Check load alias cached for second load
   LoadInst *LA4 = B.CreateLoad(AllocaA, "");
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
 
   unsigned I = 0;
   for (LoadInst *V : {LA1, LA2}) {
     MemoryUse *MemUse = dyn_cast_or_null<MemoryUse>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemUse->getOptimizedAccessType(), None)
         << "Load " << I << " doesn't have the correct alias information";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
   for (LoadInst *V : {LA3, LA4}) {
     MemoryUse *MemUse = dyn_cast_or_null<MemoryUse>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemUse->getOptimizedAccessType(), MustAlias)
         << "Load " << I << " doesn't have the correct alias information";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
 }
 
 // Test Must alias for optimized defs.
 TEST_F(MemorySSATest, TestStoreMustAlias) {
   F = Function::Create(FunctionType::get(B.getVoidTy(), {}, false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   Value *AllocaA = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   Value *AllocaB = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "B");
   StoreInst *SA1 = B.CreateStore(ConstantInt::get(Int8, 1), AllocaA);
   StoreInst *SB1 = B.CreateStore(ConstantInt::get(Int8, 1), AllocaB);
   StoreInst *SA2 = B.CreateStore(ConstantInt::get(Int8, 2), AllocaA);
   StoreInst *SB2 = B.CreateStore(ConstantInt::get(Int8, 2), AllocaB);
   StoreInst *SA3 = B.CreateStore(ConstantInt::get(Int8, 3), AllocaA);
   StoreInst *SB3 = B.CreateStore(ConstantInt::get(Int8, 3), AllocaB);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
 
   unsigned I = 0;
   for (StoreInst *V : {SA1, SB1, SA2, SB2, SA3, SB3}) {
     MemoryDef *MemDef = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemDef->isOptimized(), false)
         << "Store " << I << " is optimized from the start?";
     EXPECT_EQ(MemDef->getOptimizedAccessType(), MayAlias)
         << "Store " << I
         << " has correct alias information before being optimized?";
     if (V == SA1)
       Walker->getClobberingMemoryAccess(V);
     else {
       MemoryAccess *Def = MemDef->getDefiningAccess();
       MemoryAccess *Clob = Walker->getClobberingMemoryAccess(V);
       EXPECT_NE(Def, Clob) << "Store " << I
                            << " has Defining Access equal to Clobbering Access";
     }
     EXPECT_EQ(MemDef->isOptimized(), true)
         << "Store " << I << " was not optimized";
     if (I == 0 || I == 1)
       EXPECT_EQ(MemDef->getOptimizedAccessType(), None)
           << "Store " << I << " doesn't have the correct alias information";
     else
       EXPECT_EQ(MemDef->getOptimizedAccessType(), MustAlias)
           << "Store " << I << " doesn't have the correct alias information";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
 }
 
 // Test May alias for optimized uses.
 TEST_F(MemorySSATest, TestLoadMayAlias) {
   F = Function::Create(FunctionType::get(B.getVoidTy(),
                                          {B.getInt8PtrTy(), B.getInt8PtrTy()},
                                          false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   auto *ArgIt = F->arg_begin();
   Argument *PointerA = &*ArgIt;
   Argument *PointerB = &*(++ArgIt);
   B.CreateStore(ConstantInt::get(Int8, 1), PointerB);
   LoadInst *LA1 = B.CreateLoad(PointerA, "");
   B.CreateStore(ConstantInt::get(Int8, 0), PointerA);
   LoadInst *LB1 = B.CreateLoad(PointerB, "");
   B.CreateStore(ConstantInt::get(Int8, 0), PointerA);
   LoadInst *LA2 = B.CreateLoad(PointerA, "");
   B.CreateStore(ConstantInt::get(Int8, 0), PointerB);
   LoadInst *LB2 = B.CreateLoad(PointerB, "");
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
 
   unsigned I = 0;
   for (LoadInst *V : {LA1, LB1}) {
     MemoryUse *MemUse = dyn_cast_or_null<MemoryUse>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemUse->getOptimizedAccessType(), MayAlias)
         << "Load " << I << " doesn't have the correct alias information";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
   for (LoadInst *V : {LA2, LB2}) {
     MemoryUse *MemUse = dyn_cast_or_null<MemoryUse>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemUse->getOptimizedAccessType(), MustAlias)
         << "Load " << I << " doesn't have the correct alias information";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
 }
 
 // Test May alias for optimized defs.
 TEST_F(MemorySSATest, TestStoreMayAlias) {
   F = Function::Create(FunctionType::get(B.getVoidTy(),
                                          {B.getInt8PtrTy(), B.getInt8PtrTy()},
                                          false),
                        GlobalValue::ExternalLinkage, "F", &M);
   B.SetInsertPoint(BasicBlock::Create(C, "", F));
   Type *Int8 = Type::getInt8Ty(C);
   auto *ArgIt = F->arg_begin();
   Argument *PointerA = &*ArgIt;
   Argument *PointerB = &*(++ArgIt);
   Value *AllocaC = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "C");
   // Store into arg1, must alias because it's LOE => must
   StoreInst *SA1 = B.CreateStore(ConstantInt::get(Int8, 0), PointerA);
   // Store into arg2, may alias store to arg1 => may
   StoreInst *SB1 = B.CreateStore(ConstantInt::get(Int8, 1), PointerB);
   // Store into aloca, no alias with args, so must alias LOE => must
   StoreInst *SC1 = B.CreateStore(ConstantInt::get(Int8, 2), AllocaC);
   // Store into arg1, may alias store to arg2 => may
   StoreInst *SA2 = B.CreateStore(ConstantInt::get(Int8, 3), PointerA);
   // Store into arg2, may alias store to arg1 => may
   StoreInst *SB2 = B.CreateStore(ConstantInt::get(Int8, 4), PointerB);
   // Store into aloca, no alias with args, so must alias SC1 => must
   StoreInst *SC2 = B.CreateStore(ConstantInt::get(Int8, 5), AllocaC);
   // Store into arg2, must alias store to arg2 => must
   StoreInst *SB3 = B.CreateStore(ConstantInt::get(Int8, 6), PointerB);
   std::initializer_list<StoreInst *> Sts = {SA1, SB1, SC1, SA2, SB2, SC2, SB3};
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
 
   unsigned I = 0;
   for (StoreInst *V : Sts) {
     MemoryDef *MemDef = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemDef->isOptimized(), false)
         << "Store " << I << " is optimized from the start?";
     EXPECT_EQ(MemDef->getOptimizedAccessType(), MayAlias)
         << "Store " << I
         << " has correct alias information before being optimized?";
     ++I;
   }
 
   for (StoreInst *V : Sts)
     Walker->getClobberingMemoryAccess(V);
 
   I = 0;
   for (StoreInst *V : Sts) {
     MemoryDef *MemDef = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(V));
     EXPECT_EQ(MemDef->isOptimized(), true)
         << "Store " << I << " was not optimized";
     if (I == 1 || I == 3 || I == 4)
       EXPECT_EQ(MemDef->getOptimizedAccessType(), MayAlias)
           << "Store " << I << " doesn't have the correct alias information";
     else if (I == 0 || I == 2)
       EXPECT_EQ(MemDef->getOptimizedAccessType(), None)
           << "Store " << I << " doesn't have the correct alias information";
     else
       EXPECT_EQ(MemDef->getOptimizedAccessType(), MustAlias)
           << "Store " << I << " doesn't have the correct alias information";
     // EXPECT_EQ expands such that if we increment I above, it won't get
     // incremented except when we try to print the error message.
     ++I;
   }
 }
+
+TEST_F(MemorySSATest, LifetimeMarkersAreClobbers) {
+  // Example code:
+  // define void @a(i8* %foo) {
+  //   %bar = getelementptr i8, i8* %foo, i64 1
+  //   store i8 0, i8* %foo
+  //   store i8 0, i8* %bar
+  //   call void @llvm.lifetime.end.p0i8(i64 8, i32* %p)
+  //   call void @llvm.lifetime.start.p0i8(i64 8, i32* %p)
+  //   store i8 0, i8* %foo
+  //   store i8 0, i8* %bar
+  //   ret void
+  // }
+  //
+  // Patterns like this are possible after inlining; the stores to %foo and %bar
+  // should both be clobbered by the lifetime.start call if they're dominated by
+  // it.
+
+  IRBuilder<> B(C);
+  F = Function::Create(
+      FunctionType::get(B.getVoidTy(), {B.getInt8PtrTy()}, false),
+      GlobalValue::ExternalLinkage, "F", &M);
+
+  // Make blocks
+  BasicBlock *Entry = BasicBlock::Create(C, "entry", F);
+
+  B.SetInsertPoint(Entry);
+  Value *Foo = &*F->arg_begin();
+
+  Value *Bar = B.CreateGEP(Foo, B.getInt64(1), "bar");
+
+  B.CreateStore(B.getInt8(0), Foo);
+  B.CreateStore(B.getInt8(0), Bar);
+
+  auto GetLifetimeIntrinsic = [&](Intrinsic::ID ID) {
+    return Intrinsic::getDeclaration(&M, ID, {Foo->getType()});
+  };
+
+  B.CreateCall(GetLifetimeIntrinsic(Intrinsic::lifetime_end),
+               {B.getInt64(2), Foo});
+  Instruction *LifetimeStart = B.CreateCall(
+      GetLifetimeIntrinsic(Intrinsic::lifetime_start), {B.getInt64(2), Foo});
+
+  Instruction *FooStore = B.CreateStore(B.getInt8(0), Foo);
+  Instruction *BarStore = B.CreateStore(B.getInt8(0), Bar);
+
+  setupAnalyses();
+  MemorySSA &MSSA = *Analyses->MSSA;
+
+  MemoryAccess *LifetimeStartAccess = MSSA.getMemoryAccess(LifetimeStart);
+  ASSERT_NE(LifetimeStartAccess, nullptr);
+
+  MemoryAccess *FooAccess = MSSA.getMemoryAccess(FooStore);
+  ASSERT_NE(FooAccess, nullptr);
+
+  MemoryAccess *BarAccess = MSSA.getMemoryAccess(BarStore);
+  ASSERT_NE(BarAccess, nullptr);
+
+  MemoryAccess *FooClobber =
+      MSSA.getWalker()->getClobberingMemoryAccess(FooAccess);
+  EXPECT_EQ(FooClobber, LifetimeStartAccess);
+
+  MemoryAccess *BarClobber =
+      MSSA.getWalker()->getClobberingMemoryAccess(BarAccess);
+  EXPECT_EQ(BarClobber, LifetimeStartAccess);
+}
Index: vendor/llvm/dist-release_70/utils/lit/lit/Test.py
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/lit/Test.py	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/lit/Test.py	(revision 338000)
@@ -1,398 +1,403 @@
 import os
 from xml.sax.saxutils import quoteattr
 from json import JSONEncoder
 
 from lit.BooleanExpression import BooleanExpression
 
 # Test result codes.
 
 class ResultCode(object):
     """Test result codes."""
 
     # We override __new__ and __getnewargs__ to ensure that pickling still
     # provides unique ResultCode objects in any particular instance.
     _instances = {}
     def __new__(cls, name, isFailure):
         res = cls._instances.get(name)
         if res is None:
             cls._instances[name] = res = super(ResultCode, cls).__new__(cls)
         return res
     def __getnewargs__(self):
         return (self.name, self.isFailure)
 
     def __init__(self, name, isFailure):
         self.name = name
         self.isFailure = isFailure
 
     def __repr__(self):
         return '%s%r' % (self.__class__.__name__,
                          (self.name, self.isFailure))
 
 PASS        = ResultCode('PASS', False)
 FLAKYPASS   = ResultCode('FLAKYPASS', False)
 XFAIL       = ResultCode('XFAIL', False)
 FAIL        = ResultCode('FAIL', True)
 XPASS       = ResultCode('XPASS', True)
 UNRESOLVED  = ResultCode('UNRESOLVED', True)
 UNSUPPORTED = ResultCode('UNSUPPORTED', False)
 TIMEOUT     = ResultCode('TIMEOUT', True)
 
 # Test metric values.
 
 class MetricValue(object):
     def format(self):
         """
         format() -> str
 
         Convert this metric to a string suitable for displaying as part of the
         console output.
         """
         raise RuntimeError("abstract method")
 
     def todata(self):
         """
         todata() -> json-serializable data
 
         Convert this metric to content suitable for serializing in the JSON test
         output.
         """
         raise RuntimeError("abstract method")
 
 class IntMetricValue(MetricValue):
     def __init__(self, value):
         self.value = value
 
     def format(self):
         return str(self.value)
 
     def todata(self):
         return self.value
 
 class RealMetricValue(MetricValue):
     def __init__(self, value):
         self.value = value
 
     def format(self):
         return '%.4f' % self.value
 
     def todata(self):
         return self.value
 
 class JSONMetricValue(MetricValue):
     """
         JSONMetricValue is used for types that are representable in the output
         but that are otherwise uninterpreted.
     """
     def __init__(self, value):
         # Ensure the value is a serializable by trying to encode it.
         # WARNING: The value may change before it is encoded again, and may
         #          not be encodable after the change.
         try:
             e = JSONEncoder()
             e.encode(value)
         except TypeError:
             raise
         self.value = value
 
     def format(self):
         e = JSONEncoder(indent=2, sort_keys=True)
         return e.encode(self.value)
 
     def todata(self):
         return self.value
 
 def toMetricValue(value):
     if isinstance(value, MetricValue):
         return value
     elif isinstance(value, int):
         return IntMetricValue(value)
     elif isinstance(value, float):
         return RealMetricValue(value)
     else:
         # 'long' is only present in python2
         try:
             if isinstance(value, long):
                 return IntMetricValue(value)
         except NameError:
             pass
 
         # Try to create a JSONMetricValue and let the constructor throw
         # if value is not a valid type.
         return JSONMetricValue(value)
 
 
 # Test results.
 
 class Result(object):
     """Wrapper for the results of executing an individual test."""
 
     def __init__(self, code, output='', elapsed=None):
         # The result code.
         self.code = code
         # The test output.
         self.output = output
         # The wall timing to execute the test, if timing.
         self.elapsed = elapsed
         # The metrics reported by this test.
         self.metrics = {}
         # The micro-test results reported by this test.
         self.microResults = {}
 
     def addMetric(self, name, value):
         """
         addMetric(name, value)
 
         Attach a test metric to the test result, with the given name and list of
         values. It is an error to attempt to attach the metrics with the same
         name multiple times.
 
         Each value must be an instance of a MetricValue subclass.
         """
         if name in self.metrics:
             raise ValueError("result already includes metrics for %r" % (
                     name,))
         if not isinstance(value, MetricValue):
             raise TypeError("unexpected metric value: %r" % (value,))
         self.metrics[name] = value
 
     def addMicroResult(self, name, microResult):
         """
         addMicroResult(microResult)
 
         Attach a micro-test result to the test result, with the given name and
         result.  It is an error to attempt to attach a micro-test with the 
         same name multiple times.
 
         Each micro-test result must be an instance of the Result class.
         """
         if name in self.microResults:
             raise ValueError("Result already includes microResult for %r" % (
                    name,))
         if not isinstance(microResult, Result):
             raise TypeError("unexpected MicroResult value %r" % (microResult,))
         self.microResults[name] = microResult
 
 
 # Test classes.
 
 class TestSuite:
     """TestSuite - Information on a group of tests.
 
     A test suite groups together a set of logically related tests.
     """
 
     def __init__(self, name, source_root, exec_root, config):
         self.name = name
         self.source_root = source_root
         self.exec_root = exec_root
         # The test suite configuration.
         self.config = config
 
     def getSourcePath(self, components):
         return os.path.join(self.source_root, *components)
 
     def getExecPath(self, components):
         return os.path.join(self.exec_root, *components)
 
 class Test:
     """Test - Information on a single test instance."""
 
     def __init__(self, suite, path_in_suite, config, file_path = None):
         self.suite = suite
         self.path_in_suite = path_in_suite
         self.config = config
         self.file_path = file_path
 
         # A list of conditions under which this test is expected to fail.
         # Each condition is a boolean expression of features and target
         # triple parts. These can optionally be provided by test format
         # handlers, and will be honored when the test result is supplied.
         self.xfails = []
 
         # A list of conditions that must be satisfied before running the test.
         # Each condition is a boolean expression of features. All of them
         # must be True for the test to run.
         # FIXME should target triple parts count here too?
         self.requires = []
 
         # A list of conditions that prevent execution of the test.
         # Each condition is a boolean expression of features and target
         # triple parts. All of them must be False for the test to run.
         self.unsupported = []
 
         # The test result, once complete.
         self.result = None
 
     def setResult(self, result):
         if self.result is not None:
             raise ValueError("test result already set")
         if not isinstance(result, Result):
             raise ValueError("unexpected result type")
 
         self.result = result
 
         # Apply the XFAIL handling to resolve the result exit code.
         try:
             if self.isExpectedToFail():
                 if self.result.code == PASS:
                     self.result.code = XPASS
                 elif self.result.code == FAIL:
                     self.result.code = XFAIL
         except ValueError as e:
             # Syntax error in an XFAIL line.
             self.result.code = UNRESOLVED
             self.result.output = str(e)
         
     def getFullName(self):
         return self.suite.config.name + ' :: ' + '/'.join(self.path_in_suite)
 
     def getFilePath(self):
         if self.file_path:
             return self.file_path
         return self.getSourcePath()
 
     def getSourcePath(self):
         return self.suite.getSourcePath(self.path_in_suite)
 
     def getExecPath(self):
         return self.suite.getExecPath(self.path_in_suite)
 
     def isExpectedToFail(self):
         """
         isExpectedToFail() -> bool
 
         Check whether this test is expected to fail in the current
         configuration. This check relies on the test xfails property which by
         some test formats may not be computed until the test has first been
         executed.
         Throws ValueError if an XFAIL line has a syntax error.
         """
 
         features = self.config.available_features
         triple = getattr(self.suite.config, 'target_triple', "")
 
         # Check if any of the xfails match an available feature or the target.
         for item in self.xfails:
             # If this is the wildcard, it always fails.
             if item == '*':
                 return True
 
             # If this is a True expression of features and target triple parts,
             # it fails.
             try:
                 if BooleanExpression.evaluate(item, features, triple):
                     return True
             except ValueError as e:
                 raise ValueError('Error in XFAIL list:\n%s' % str(e))
 
         return False
 
     def isWithinFeatureLimits(self):
         """
         isWithinFeatureLimits() -> bool
 
         A test is within the feature limits set by run_only_tests if
         1. the test's requirements ARE satisfied by the available features
         2. the test's requirements ARE NOT satisfied after the limiting
            features are removed from the available features
 
         Throws ValueError if a REQUIRES line has a syntax error.
         """
 
         if not self.config.limit_to_features:
             return True  # No limits. Run it.
 
         # Check the requirements as-is (#1)
         if self.getMissingRequiredFeatures():
             return False
 
         # Check the requirements after removing the limiting features (#2)
         featuresMinusLimits = [f for f in self.config.available_features
                                if not f in self.config.limit_to_features]
         if not self.getMissingRequiredFeaturesFromList(featuresMinusLimits):
             return False
 
         return True
 
     def getMissingRequiredFeaturesFromList(self, features):
         try:
             return [item for item in self.requires
                     if not BooleanExpression.evaluate(item, features)]
         except ValueError as e:
             raise ValueError('Error in REQUIRES list:\n%s' % str(e))
 
     def getMissingRequiredFeatures(self):
         """
         getMissingRequiredFeatures() -> list of strings
 
         Returns a list of features from REQUIRES that are not satisfied."
         Throws ValueError if a REQUIRES line has a syntax error.
         """
 
         features = self.config.available_features
         return self.getMissingRequiredFeaturesFromList(features)
 
     def getUnsupportedFeatures(self):
         """
         getUnsupportedFeatures() -> list of strings
 
         Returns a list of features from UNSUPPORTED that are present
         in the test configuration's features or target triple.
         Throws ValueError if an UNSUPPORTED line has a syntax error.
         """
 
         features = self.config.available_features
         triple = getattr(self.suite.config, 'target_triple', "")
 
         try:
             return [item for item in self.unsupported
                     if BooleanExpression.evaluate(item, features, triple)]
         except ValueError as e:
             raise ValueError('Error in UNSUPPORTED list:\n%s' % str(e))
 
     def isEarlyTest(self):
         """
         isEarlyTest() -> bool
 
         Check whether this test should be executed early in a particular run.
         This can be used for test suites with long running tests to maximize
         parallelism or where it is desirable to surface their failures early.
         """
         return self.suite.config.is_early
 
     def writeJUnitXML(self, fil):
         """Write the test's report xml representation to a file handle."""
         test_name = quoteattr(self.path_in_suite[-1])
         test_path = self.path_in_suite[:-1]
         safe_test_path = [x.replace(".","_") for x in test_path]
         safe_name = self.suite.name.replace(".","-")
 
         if safe_test_path:
             class_name = safe_name + "." + "/".join(safe_test_path) 
         else:
             class_name = safe_name + "." + safe_name
         class_name = quoteattr(class_name)
         testcase_template = '<testcase classname={class_name} name={test_name} time="{time:.2f}"'
         elapsed_time = self.result.elapsed if self.result.elapsed is not None else 0.0
         testcase_xml = testcase_template.format(class_name=class_name, test_name=test_name, time=elapsed_time)
         fil.write(testcase_xml)
         if self.result.code.isFailure:
             fil.write(">\n\t<failure ><![CDATA[")
-            if type(self.result.output) == unicode:
-                encoded_output = self.result.output.encode("utf-8", 'ignore')
-            else:
+            # In Python2, 'str' and 'unicode' are distinct types, but in Python3, the type 'unicode' does not exist
+            # and instead 'bytes' is distinct
+            # in Python3, there's no unicode
+            if isinstance(self.result.output, str):
                 encoded_output = self.result.output
+            elif isinstance(self.result.output, bytes):
+                encoded_output = self.result.output.decode("utf-8", 'ignore')
+            else:
+                encoded_output = self.result.output.encode("utf-8", 'ignore')
             # In the unlikely case that the output contains the CDATA terminator
             # we wrap it by creating a new CDATA block
             fil.write(encoded_output.replace("]]>", "]]]]><![CDATA[>"))
             fil.write("]]></failure>\n</testcase>")
         elif self.result.code == UNSUPPORTED:
             unsupported_features = self.getMissingRequiredFeatures()
             if unsupported_features:
                 skip_message = "Skipping because of: " + ", ".join(unsupported_features)
             else:
                 skip_message = "Skipping because of configuration."
 
             fil.write(">\n\t<skipped message={} />\n</testcase>\n".format(quoteattr(skip_message)))
         else:
             fil.write("/>")
Index: vendor/llvm/dist-release_70/utils/lit/lit/llvm/config.py
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/lit/llvm/config.py	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/lit/llvm/config.py	(revision 338000)
@@ -1,466 +1,467 @@
 import os
 import platform
 import re
 import subprocess
 import sys
 
 import lit.util
 from lit.llvm.subst import FindTool
 from lit.llvm.subst import ToolSubst
 
 
 def binary_feature(on, feature, off_prefix):
     return feature if on else off_prefix + feature
 
 
 class LLVMConfig(object):
 
     def __init__(self, lit_config, config):
         self.lit_config = lit_config
         self.config = config
 
         features = config.available_features
 
         self.use_lit_shell = False
         # Tweak PATH for Win32 to decide to use bash.exe or not.
         if sys.platform == 'win32':
             # For tests that require Windows to run.
             features.add('system-windows')
 
             # Seek sane tools in directories and set to $PATH.
             path = self.lit_config.getToolsPath(config.lit_tools_dir,
                                                 config.environment['PATH'],
                                                 ['cmp.exe', 'grep.exe', 'sed.exe'])
             if path is not None:
                 self.with_environment('PATH', path, append_path=True)
             self.use_lit_shell = True
 
         # Choose between lit's internal shell pipeline runner and a real shell.  If
         # LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override.
         lit_shell_env = os.environ.get('LIT_USE_INTERNAL_SHELL')
         if lit_shell_env:
             self.use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
 
         if not self.use_lit_shell:
             features.add('shell')
 
         # Running on Darwin OS
         if platform.system() == 'Darwin':
             # FIXME: lld uses the first, other projects use the second.
             # We should standardize on the former.
             features.add('system-linker-mach-o')
             features.add('system-darwin')
         elif platform.system() == 'Windows':
             # For tests that require Windows to run.
             features.add('system-windows')
         elif platform.system() == "Linux":
             features.add('system-linux')
 
         # Native compilation: host arch == default triple arch
         # Both of these values should probably be in every site config (e.g. as
         # part of the standard header.  But currently they aren't)
         host_triple = getattr(config, 'host_triple', None)
         target_triple = getattr(config, 'target_triple', None)
         if host_triple and host_triple == target_triple:
             features.add('native')
 
         # Sanitizers.
         sanitizers = getattr(config, 'llvm_use_sanitizer', '')
         sanitizers = frozenset(x.lower() for x in sanitizers.split(';'))
         features.add(binary_feature('address' in sanitizers, 'asan', 'not_'))
         features.add(binary_feature('memory' in sanitizers, 'msan', 'not_'))
         features.add(binary_feature(
             'undefined' in sanitizers, 'ubsan', 'not_'))
 
         have_zlib = getattr(config, 'have_zlib', None)
         features.add(binary_feature(have_zlib, 'zlib', 'no'))
 
         # Check if we should run long running tests.
         long_tests = lit_config.params.get('run_long_tests', None)
         if lit.util.pythonize_bool(long_tests):
             features.add('long_tests')
 
         if target_triple:
             if re.match(r'^x86_64.*-apple', target_triple):
                 host_cxx = getattr(config, 'host_cxx', None)
                 if 'address' in sanitizers and self.get_clang_has_lsan(host_cxx, target_triple):
                     self.with_environment(
                         'ASAN_OPTIONS', 'detect_leaks=1', append_path=True)
             if re.match(r'^x86_64.*-linux', target_triple):
                 features.add('x86_64-linux')
             if re.match(r'.*-win32$', target_triple):
                 features.add('target-windows')
 
         use_gmalloc = lit_config.params.get('use_gmalloc', None)
         if lit.util.pythonize_bool(use_gmalloc):
             # Allow use of an explicit path for gmalloc library.
             # Will default to '/usr/lib/libgmalloc.dylib' if not set.
             gmalloc_path_str = lit_config.params.get('gmalloc_path',
                                                      '/usr/lib/libgmalloc.dylib')
             if gmalloc_path_str is not None:
                 self.with_environment(
                     'DYLD_INSERT_LIBRARIES', gmalloc_path_str)
 
     def with_environment(self, variable, value, append_path=False):
         if append_path:
             # For paths, we should be able to take a list of them and process all
             # of them.
             paths_to_add = value
             if lit.util.is_string(paths_to_add):
                 paths_to_add = [paths_to_add]
 
             def norm(x):
                 return os.path.normcase(os.path.normpath(x))
 
             current_paths = self.config.environment.get(variable, None)
             if current_paths:
                 current_paths = current_paths.split(os.path.pathsep)
                 paths = [norm(p) for p in current_paths]
             else:
                 paths = []
 
             # If we are passed a list [a b c], then iterating this list forwards
             # and adding each to the beginning would result in b c a.  So we
             # need to iterate in reverse to end up with the original ordering.
             for p in reversed(paths_to_add):
                 # Move it to the front if it already exists, otherwise insert it at the
                 # beginning.
                 p = norm(p)
                 try:
                     paths.remove(p)
                 except ValueError:
                     pass
                 paths = [p] + paths
             value = os.pathsep.join(paths)
         self.config.environment[variable] = value
 
     def with_system_environment(self, variables, append_path=False):
         if lit.util.is_string(variables):
             variables = [variables]
         for v in variables:
             value = os.environ.get(v)
             if value:
                 self.with_environment(v, value, append_path)
 
     def clear_environment(self, variables):
         for name in variables:
             if name in self.config.environment:
                 del self.config.environment[name]
 
     def get_process_output(self, command):
         try:
             cmd = subprocess.Popen(
                 command, stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE, env=self.config.environment)
             stdout, stderr = cmd.communicate()
             stdout = lit.util.to_string(stdout)
             stderr = lit.util.to_string(stderr)
             return (stdout, stderr)
         except OSError:
             self.lit_config.fatal('Could not run process %s' % command)
 
     def feature_config(self, features):
         # Ask llvm-config about the specified feature.
         arguments = [x for (x, _) in features]
         config_path = os.path.join(self.config.llvm_tools_dir, 'llvm-config')
 
         output, _ = self.get_process_output([config_path] + arguments)
         lines = output.split('\n')
 
         for (feature_line, (_, patterns)) in zip(lines, features):
             # We should have either a callable or a dictionary.  If it's a
             # dictionary, grep each key against the output and use the value if
             # it matches.  If it's a callable, it does the entire translation.
             if callable(patterns):
                 features_to_add = patterns(feature_line)
                 self.config.available_features.update(features_to_add)
             else:
                 for (re_pattern, feature) in patterns.items():
                     if re.search(re_pattern, feature_line):
                         self.config.available_features.add(feature)
 
     # Note that when substituting %clang_cc1 also fill in the include directory of
     # the builtin headers. Those are part of even a freestanding environment, but
     # Clang relies on the driver to locate them.
     def get_clang_builtin_include_dir(self, clang):
         # FIXME: Rather than just getting the version, we should have clang print
         # out its resource dir here in an easy to scrape form.
         clang_dir, _ = self.get_process_output(
             [clang, '-print-file-name=include'])
 
         if not clang_dir:
             self.lit_config.fatal(
                 "Couldn't find the include dir for Clang ('%s')" % clang)
 
         clang_dir = clang_dir.strip()
         if sys.platform in ['win32'] and not self.use_lit_shell:
             # Don't pass dosish path separator to msys bash.exe.
             clang_dir = clang_dir.replace('\\', '/')
         # Ensure the result is an ascii string, across Python2.5+ - Python3.
         return clang_dir
 
     # On macOS, LSan is only supported on clang versions 5 and higher
     def get_clang_has_lsan(self, clang, triple):
         if not clang:
             self.lit_config.warning(
                 'config.host_cxx is unset but test suite is configured to use sanitizers.')
             return False
 
         clang_binary = clang.split()[0]
         version_string, _ = self.get_process_output(
             [clang_binary, '--version'])
         if not 'clang' in version_string:
             self.lit_config.warning(
                 "compiler '%s' does not appear to be clang, " % clang_binary +
                 'but test suite is configured to use sanitizers.')
             return False
 
         if re.match(r'.*-linux', triple):
             return True
 
         if re.match(r'^x86_64.*-apple', triple):
             version_regex = re.search(r'version ([0-9]+)\.([0-9]+).([0-9]+)', version_string)
             major_version_number = int(version_regex.group(1))
             minor_version_number = int(version_regex.group(2))
             patch_version_number = int(version_regex.group(3))
             if 'Apple LLVM' in version_string:
                 # Apple LLVM doesn't yet support LSan
                 return False
             else:
                 return major_version_number >= 5
 
         return False
 
     def make_itanium_abi_triple(self, triple):
         m = re.match(r'(\w+)-(\w+)-(\w+)', triple)
         if not m:
             self.lit_config.fatal(
                 "Could not turn '%s' into Itanium ABI triple" % triple)
         if m.group(3).lower() != 'win32':
             # All non-win32 triples use the Itanium ABI.
             return triple
         return m.group(1) + '-' + m.group(2) + '-mingw32'
 
     def make_msabi_triple(self, triple):
         m = re.match(r'(\w+)-(\w+)-(\w+)', triple)
         if not m:
             self.lit_config.fatal(
                 "Could not turn '%s' into MS ABI triple" % triple)
         isa = m.group(1).lower()
         vendor = m.group(2).lower()
         os = m.group(3).lower()
         if os == 'win32':
             # If the OS is win32, we're done.
             return triple
         if isa.startswith('x86') or isa == 'amd64' or re.match(r'i\d86', isa):
             # For x86 ISAs, adjust the OS.
             return isa + '-' + vendor + '-win32'
         # -win32 is not supported for non-x86 targets; use a default.
         return 'i686-pc-win32'
 
     def add_tool_substitutions(self, tools, search_dirs=None):
         if not search_dirs:
             search_dirs = [self.config.llvm_tools_dir]
 
         if lit.util.is_string(search_dirs):
             search_dirs = [search_dirs]
 
         tools = [x if isinstance(x, ToolSubst) else ToolSubst(x)
                  for x in tools]
 
         search_dirs = os.pathsep.join(search_dirs)
         substitutions = []
 
         for tool in tools:
             match = tool.resolve(self, search_dirs)
 
             # Either no match occurred, or there was an unresolved match that
             # is ignored.
             if not match:
                 continue
 
             subst_key, tool_pipe, command = match
 
             # An unresolved match occurred that can't be ignored.  Fail without
             # adding any of the previously-discovered substitutions.
             if not command:
                 return False
 
             substitutions.append((subst_key, tool_pipe + command))
 
         self.config.substitutions.extend(substitutions)
         return True
 
     def use_default_substitutions(self):
         tool_patterns = [
             ToolSubst('FileCheck', unresolved='fatal'),
             # Handle these specially as they are strings searched for during testing.
             ToolSubst(r'\| \bcount\b', command=FindTool(
                 'count'), verbatim=True, unresolved='fatal'),
             ToolSubst(r'\| \bnot\b', command=FindTool('not'), verbatim=True, unresolved='fatal')]
 
-        self.config.substitutions.append(('%python', sys.executable))
+        self.config.substitutions.append(('%python', '"%s"' % (sys.executable)))
+
         self.add_tool_substitutions(
             tool_patterns, [self.config.llvm_tools_dir])
 
     def use_llvm_tool(self, name, search_env=None, required=False, quiet=False):
         """Find the executable program 'name', optionally using the specified
         environment variable as an override before searching the
         configuration's PATH."""
         # If the override is specified in the environment, use it without
         # validation.
         if search_env:
             tool = self.config.environment.get(search_env)
             if tool:
                 return tool
 
         # Otherwise look in the path.
         tool = lit.util.which(name, self.config.environment['PATH'])
 
         if required and not tool:
             message = "couldn't find '{}' program".format(name)
             if search_env:
                 message = message + \
                     ', try setting {} in your environment'.format(search_env)
             self.lit_config.fatal(message)
 
         if tool:
             tool = os.path.normpath(tool)
             if not self.lit_config.quiet and not quiet:
                 self.lit_config.note('using {}: {}'.format(name, tool))
         return tool
 
     def use_clang(self, required=True):
         """Configure the test suite to be able to invoke clang.
 
         Sets up some environment variables important to clang, locates a
         just-built or installed clang, and add a set of standard
         substitutions useful to any test suite that makes use of clang.
 
         """
         # Clear some environment variables that might affect Clang.
         #
         # This first set of vars are read by Clang, but shouldn't affect tests
         # that aren't specifically looking for these features, or are required
         # simply to run the tests at all.
         #
         # FIXME: Should we have a tool that enforces this?
 
         # safe_env_vars = ('TMPDIR', 'TEMP', 'TMP', 'USERPROFILE', 'PWD',
         #                  'MACOSX_DEPLOYMENT_TARGET', 'IPHONEOS_DEPLOYMENT_TARGET',
         #                  'VCINSTALLDIR', 'VC100COMNTOOLS', 'VC90COMNTOOLS',
         #                  'VC80COMNTOOLS')
         possibly_dangerous_env_vars = ['COMPILER_PATH', 'RC_DEBUG_OPTIONS',
                                        'CINDEXTEST_PREAMBLE_FILE', 'LIBRARY_PATH',
                                        'CPATH', 'C_INCLUDE_PATH', 'CPLUS_INCLUDE_PATH',
                                        'OBJC_INCLUDE_PATH', 'OBJCPLUS_INCLUDE_PATH',
                                        'LIBCLANG_TIMING', 'LIBCLANG_OBJTRACKING',
                                        'LIBCLANG_LOGGING', 'LIBCLANG_BGPRIO_INDEX',
                                        'LIBCLANG_BGPRIO_EDIT', 'LIBCLANG_NOTHREADS',
                                        'LIBCLANG_RESOURCE_USAGE',
                                        'LIBCLANG_CODE_COMPLETION_LOGGING']
         # Clang/Win32 may refer to %INCLUDE%. vsvarsall.bat sets it.
         if platform.system() != 'Windows':
             possibly_dangerous_env_vars.append('INCLUDE')
 
         self.clear_environment(possibly_dangerous_env_vars)
 
         # Tweak the PATH to include the tools dir and the scripts dir.
         # Put Clang first to avoid LLVM from overriding out-of-tree clang builds.
         possible_paths = ['clang_tools_dir', 'llvm_tools_dir']
         paths = [getattr(self.config, pp) for pp in possible_paths
                  if getattr(self.config, pp, None)]
         self.with_environment('PATH', paths, append_path=True)
 
         paths = [self.config.llvm_shlib_dir, self.config.llvm_libs_dir]
         self.with_environment('LD_LIBRARY_PATH', paths, append_path=True)
 
         # Discover the 'clang' and 'clangcc' to use.
 
         self.config.clang = self.use_llvm_tool(
             'clang', search_env='CLANG', required=required)
 
         self.config.substitutions.append(
             ('%llvmshlibdir', self.config.llvm_shlib_dir))
         self.config.substitutions.append(
             ('%pluginext', self.config.llvm_plugin_ext))
 
         builtin_include_dir = self.get_clang_builtin_include_dir(self.config.clang)
         tool_substitutions = [
             ToolSubst('%clang', command=self.config.clang),
             ToolSubst('%clang_analyze_cc1', command='%clang_cc1', extra_args=['-analyze', '%analyze']),
             ToolSubst('%clang_cc1', command=self.config.clang, extra_args=['-cc1', '-internal-isystem', builtin_include_dir, '-nostdsysteminc']),
             ToolSubst('%clang_cpp', command=self.config.clang, extra_args=['--driver-mode=cpp']),
             ToolSubst('%clang_cl', command=self.config.clang, extra_args=['--driver-mode=cl']),
             ToolSubst('%clangxx', command=self.config.clang, extra_args=['--driver-mode=g++']),
             ]
         self.add_tool_substitutions(tool_substitutions)
 
         self.config.substitutions.append(('%itanium_abi_triple',
                                           self.make_itanium_abi_triple(self.config.target_triple)))
         self.config.substitutions.append(('%ms_abi_triple',
                                           self.make_msabi_triple(self.config.target_triple)))
         self.config.substitutions.append(
             ('%resource_dir', builtin_include_dir))
 
         # The host triple might not be set, at least if we're compiling clang from
         # an already installed llvm.
         if self.config.host_triple and self.config.host_triple != '@LLVM_HOST_TRIPLE@':
             self.config.substitutions.append(('%target_itanium_abi_host_triple',
                                               '--target=%s' % self.make_itanium_abi_triple(self.config.host_triple)))
         else:
             self.config.substitutions.append(
                 ('%target_itanium_abi_host_triple', ''))
 
         self.config.substitutions.append(
             ('%src_include_dir', self.config.clang_src_dir + '/include'))
 
         # FIXME: Find nicer way to prohibit this.
         self.config.substitutions.append(
             (' clang ', """*** Do not use 'clang' in tests, use '%clang'. ***"""))
         self.config.substitutions.append(
             (' clang\+\+ ', """*** Do not use 'clang++' in tests, use '%clangxx'. ***"""))
         self.config.substitutions.append(
             (' clang-cc ',
              """*** Do not use 'clang-cc' in tests, use '%clang_cc1'. ***"""))
         self.config.substitutions.append(
             (' clang -cc1 -analyze ',
              """*** Do not use 'clang -cc1 -analyze' in tests, use '%clang_analyze_cc1'. ***"""))
         self.config.substitutions.append(
             (' clang -cc1 ',
              """*** Do not use 'clang -cc1' in tests, use '%clang_cc1'. ***"""))
         self.config.substitutions.append(
             (' %clang-cc1 ',
              """*** invalid substitution, use '%clang_cc1'. ***"""))
         self.config.substitutions.append(
             (' %clang-cpp ',
              """*** invalid substitution, use '%clang_cpp'. ***"""))
         self.config.substitutions.append(
             (' %clang-cl ',
              """*** invalid substitution, use '%clang_cl'. ***"""))
 
     def use_lld(self, required=True):
         """Configure the test suite to be able to invoke lld.
 
         Sets up some environment variables important to lld, locates a
         just-built or installed lld, and add a set of standard
         substitutions useful to any test suite that makes use of lld.
 
         """
         # Tweak the PATH to include the tools dir
         tool_dirs = [self.config.llvm_tools_dir]
         lib_dirs = [self.config.llvm_libs_dir]
         lld_tools_dir = getattr(self.config, 'lld_tools_dir', None)
         lld_libs_dir = getattr(self.config, 'lld_libs_dir', None)
 
         if lld_tools_dir:
             tool_dirs = tool_dirs + [lld_tools_dir]
         if lld_libs_dir:
             lib_dirs = lib_dirs + [lld_libs_dir]
 
         self.with_environment('PATH', tool_dirs, append_path=True)
         self.with_environment('LD_LIBRARY_PATH', lib_dirs, append_path=True)
 
         tool_patterns = ['lld', 'ld.lld', 'lld-link', 'ld64.lld', 'wasm-ld']
 
         self.add_tool_substitutions(tool_patterns, tool_dirs)
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-env/lit.cfg
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-env/lit.cfg	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-env/lit.cfg	(revision 338000)
@@ -1,9 +1,9 @@
 import lit.formats
 config.name = 'shtest-env'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
 config.test_source_root = None
 config.test_exec_root = None
 config.environment['FOO'] = '1'
 config.environment['BAR'] = '2'
-config.substitutions.append(('%{python}', sys.executable))
+config.substitutions.append(('%{python}', '"%s"' % (sys.executable)))
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-format/external_shell/fail_with_bad_encoding.txt
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-format/external_shell/fail_with_bad_encoding.txt	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-format/external_shell/fail_with_bad_encoding.txt	(revision 338000)
@@ -1,5 +1,5 @@
 # Run a command that fails with error on stdout.
 #
-# RUN: "%{python}" %S/write-bad-encoding.py
+# RUN: %{python} %S/write-bad-encoding.py
 # RUN: false
 
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-format/lit.cfg
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-format/lit.cfg	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-format/lit.cfg	(revision 338000)
@@ -1,9 +1,9 @@
 import lit.formats
 config.name = 'shtest-format'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
 config.test_source_root = None
 config.test_exec_root = None
 config.target_triple = 'x86_64-unknown-unknown'
 config.available_features.add('a-present-feature')
-config.substitutions.append(('%{python}', sys.executable))
+config.substitutions.append(('%{python}', "'%s'" % (sys.executable)))
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/dev-null.txt
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/dev-null.txt	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/dev-null.txt	(revision 338000)
@@ -1,14 +1,14 @@
 # Check handling of /dev/null in command line options
 # On windows, it should be redirected to a temp file.
 #
-# RUN: "%{python}" %S/check_args.py --my_arg /dev/null | FileCheck %s --check-prefix=CHECK1
+# RUN: %{python} %S/check_args.py --my_arg /dev/null | FileCheck %s --check-prefix=CHECK1
 # CHECK1: OK
 
-# RUN: "%{python}" %S/check_args.py --my_arg=/dev/null | FileCheck %s --check-prefix=CHECK2
+# RUN: %{python} %S/check_args.py --my_arg=/dev/null | FileCheck %s --check-prefix=CHECK2
 # CHECK2: OK
 
-# RUN: "%{python}" %S/check_args.py -a /dev/null | FileCheck %s --check-prefix=CHECK3
+# RUN: %{python} %S/check_args.py -a /dev/null | FileCheck %s --check-prefix=CHECK3
 # CHECK3: OK
 
-# RUN: "%{python}" %S/check_args.py -a=/dev/null | FileCheck %s --check-prefix=CHECK4
+# RUN: %{python} %S/check_args.py -a=/dev/null | FileCheck %s --check-prefix=CHECK4
 # CHECK4: OK
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/lit.cfg
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/lit.cfg	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/lit.cfg	(revision 338000)
@@ -1,7 +1,7 @@
 import lit.formats
 config.name = 'shtest-shell'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
 config.test_source_root = None
 config.test_exec_root = None
-config.substitutions.append(('%{python}', sys.executable))
+config.substitutions.append(('%{python}', '"%s"' % (sys.executable)))
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/redirects.txt
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/redirects.txt	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/redirects.txt	(revision 338000)
@@ -1,41 +1,41 @@
 # Check stdout redirect (> and >>).
 #
 # RUN: echo "not-present" > %t.stdout-write
 # RUN: echo "is-present" > %t.stdout-write
 # RUN: FileCheck --check-prefix=STDOUT-WRITE < %t.stdout-write %s
 #
 # STDOUT-WRITE-NOT: not-present
 # STDOUT-WRITE: is-present
 #
 # RUN: echo "appended-line" >> %t.stdout-write
 # RUN: FileCheck --check-prefix=STDOUT-APPEND < %t.stdout-write %s
 #
 # STDOUT-APPEND: is-present
 # STDOUT-APPEND: appended-line
 
 
 # Check stderr redirect (2> and 2>>).
 #
 # RUN: echo "not-present" > %t.stderr-write
-# RUN: "%{python}" %S/write-to-stderr.py 2> %t.stderr-write
+# RUN: %{python} %S/write-to-stderr.py 2> %t.stderr-write
 # RUN: FileCheck --check-prefix=STDERR-WRITE < %t.stderr-write %s
 #
 # STDERR-WRITE-NOT: not-present
 # STDERR-WRITE: a line on stderr
 #
-# RUN: "%{python}" %S/write-to-stderr.py 2>> %t.stderr-write
+# RUN: %{python} %S/write-to-stderr.py 2>> %t.stderr-write
 # RUN: FileCheck --check-prefix=STDERR-APPEND < %t.stderr-write %s
 #
 # STDERR-APPEND: a line on stderr
 # STDERR-APPEND: a line on stderr
 
 
 # Check combined redirect (&>).
 #
 # RUN: echo "not-present" > %t.combined
-# RUN: "%{python}" %S/write-to-stdout-and-stderr.py &> %t.combined
+# RUN: %{python} %S/write-to-stdout-and-stderr.py &> %t.combined
 # RUN: FileCheck --check-prefix=COMBINED-WRITE < %t.combined %s
 #
 # COMBINED-WRITE-NOT: not-present
 # COMBINED-WRITE: a line on stdout
 # COMBINED-WRITE: a line on stderr
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt	(revision 338000)
@@ -1,171 +1,171 @@
 # Check rm file operations.
 # Check force remove commands success whether the file does or doesn't exist.
 #
 # RUN: rm -f %t.write
-# RUN: "%{python}" %S/check_path.py file %t.write > %t.out
+# RUN: %{python} %S/check_path.py file %t.write > %t.out
 # RUN: FileCheck --check-prefix=REMOVE-FILE < %t.out %s
 # RUN: echo "create a temp file" > %t.write
-# RUN: "%{python}" %S/check_path.py file %t.write > %t.out
+# RUN: %{python} %S/check_path.py file %t.write > %t.out
 # RUN: FileCheck --check-prefix=FILE-EXIST < %t.out %s
 # RUN: rm -f %t.write
-# RUN: "%{python}" %S/check_path.py file %t.write > %t.out
+# RUN: %{python} %S/check_path.py file %t.write > %t.out
 # RUN: FileCheck --check-prefix=REMOVE-FILE < %t.out %s
 #
 # REMOVE-FILE: False
 # FILE-EXIST: True
 #
 # Check mkdir and rm folder operations.
 # Check force remove commands success whether the directory does or doesn't exist.
 #
 # Check the mkdir command with -p option.
 # RUN: rm -f -r %T/test
-# RUN: "%{python}" %S/check_path.py dir %T/test > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test > %t.out
 # RUN: FileCheck --check-prefix=REMOVE-PARENT-DIR < %t.out %s
 # RUN: mkdir -p %T/test
-# RUN: "%{python}" %S/check_path.py dir %T/test > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test > %t.out
 # RUN: FileCheck --check-prefix=MAKE-PARENT-DIR < %t.out %s
 # RUN: rm -f %T/test || true
 # RUN: rm -f -r %T/test
-# RUN: "%{python}" %S/check_path.py dir %T/test > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test > %t.out
 # RUN: FileCheck --check-prefix=REMOVE-PARENT-DIR < %t.out %s
 #
 # MAKE-PARENT-DIR: True
 # REMOVE-PARENT-DIR: False
 #
 # Check the mkdir command without -p option.
 #
 # RUN: rm -rf %T/test1
 # RUN: mkdir %T/test1
-# RUN: "%{python}" %S/check_path.py dir %T/test1 > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test1 > %t.out
 # RUN: FileCheck --check-prefix=MAKE-DIR < %t.out %s
 # RUN: cd %T/test1 && mkdir foo
-# RUN: "%{python}" %S/check_path.py dir %T/test1 > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test1 > %t.out
 # RUN: FileCheck --check-prefix=MAKE-DIR < %t.out %s
 # RUN: cd %T && rm -rf %T/test1
-# RUN: "%{python}" %S/check_path.py dir %T/test1 > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test1 > %t.out
 # RUN: FileCheck --check-prefix=REMOVE-DIR < %t.out %s
 #
 # MAKE-DIR: True
 # REMOVE-DIR: False
 #
 # Check creating and removing multiple folders and rm * operation.
 #
 # RUN: rm -rf %T/test
 # RUN: mkdir -p %T/test/test1 %T/test/test2
-# RUN: "%{python}" %S/check_path.py dir %T/test %T/test/test1 %T/test/test2 > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test %T/test/test1 %T/test/test2 > %t.out
 # RUN: FileCheck --check-prefix=DIRS-EXIST < %t.out %s
 # RUN: mkdir %T/test || true
 # RUN: echo "create a temp file" > %T/test/temp.write
 # RUN: echo "create a temp1 file" > %T/test/test1/temp1.write
 # RUN: echo "create a temp2 file" > %T/test/test2/temp2.write
-# RUN: "%{python}" %S/check_path.py file %T/test/temp.write %T/test/test1/temp1.write %T/test/test2/temp2.write> %t.out
+# RUN: %{python} %S/check_path.py file %T/test/temp.write %T/test/test1/temp1.write %T/test/test2/temp2.write> %t.out
 # RUN: FileCheck --check-prefix=FILES-EXIST < %t.out %s
 # RUN: rm -r -f %T/*
-# RUN: "%{python}" %S/check_path.py dir %T/test > %t.out
+# RUN: %{python} %S/check_path.py dir %T/test > %t.out
 # RUN: FileCheck --check-prefix=REMOVE-ALL < %t.out %s
 #
 # DIRS-EXIST: True
 # DIRS-EXIST-NEXT: True
 # DIRS-EXIST-NEXT: True
 # FILES-EXIST: True
 # FILES-EXIST-NEXT: True
 # FILES-EXIST-NEXT: True
 # REMOVE-ALL: False
 #
 # Check diff operations.
 #
 # RUN: echo "hello" > %t.stdout
 # RUN: echo "hello" > %t1.stdout
 # RUN: diff %t.stdout %t1.stdout
 # RUN: diff -u %t.stdout %t1.stdout
 # RUN: echo "hello-2" > %t1.stdout
 # RUN: diff %t.stdout %t1.stdout || true
 #
 # RUN: mkdir -p %T/dir1 %T/dir2
 # RUN: cd %T/dir1 && echo "hello" > temp1.txt
 # RUN: cd %T/dir2 && echo "hello" > temp2.txt
 # RUN: diff temp2.txt ../dir1/temp1.txt
 #
 # Check cat command with single file.
 #
 # RUN: rm -rf %T/testCat
 # RUN: mkdir -p %T/testCat
 # RUN: echo "abcdefgh" > %T/testCat/temp.write
 # RUN: cat %T/testCat/temp.write > %T/testCat/tempcat.write
-# RUN: "%{python}" %S/check_path.py file %T/testCat/tempcat.write > %T/testCat/path.out
+# RUN: %{python} %S/check_path.py file %T/testCat/tempcat.write > %T/testCat/path.out
 # RUN: FileCheck --check-prefix=FILE-EXISTS < %T/testCat/path.out %s
 # RUN: FileCheck --check-prefix=CAT-OUTPUT < %T/testCat/tempcat.write %s
 # FILE-EXISTS: True
 # CAT-OUTPUT: abcdefgh
 #
 # Check cat command with multiple files.
 #
 # RUN: rm -rf %T/testCat
 # RUN: mkdir -p %T/testCat
 # RUN: echo "abcdefgh" > %T/testCat/temp1.write
 # RUN: echo "efghijkl" > %T/testCat/temp2.write
 # RUN: echo "mnopqrst" > %T/testCat/temp3.write
 # RUN: cat %T/testCat/temp1.write %T/testCat/temp2.write %T/testCat/temp3.write > %T/testCat/tempmulticat.write
-# RUN: "%{python}" %S/check_path.py file %T/testCat/tempmulticat.write > %T/testCat/path.out
+# RUN: %{python} %S/check_path.py file %T/testCat/tempmulticat.write > %T/testCat/path.out
 # RUN: FileCheck --check-prefix=MULTI-FILE-EXISTS < %T/testCat/path.out %s
 # RUN: FileCheck --check-prefix=MULTI-CAT-OUTPUT < %T/testCat/tempmulticat.write %s
 # MULTI-FILE-EXISTS: True
 # MULTI-CAT-OUTPUT: abcdefgh
 # MULTI-CAT-OUTPUT-NEXT: efghijkl
 # MULTI-CAT-OUTPUT-NEXT: mnopqrst
 #
 # Check cat command with multiple files and piped output to FileCheck.
 #
 # RUN: rm -rf %T/testCat
 # RUN: mkdir -p %T/testCat
 # RUN: echo "abcdefgh" > %T/testCat/temp1.write
 # RUN: echo "efghijkl" > %T/testCat/temp2.write
 # RUN: cat %T/testCat/temp1.write %T/testCat/temp2.write | FileCheck --check-prefix=PIPED-CAT-OUTPUT %s
 # PIPED-CAT-OUTPUT: abcdefgh
 # PIPED-CAT-OUTPUT-NEXT: efghijkl
 #
 # Check cat command with multiple files and glob expressions.
 #
 # RUN: rm -rf %T/testCat
 # RUN: mkdir -p %T/testCat
 # RUN: echo "cvbnm" > %T/testCat/temp1.write
 # RUN: echo "qwerty" > %T/testCat/temp2.write
 # RUN: cat %T/testCat/*.write | FileCheck --check-prefix=GLOB-CAT-OUTPUT %s
 # GLOB-CAT-OUTPUT: cvbnm
 # GLOB-CAT-OUTPUT-NEXT: qwerty
 #
 # Check cat command with -v option
 #
 # RUN: cat -v %S/cat_nonprinting.bin | FileCheck --check-prefix=NP-CAT-OUTPUT %s
 # NP-CAT-OUTPUT: ^@^A^B^C^D^E^F^G	^H
 # NP-CAT-OUTPUT-NEXT: ^K^L^M^N^O^P^Q^R^S
 # NP-CAT-OUTPUT-NEXT: ^T^U^V^W^X^Y^Z^[^\^]^^^_ !"#$%&'
 # NP-CAT-OUTPUT-NEXT: ()*+,-./0123456789:;
 # NP-CAT-OUTPUT-NEXT: <=>?@ABCDEFGHIJKLMNO
 # NP-CAT-OUTPUT-NEXT: PQRSTUVWXYZ[\]^_`abc
 # NP-CAT-OUTPUT-NEXT: defghijklmnopqrstuvw
 # NP-CAT-OUTPUT-NEXT: xyz{|}~^?M-^@M-^AM-^BM-^CM-^DM-^EM-^FM-^GM-^HM-^IM-^JM-^K
 # NP-CAT-OUTPUT-NEXT: M-^LM-^MM-^NM-^OM-^PM-^QM-^RM-^SM-^TM-^UM-^VM-^WM-^XM-^YM-^ZM-^[M-^\M-^]M-^^M-^_
 # NP-CAT-OUTPUT-NEXT: M- M-!M-"M-#M-$M-%M-&M-'M-(M-)M-*M-+M-,M--M-.M-/M-0M-1M-2M-3
 # NP-CAT-OUTPUT-NEXT: M-4M-5M-6M-7M-8M-9M-:M-;M-<M-=M->M-?M-@M-AM-BM-CM-DM-EM-FM-G
 # NP-CAT-OUTPUT-NEXT: M-HM-IM-JM-KM-LM-MM-NM-OM-PM-QM-RM-SM-TM-UM-VM-WM-XM-YM-ZM-[
 # NP-CAT-OUTPUT-NEXT: M-\M-]M-^M-_M-`M-aM-bM-cM-dM-eM-fM-gM-hM-iM-jM-kM-lM-mM-nM-o
 # NP-CAT-OUTPUT-NEXT: M-pM-qM-rM-sM-tM-uM-vM-wM-xM-yM-zM-{M-|M-}M-~M-^?
 #
 # Check cat command with -show-nonprinting option
 #
 # RUN: cat --show-nonprinting %S/cat_nonprinting.bin | FileCheck --check-prefix=NPLONG-CAT-OUTPUT %s
 # NPLONG-CAT-OUTPUT: ^@^A^B^C^D^E^F^G	^H
 # NPLONG-CAT-OUTPUT-NEXT: ^K^L^M^N^O^P^Q^R^S
 # NPLONG-CAT-OUTPUT-NEXT: ^T^U^V^W^X^Y^Z^[^\^]^^^_ !"#$%&'
 # NPLONG-CAT-OUTPUT-NEXT: ()*+,-./0123456789:;
 # NPLONG-CAT-OUTPUT-NEXT: <=>?@ABCDEFGHIJKLMNO
 # NPLONG-CAT-OUTPUT-NEXT: PQRSTUVWXYZ[\]^_`abc
 # NPLONG-CAT-OUTPUT-NEXT: defghijklmnopqrstuvw
 # NPLONG-CAT-OUTPUT-NEXT: xyz{|}~^?M-^@M-^AM-^BM-^CM-^DM-^EM-^FM-^GM-^HM-^IM-^JM-^K
 # NPLONG-CAT-OUTPUT-NEXT: M-^LM-^MM-^NM-^OM-^PM-^QM-^RM-^SM-^TM-^UM-^VM-^WM-^XM-^YM-^ZM-^[M-^\M-^]M-^^M-^_
 # NPLONG-CAT-OUTPUT-NEXT: M- M-!M-"M-#M-$M-%M-&M-'M-(M-)M-*M-+M-,M--M-.M-/M-0M-1M-2M-3
 # NPLONG-CAT-OUTPUT-NEXT: M-4M-5M-6M-7M-8M-9M-:M-;M-<M-=M->M-?M-@M-AM-BM-CM-DM-EM-FM-G
 # NPLONG-CAT-OUTPUT-NEXT: M-HM-IM-JM-KM-LM-MM-NM-OM-PM-QM-RM-SM-TM-UM-VM-WM-XM-YM-ZM-[
 # NPLONG-CAT-OUTPUT-NEXT: M-\M-]M-^M-_M-`M-aM-bM-cM-dM-eM-fM-gM-hM-iM-jM-kM-lM-mM-nM-o
 # NPLONG-CAT-OUTPUT-NEXT: M-pM-qM-rM-sM-tM-uM-vM-wM-xM-yM-zM-{M-|M-}M-~M-^?
Index: vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-timeout/lit.cfg
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-timeout/lit.cfg	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/Inputs/shtest-timeout/lit.cfg	(revision 338000)
@@ -1,32 +1,32 @@
 # -*- Python -*-
 import os
 import sys
 
 import lit.formats
 
 config.name = 'per_test_timeout'
 
 shellType = lit_config.params.get('external', '1')
 
 if shellType == '0':
     lit_config.note('Using internal shell')
     externalShell = False
 else:
     lit_config.note('Using external shell')
     externalShell = True
 
 configSetTimeout = lit_config.params.get('set_timeout', '0')
 
 if configSetTimeout != '0':
     # Try setting the max individual test time in the configuration
     lit_config.maxIndividualTestTime = int(configSetTimeout)
 
 config.test_format = lit.formats.ShTest(execute_external=externalShell)
 config.suffixes = ['.py']
 
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = config.test_source_root
 config.target_triple = '(unused)'
 src_root = os.path.join(config.test_source_root, '..')
 config.environment['PYTHONPATH'] = src_root
-config.substitutions.append(('%{python}', sys.executable))
+config.substitutions.append(('%{python}', '"%s"' % (sys.executable)))
Index: vendor/llvm/dist-release_70/utils/lit/tests/lit.cfg
===================================================================
--- vendor/llvm/dist-release_70/utils/lit/tests/lit.cfg	(revision 337999)
+++ vendor/llvm/dist-release_70/utils/lit/tests/lit.cfg	(revision 338000)
@@ -1,73 +1,74 @@
 # -*- Python -*-
 
 import os
 import sys
 
 import lit.formats
 
 # Configuration file for the 'lit' test runner.
 
 # name: The name of this test suite.
 config.name = 'lit'
 
 # testFormat: The test format to use to interpret tests.
 config.test_format = lit.formats.ShTest(execute_external=False)
 
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.py']
 
 # excludes: A list of individual files to exclude.
 config.excludes = ['Inputs']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = config.test_source_root
 
 config.target_triple = '(unused)'
 
 src_root = os.path.join(config.test_source_root, '..')
 llvm_src_root = getattr(config, 'llvm_src_root', None)
 if llvm_src_root != None:
   # ``src_root`` may be in LLVM's binary build directory which does not contain
   # ``lit.py``, so use `llvm_src_root` instead.
   lit_path = os.path.join(llvm_src_root, 'utils', 'lit')
 else:
   lit_path = src_root
 
 config.environment['PYTHONPATH'] = lit_path # Required because some tests import the lit module
 config.substitutions.append(('%{src_root}', src_root))
 config.substitutions.append(('%{inputs}', os.path.join(
             src_root, 'tests', 'Inputs')))
 config.substitutions.append(('%{lit}', "%%{python} %s" % (
             os.path.join(lit_path, 'lit.py'),)))
-config.substitutions.append(('%{python}', sys.executable))
+config.substitutions.append(('%{python}', '"%s"' % (sys.executable)))
+
 
 # Enable coverage.py reporting, assuming the coverage module has been installed
 # and sitecustomize.py in the virtualenv has been modified appropriately.
 if lit_config.params.get('check-coverage', None):
     config.environment['COVERAGE_PROCESS_START'] = os.path.join(
         os.path.dirname(__file__), ".coveragerc")
 
 # Add a feature to detect the Python version.
 config.available_features.add("python%d.%d" % (sys.version_info[0],
                                                   sys.version_info[1]))
 
 # Add a feature to detect if psutil is available
 try:
     import psutil
     lit_config.note('Found python psutil module')
     config.available_features.add("python-psutil")
 except ImportError:
     lit_config.warning('Could not import psutil. Some tests will be skipped and'
                        ' the --timeout command line argument will not work.')
 
 if sys.platform.startswith('win') or sys.platform.startswith('cygwin'):
     config.available_features.add('windows')
 
 # Add llvm and lit tools directories if this config is being loaded indirectly.
 path = config.environment['PATH']
 for attribute in ('llvm_tools_dir', 'lit_tools_dir'):
     directory = getattr(config, attribute, None)
     if directory:
         path = os.path.pathsep.join((directory, path))
 config.environment['PATH'] = path